diff --git a/.gitattributes b/.gitattributes index bcc7d57b904..efb059f169a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,4 @@ contrib/* linguist-vendored *.h linguist-language=C++ +# to avoid frequent conflicts +tests/queries/0_stateless/arcadia_skip_list.txt text merge=union diff --git a/.gitmodules b/.gitmodules index 7a2c5600e65..f7dcf5f4ac1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -93,7 +93,7 @@ url = https://github.com/ClickHouse-Extras/libunwind.git [submodule "contrib/simdjson"] path = contrib/simdjson - url = https://github.com/ClickHouse-Extras/simdjson.git + url = https://github.com/simdjson/simdjson.git [submodule "contrib/rapidjson"] path = contrib/rapidjson url = https://github.com/ClickHouse-Extras/rapidjson @@ -133,7 +133,7 @@ url = https://github.com/unicode-org/icu.git [submodule "contrib/flatbuffers"] path = contrib/flatbuffers - url = https://github.com/google/flatbuffers.git + url = https://github.com/ClickHouse-Extras/flatbuffers.git [submodule "contrib/libc-headers"] path = contrib/libc-headers url = https://github.com/ClickHouse-Extras/libc-headers.git @@ -221,3 +221,9 @@ [submodule "contrib/NuRaft"] path = contrib/NuRaft url = https://github.com/ClickHouse-Extras/NuRaft.git +[submodule "contrib/nanodbc"] + path = contrib/nanodbc + url = https://github.com/ClickHouse-Extras/nanodbc.git +[submodule "contrib/datasketches-cpp"] + path = contrib/datasketches-cpp + url = https://github.com/ClickHouse-Extras/datasketches-cpp.git diff --git a/CHANGELOG.md b/CHANGELOG.md index e2c777b3bcf..cc1ec835a7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,312 @@ +## ClickHouse release 21.4 + +### ClickHouse release 21.4.1 2021-04-12 + +#### Backward Incompatible Change + +* The `toStartOfIntervalFunction` will align hour intervals to the midnight (in previous versions they were aligned to the start of unix epoch). For example, `toStartOfInterval(x, INTERVAL 11 HOUR)` will split every day into three intervals: `00:00:00..10:59:59`, `11:00:00..21:59:59` and `22:00:00..23:59:59`. This behaviour is more suited for practical needs. This closes [#9510](https://github.com/ClickHouse/ClickHouse/issues/9510). [#22060](https://github.com/ClickHouse/ClickHouse/pull/22060) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* `Age` and `Precision` in graphite rollup configs should increase from retention to retention. Now it's checked and the wrong config raises an exception. [#21496](https://github.com/ClickHouse/ClickHouse/pull/21496) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix `cutToFirstSignificantSubdomainCustom()`/`firstSignificantSubdomainCustom()` returning wrong result for 3+ level domains present in custom top-level domain list. For input domains matching these custom top-level domains, the third-level domain was considered to be the first significant one. This is now fixed. This change may introduce incompatibility if the function is used in e.g. the sharding key. [#21946](https://github.com/ClickHouse/ClickHouse/pull/21946) ([Azat Khuzhin](https://github.com/azat)). +* Column `keys` in table `system.dictionaries` was replaced to columns `key.names` and `key.types`. Columns `key.names`, `key.types`, `attribute.names`, `attribute.types` from `system.dictionaries` table does not require dictionary to be loaded. [#21884](https://github.com/ClickHouse/ClickHouse/pull/21884) ([Maksim Kita](https://github.com/kitaisreal)). +* Now replicas that are processing the `ALTER TABLE ATTACH PART[ITION]` command search in their `detached/` folders before fetching the data from other replicas. As an implementation detail, a new command `ATTACH_PART` is introduced in the replicated log. Parts are searched and compared by their checksums. [#18978](https://github.com/ClickHouse/ClickHouse/pull/18978) ([Mike Kot](https://github.com/myrrc)). **Note**: + * `ATTACH PART[ITION]` queries may not work during cluster upgrade. + * It's not possible to rollback to older ClickHouse version after executing `ALTER ... ATTACH` query in new version as the old servers would fail to pass the `ATTACH_PART` entry in the replicated log. +* In this version, empty `` will block all access to remote hosts while in previous versions it did nothing. If you want to keep old behaviour and you have empty `remote_url_allow_hosts` element in configuration file, remove it. [#20058](https://github.com/ClickHouse/ClickHouse/pull/20058) ([Vladimir Chebotarev](https://github.com/excitoon)). + + +#### New Feature + +* Extended range of `DateTime64` to support dates from year 1925 to 2283. Improved support of `DateTime` around zero date (`1970-01-01`). [#9404](https://github.com/ClickHouse/ClickHouse/pull/9404) ([alexey-milovidov](https://github.com/alexey-milovidov), [Vasily Nemkov](https://github.com/Enmk)). Not every time and date functions are working for extended range of dates. +* Added support of Kerberos authentication for preconfigured users and HTTP requests (GSS-SPNEGO). [#14995](https://github.com/ClickHouse/ClickHouse/pull/14995) ([Denis Glazachev](https://github.com/traceon)). +* Add `prefer_column_name_to_alias` setting to use original column names instead of aliases. it is needed to be more compatible with common databases' aliasing rules. This is for [#9715](https://github.com/ClickHouse/ClickHouse/issues/9715) and [#9887](https://github.com/ClickHouse/ClickHouse/issues/9887). [#22044](https://github.com/ClickHouse/ClickHouse/pull/22044) ([Amos Bird](https://github.com/amosbird)). +* Added functions `dictGetChildren(dictionary, key)`, `dictGetDescendants(dictionary, key, level)`. Function `dictGetChildren` return all children as an array if indexes. It is a inverse transformation for `dictGetHierarchy`. Function `dictGetDescendants` return all descendants as if `dictGetChildren` was applied `level` times recursively. Zero `level` value is equivalent to infinity. Closes [#14656](https://github.com/ClickHouse/ClickHouse/issues/14656). [#22096](https://github.com/ClickHouse/ClickHouse/pull/22096) ([Maksim Kita](https://github.com/kitaisreal)). +* Added `executable_pool` dictionary source. Close [#14528](https://github.com/ClickHouse/ClickHouse/issues/14528). [#21321](https://github.com/ClickHouse/ClickHouse/pull/21321) ([Maksim Kita](https://github.com/kitaisreal)). +* Added table function `dictionary`. It works the same way as `Dictionary` engine. Closes [#21560](https://github.com/ClickHouse/ClickHouse/issues/21560). [#21910](https://github.com/ClickHouse/ClickHouse/pull/21910) ([Maksim Kita](https://github.com/kitaisreal)). +* Support `Nullable` type for `PolygonDictionary` attribute. [#21890](https://github.com/ClickHouse/ClickHouse/pull/21890) ([Maksim Kita](https://github.com/kitaisreal)). +* Functions `dictGet`, `dictHas` use current database name if it is not specified for dictionaries created with DDL. Closes [#21632](https://github.com/ClickHouse/ClickHouse/issues/21632). [#21859](https://github.com/ClickHouse/ClickHouse/pull/21859) ([Maksim Kita](https://github.com/kitaisreal)). +* Added function `dictGetOrNull`. It works like `dictGet`, but return `Null` in case key was not found in dictionary. Closes [#22375](https://github.com/ClickHouse/ClickHouse/issues/22375). [#22413](https://github.com/ClickHouse/ClickHouse/pull/22413) ([Maksim Kita](https://github.com/kitaisreal)). +* Added async update in `ComplexKeyCache`, `SSDCache`, `SSDComplexKeyCache` dictionaries. Added support for `Nullable` type in `Cache`, `ComplexKeyCache`, `SSDCache`, `SSDComplexKeyCache` dictionaries. Added support for multiple attributes fetch with `dictGet`, `dictGetOrDefault` functions. Fixes [#21517](https://github.com/ClickHouse/ClickHouse/issues/21517). [#20595](https://github.com/ClickHouse/ClickHouse/pull/20595) ([Maksim Kita](https://github.com/kitaisreal)). +* Support `dictHas` function for `RangeHashedDictionary`. Fixes [#6680](https://github.com/ClickHouse/ClickHouse/issues/6680). [#19816](https://github.com/ClickHouse/ClickHouse/pull/19816) ([Maksim Kita](https://github.com/kitaisreal)). +* Add function `timezoneOf` that returns the timezone name of `DateTime` or `DateTime64` data types. This does not close [#9959](https://github.com/ClickHouse/ClickHouse/issues/9959). Fix inconsistencies in function names: add aliases `timezone` and `timeZone` as well as `toTimezone` and `toTimeZone` and `timezoneOf` and `timeZoneOf`. [#22001](https://github.com/ClickHouse/ClickHouse/pull/22001) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add new optional clause `GRANTEES` for `CREATE/ALTER USER` commands. It specifies users or roles which are allowed to receive grants from this user on condition this user has also all required access granted with grant option. By default `GRANTEES ANY` is used which means a user with grant option can grant to anyone. Syntax: `CREATE USER ... GRANTEES {user | role | ANY | NONE} [,...] [EXCEPT {user | role} [,...]]`. [#21641](https://github.com/ClickHouse/ClickHouse/pull/21641) ([Vitaly Baranov](https://github.com/vitlibar)). +* Add new column `slowdowns_count` to `system.clusters`. When using hedged requests, it shows how many times we switched to another replica because this replica was responding slowly. Also show actual value of `errors_count` in `system.clusters`. [#21480](https://github.com/ClickHouse/ClickHouse/pull/21480) ([Kruglov Pavel](https://github.com/Avogar)). +* Add `_partition_id` virtual column for `MergeTree*` engines. Allow to prune partitions by `_partition_id`. Add `partitionID()` function to calculate partition id string. [#21401](https://github.com/ClickHouse/ClickHouse/pull/21401) ([Amos Bird](https://github.com/amosbird)). +* Add function `isIPAddressInRange` to test if an IPv4 or IPv6 address is contained in a given CIDR network prefix. [#21329](https://github.com/ClickHouse/ClickHouse/pull/21329) ([PHO](https://github.com/depressed-pho)). +* Added new SQL command `ALTER TABLE 'table_name' UNFREEZE [PARTITION 'part_expr'] WITH NAME 'backup_name'`. This command is needed to properly remove 'freezed' partitions from all disks. [#21142](https://github.com/ClickHouse/ClickHouse/pull/21142) ([Pavel Kovalenko](https://github.com/Jokser)). +* Supports implicit key type conversion for JOIN. [#19885](https://github.com/ClickHouse/ClickHouse/pull/19885) ([Vladimir](https://github.com/vdimir)). + +#### Experimental Feature + +* Support `RANGE OFFSET` frame (for window functions) for floating point types. Implement `lagInFrame`/`leadInFrame` window functions, which are analogous to `lag`/`lead`, but respect the window frame. They are identical when the frame is `between unbounded preceding and unbounded following`. This closes [#5485](https://github.com/ClickHouse/ClickHouse/issues/5485). [#21895](https://github.com/ClickHouse/ClickHouse/pull/21895) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* Zero-copy replication for `ReplicatedMergeTree` over S3 storage. [#16240](https://github.com/ClickHouse/ClickHouse/pull/16240) ([ianton-ru](https://github.com/ianton-ru)). +* Added possibility to migrate existing S3 disk to the schema with backup-restore capabilities. [#22070](https://github.com/ClickHouse/ClickHouse/pull/22070) ([Pavel Kovalenko](https://github.com/Jokser)). + +#### Performance Improvement + +* Supported parallel formatting in `clickhouse-local` and everywhere else. [#21630](https://github.com/ClickHouse/ClickHouse/pull/21630) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Support parallel parsing for `CSVWithNames` and `TSVWithNames` formats. This closes [#21085](https://github.com/ClickHouse/ClickHouse/issues/21085). [#21149](https://github.com/ClickHouse/ClickHouse/pull/21149) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Enable read with mmap IO for file ranges from 64 MiB (the settings `min_bytes_to_use_mmap_io`). It may lead to moderate performance improvement. [#22326](https://github.com/ClickHouse/ClickHouse/pull/22326) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add cache for files read with `min_bytes_to_use_mmap_io` setting. It makes significant (2x and more) performance improvement when the value of the setting is small by avoiding frequent mmap/munmap calls and the consequent page faults. Note that mmap IO has major drawbacks that makes it less reliable in production (e.g. hung or SIGBUS on faulty disks; less controllable memory usage). Nevertheless it is good in benchmarks. [#22206](https://github.com/ClickHouse/ClickHouse/pull/22206) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Avoid unnecessary data copy when using codec `NONE`. Please note that codec `NONE` is mostly useless - it's recommended to always use compression (`LZ4` is by default). Despite the common belief, disabling compression may not improve performance (the opposite effect is possible). The `NONE` codec is useful in some cases: - when data is uncompressable; - for synthetic benchmarks. [#22145](https://github.com/ClickHouse/ClickHouse/pull/22145) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Faster `GROUP BY` with small `max_rows_to_group_by` and `group_by_overflow_mode='any'`. [#21856](https://github.com/ClickHouse/ClickHouse/pull/21856) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Optimize performance of queries like `SELECT ... FINAL ... WHERE`. Now in queries with `FINAL` it's allowed to move to `PREWHERE` columns, which are in sorting key. [#21830](https://github.com/ClickHouse/ClickHouse/pull/21830) ([foolchi](https://github.com/foolchi)). +* Improved performance by replacing `memcpy` to another implementation. This closes [#18583](https://github.com/ClickHouse/ClickHouse/issues/18583). [#21520](https://github.com/ClickHouse/ClickHouse/pull/21520) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Improve performance of aggregation in order of sorting key (with enabled setting `optimize_aggregation_in_order`). [#19401](https://github.com/ClickHouse/ClickHouse/pull/19401) ([Anton Popov](https://github.com/CurtizJ)). + +#### Improvement + +* Add connection pool for PostgreSQL table/database engine and dictionary source. Should fix [#21444](https://github.com/ClickHouse/ClickHouse/issues/21444). [#21839](https://github.com/ClickHouse/ClickHouse/pull/21839) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support non-default table schema for postgres storage/table-function. Closes [#21701](https://github.com/ClickHouse/ClickHouse/issues/21701). [#21711](https://github.com/ClickHouse/ClickHouse/pull/21711) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support replicas priority for postgres dictionary source. [#21710](https://github.com/ClickHouse/ClickHouse/pull/21710) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Introduce a new merge tree setting `min_bytes_to_rebalance_partition_over_jbod` which allows assigning new parts to different disks of a JBOD volume in a balanced way. [#16481](https://github.com/ClickHouse/ClickHouse/pull/16481) ([Amos Bird](https://github.com/amosbird)). +* Added `Grant`, `Revoke` and `System` values of `query_kind` column for corresponding queries in `system.query_log`. [#21102](https://github.com/ClickHouse/ClickHouse/pull/21102) ([Vasily Nemkov](https://github.com/Enmk)). +* Allow customizing timeouts for HTTP connections used for replication independently from other HTTP timeouts. [#20088](https://github.com/ClickHouse/ClickHouse/pull/20088) ([nvartolomei](https://github.com/nvartolomei)). +* Better exception message in client in case of exception while server is writing blocks. In previous versions client may get misleading message like `Data compressed with different methods`. [#22427](https://github.com/ClickHouse/ClickHouse/pull/22427) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix error `Directory tmp_fetch_XXX already exists` which could happen after failed fetch part. Delete temporary fetch directory if it already exists. Fixes [#14197](https://github.com/ClickHouse/ClickHouse/issues/14197). [#22411](https://github.com/ClickHouse/ClickHouse/pull/22411) ([nvartolomei](https://github.com/nvartolomei)). +* Fix MSan report for function `range` with `UInt256` argument (support for large integers is experimental). This closes [#22157](https://github.com/ClickHouse/ClickHouse/issues/22157). [#22387](https://github.com/ClickHouse/ClickHouse/pull/22387) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add `current_database` column to `system.processes` table. It contains the current database of the query. [#22365](https://github.com/ClickHouse/ClickHouse/pull/22365) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* Add case-insensitive history search/navigation and subword movement features to `clickhouse-client`. [#22105](https://github.com/ClickHouse/ClickHouse/pull/22105) ([Amos Bird](https://github.com/amosbird)). +* If tuple of NULLs, e.g. `(NULL, NULL)` is on the left hand side of `IN` operator with tuples of non-NULLs on the right hand side, e.g. `SELECT (NULL, NULL) IN ((0, 0), (3, 1))` return 0 instead of throwing an exception about incompatible types. The expression may also appear due to optimization of something like `SELECT (NULL, NULL) = (8, 0) OR (NULL, NULL) = (3, 2) OR (NULL, NULL) = (0, 0) OR (NULL, NULL) = (3, 1)`. This closes [#22017](https://github.com/ClickHouse/ClickHouse/issues/22017). [#22063](https://github.com/ClickHouse/ClickHouse/pull/22063) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Update used version of simdjson to 0.9.1. This fixes [#21984](https://github.com/ClickHouse/ClickHouse/issues/21984). [#22057](https://github.com/ClickHouse/ClickHouse/pull/22057) ([Vitaly Baranov](https://github.com/vitlibar)). +* Added case insensitive aliases for `CONNECTION_ID()` and `VERSION()` functions. This fixes [#22028](https://github.com/ClickHouse/ClickHouse/issues/22028). [#22042](https://github.com/ClickHouse/ClickHouse/pull/22042) ([Eugene Klimov](https://github.com/Slach)). +* Add option `strict_increase` to `windowFunnel` function to calculate each event once (resolve [#21835](https://github.com/ClickHouse/ClickHouse/issues/21835)). [#22025](https://github.com/ClickHouse/ClickHouse/pull/22025) ([Vladimir](https://github.com/vdimir)). +* If partition key of a `MergeTree` table does not include `Date` or `DateTime` columns but includes exactly one `DateTime64` column, expose its values in the `min_time` and `max_time` columns in `system.parts` and `system.parts_columns` tables. Add `min_time` and `max_time` columns to `system.parts_columns` table (these was inconsistency to the `system.parts` table). This closes [#18244](https://github.com/ClickHouse/ClickHouse/issues/18244). [#22011](https://github.com/ClickHouse/ClickHouse/pull/22011) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Supported `replication_alter_partitions_sync=1` setting in `clickhouse-copier` for moving partitions from helping table to destination. Decreased default timeouts. Fixes [#21911](https://github.com/ClickHouse/ClickHouse/issues/21911). [#21912](https://github.com/ClickHouse/ClickHouse/pull/21912) ([turbo jason](https://github.com/songenjie)). +* Show path to data directory of `EmbeddedRocksDB` tables in system tables. [#21903](https://github.com/ClickHouse/ClickHouse/pull/21903) ([tavplubix](https://github.com/tavplubix)). +* Add profile event `HedgedRequestsChangeReplica`, change read data timeout from sec to ms. [#21886](https://github.com/ClickHouse/ClickHouse/pull/21886) ([Kruglov Pavel](https://github.com/Avogar)). +* DiskS3 (experimental feature under development). Fixed bug with the impossibility to move directory if the destination is not empty and cache disk is used. [#21837](https://github.com/ClickHouse/ClickHouse/pull/21837) ([Pavel Kovalenko](https://github.com/Jokser)). +* Better formatting for `Array` and `Map` data types in Web UI. [#21798](https://github.com/ClickHouse/ClickHouse/pull/21798) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Update clusters only if their configurations were updated. [#21685](https://github.com/ClickHouse/ClickHouse/pull/21685) ([Kruglov Pavel](https://github.com/Avogar)). +* Propagate query and session settings for distributed DDL queries. Set `distributed_ddl_entry_format_version` to 2 to enable this. Added `distributed_ddl_output_mode` setting. Supported modes: `none`, `throw` (default), `null_status_on_timeout` and `never_throw`. Miscellaneous fixes and improvements for `Replicated` database engine. [#21535](https://github.com/ClickHouse/ClickHouse/pull/21535) ([tavplubix](https://github.com/tavplubix)). +* If `PODArray` was instantiated with element size that is neither a fraction or a multiple of 16, buffer overflow was possible. No bugs in current releases exist. [#21533](https://github.com/ClickHouse/ClickHouse/pull/21533) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add `last_error_time`/`last_error_message`/`last_error_stacktrace`/`remote` columns for `system.errors`. [#21529](https://github.com/ClickHouse/ClickHouse/pull/21529) ([Azat Khuzhin](https://github.com/azat)). +* Add aliases `simpleJSONExtract/simpleJSONHas` to `visitParam/visitParamExtract{UInt, Int, Bool, Float, Raw, String}`. Fixes #21383. [#21519](https://github.com/ClickHouse/ClickHouse/pull/21519) ([fastio](https://github.com/fastio)). +* Add setting `optimize_skip_unused_shards_limit` to limit the number of sharding key values for `optimize_skip_unused_shards`. [#21512](https://github.com/ClickHouse/ClickHouse/pull/21512) ([Azat Khuzhin](https://github.com/azat)). +* Improve `clickhouse-format` to not throw exception when there are extra spaces or comment after the last query, and throw exception early with readable message when format `ASTInsertQuery` with data . [#21311](https://github.com/ClickHouse/ClickHouse/pull/21311) ([flynn](https://github.com/ucasFL)). +* Improve support of integer keys in data type `Map`. [#21157](https://github.com/ClickHouse/ClickHouse/pull/21157) ([Anton Popov](https://github.com/CurtizJ)). +* MaterializeMySQL: attempt to reconnect to MySQL if the connection is lost. [#20961](https://github.com/ClickHouse/ClickHouse/pull/20961) ([Håvard Kvålen](https://github.com/havardk)). +* Support more cases to rewrite `CROSS JOIN` to `INNER JOIN`. [#20392](https://github.com/ClickHouse/ClickHouse/pull/20392) ([Vladimir](https://github.com/vdimir)). +* Do not create empty parts on INSERT when `optimize_on_insert` setting enabled. Fixes [#20304](https://github.com/ClickHouse/ClickHouse/issues/20304). [#20387](https://github.com/ClickHouse/ClickHouse/pull/20387) ([Kruglov Pavel](https://github.com/Avogar)). +* `MaterializeMySQL`: add minmax skipping index for `_version` column. [#20382](https://github.com/ClickHouse/ClickHouse/pull/20382) ([Stig Bakken](https://github.com/stigsb)). +* Add option `--backslash` for `clickhouse-format`, which can add a backslash at the end of each line of the formatted query. [#21494](https://github.com/ClickHouse/ClickHouse/pull/21494) ([flynn](https://github.com/ucasFL)). +* Now clickhouse will not throw `LOGICAL_ERROR` exception when we try to mutate the already covered part. Fixes [#22013](https://github.com/ClickHouse/ClickHouse/issues/22013). [#22291](https://github.com/ClickHouse/ClickHouse/pull/22291) ([alesapin](https://github.com/alesapin)). + +#### Bug Fix + +* Remove socket from epoll before cancelling packet receiver in `HedgedConnections` to prevent possible race. Fixes [#22161](https://github.com/ClickHouse/ClickHouse/issues/22161). [#22443](https://github.com/ClickHouse/ClickHouse/pull/22443) ([Kruglov Pavel](https://github.com/Avogar)). +* Add (missing) memory accounting in parallel parsing routines. In previous versions OOM was possible when the resultset contains very large blocks of data. This closes [#22008](https://github.com/ClickHouse/ClickHouse/issues/22008). [#22425](https://github.com/ClickHouse/ClickHouse/pull/22425) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix exception which may happen when `SELECT` has constant `WHERE` condition and source table has columns which names are digits. [#22270](https://github.com/ClickHouse/ClickHouse/pull/22270) ([LiuNeng](https://github.com/liuneng1994)). +* Fix query cancellation with `use_hedged_requests=0` and `async_socket_for_remote=1`. [#22183](https://github.com/ClickHouse/ClickHouse/pull/22183) ([Azat Khuzhin](https://github.com/azat)). +* Fix uncaught exception in `InterserverIOHTTPHandler`. [#22146](https://github.com/ClickHouse/ClickHouse/pull/22146) ([Azat Khuzhin](https://github.com/azat)). +* Fix docker entrypoint in case `http_port` is not in the config. [#22132](https://github.com/ClickHouse/ClickHouse/pull/22132) ([Ewout](https://github.com/devwout)). +* Fix error `Invalid number of rows in Chunk` in `JOIN` with `TOTALS` and `arrayJoin`. Closes [#19303](https://github.com/ClickHouse/ClickHouse/issues/19303). [#22129](https://github.com/ClickHouse/ClickHouse/pull/22129) ([Vladimir](https://github.com/vdimir)). +* Fix the background thread pool name which used to poll message from Kafka. The Kafka engine with the broken thread pool will not consume the message from message queue. [#22122](https://github.com/ClickHouse/ClickHouse/pull/22122) ([fastio](https://github.com/fastio)). +* Fix waiting for `OPTIMIZE` and `ALTER` queries for `ReplicatedMergeTree` table engines. Now the query will not hang when the table was detached or restarted. [#22118](https://github.com/ClickHouse/ClickHouse/pull/22118) ([alesapin](https://github.com/alesapin)). +* Disable `async_socket_for_remote`/`use_hedged_requests` for buggy Linux kernels. [#22109](https://github.com/ClickHouse/ClickHouse/pull/22109) ([Azat Khuzhin](https://github.com/azat)). +* Docker entrypoint: avoid chown of `.` in case when `LOG_PATH` is empty. Closes [#22100](https://github.com/ClickHouse/ClickHouse/issues/22100). [#22102](https://github.com/ClickHouse/ClickHouse/pull/22102) ([filimonov](https://github.com/filimonov)). +* The function `decrypt` was lacking a check for the minimal size of data encrypted in `AEAD` mode. This closes [#21897](https://github.com/ClickHouse/ClickHouse/issues/21897). [#22064](https://github.com/ClickHouse/ClickHouse/pull/22064) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* In rare case, merge for `CollapsingMergeTree` may create granule with `index_granularity + 1` rows. Because of this, internal check, added in [#18928](https://github.com/ClickHouse/ClickHouse/issues/18928) (affects 21.2 and 21.3), may fail with error `Incomplete granules are not allowed while blocks are granules size`. This error did not allow parts to merge. [#21976](https://github.com/ClickHouse/ClickHouse/pull/21976) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Reverted [#15454](https://github.com/ClickHouse/ClickHouse/issues/15454) that may cause significant increase in memory usage while loading external dictionaries of hashed type. This closes [#21935](https://github.com/ClickHouse/ClickHouse/issues/21935). [#21948](https://github.com/ClickHouse/ClickHouse/pull/21948) ([Maksim Kita](https://github.com/kitaisreal)). +* Prevent hedged connections overlaps (`Unknown packet 9 from server` error). [#21941](https://github.com/ClickHouse/ClickHouse/pull/21941) ([Azat Khuzhin](https://github.com/azat)). +* Fix reading the HTTP POST request with "multipart/form-data" content type in some cases. [#21936](https://github.com/ClickHouse/ClickHouse/pull/21936) ([Ivan](https://github.com/abyss7)). +* Fix wrong `ORDER BY` results when a query contains window functions, and optimization for reading in primary key order is applied. Fixes [#21828](https://github.com/ClickHouse/ClickHouse/issues/21828). [#21915](https://github.com/ClickHouse/ClickHouse/pull/21915) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* Fix deadlock in first catboost model execution. Closes [#13832](https://github.com/ClickHouse/ClickHouse/issues/13832). [#21844](https://github.com/ClickHouse/ClickHouse/pull/21844) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix incorrect query result (and possible crash) which could happen when `WHERE` or `HAVING` condition is pushed before `GROUP BY`. Fixes [#21773](https://github.com/ClickHouse/ClickHouse/issues/21773). [#21841](https://github.com/ClickHouse/ClickHouse/pull/21841) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Better error handling and logging in `WriteBufferFromS3`. [#21836](https://github.com/ClickHouse/ClickHouse/pull/21836) ([Pavel Kovalenko](https://github.com/Jokser)). +* Fix possible crashes in aggregate functions with combinator `Distinct`, while using two-level aggregation. This is a follow-up fix of [#18365](https://github.com/ClickHouse/ClickHouse/pull/18365) . Can only reproduced in production env. [#21818](https://github.com/ClickHouse/ClickHouse/pull/21818) ([Amos Bird](https://github.com/amosbird)). +* Fix scalar subquery index analysis. This fixes [#21717](https://github.com/ClickHouse/ClickHouse/issues/21717) , which was introduced in [#18896](https://github.com/ClickHouse/ClickHouse/pull/18896). [#21766](https://github.com/ClickHouse/ClickHouse/pull/21766) ([Amos Bird](https://github.com/amosbird)). +* Fix bug for `ReplicatedMerge` table engines when `ALTER MODIFY COLUMN` query doesn't change the type of `Decimal` column if its size (32 bit or 64 bit) doesn't change. [#21728](https://github.com/ClickHouse/ClickHouse/pull/21728) ([alesapin](https://github.com/alesapin)). +* Fix possible infinite waiting when concurrent `OPTIMIZE` and `DROP` are run for `ReplicatedMergeTree`. [#21716](https://github.com/ClickHouse/ClickHouse/pull/21716) ([Azat Khuzhin](https://github.com/azat)). +* Fix function `arrayElement` with type `Map` for constant integer arguments. [#21699](https://github.com/ClickHouse/ClickHouse/pull/21699) ([Anton Popov](https://github.com/CurtizJ)). +* Fix SIGSEGV on not existing attributes from `ip_trie` with `access_to_key_from_attributes`. [#21692](https://github.com/ClickHouse/ClickHouse/pull/21692) ([Azat Khuzhin](https://github.com/azat)). +* Server now start accepting connections only after `DDLWorker` and dictionaries initialization. [#21676](https://github.com/ClickHouse/ClickHouse/pull/21676) ([Azat Khuzhin](https://github.com/azat)). +* Add type conversion for keys of tables of type `Join` (previously led to SIGSEGV). [#21646](https://github.com/ClickHouse/ClickHouse/pull/21646) ([Azat Khuzhin](https://github.com/azat)). +* Fix distributed requests cancellation (for example simple select from multiple shards with limit, i.e. `select * from remote('127.{2,3}', system.numbers) limit 100`) with `async_socket_for_remote=1`. [#21643](https://github.com/ClickHouse/ClickHouse/pull/21643) ([Azat Khuzhin](https://github.com/azat)). +* Fix `fsync_part_directory` for horizontal merge. [#21642](https://github.com/ClickHouse/ClickHouse/pull/21642) ([Azat Khuzhin](https://github.com/azat)). +* Remove unknown columns from joined table in `WHERE` for queries to external database engines (MySQL, PostgreSQL). close [#14614](https://github.com/ClickHouse/ClickHouse/issues/14614), close [#19288](https://github.com/ClickHouse/ClickHouse/issues/19288) (dup), close [#19645](https://github.com/ClickHouse/ClickHouse/issues/19645) (dup). [#21640](https://github.com/ClickHouse/ClickHouse/pull/21640) ([Vladimir](https://github.com/vdimir)). +* `std::terminate` was called if there is an error writing data into s3. [#21624](https://github.com/ClickHouse/ClickHouse/pull/21624) ([Vladimir](https://github.com/vdimir)). +* Fix possible error `Cannot find column` when `optimize_skip_unused_shards` is enabled and zero shards are used. [#21579](https://github.com/ClickHouse/ClickHouse/pull/21579) ([Azat Khuzhin](https://github.com/azat)). +* In case if query has constant `WHERE` condition, and setting `optimize_skip_unused_shards` enabled, all shards may be skipped and query could return incorrect empty result. [#21550](https://github.com/ClickHouse/ClickHouse/pull/21550) ([Amos Bird](https://github.com/amosbird)). +* Fix table function `clusterAllReplicas` returns wrong `_shard_num`. close [#21481](https://github.com/ClickHouse/ClickHouse/issues/21481). [#21498](https://github.com/ClickHouse/ClickHouse/pull/21498) ([flynn](https://github.com/ucasFL)). +* Fix that S3 table holds old credentials after config update. [#21457](https://github.com/ClickHouse/ClickHouse/pull/21457) ([Grigory Pervakov](https://github.com/GrigoryPervakov)). +* Fixed race on SSL object inside `SecureSocket` in Poco. [#21456](https://github.com/ClickHouse/ClickHouse/pull/21456) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix `Avro` format parsing for `Kafka`. Fixes [#21437](https://github.com/ClickHouse/ClickHouse/issues/21437). [#21438](https://github.com/ClickHouse/ClickHouse/pull/21438) ([Ilya Golshtein](https://github.com/ilejn)). +* Fix receive and send timeouts and non-blocking read in secure socket. [#21429](https://github.com/ClickHouse/ClickHouse/pull/21429) ([Kruglov Pavel](https://github.com/Avogar)). +* `force_drop_table` flag didn't work for `MATERIALIZED VIEW`, it's fixed. Fixes [#18943](https://github.com/ClickHouse/ClickHouse/issues/18943). [#20626](https://github.com/ClickHouse/ClickHouse/pull/20626) ([tavplubix](https://github.com/tavplubix)). +* Fix name clashes in `PredicateRewriteVisitor`. It caused incorrect `WHERE` filtration after full join. Close [#20497](https://github.com/ClickHouse/ClickHouse/issues/20497). [#20622](https://github.com/ClickHouse/ClickHouse/pull/20622) ([Vladimir](https://github.com/vdimir)). + +#### Build/Testing/Packaging Improvement + +* Add [Jepsen](https://github.com/jepsen-io/jepsen) tests for ClickHouse Keeper. [#21677](https://github.com/ClickHouse/ClickHouse/pull/21677) ([alesapin](https://github.com/alesapin)). +* Run stateless tests in parallel in CI. Depends on [#22181](https://github.com/ClickHouse/ClickHouse/issues/22181). [#22300](https://github.com/ClickHouse/ClickHouse/pull/22300) ([alesapin](https://github.com/alesapin)). +* Enable status check for [SQLancer](https://github.com/sqlancer/sqlancer) CI run. [#22015](https://github.com/ClickHouse/ClickHouse/pull/22015) ([Ilya Yatsishin](https://github.com/qoega)). +* Multiple preparations for PowerPC builds: Enable the bundled openldap on `ppc64le`. [#22487](https://github.com/ClickHouse/ClickHouse/pull/22487) ([Kfir Itzhak](https://github.com/mastertheknife)). Enable compiling on `ppc64le` with Clang. [#22476](https://github.com/ClickHouse/ClickHouse/pull/22476) ([Kfir Itzhak](https://github.com/mastertheknife)). Fix compiling boost on `ppc64le`. [#22474](https://github.com/ClickHouse/ClickHouse/pull/22474) ([Kfir Itzhak](https://github.com/mastertheknife)). Fix CMake error about internal CMake variable `CMAKE_ASM_COMPILE_OBJECT` not set on `ppc64le`. [#22469](https://github.com/ClickHouse/ClickHouse/pull/22469) ([Kfir Itzhak](https://github.com/mastertheknife)). Fix Fedora/RHEL/CentOS not finding `libclang_rt.builtins` on `ppc64le`. [#22458](https://github.com/ClickHouse/ClickHouse/pull/22458) ([Kfir Itzhak](https://github.com/mastertheknife)). Enable building with `jemalloc` on `ppc64le`. [#22447](https://github.com/ClickHouse/ClickHouse/pull/22447) ([Kfir Itzhak](https://github.com/mastertheknife)). Fix ClickHouse's config embedding and cctz's timezone embedding on `ppc64le`. [#22445](https://github.com/ClickHouse/ClickHouse/pull/22445) ([Kfir Itzhak](https://github.com/mastertheknife)). Fixed compiling on `ppc64le` and use the correct instruction pointer register on `ppc64le`. [#22430](https://github.com/ClickHouse/ClickHouse/pull/22430) ([Kfir Itzhak](https://github.com/mastertheknife)). +* Re-enable the S3 (AWS) library on `aarch64`. [#22484](https://github.com/ClickHouse/ClickHouse/pull/22484) ([Kfir Itzhak](https://github.com/mastertheknife)). +* Add `tzdata` to Docker containers because reading `ORC` formats requires it. This closes [#14156](https://github.com/ClickHouse/ClickHouse/issues/14156). [#22000](https://github.com/ClickHouse/ClickHouse/pull/22000) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Introduce 2 arguments for `clickhouse-server` image Dockerfile: `deb_location` & `single_binary_location`. [#21977](https://github.com/ClickHouse/ClickHouse/pull/21977) ([filimonov](https://github.com/filimonov)). +* Allow to use clang-tidy with release builds by enabling assertions if it is used. [#21914](https://github.com/ClickHouse/ClickHouse/pull/21914) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add llvm-12 binaries name to search in cmake scripts. Implicit constants conversions to mute clang warnings. Updated submodules to build with CMake 3.19. Mute recursion in macro expansion in `readpassphrase` library. Deprecated `-fuse-ld` changed to `--ld-path` for clang. [#21597](https://github.com/ClickHouse/ClickHouse/pull/21597) ([Ilya Yatsishin](https://github.com/qoega)). +* Updating `docker/test/testflows/runner/dockerd-entrypoint.sh` to use Yandex dockerhub-proxy, because Docker Hub has enabled very restrictive rate limits [#21551](https://github.com/ClickHouse/ClickHouse/pull/21551) ([vzakaznikov](https://github.com/vzakaznikov)). +* Fix macOS shared lib build. [#20184](https://github.com/ClickHouse/ClickHouse/pull/20184) ([nvartolomei](https://github.com/nvartolomei)). +* Add `ctime` option to `zookeeper-dump-tree`. It allows to dump node creation time. [#21842](https://github.com/ClickHouse/ClickHouse/pull/21842) ([Ilya](https://github.com/HumanUser)). + + +## ClickHouse release 21.3 (LTS) + +### ClickHouse release v21.3, 2021-03-12 + +#### Backward Incompatible Change + +* Now it's not allowed to create MergeTree tables in old syntax with table TTL because it's just ignored. Attach of old tables is still possible. [#20282](https://github.com/ClickHouse/ClickHouse/pull/20282) ([alesapin](https://github.com/alesapin)). +* Now all case-insensitive function names will be rewritten to their canonical representations. This is needed for projection query routing (the upcoming feature). [#20174](https://github.com/ClickHouse/ClickHouse/pull/20174) ([Amos Bird](https://github.com/amosbird)). +* Fix creation of `TTL` in cases, when its expression is a function and it is the same as `ORDER BY` key. Now it's allowed to set custom aggregation to primary key columns in `TTL` with `GROUP BY`. Backward incompatible: For primary key columns, which are not in `GROUP BY` and aren't set explicitly now is applied function `any` instead of `max`, when TTL is expired. Also if you use TTL with `WHERE` or `GROUP BY` you can see exceptions at merges, while making rolling update. [#15450](https://github.com/ClickHouse/ClickHouse/pull/15450) ([Anton Popov](https://github.com/CurtizJ)). + +#### New Feature + +* Add file engine settings: `engine_file_empty_if_not_exists` and `engine_file_truncate_on_insert`. [#20620](https://github.com/ClickHouse/ClickHouse/pull/20620) ([M0r64n](https://github.com/M0r64n)). +* Add aggregate function `deltaSum` for summing the differences between consecutive rows. [#20057](https://github.com/ClickHouse/ClickHouse/pull/20057) ([Russ Frank](https://github.com/rf)). +* New `event_time_microseconds` column in `system.part_log` table. [#20027](https://github.com/ClickHouse/ClickHouse/pull/20027) ([Bharat Nallan](https://github.com/bharatnc)). +* Added `timezoneOffset(datetime)` function which will give the offset from UTC in seconds. This close [#issue:19850](https://github.com/ClickHouse/ClickHouse/issues/19850). [#19962](https://github.com/ClickHouse/ClickHouse/pull/19962) ([keenwolf](https://github.com/keen-wolf)). +* Add setting `insert_shard_id` to support insert data into specific shard from distributed table. [#19961](https://github.com/ClickHouse/ClickHouse/pull/19961) ([flynn](https://github.com/ucasFL)). +* Function `reinterpretAs` updated to support big integers. Fixes [#19691](https://github.com/ClickHouse/ClickHouse/issues/19691). [#19858](https://github.com/ClickHouse/ClickHouse/pull/19858) ([Maksim Kita](https://github.com/kitaisreal)). +* Added Server Side Encryption Customer Keys (the `x-amz-server-side-encryption-customer-(key/md5)` header) support in S3 client. See [the link](https://docs.aws.amazon.com/AmazonS3/latest/dev/ServerSideEncryptionCustomerKeys.html). Closes [#19428](https://github.com/ClickHouse/ClickHouse/issues/19428). [#19748](https://github.com/ClickHouse/ClickHouse/pull/19748) ([Vladimir Chebotarev](https://github.com/excitoon)). +* Added `implicit_key` option for `executable` dictionary source. It allows to avoid printing key for every record if records comes in the same order as the input keys. Implements [#14527](https://github.com/ClickHouse/ClickHouse/issues/14527). [#19677](https://github.com/ClickHouse/ClickHouse/pull/19677) ([Maksim Kita](https://github.com/kitaisreal)). +* Add quota type `query_selects` and `query_inserts`. [#19603](https://github.com/ClickHouse/ClickHouse/pull/19603) ([JackyWoo](https://github.com/JackyWoo)). +* Add function `extractTextFromHTML` [#19600](https://github.com/ClickHouse/ClickHouse/pull/19600) ([zlx19950903](https://github.com/zlx19950903)), ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Tables with `MergeTree*` engine now have two new table-level settings for query concurrency control. Setting `max_concurrent_queries` limits the number of concurrently executed queries which are related to this table. Setting `min_marks_to_honor_max_concurrent_queries` tells to apply previous setting only if query reads at least this number of marks. [#19544](https://github.com/ClickHouse/ClickHouse/pull/19544) ([Amos Bird](https://github.com/amosbird)). +* Added `file` function to read file from user_files directory as a String. This is different from the `file` table function. This implements [#issue:18851](https://github.com/ClickHouse/ClickHouse/issues/18851). [#19204](https://github.com/ClickHouse/ClickHouse/pull/19204) ([keenwolf](https://github.com/keen-wolf)). + +#### Experimental feature + +* Add experimental `Replicated` database engine. It replicates DDL queries across multiple hosts. [#16193](https://github.com/ClickHouse/ClickHouse/pull/16193) ([tavplubix](https://github.com/tavplubix)). +* Introduce experimental support for window functions, enabled with `allow_experimental_window_functions = 1`. This is a preliminary, alpha-quality implementation that is not suitable for production use and will change in backward-incompatible ways in future releases. Please see [the documentation](https://github.com/ClickHouse/ClickHouse/blob/master/docs/en/sql-reference/window-functions/index.md#experimental-window-functions) for the list of supported features. [#20337](https://github.com/ClickHouse/ClickHouse/pull/20337) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* Add the ability to backup/restore metadata files for DiskS3. [#18377](https://github.com/ClickHouse/ClickHouse/pull/18377) ([Pavel Kovalenko](https://github.com/Jokser)). + +#### Performance Improvement + +* Hedged requests for remote queries. When setting `use_hedged_requests` enabled (off by default), allow to establish many connections with different replicas for query. New connection is enabled in case existent connection(s) with replica(s) were not established within `hedged_connection_timeout` or no data was received within `receive_data_timeout`. Query uses the first connection which send non empty progress packet (or data packet, if `allow_changing_replica_until_first_data_packet`); other connections are cancelled. Queries with `max_parallel_replicas > 1` are supported. [#19291](https://github.com/ClickHouse/ClickHouse/pull/19291) ([Kruglov Pavel](https://github.com/Avogar)). This allows to significantly reduce tail latencies on very large clusters. +* Added support for `PREWHERE` (and enable the corresponding optimization) when tables have row-level security expressions specified. [#19576](https://github.com/ClickHouse/ClickHouse/pull/19576) ([Denis Glazachev](https://github.com/traceon)). +* The setting `distributed_aggregation_memory_efficient` is enabled by default. It will lower memory usage and improve performance of distributed queries. [#20599](https://github.com/ClickHouse/ClickHouse/pull/20599) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Improve performance of GROUP BY multiple fixed size keys. [#20472](https://github.com/ClickHouse/ClickHouse/pull/20472) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Improve performance of aggregate functions by more strict aliasing. [#19946](https://github.com/ClickHouse/ClickHouse/pull/19946) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Speed up reading from `Memory` tables in extreme cases (when reading speed is in order of 50 GB/sec) by simplification of pipeline and (consequently) less lock contention in pipeline scheduling. [#20468](https://github.com/ClickHouse/ClickHouse/pull/20468) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Partially reimplement HTTP server to make it making less copies of incoming and outgoing data. It gives up to 1.5 performance improvement on inserting long records over HTTP. [#19516](https://github.com/ClickHouse/ClickHouse/pull/19516) ([Ivan](https://github.com/abyss7)). +* Add `compress` setting for `Memory` tables. If it's enabled the table will use less RAM. On some machines and datasets it can also work faster on SELECT, but it is not always the case. This closes [#20093](https://github.com/ClickHouse/ClickHouse/issues/20093). Note: there are reasons why Memory tables can work slower than MergeTree: (1) lack of compression (2) static size of blocks (3) lack of indices and prewhere... [#20168](https://github.com/ClickHouse/ClickHouse/pull/20168) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Slightly better code in aggregation. [#20978](https://github.com/ClickHouse/ClickHouse/pull/20978) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add back `intDiv`/`modulo` specializations for better performance. This fixes [#21293](https://github.com/ClickHouse/ClickHouse/issues/21293) . The regression was introduced in https://github.com/ClickHouse/ClickHouse/pull/18145 . [#21307](https://github.com/ClickHouse/ClickHouse/pull/21307) ([Amos Bird](https://github.com/amosbird)). +* Do not squash blocks too much on INSERT SELECT if inserting into Memory table. In previous versions inefficient data representation was created in Memory table after INSERT SELECT. This closes [#13052](https://github.com/ClickHouse/ClickHouse/issues/13052). [#20169](https://github.com/ClickHouse/ClickHouse/pull/20169) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix at least one case when DataType parser may have exponential complexity (found by fuzzer). This closes [#20096](https://github.com/ClickHouse/ClickHouse/issues/20096). [#20132](https://github.com/ClickHouse/ClickHouse/pull/20132) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Parallelize SELECT with FINAL for single part with level > 0 when `do_not_merge_across_partitions_select_final` setting is 1. [#19375](https://github.com/ClickHouse/ClickHouse/pull/19375) ([Kruglov Pavel](https://github.com/Avogar)). +* Fill only requested columns when querying `system.parts` and `system.parts_columns`. Closes [#19570](https://github.com/ClickHouse/ClickHouse/issues/19570). [#21035](https://github.com/ClickHouse/ClickHouse/pull/21035) ([Anmol Arora](https://github.com/anmolarora)). +* Perform algebraic optimizations of arithmetic expressions inside `avg` aggregate function. close [#20092](https://github.com/ClickHouse/ClickHouse/issues/20092). [#20183](https://github.com/ClickHouse/ClickHouse/pull/20183) ([flynn](https://github.com/ucasFL)). + +#### Improvement + +* Case-insensitive compression methods for table functions. Also fixed LZMA compression method which was checked in upper case. [#21416](https://github.com/ClickHouse/ClickHouse/pull/21416) ([Vladimir Chebotarev](https://github.com/excitoon)). +* Add two settings to delay or throw error during insertion when there are too many inactive parts. This is useful when server fails to clean up parts quickly enough. [#20178](https://github.com/ClickHouse/ClickHouse/pull/20178) ([Amos Bird](https://github.com/amosbird)). +* Provide better compatibility for mysql clients. 1. mysql jdbc 2. mycli. [#21367](https://github.com/ClickHouse/ClickHouse/pull/21367) ([Amos Bird](https://github.com/amosbird)). +* Forbid to drop a column if it's referenced by materialized view. Closes [#21164](https://github.com/ClickHouse/ClickHouse/issues/21164). [#21303](https://github.com/ClickHouse/ClickHouse/pull/21303) ([flynn](https://github.com/ucasFL)). +* MySQL dictionary source will now retry unexpected connection failures (Lost connection to MySQL server during query) which sometimes happen on SSL/TLS connections. [#21237](https://github.com/ClickHouse/ClickHouse/pull/21237) ([Alexander Kazakov](https://github.com/Akazz)). +* Usability improvement: more consistent `DateTime64` parsing: recognize the case when unix timestamp with subsecond resolution is specified as scaled integer (like `1111111111222` instead of `1111111111.222`). This closes [#13194](https://github.com/ClickHouse/ClickHouse/issues/13194). [#21053](https://github.com/ClickHouse/ClickHouse/pull/21053) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Do only merging of sorted blocks on initiator with distributed_group_by_no_merge. [#20882](https://github.com/ClickHouse/ClickHouse/pull/20882) ([Azat Khuzhin](https://github.com/azat)). +* When loading config for mysql source ClickHouse will now randomize the list of replicas with the same priority to ensure the round-robin logics of picking mysql endpoint. This closes [#20629](https://github.com/ClickHouse/ClickHouse/issues/20629). [#20632](https://github.com/ClickHouse/ClickHouse/pull/20632) ([Alexander Kazakov](https://github.com/Akazz)). +* Function 'reinterpretAs(x, Type)' renamed into 'reinterpret(x, Type)'. [#20611](https://github.com/ClickHouse/ClickHouse/pull/20611) ([Maksim Kita](https://github.com/kitaisreal)). +* Support vhost for RabbitMQ engine [#20576](https://github.com/ClickHouse/ClickHouse/issues/20576). [#20596](https://github.com/ClickHouse/ClickHouse/pull/20596) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Improved serialization for data types combined of Arrays and Tuples. Improved matching enum data types to protobuf enum type. Fixed serialization of the `Map` data type. Omitted values are now set by default. [#20506](https://github.com/ClickHouse/ClickHouse/pull/20506) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fixed race between execution of distributed DDL tasks and cleanup of DDL queue. Now DDL task cannot be removed from ZooKeeper if there are active workers. Fixes [#20016](https://github.com/ClickHouse/ClickHouse/issues/20016). [#20448](https://github.com/ClickHouse/ClickHouse/pull/20448) ([tavplubix](https://github.com/tavplubix)). +* Make FQDN and other DNS related functions work correctly in alpine images. [#20336](https://github.com/ClickHouse/ClickHouse/pull/20336) ([filimonov](https://github.com/filimonov)). +* Do not allow early constant folding of explicitly forbidden functions. [#20303](https://github.com/ClickHouse/ClickHouse/pull/20303) ([Azat Khuzhin](https://github.com/azat)). +* Implicit conversion from integer to Decimal type might succeeded if integer value doe not fit into Decimal type. Now it throws `ARGUMENT_OUT_OF_BOUND`. [#20232](https://github.com/ClickHouse/ClickHouse/pull/20232) ([tavplubix](https://github.com/tavplubix)). +* Lockless `SYSTEM FLUSH DISTRIBUTED`. [#20215](https://github.com/ClickHouse/ClickHouse/pull/20215) ([Azat Khuzhin](https://github.com/azat)). +* Normalize count(constant), sum(1) to count(). This is needed for projection query routing. [#20175](https://github.com/ClickHouse/ClickHouse/pull/20175) ([Amos Bird](https://github.com/amosbird)). +* Support all native integer types in bitmap functions. [#20171](https://github.com/ClickHouse/ClickHouse/pull/20171) ([Amos Bird](https://github.com/amosbird)). +* Updated `CacheDictionary`, `ComplexCacheDictionary`, `SSDCacheDictionary`, `SSDComplexKeyDictionary` to use LRUHashMap as underlying index. [#20164](https://github.com/ClickHouse/ClickHouse/pull/20164) ([Maksim Kita](https://github.com/kitaisreal)). +* The setting `access_management` is now configurable on startup by providing `CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT`, defaults to disabled (`0`) which was the prior value. [#20139](https://github.com/ClickHouse/ClickHouse/pull/20139) ([Marquitos](https://github.com/sonirico)). +* Fix toDateTime64(toDate()/toDateTime()) for DateTime64 - Implement DateTime64 clamping to match DateTime behaviour. [#20131](https://github.com/ClickHouse/ClickHouse/pull/20131) ([Azat Khuzhin](https://github.com/azat)). +* Quota improvements: SHOW TABLES is now considered as one query in the quota calculations, not two queries. SYSTEM queries now consume quota. Fix calculation of interval's end in quota consumption. [#20106](https://github.com/ClickHouse/ClickHouse/pull/20106) ([Vitaly Baranov](https://github.com/vitlibar)). +* Supports `path IN (set)` expressions for `system.zookeeper` table. [#20105](https://github.com/ClickHouse/ClickHouse/pull/20105) ([小路](https://github.com/nicelulu)). +* Show full details of `MaterializeMySQL` tables in `system.tables`. [#20051](https://github.com/ClickHouse/ClickHouse/pull/20051) ([Stig Bakken](https://github.com/stigsb)). +* Fix data race in executable dictionary that was possible only on misuse (when the script returns data ignoring its input). [#20045](https://github.com/ClickHouse/ClickHouse/pull/20045) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* The value of MYSQL_OPT_RECONNECT option can now be controlled by "opt_reconnect" parameter in the config section of mysql replica. [#19998](https://github.com/ClickHouse/ClickHouse/pull/19998) ([Alexander Kazakov](https://github.com/Akazz)). +* If user calls `JSONExtract` function with `Float32` type requested, allow inaccurate conversion to the result type. For example the number `0.1` in JSON is double precision and is not representable in Float32, but the user still wants to get it. Previous versions return 0 for non-Nullable type and NULL for Nullable type to indicate that conversion is imprecise. The logic was 100% correct but it was surprising to users and leading to questions. This closes [#13962](https://github.com/ClickHouse/ClickHouse/issues/13962). [#19960](https://github.com/ClickHouse/ClickHouse/pull/19960) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add conversion of block structure for INSERT into Distributed tables if it does not match. [#19947](https://github.com/ClickHouse/ClickHouse/pull/19947) ([Azat Khuzhin](https://github.com/azat)). +* Improvement for the `system.distributed_ddl_queue` table. Initialize MaxDDLEntryID to the last value after restarting. Before this PR, MaxDDLEntryID will remain zero until a new DDLTask is processed. [#19924](https://github.com/ClickHouse/ClickHouse/pull/19924) ([Amos Bird](https://github.com/amosbird)). +* Show `MaterializeMySQL` tables in `system.parts`. [#19770](https://github.com/ClickHouse/ClickHouse/pull/19770) ([Stig Bakken](https://github.com/stigsb)). +* Add separate config directive for `Buffer` profile. [#19721](https://github.com/ClickHouse/ClickHouse/pull/19721) ([Azat Khuzhin](https://github.com/azat)). +* Move conditions that are not related to JOIN to WHERE clause. [#18720](https://github.com/ClickHouse/ClickHouse/issues/18720). [#19685](https://github.com/ClickHouse/ClickHouse/pull/19685) ([hexiaoting](https://github.com/hexiaoting)). +* Add ability to throttle INSERT into Distributed based on amount of pending bytes for async send (`bytes_to_delay_insert`/`max_delay_to_insert` and `bytes_to_throw_insert` settings for `Distributed` engine has been added). [#19673](https://github.com/ClickHouse/ClickHouse/pull/19673) ([Azat Khuzhin](https://github.com/azat)). +* Fix some rare cases when write errors can be ignored in destructors. [#19451](https://github.com/ClickHouse/ClickHouse/pull/19451) ([Azat Khuzhin](https://github.com/azat)). +* Print inline frames in stack traces for fatal errors. [#19317](https://github.com/ClickHouse/ClickHouse/pull/19317) ([Ivan](https://github.com/abyss7)). + +#### Bug Fix + +* Fix redundant reconnects to ZooKeeper and the possibility of two active sessions for a single clickhouse server. Both problems introduced in #14678. [#21264](https://github.com/ClickHouse/ClickHouse/pull/21264) ([alesapin](https://github.com/alesapin)). +* Fix error `Bad cast from type ... to DB::ColumnLowCardinality` while inserting into table with `LowCardinality` column from `Values` format. Fixes #21140 [#21357](https://github.com/ClickHouse/ClickHouse/pull/21357) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix a deadlock in `ALTER DELETE` mutations for non replicated MergeTree table engines when the predicate contains the table itself. Fixes [#20558](https://github.com/ClickHouse/ClickHouse/issues/20558). [#21477](https://github.com/ClickHouse/ClickHouse/pull/21477) ([alesapin](https://github.com/alesapin)). +* Fix SIGSEGV for distributed queries on failures. [#21434](https://github.com/ClickHouse/ClickHouse/pull/21434) ([Azat Khuzhin](https://github.com/azat)). +* Now `ALTER MODIFY COLUMN` queries will correctly affect changes in partition key, skip indices, TTLs, and so on. Fixes [#13675](https://github.com/ClickHouse/ClickHouse/issues/13675). [#21334](https://github.com/ClickHouse/ClickHouse/pull/21334) ([alesapin](https://github.com/alesapin)). +* Fix bug with `join_use_nulls` and joining `TOTALS` from subqueries. This closes [#19362](https://github.com/ClickHouse/ClickHouse/issues/19362) and [#21137](https://github.com/ClickHouse/ClickHouse/issues/21137). [#21248](https://github.com/ClickHouse/ClickHouse/pull/21248) ([vdimir](https://github.com/vdimir)). +* Fix crash in `EXPLAIN` for query with `UNION`. Fixes [#20876](https://github.com/ClickHouse/ClickHouse/issues/20876), [#21170](https://github.com/ClickHouse/ClickHouse/issues/21170). [#21246](https://github.com/ClickHouse/ClickHouse/pull/21246) ([flynn](https://github.com/ucasFL)). +* Now mutations allowed only for table engines that support them (MergeTree family, Memory, MaterializedView). Other engines will report a more clear error. Fixes [#21168](https://github.com/ClickHouse/ClickHouse/issues/21168). [#21183](https://github.com/ClickHouse/ClickHouse/pull/21183) ([alesapin](https://github.com/alesapin)). +* Fixes [#21112](https://github.com/ClickHouse/ClickHouse/issues/21112). Fixed bug that could cause duplicates with insert query (if one of the callbacks came a little too late). [#21138](https://github.com/ClickHouse/ClickHouse/pull/21138) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix `input_format_null_as_default` take effective when types are nullable. This fixes [#21116](https://github.com/ClickHouse/ClickHouse/issues/21116) . [#21121](https://github.com/ClickHouse/ClickHouse/pull/21121) ([Amos Bird](https://github.com/amosbird)). +* fix bug related to cast Tuple to Map. Closes [#21029](https://github.com/ClickHouse/ClickHouse/issues/21029). [#21120](https://github.com/ClickHouse/ClickHouse/pull/21120) ([hexiaoting](https://github.com/hexiaoting)). +* Fix the metadata leak when the Replicated*MergeTree with custom (non default) ZooKeeper cluster is dropped. [#21119](https://github.com/ClickHouse/ClickHouse/pull/21119) ([fastio](https://github.com/fastio)). +* Fix type mismatch issue when using LowCardinality keys in joinGet. This fixes [#21114](https://github.com/ClickHouse/ClickHouse/issues/21114). [#21117](https://github.com/ClickHouse/ClickHouse/pull/21117) ([Amos Bird](https://github.com/amosbird)). +* fix default_replica_path and default_replica_name values are useless on Replicated(*)MergeTree engine when the engine needs specify other parameters. [#21060](https://github.com/ClickHouse/ClickHouse/pull/21060) ([mxzlxy](https://github.com/mxzlxy)). +* Out of bound memory access was possible when formatting specifically crafted out of range value of type `DateTime64`. This closes [#20494](https://github.com/ClickHouse/ClickHouse/issues/20494). This closes [#20543](https://github.com/ClickHouse/ClickHouse/issues/20543). [#21023](https://github.com/ClickHouse/ClickHouse/pull/21023) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Block parallel insertions into storage join. [#21009](https://github.com/ClickHouse/ClickHouse/pull/21009) ([vdimir](https://github.com/vdimir)). +* Fixed behaviour, when `ALTER MODIFY COLUMN` created mutation, that will knowingly fail. [#21007](https://github.com/ClickHouse/ClickHouse/pull/21007) ([Anton Popov](https://github.com/CurtizJ)). +* Closes [#9969](https://github.com/ClickHouse/ClickHouse/issues/9969). Fixed Brotli http compression error, which reproduced for large data sizes, slightly complicated structure and with json output format. Update Brotli to the latest version to include the "fix rare access to uninitialized data in ring-buffer". [#20991](https://github.com/ClickHouse/ClickHouse/pull/20991) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix 'Empty task was returned from async task queue' on query cancellation. [#20881](https://github.com/ClickHouse/ClickHouse/pull/20881) ([Azat Khuzhin](https://github.com/azat)). +* `USE database;` query did not work when using MySQL 5.7 client to connect to ClickHouse server, it's fixed. Fixes [#18926](https://github.com/ClickHouse/ClickHouse/issues/18926). [#20878](https://github.com/ClickHouse/ClickHouse/pull/20878) ([tavplubix](https://github.com/tavplubix)). +* Fix usage of `-Distinct` combinator with `-State` combinator in aggregate functions. [#20866](https://github.com/ClickHouse/ClickHouse/pull/20866) ([Anton Popov](https://github.com/CurtizJ)). +* Fix subquery with union distinct and limit clause. close [#20597](https://github.com/ClickHouse/ClickHouse/issues/20597). [#20610](https://github.com/ClickHouse/ClickHouse/pull/20610) ([flynn](https://github.com/ucasFL)). +* Fixed inconsistent behavior of dictionary in case of queries where we look for absent keys in dictionary. [#20578](https://github.com/ClickHouse/ClickHouse/pull/20578) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix the number of threads for scalar subqueries and subqueries for index (after [#19007](https://github.com/ClickHouse/ClickHouse/issues/19007) single thread was always used). Fixes [#20457](https://github.com/ClickHouse/ClickHouse/issues/20457), [#20512](https://github.com/ClickHouse/ClickHouse/issues/20512). [#20550](https://github.com/ClickHouse/ClickHouse/pull/20550) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix crash which could happen if unknown packet was received from remove query (was introduced in [#17868](https://github.com/ClickHouse/ClickHouse/issues/17868)). [#20547](https://github.com/ClickHouse/ClickHouse/pull/20547) ([Azat Khuzhin](https://github.com/azat)). +* Add proper checks while parsing directory names for async INSERT (fixes SIGSEGV). [#20498](https://github.com/ClickHouse/ClickHouse/pull/20498) ([Azat Khuzhin](https://github.com/azat)). +* Fix function `transform` does not work properly for floating point keys. Closes [#20460](https://github.com/ClickHouse/ClickHouse/issues/20460). [#20479](https://github.com/ClickHouse/ClickHouse/pull/20479) ([flynn](https://github.com/ucasFL)). +* Fix infinite loop when propagating WITH aliases to subqueries. This fixes [#20388](https://github.com/ClickHouse/ClickHouse/issues/20388). [#20476](https://github.com/ClickHouse/ClickHouse/pull/20476) ([Amos Bird](https://github.com/amosbird)). +* Fix abnormal server termination when http client goes away. [#20464](https://github.com/ClickHouse/ClickHouse/pull/20464) ([Azat Khuzhin](https://github.com/azat)). +* Fix `LOGICAL_ERROR` for `join_use_nulls=1` when JOIN contains const from SELECT. [#20461](https://github.com/ClickHouse/ClickHouse/pull/20461) ([Azat Khuzhin](https://github.com/azat)). +* Check if table function `view` is used in expression list and throw an error. This fixes [#20342](https://github.com/ClickHouse/ClickHouse/issues/20342). [#20350](https://github.com/ClickHouse/ClickHouse/pull/20350) ([Amos Bird](https://github.com/amosbird)). +* Avoid invalid dereference in RANGE_HASHED() dictionary. [#20345](https://github.com/ClickHouse/ClickHouse/pull/20345) ([Azat Khuzhin](https://github.com/azat)). +* Fix null dereference with `join_use_nulls=1`. [#20344](https://github.com/ClickHouse/ClickHouse/pull/20344) ([Azat Khuzhin](https://github.com/azat)). +* Fix incorrect result of binary operations between two constant decimals of different scale. Fixes [#20283](https://github.com/ClickHouse/ClickHouse/issues/20283). [#20339](https://github.com/ClickHouse/ClickHouse/pull/20339) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix too often retries of failed background tasks for `ReplicatedMergeTree` table engines family. This could lead to too verbose logging and increased CPU load. Fixes [#20203](https://github.com/ClickHouse/ClickHouse/issues/20203). [#20335](https://github.com/ClickHouse/ClickHouse/pull/20335) ([alesapin](https://github.com/alesapin)). +* Restrict to `DROP` or `RENAME` version column of `*CollapsingMergeTree` and `ReplacingMergeTree` table engines. [#20300](https://github.com/ClickHouse/ClickHouse/pull/20300) ([alesapin](https://github.com/alesapin)). +* Fixed the behavior when in case of broken JSON we tried to read the whole file into memory which leads to exception from the allocator. Fixes [#19719](https://github.com/ClickHouse/ClickHouse/issues/19719). [#20286](https://github.com/ClickHouse/ClickHouse/pull/20286) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix exception during vertical merge for `MergeTree` table engines family which don't allow to perform vertical merges. Fixes [#20259](https://github.com/ClickHouse/ClickHouse/issues/20259). [#20279](https://github.com/ClickHouse/ClickHouse/pull/20279) ([alesapin](https://github.com/alesapin)). +* Fix rare server crash on config reload during the shutdown. Fixes [#19689](https://github.com/ClickHouse/ClickHouse/issues/19689). [#20224](https://github.com/ClickHouse/ClickHouse/pull/20224) ([alesapin](https://github.com/alesapin)). +* Fix CTE when using in INSERT SELECT. This fixes [#20187](https://github.com/ClickHouse/ClickHouse/issues/20187), fixes [#20195](https://github.com/ClickHouse/ClickHouse/issues/20195). [#20211](https://github.com/ClickHouse/ClickHouse/pull/20211) ([Amos Bird](https://github.com/amosbird)). +* Fixes [#19314](https://github.com/ClickHouse/ClickHouse/issues/19314). [#20156](https://github.com/ClickHouse/ClickHouse/pull/20156) ([Ivan](https://github.com/abyss7)). +* fix toMinute function to handle special timezone correctly. [#20149](https://github.com/ClickHouse/ClickHouse/pull/20149) ([keenwolf](https://github.com/keen-wolf)). +* Fix server crash after query with `if` function with `Tuple` type of then/else branches result. `Tuple` type must contain `Array` or another complex type. Fixes [#18356](https://github.com/ClickHouse/ClickHouse/issues/18356). [#20133](https://github.com/ClickHouse/ClickHouse/pull/20133) ([alesapin](https://github.com/alesapin)). +* The `MongoDB` table engine now establishes connection only when it's going to read data. `ATTACH TABLE` won't try to connect anymore. [#20110](https://github.com/ClickHouse/ClickHouse/pull/20110) ([Vitaly Baranov](https://github.com/vitlibar)). +* Bugfix in StorageJoin. [#20079](https://github.com/ClickHouse/ClickHouse/pull/20079) ([vdimir](https://github.com/vdimir)). +* Fix the case when calculating modulo of division of negative number by small divisor, the resulting data type was not large enough to accomodate the negative result. This closes [#20052](https://github.com/ClickHouse/ClickHouse/issues/20052). [#20067](https://github.com/ClickHouse/ClickHouse/pull/20067) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* MaterializeMySQL: Fix replication for statements that update several tables. [#20066](https://github.com/ClickHouse/ClickHouse/pull/20066) ([Håvard Kvålen](https://github.com/havardk)). +* Prevent "Connection refused" in docker during initialization script execution. [#20012](https://github.com/ClickHouse/ClickHouse/pull/20012) ([filimonov](https://github.com/filimonov)). +* `EmbeddedRocksDB` is an experimental storage. Fix the issue with lack of proper type checking. Simplified code. This closes [#19967](https://github.com/ClickHouse/ClickHouse/issues/19967). [#19972](https://github.com/ClickHouse/ClickHouse/pull/19972) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix a segfault in function `fromModifiedJulianDay` when the argument type is `Nullable(T)` for any integral types other than Int32. [#19959](https://github.com/ClickHouse/ClickHouse/pull/19959) ([PHO](https://github.com/depressed-pho)). +* BloomFilter index crash fix. Fixes [#19757](https://github.com/ClickHouse/ClickHouse/issues/19757). [#19884](https://github.com/ClickHouse/ClickHouse/pull/19884) ([Maksim Kita](https://github.com/kitaisreal)). +* Deadlock was possible if system.text_log is enabled. This fixes [#19874](https://github.com/ClickHouse/ClickHouse/issues/19874). [#19875](https://github.com/ClickHouse/ClickHouse/pull/19875) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix starting the server with tables having default expressions containing dictGet(). Allow getting return type of dictGet() without loading dictionary. [#19805](https://github.com/ClickHouse/ClickHouse/pull/19805) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix clickhouse-client abort exception while executing only `select`. [#19790](https://github.com/ClickHouse/ClickHouse/pull/19790) ([taiyang-li](https://github.com/taiyang-li)). +* Fix a bug that moving pieces to destination table may failed in case of launching multiple clickhouse-copiers. [#19743](https://github.com/ClickHouse/ClickHouse/pull/19743) ([madianjun](https://github.com/mdianjun)). +* Background thread which executes `ON CLUSTER` queries might hang waiting for dropped replicated table to do something. It's fixed. [#19684](https://github.com/ClickHouse/ClickHouse/pull/19684) ([yiguolei](https://github.com/yiguolei)). + +#### Build/Testing/Packaging Improvement + +* Allow to build ClickHouse with AVX-2 enabled globally. It gives slight performance benefits on modern CPUs. Not recommended for production and will not be supported as official build for now. [#20180](https://github.com/ClickHouse/ClickHouse/pull/20180) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix some of the issues found by Coverity. See [#19964](https://github.com/ClickHouse/ClickHouse/issues/19964). [#20010](https://github.com/ClickHouse/ClickHouse/pull/20010) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Allow to start up with modified binary under gdb. In previous version if you set up breakpoint in gdb before start, server will refuse to start up due to failed integrity check. [#21258](https://github.com/ClickHouse/ClickHouse/pull/21258) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add a test for different compression methods in Kafka. [#21111](https://github.com/ClickHouse/ClickHouse/pull/21111) ([filimonov](https://github.com/filimonov)). +* Fixed port clash from test_storage_kerberized_hdfs test. [#19974](https://github.com/ClickHouse/ClickHouse/pull/19974) ([Ilya Yatsishin](https://github.com/qoega)). +* Print `stdout` and `stderr` to log when failed to start docker in integration tests. Before this PR there was a very short error message in this case which didn't help to investigate the problems. [#20631](https://github.com/ClickHouse/ClickHouse/pull/20631) ([Vitaly Baranov](https://github.com/vitlibar)). + + ## ClickHouse release 21.2 ### ClickHouse release v21.2.2.8-stable, 2021-02-07 diff --git a/CMakeLists.txt b/CMakeLists.txt index 9002f1df140..1423f3a0bc2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,8 @@ else() set(RECONFIGURE_MESSAGE_LEVEL STATUS) endif() +enable_language(C CXX ASM) + include (cmake/arch.cmake) include (cmake/target.cmake) include (cmake/tools.cmake) @@ -66,17 +68,30 @@ endif () include (cmake/find/ccache.cmake) -option(ENABLE_CHECK_HEAVY_BUILDS "Don't allow C++ translation units to compile too long or to take too much memory while compiling" OFF) +# Take care to add prlimit in command line before ccache, or else ccache thinks that +# prlimit is compiler, and clang++ is its input file, and refuses to work with +# multiple inputs, e.g in ccache log: +# [2021-03-31T18:06:32.655327 36900] Command line: /usr/bin/ccache prlimit --as=10000000000 --data=5000000000 --cpu=600 /usr/bin/clang++-11 - ...... std=gnu++2a -MD -MT src/CMakeFiles/dbms.dir/Storages/MergeTree/IMergeTreeDataPart.cpp.o -MF src/CMakeFiles/dbms.dir/Storages/MergeTree/IMergeTreeDataPart.cpp.o.d -o src/CMakeFiles/dbms.dir/Storages/MergeTree/IMergeTreeDataPart.cpp.o -c ../src/Storages/MergeTree/IMergeTreeDataPart.cpp +# +# [2021-03-31T18:06:32.656704 36900] Multiple input files: /usr/bin/clang++-11 and ../src/Storages/MergeTree/IMergeTreeDataPart.cpp +# +# Another way would be to use --ccache-skip option before clang++-11 to make +# ccache ignore it. +option(ENABLE_CHECK_HEAVY_BUILDS "Don't allow C++ translation units to compile too long or to take too much memory while compiling." OFF) if (ENABLE_CHECK_HEAVY_BUILDS) # set DATA (since RSS does not work since 2.6.x+) to 2G set (RLIMIT_DATA 5000000000) # set VIRT (RLIMIT_AS) to 10G (DATA*10) set (RLIMIT_AS 10000000000) + # set CPU time limit to 600 seconds + set (RLIMIT_CPU 600) + # gcc10/gcc10/clang -fsanitize=memory is too heavy if (SANITIZE STREQUAL "memory" OR COMPILER_GCC) set (RLIMIT_DATA 10000000000) endif() - set (CMAKE_CXX_COMPILER_LAUNCHER prlimit --as=${RLIMIT_AS} --data=${RLIMIT_DATA} --cpu=600) + + set (CMAKE_CXX_COMPILER_LAUNCHER prlimit --as=${RLIMIT_AS} --data=${RLIMIT_DATA} --cpu=${RLIMIT_CPU} ${CMAKE_CXX_COMPILER_LAUNCHER}) endif () if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "None") @@ -155,7 +170,6 @@ option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests" if (OS_LINUX AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND CMAKE_VERSION VERSION_GREATER "3.9.0") # Only for Linux, x86_64. - # Implies ${ENABLE_FASTMEMCPY} option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON) elseif(GLIBC_COMPATIBILITY) message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration") @@ -169,7 +183,7 @@ endif () set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") if (OS_LINUX) - find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-11" "llvm-objcopy-10" "llvm-objcopy-9" "llvm-objcopy-8" "objcopy") + find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-12" "llvm-objcopy-11" "llvm-objcopy-10" "llvm-objcopy-9" "llvm-objcopy-8" "objcopy") if (OBJCOPY_PATH) message(STATUS "Using objcopy: ${OBJCOPY_PATH}.") @@ -241,9 +255,7 @@ else() message(STATUS "Disabling compiler -pipe option (have only ${AVAILABLE_PHYSICAL_MEMORY} mb of memory)") endif() -if(NOT DISABLE_CPU_OPTIMIZE) - include(cmake/cpu_features.cmake) -endif() +include(cmake/cpu_features.cmake) option(ARCH_NATIVE "Add -march=native compiler flag") @@ -251,25 +263,39 @@ if (ARCH_NATIVE) set (COMPILER_FLAGS "${COMPILER_FLAGS} -march=native") endif () -if (COMPILER_GCC OR COMPILER_CLANG) - # to make numeric_limits<__int128> works with GCC - set (_CXX_STANDARD "gnu++2a") -else() - set (_CXX_STANDARD "c++2a") -endif() +if (${CMAKE_VERSION} VERSION_LESS "3.12.4") + # CMake < 3.12 doesn't support setting 20 as a C++ standard version. + # We will add C++ standard controlling flag in CMAKE_CXX_FLAGS manually for now. -# cmake < 3.12 doesn't support 20. We'll set CMAKE_CXX_FLAGS for now -# set (CMAKE_CXX_STANDARD 20) -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=${_CXX_STANDARD}") + if (COMPILER_GCC OR COMPILER_CLANG) + # to make numeric_limits<__int128> works with GCC + set (_CXX_STANDARD "gnu++2a") + else () + set (_CXX_STANDARD "c++2a") + endif () -set (CMAKE_CXX_EXTENSIONS 0) # https://cmake.org/cmake/help/latest/prop_tgt/CXX_EXTENSIONS.html#prop_tgt:CXX_EXTENSIONS -set (CMAKE_CXX_STANDARD_REQUIRED ON) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=${_CXX_STANDARD}") +else () + set (CMAKE_CXX_STANDARD 20) + set (CMAKE_CXX_EXTENSIONS ON) # Same as gnu++2a (ON) vs c++2a (OFF): https://cmake.org/cmake/help/latest/prop_tgt/CXX_EXTENSIONS.html + set (CMAKE_CXX_STANDARD_REQUIRED ON) +endif () + +set (CMAKE_C_STANDARD 11) +set (CMAKE_C_EXTENSIONS ON) +set (CMAKE_C_STANDARD_REQUIRED ON) if (COMPILER_GCC OR COMPILER_CLANG) # Enable C++14 sized global deallocation functions. It should be enabled by setting -std=c++14 but I'm not sure. set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsized-deallocation") endif () +# falign-functions=32 prevents from random performance regressions with the code change. Thus, providing more stable +# benchmarks. +if (COMPILER_GCC OR COMPILER_CLANG) + set(COMPILER_FLAGS "${COMPILER_FLAGS} -falign-functions=32") +endif () + # Compiler-specific coverage flags e.g. -fcoverage-mapping for gcc option(WITH_COVERAGE "Profile the resulting binary/binaries" OFF) @@ -331,7 +357,7 @@ if (COMPILER_CLANG) endif () # Always prefer llvm tools when using clang. For instance, we cannot use GNU ar when llvm LTO is enabled - find_program (LLVM_AR_PATH NAMES "llvm-ar" "llvm-ar-11" "llvm-ar-10" "llvm-ar-9" "llvm-ar-8") + find_program (LLVM_AR_PATH NAMES "llvm-ar" "llvm-ar-12" "llvm-ar-11" "llvm-ar-10" "llvm-ar-9" "llvm-ar-8") if (LLVM_AR_PATH) message(STATUS "Using llvm-ar: ${LLVM_AR_PATH}.") @@ -340,7 +366,7 @@ if (COMPILER_CLANG) message(WARNING "Cannot find llvm-ar. System ar will be used instead. It does not work with ThinLTO.") endif () - find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib" "llvm-ranlib-11" "llvm-ranlib-10" "llvm-ranlib-9" "llvm-ranlib-8") + find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib" "llvm-ranlib-12" "llvm-ranlib-11" "llvm-ranlib-10" "llvm-ranlib-9" "llvm-ranlib-8") if (LLVM_RANLIB_PATH) message(STATUS "Using llvm-ranlib: ${LLVM_RANLIB_PATH}.") @@ -457,6 +483,7 @@ find_contrib_lib(double-conversion) # Must be before parquet include (cmake/find/ssl.cmake) include (cmake/find/ldap.cmake) # after ssl include (cmake/find/icu.cmake) +include (cmake/find/xz.cmake) include (cmake/find/zlib.cmake) include (cmake/find/zstd.cmake) include (cmake/find/ltdl.cmake) # for odbc @@ -489,6 +516,7 @@ include (cmake/find/fast_float.cmake) include (cmake/find/rapidjson.cmake) include (cmake/find/fastops.cmake) include (cmake/find/odbc.cmake) +include (cmake/find/nanodbc.cmake) include (cmake/find/rocksdb.cmake) include (cmake/find/libpqxx.cmake) include (cmake/find/nuraft.cmake) @@ -504,6 +532,7 @@ include (cmake/find/msgpack.cmake) include (cmake/find/cassandra.cmake) include (cmake/find/sentry.cmake) include (cmake/find/stats.cmake) +include (cmake/find/datasketches.cmake) set (USE_INTERNAL_CITYHASH_LIBRARY ON CACHE INTERNAL "") find_contrib_lib(cityhash) @@ -536,7 +565,7 @@ macro (add_executable target) # explicitly acquire and interpose malloc symbols by clickhouse_malloc # if GLIBC_COMPATIBILITY is ON and ENABLE_THINLTO is on than provide memcpy symbol explicitly to neutrialize thinlto's libcall generation. if (GLIBC_COMPATIBILITY AND ENABLE_THINLTO) - _add_executable (${ARGV} $ $) + _add_executable (${ARGV} $ $) else () _add_executable (${ARGV} $) endif () diff --git a/README.md b/README.md index 3329a98877f..ea9f365a3c6 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ClickHouse® is an open-source column-oriented database management system that a * [Tutorial](https://clickhouse.tech/docs/en/getting_started/tutorial/) shows how to set up and query small ClickHouse cluster. * [Documentation](https://clickhouse.tech/docs/en/) provides more in-depth information. * [YouTube channel](https://www.youtube.com/c/ClickHouseDB) has a lot of content about ClickHouse in video format. -* [Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-ly9m4w1x-6j7x5Ts_pQZqrctAbRZ3cg) and [Telegram](https://telegram.me/clickhouse_en) allow to chat with ClickHouse users in real-time. +* [Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-nwwakmk4-xOJ6cdy0sJC3It8j348~IA) and [Telegram](https://telegram.me/clickhouse_en) allow to chat with ClickHouse users in real-time. * [Blog](https://clickhouse.yandex/blog/en/) contains various ClickHouse-related articles, as well as announcements and reports about events. * [Code Browser](https://clickhouse.tech/codebrowser/html_report/ClickHouse/index.html) with syntax highlight and navigation. * [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any. diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index 46bd57eda12..023dcaaccae 100644 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -8,6 +8,7 @@ add_subdirectory (loggers) add_subdirectory (pcg-random) add_subdirectory (widechar_width) add_subdirectory (readpassphrase) +add_subdirectory (bridge) if (USE_MYSQL) add_subdirectory (mysqlxx) diff --git a/base/bridge/CMakeLists.txt b/base/bridge/CMakeLists.txt new file mode 100644 index 00000000000..20b0b651677 --- /dev/null +++ b/base/bridge/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library (bridge + IBridge.cpp +) + +target_include_directories (daemon PUBLIC ..) +target_link_libraries (bridge PRIVATE daemon dbms Poco::Data Poco::Data::ODBC) + diff --git a/base/bridge/IBridge.cpp b/base/bridge/IBridge.cpp new file mode 100644 index 00000000000..b1f71315fef --- /dev/null +++ b/base/bridge/IBridge.cpp @@ -0,0 +1,238 @@ +#include "IBridge.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if USE_ODBC +# include +#endif + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; +} + +namespace +{ + Poco::Net::SocketAddress makeSocketAddress(const std::string & host, UInt16 port, Poco::Logger * log) + { + Poco::Net::SocketAddress socket_address; + try + { + socket_address = Poco::Net::SocketAddress(host, port); + } + catch (const Poco::Net::DNSException & e) + { + const auto code = e.code(); + if (code == EAI_FAMILY +#if defined(EAI_ADDRFAMILY) + || code == EAI_ADDRFAMILY +#endif + ) + { + LOG_ERROR(log, "Cannot resolve listen_host ({}), error {}: {}. If it is an IPv6 address and your host has disabled IPv6, then consider to specify IPv4 address to listen in element of configuration file. Example: 0.0.0.0", host, e.code(), e.message()); + } + + throw; + } + return socket_address; + } + + Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, Poco::Logger * log) + { + auto address = makeSocketAddress(host, port, log); +#if POCO_VERSION < 0x01080000 + socket.bind(address, /* reuseAddress = */ true); +#else + socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ false); +#endif + + socket.listen(/* backlog = */ 64); + + return address; + } +} + + +void IBridge::handleHelp(const std::string &, const std::string &) +{ + Poco::Util::HelpFormatter help_formatter(options()); + help_formatter.setCommand(commandName()); + help_formatter.setHeader("HTTP-proxy for odbc requests"); + help_formatter.setUsage("--http-port "); + help_formatter.format(std::cerr); + + stopOptionsProcessing(); +} + + +void IBridge::defineOptions(Poco::Util::OptionSet & options) +{ + options.addOption( + Poco::Util::Option("http-port", "", "port to listen").argument("http-port", true) .binding("http-port")); + + options.addOption( + Poco::Util::Option("listen-host", "", "hostname or address to listen, default 127.0.0.1").argument("listen-host").binding("listen-host")); + + options.addOption( + Poco::Util::Option("http-timeout", "", "http timeout for socket, default 1800").argument("http-timeout").binding("http-timeout")); + + options.addOption( + Poco::Util::Option("max-server-connections", "", "max connections to server, default 1024").argument("max-server-connections").binding("max-server-connections")); + + options.addOption( + Poco::Util::Option("keep-alive-timeout", "", "keepalive timeout, default 10").argument("keep-alive-timeout").binding("keep-alive-timeout")); + + options.addOption( + Poco::Util::Option("log-level", "", "sets log level, default info") .argument("log-level").binding("logger.level")); + + options.addOption( + Poco::Util::Option("log-path", "", "log path for all logs, default console").argument("log-path").binding("logger.log")); + + options.addOption( + Poco::Util::Option("err-log-path", "", "err log path for all logs, default no").argument("err-log-path").binding("logger.errorlog")); + + options.addOption( + Poco::Util::Option("stdout-path", "", "stdout log path, default console").argument("stdout-path").binding("logger.stdout")); + + options.addOption( + Poco::Util::Option("stderr-path", "", "stderr log path, default console").argument("stderr-path").binding("logger.stderr")); + + using Me = std::decay_t; + + options.addOption( + Poco::Util::Option("help", "", "produce this help message").binding("help").callback(Poco::Util::OptionCallback(this, &Me::handleHelp))); + + ServerApplication::defineOptions(options); // NOLINT Don't need complex BaseDaemon's .xml config +} + + +void IBridge::initialize(Application & self) +{ + BaseDaemon::closeFDs(); + is_help = config().has("help"); + + if (is_help) + return; + + config().setString("logger", bridgeName()); + + /// Redirect stdout, stderr to specified files. + /// Some libraries and sanitizers write to stderr in case of errors. + const auto stdout_path = config().getString("logger.stdout", ""); + if (!stdout_path.empty()) + { + if (!freopen(stdout_path.c_str(), "a+", stdout)) + throw Poco::OpenFileException("Cannot attach stdout to " + stdout_path); + + /// Disable buffering for stdout. + setbuf(stdout, nullptr); + } + const auto stderr_path = config().getString("logger.stderr", ""); + if (!stderr_path.empty()) + { + if (!freopen(stderr_path.c_str(), "a+", stderr)) + throw Poco::OpenFileException("Cannot attach stderr to " + stderr_path); + + /// Disable buffering for stderr. + setbuf(stderr, nullptr); + } + + buildLoggers(config(), logger(), self.commandName()); + + BaseDaemon::logRevision(); + + log = &logger(); + hostname = config().getString("listen-host", "127.0.0.1"); + port = config().getUInt("http-port"); + if (port > 0xFFFF) + throw Exception("Out of range 'http-port': " + std::to_string(port), ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + http_timeout = config().getUInt("http-timeout", DEFAULT_HTTP_READ_BUFFER_TIMEOUT); + max_server_connections = config().getUInt("max-server-connections", 1024); + keep_alive_timeout = config().getUInt("keep-alive-timeout", 10); + + initializeTerminationAndSignalProcessing(); + +#if USE_ODBC + if (bridgeName() == "ODBCBridge") + Poco::Data::ODBC::Connector::registerConnector(); +#endif + + ServerApplication::initialize(self); // NOLINT +} + + +void IBridge::uninitialize() +{ + BaseDaemon::uninitialize(); +} + + +int IBridge::main(const std::vector & /*args*/) +{ + if (is_help) + return Application::EXIT_OK; + + registerFormats(); + LOG_INFO(log, "Starting up {} on host: {}, port: {}", bridgeName(), hostname, port); + + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, hostname, port, log); + socket.setReceiveTimeout(http_timeout); + socket.setSendTimeout(http_timeout); + + Poco::ThreadPool server_pool(3, max_server_connections); + + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(http_timeout); + http_params->setKeepAliveTimeout(keep_alive_timeout); + + auto shared_context = Context::createShared(); + auto context = Context::createGlobal(shared_context.get()); + context->makeGlobalContext(); + + if (config().has("query_masking_rules")) + SensitiveDataMasker::setInstance(std::make_unique(config(), "query_masking_rules")); + + auto server = HTTPServer( + context, + getHandlerFactoryPtr(context), + server_pool, + socket, + http_params); + + SCOPE_EXIT({ + LOG_DEBUG(log, "Received termination signal."); + LOG_DEBUG(log, "Waiting for current connections to close."); + + server.stop(); + + for (size_t count : ext::range(1, 6)) + { + if (server.currentConnections() == 0) + break; + LOG_DEBUG(log, "Waiting for {} connections, try {}", server.currentConnections(), count); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + }); + + server.start(); + LOG_INFO(log, "Listening http://{}", address.toString()); + + waitForTerminationRequest(); + return Application::EXIT_OK; +} + +} diff --git a/base/bridge/IBridge.h b/base/bridge/IBridge.h new file mode 100644 index 00000000000..c64003d9959 --- /dev/null +++ b/base/bridge/IBridge.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include + +#include +#include + + +namespace DB +{ + +/// Class represents base for clickhouse-odbc-bridge and clickhouse-library-bridge servers. +/// Listens to incoming HTTP POST and GET requests on specified port and host. +/// Has two handlers '/' for all incoming POST requests and /ping for GET request about service status. +class IBridge : public BaseDaemon +{ + +public: + /// Define command line arguments + void defineOptions(Poco::Util::OptionSet & options) override; + +protected: + using HandlerFactoryPtr = std::shared_ptr; + + void initialize(Application & self) override; + + void uninitialize() override; + + int main(const std::vector & args) override; + + virtual std::string bridgeName() const = 0; + + virtual HandlerFactoryPtr getHandlerFactoryPtr(ContextPtr context) const = 0; + + size_t keep_alive_timeout; + +private: + void handleHelp(const std::string &, const std::string &); + + bool is_help; + std::string hostname; + size_t port; + std::string log_level; + size_t max_server_connections; + size_t http_timeout; + + Poco::Logger * log; +}; +} diff --git a/base/common/BorrowedObjectPool.h b/base/common/BorrowedObjectPool.h new file mode 100644 index 00000000000..6a90a7e7122 --- /dev/null +++ b/base/common/BorrowedObjectPool.h @@ -0,0 +1,156 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +/** Pool for limited size objects that cannot be used from different threads simultaneously. + * The main use case is to have fixed size of objects that can be reused in difference threads during their lifetime + * and have to be initialized on demand. + * Two main properties of pool are allocated objects size and borrowed objects size. + * Allocated objects size is size of objects that are currently allocated by the pool. + * Borrowed objects size is size of objects that are borrowed by clients. + * If max_size == 0 then pool has unlimited size and objects will be allocated without limit. + * + * Pool provides following strategy for borrowing object: + * If max_size == 0 then pool has unlimited size and objects will be allocated without limit. + * 1. If pool has objects that can be borrowed increase borrowed objects size and return it. + * 2. If pool allocatedObjectsSize is lower than max objects size or pool has unlimited size + * allocate new object, increase borrowed objects size and return it. + * 3. If pool is full wait on condition variable with or without timeout until some object + * will be returned to the pool. + */ +template +class BorrowedObjectPool final +{ +public: + explicit BorrowedObjectPool(size_t max_size_) : max_size(max_size_) {} + + /// Borrow object from pool. If pull is full and all objects were borrowed + /// then calling thread will wait until some object will be returned into pool. + template + void borrowObject(T & dest, FactoryFunc && func) + { + std::unique_lock lock(objects_mutex); + + if (!objects.empty()) + { + dest = borrowFromObjects(lock); + return; + } + + bool has_unlimited_size = (max_size == 0); + + if (unlikely(has_unlimited_size) || allocated_objects_size < max_size) + { + dest = allocateObjectForBorrowing(lock, std::forward(func)); + return; + } + + condition_variable.wait(lock, [this] { return !objects.empty(); }); + dest = borrowFromObjects(lock); + } + + /// Same as borrowObject function, but wait with timeout. + /// Returns true if object was borrowed during timeout. + template + bool tryBorrowObject(T & dest, FactoryFunc && func, size_t timeout_in_milliseconds = 0) + { + std::unique_lock lock(objects_mutex); + + if (!objects.empty()) + { + dest = borrowFromObjects(lock); + return true; + } + + bool has_unlimited_size = (max_size == 0); + + if (unlikely(has_unlimited_size) || allocated_objects_size < max_size) + { + dest = allocateObjectForBorrowing(lock, std::forward(func)); + return true; + } + + bool wait_result = condition_variable.wait_for(lock, std::chrono::milliseconds(timeout_in_milliseconds), [this] { return !objects.empty(); }); + + if (wait_result) + dest = borrowFromObjects(lock); + + return wait_result; + } + + /// Return object into pool. Client must return same object that was borrowed. + inline void returnObject(T && object_to_return) + { + std::unique_lock lck(objects_mutex); + + objects.emplace_back(std::move(object_to_return)); + --borrowed_objects_size; + + condition_variable.notify_one(); + } + + /// Max pool size + inline size_t maxSize() const + { + return max_size; + } + + /// Allocated objects size by the pool. If allocatedObjectsSize == maxSize then pool is full. + inline size_t allocatedObjectsSize() const + { + std::unique_lock lock(objects_mutex); + return allocated_objects_size; + } + + /// Returns allocatedObjectsSize == maxSize + inline bool isFull() const + { + std::unique_lock lock(objects_mutex); + return allocated_objects_size == max_size; + } + + /// Borrowed objects size. If borrowedObjectsSize == allocatedObjectsSize and pool is full. + /// Then client will wait during borrowObject function call. + inline size_t borrowedObjectsSize() const + { + std::unique_lock lock(objects_mutex); + return borrowed_objects_size; + } + +private: + + template + inline T allocateObjectForBorrowing(const std::unique_lock &, FactoryFunc && func) + { + ++allocated_objects_size; + ++borrowed_objects_size; + + return std::forward(func)(); + } + + inline T borrowFromObjects(const std::unique_lock &) + { + T dst; + detail::moveOrCopyIfThrow(std::move(objects.back()), dst); + objects.pop_back(); + + ++borrowed_objects_size; + + return dst; + } + + size_t max_size; + + mutable std::mutex objects_mutex; + std::condition_variable condition_variable; + size_t allocated_objects_size = 0; + size_t borrowed_objects_size = 0; + std::vector objects; +}; diff --git a/base/common/CMakeLists.txt b/base/common/CMakeLists.txt index cea52b443dd..7dfb9bc10c0 100644 --- a/base/common/CMakeLists.txt +++ b/base/common/CMakeLists.txt @@ -47,6 +47,10 @@ endif() target_include_directories(common PUBLIC .. ${CMAKE_CURRENT_BINARY_DIR}/..) +if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) + target_link_libraries(common PUBLIC -Wl,-U,_inside_main) +endif() + # Allow explicit fallback to readline if (NOT ENABLE_REPLXX AND ENABLE_READLINE) message (STATUS "Attempt to fallback to readline explicitly") @@ -74,7 +78,6 @@ target_link_libraries (common ${CITYHASH_LIBRARIES} boost::headers_only boost::system - FastMemcpy Poco::Net Poco::Net::SSL Poco::Util diff --git a/base/common/DateLUT.cpp b/base/common/DateLUT.cpp index 6ff0884701c..d14b63cd70a 100644 --- a/base/common/DateLUT.cpp +++ b/base/common/DateLUT.cpp @@ -152,7 +152,7 @@ const DateLUTImpl & DateLUT::getImplementation(const std::string & time_zone) co auto it = impls.emplace(time_zone, nullptr).first; if (!it->second) - it->second = std::make_unique(time_zone); + it->second = std::unique_ptr(new DateLUTImpl(time_zone)); return *it->second; } diff --git a/base/common/DateLUT.h b/base/common/DateLUT.h index 93c6cb403e2..378b4360f3b 100644 --- a/base/common/DateLUT.h +++ b/base/common/DateLUT.h @@ -32,7 +32,6 @@ public: return date_lut.getImplementation(time_zone); } - static void setDefaultTimezone(const std::string & time_zone) { auto & date_lut = getInstance(); diff --git a/base/common/DateLUTImpl.cpp b/base/common/DateLUTImpl.cpp index 50620e21b8f..e7faeb63760 100644 --- a/base/common/DateLUTImpl.cpp +++ b/base/common/DateLUTImpl.cpp @@ -46,24 +46,41 @@ DateLUTImpl::DateLUTImpl(const std::string & time_zone_) if (&inside_main) assert(inside_main); - size_t i = 0; - time_t start_of_day = 0; - cctz::time_zone cctz_time_zone; if (!cctz::load_time_zone(time_zone, &cctz_time_zone)) throw Poco::Exception("Cannot load time zone " + time_zone_); - cctz::time_zone::absolute_lookup start_of_epoch_lookup = cctz_time_zone.lookup(std::chrono::system_clock::from_time_t(start_of_day)); - offset_at_start_of_epoch = start_of_epoch_lookup.offset; - offset_is_whole_number_of_hours_everytime = true; + constexpr cctz::civil_day epoch{1970, 1, 1}; + constexpr cctz::civil_day lut_start{DATE_LUT_MIN_YEAR, 1, 1}; + time_t start_of_day; - cctz::civil_day date{1970, 1, 1}; + /// Note: it's validated against all timezones in the system. + static_assert((epoch - lut_start) == daynum_offset_epoch); + offset_at_start_of_epoch = cctz_time_zone.lookup(cctz_time_zone.lookup(epoch).pre).offset; + offset_at_start_of_lut = cctz_time_zone.lookup(cctz_time_zone.lookup(lut_start).pre).offset; + offset_is_whole_number_of_hours_during_epoch = true; + + cctz::civil_day date = lut_start; + + UInt32 i = 0; do { cctz::time_zone::civil_lookup lookup = cctz_time_zone.lookup(date); - start_of_day = std::chrono::system_clock::to_time_t(lookup.pre); /// Ambiguity is possible. + /// Ambiguity is possible if time was changed backwards at the midnight + /// or after midnight time has been changed back to midnight, for example one hour backwards at 01:00 + /// or after midnight time has been changed to the previous day, for example two hours backwards at 01:00 + /// Then midnight appears twice. Usually time change happens exactly at 00:00 or 01:00. + + /// If transition did not involve previous day, we should use the first midnight as the start of the day, + /// otherwise it's better to use the second midnight. + + std::chrono::time_point start_of_day_time_point = lookup.trans < lookup.post + ? lookup.post /* Second midnight appears after transition, so there was a piece of previous day after transition */ + : lookup.pre; + + start_of_day = std::chrono::system_clock::to_time_t(start_of_day_time_point); Values & values = lut[i]; values.year = date.year(); @@ -72,7 +89,7 @@ DateLUTImpl::DateLUTImpl(const std::string & time_zone_) values.day_of_week = getDayOfWeek(date); values.date = start_of_day; - assert(values.year >= DATE_LUT_MIN_YEAR && values.year <= DATE_LUT_MAX_YEAR); + assert(values.year >= DATE_LUT_MIN_YEAR && values.year <= DATE_LUT_MAX_YEAR + 1); assert(values.month >= 1 && values.month <= 12); assert(values.day_of_month >= 1 && values.day_of_month <= 31); assert(values.day_of_week >= 1 && values.day_of_week <= 7); @@ -85,50 +102,42 @@ DateLUTImpl::DateLUTImpl(const std::string & time_zone_) else values.days_in_month = i != 0 ? lut[i - 1].days_in_month : 31; - values.time_at_offset_change = 0; - values.amount_of_offset_change = 0; + values.time_at_offset_change_value = 0; + values.amount_of_offset_change_value = 0; - if (start_of_day % 3600) - offset_is_whole_number_of_hours_everytime = false; + if (offset_is_whole_number_of_hours_during_epoch && start_of_day > 0 && start_of_day % 3600) + offset_is_whole_number_of_hours_during_epoch = false; - /// If UTC offset was changed in previous day. - if (i != 0) + /// If UTC offset was changed this day. + /// Change in time zone without transition is possible, e.g. Moscow 1991 Sun, 31 Mar, 02:00 MSK to EEST + cctz::time_zone::civil_transition transition{}; + if (cctz_time_zone.next_transition(start_of_day_time_point - std::chrono::seconds(1), &transition) + && (cctz::civil_day(transition.from) == date || cctz::civil_day(transition.to) == date) + && transition.from != transition.to) { - auto amount_of_offset_change_at_prev_day = 86400 - (lut[i].date - lut[i - 1].date); - if (amount_of_offset_change_at_prev_day) - { - lut[i - 1].amount_of_offset_change = amount_of_offset_change_at_prev_day; + values.time_at_offset_change_value = (transition.from - cctz::civil_second(date)) / Values::OffsetChangeFactor; + values.amount_of_offset_change_value = (transition.to - transition.from) / Values::OffsetChangeFactor; - const auto utc_offset_at_beginning_of_day = cctz_time_zone.lookup(std::chrono::system_clock::from_time_t(lut[i - 1].date)).offset; +// std::cerr << time_zone << ", " << date << ": change from " << transition.from << " to " << transition.to << "\n"; +// std::cerr << time_zone << ", " << date << ": change at " << values.time_at_offset_change() << " with " << values.amount_of_offset_change() << "\n"; - /// Find a time (timestamp offset from beginning of day), - /// when UTC offset was changed. Search is performed with 15-minute granularity, assuming it is enough. + /// We don't support too large changes. + if (values.amount_of_offset_change_value > 24 * 4) + values.amount_of_offset_change_value = 24 * 4; + else if (values.amount_of_offset_change_value < -24 * 4) + values.amount_of_offset_change_value = -24 * 4; - time_t time_at_offset_change = 900; - while (time_at_offset_change < 86400) - { - auto utc_offset_at_current_time = cctz_time_zone.lookup(std::chrono::system_clock::from_time_t( - lut[i - 1].date + time_at_offset_change)).offset; - - if (utc_offset_at_current_time != utc_offset_at_beginning_of_day) - break; - - time_at_offset_change += 900; - } - - lut[i - 1].time_at_offset_change = time_at_offset_change; - - /// We doesn't support cases when time change results in switching to previous day. - if (static_cast(lut[i - 1].time_at_offset_change) + static_cast(lut[i - 1].amount_of_offset_change) < 0) - lut[i - 1].time_at_offset_change = -lut[i - 1].amount_of_offset_change; - } + /// We don't support cases when time change results in switching to previous day. + /// Shift the point of time change later. + if (values.time_at_offset_change_value + values.amount_of_offset_change_value < 0) + values.time_at_offset_change_value = -values.amount_of_offset_change_value; } /// Going to next day. ++date; ++i; } - while (start_of_day <= DATE_LUT_MAX && i <= DATE_LUT_MAX_DAY_NUM); + while (i < DATE_LUT_SIZE && lut[i - 1].year <= DATE_LUT_MAX_YEAR); /// Fill excessive part of lookup table. This is needed only to simplify handling of overflow cases. while (i < DATE_LUT_SIZE) diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h index 064787fb64e..9e60181e802 100644 --- a/base/common/DateLUTImpl.h +++ b/base/common/DateLUTImpl.h @@ -5,23 +5,32 @@ #include "types.h" #include +#include #include +#include -#define DATE_LUT_MAX (0xFFFFFFFFU - 86400) -#define DATE_LUT_MAX_DAY_NUM (0xFFFFFFFFU / 86400) -/// Table size is bigger than DATE_LUT_MAX_DAY_NUM to fill all indices within UInt16 range: this allows to remove extra check. -#define DATE_LUT_SIZE 0x10000 -#define DATE_LUT_MIN_YEAR 1970 -#define DATE_LUT_MAX_YEAR 2106 /// Last supported year (incomplete) +#define DATE_LUT_MIN_YEAR 1925 /// 1925 since wast majority of timezones changed to 15-minute aligned offsets somewhere in 1924 or earlier. +#define DATE_LUT_MAX_YEAR 2283 /// Last supported year (complete) #define DATE_LUT_YEARS (1 + DATE_LUT_MAX_YEAR - DATE_LUT_MIN_YEAR) /// Number of years in lookup table +#define DATE_LUT_SIZE 0x20000 + +#define DATE_LUT_MAX (0xFFFFFFFFU - 86400) +#define DATE_LUT_MAX_DAY_NUM 0xFFFF + +/// A constant to add to time_t so every supported time point becomes non-negative and still has the same remainder of division by 3600. +/// If we treat "remainder of division" operation in the sense of modular arithmetic (not like in C++). +#define DATE_LUT_ADD ((1970 - DATE_LUT_MIN_YEAR) * 366 * 86400) + + #if defined(__PPC__) -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #endif + /// Flags for toYearWeek() function. enum class WeekModeFlag : UInt8 { @@ -37,7 +46,8 @@ using YearWeek = std::pair; */ class DateLUTImpl { -public: +private: + friend class DateLUT; explicit DateLUTImpl(const std::string & time_zone); DateLUTImpl(const DateLUTImpl &) = delete; @@ -45,14 +55,75 @@ public: DateLUTImpl(const DateLUTImpl &&) = delete; DateLUTImpl & operator=(const DateLUTImpl &&) = delete; + // Normalized and bound-checked index of element in lut, + // has to be a separate type to support overloading + // TODO: make sure that any arithmetic on LUTIndex actually results in valid LUTIndex. + STRONG_TYPEDEF(UInt32, LUTIndex) + + template + friend inline LUTIndex operator+(const LUTIndex & index, const T v) + { + return LUTIndex{(index.toUnderType() + UInt32(v)) & date_lut_mask}; + } + + template + friend inline LUTIndex operator+(const T v, const LUTIndex & index) + { + return LUTIndex{(v + index.toUnderType()) & date_lut_mask}; + } + + friend inline LUTIndex operator+(const LUTIndex & index, const LUTIndex & v) + { + return LUTIndex{(index.toUnderType() + v.toUnderType()) & date_lut_mask}; + } + + template + friend inline LUTIndex operator-(const LUTIndex & index, const T v) + { + return LUTIndex{(index.toUnderType() - UInt32(v)) & date_lut_mask}; + } + + template + friend inline LUTIndex operator-(const T v, const LUTIndex & index) + { + return LUTIndex{(v - index.toUnderType()) & date_lut_mask}; + } + + friend inline LUTIndex operator-(const LUTIndex & index, const LUTIndex & v) + { + return LUTIndex{(index.toUnderType() - v.toUnderType()) & date_lut_mask}; + } + + template + friend inline LUTIndex operator*(const LUTIndex & index, const T v) + { + return LUTIndex{(index.toUnderType() * UInt32(v)) & date_lut_mask}; + } + + template + friend inline LUTIndex operator*(const T v, const LUTIndex & index) + { + return LUTIndex{(v * index.toUnderType()) & date_lut_mask}; + } + + template + friend inline LUTIndex operator/(const LUTIndex & index, const T v) + { + return LUTIndex{(index.toUnderType() / UInt32(v)) & date_lut_mask}; + } + + template + friend inline LUTIndex operator/(const T v, const LUTIndex & index) + { + return LUTIndex{(UInt32(v) / index.toUnderType()) & date_lut_mask}; + } + public: /// The order of fields matters for alignment and sizeof. struct Values { - /// Least significat 32 bits from time_t at beginning of the day. - /// If the unix timestamp of beginning of the day is negative (example: 1970-01-01 MSK, where time_t == -10800), then value will overflow. - /// Change to time_t; change constants above; and recompile the sources if you need to support time after 2105 year. - UInt32 date; + /// time_t at beginning of the day. + Int64 date; /// Properties of the day. UInt16 year; @@ -65,107 +136,189 @@ public: UInt8 days_in_month; /// For days, when offset from UTC was changed due to daylight saving time or permanent change, following values could be non zero. - Int16 amount_of_offset_change; /// Usually -3600 or 3600, but look at Lord Howe Island. - UInt32 time_at_offset_change; /// In seconds from beginning of the day. + /// All in OffsetChangeFactor (15 minute) intervals. + Int8 amount_of_offset_change_value; /// Usually -4 or 4, but look at Lord Howe Island. Multiply by OffsetChangeFactor + UInt8 time_at_offset_change_value; /// In seconds from beginning of the day. Multiply by OffsetChangeFactor + + inline Int32 amount_of_offset_change() const + { + return static_cast(amount_of_offset_change_value) * OffsetChangeFactor; + } + + inline UInt32 time_at_offset_change() const + { + return static_cast(time_at_offset_change_value) * OffsetChangeFactor; + } + + /// Since most of the modern timezones have a DST change aligned to 15 minutes, to save as much space as possible inside Value, + /// we are dividing any offset change related value by this factor before setting it to Value, + /// hence it has to be explicitly multiplied back by this factor before being used. + static constexpr UInt16 OffsetChangeFactor = 900; }; static_assert(sizeof(Values) == 16); private: - /// Lookup table is indexed by DayNum. + + /// Mask is all-ones to allow efficient protection against overflow. + static constexpr UInt32 date_lut_mask = 0x1ffff; + static_assert(date_lut_mask == DATE_LUT_SIZE - 1); + + /// Offset to epoch in days (ExtendedDayNum) of the first day in LUT. + /// "epoch" is the Unix Epoch (starts at unix timestamp zero) + static constexpr UInt32 daynum_offset_epoch = 16436; + static_assert(daynum_offset_epoch == (1970 - DATE_LUT_MIN_YEAR) * 365 + (1970 - DATE_LUT_MIN_YEAR / 4 * 4) / 4); + + /// Lookup table is indexed by LUTIndex. /// Day nums are the same in all time zones. 1970-01-01 is 0 and so on. /// Table is relatively large, so better not to place the object on stack. /// In comparison to std::vector, plain array is cheaper by one indirection. - Values lut[DATE_LUT_SIZE]; + Values lut[DATE_LUT_SIZE + 1]; - /// Year number after DATE_LUT_MIN_YEAR -> day num for start of year. - DayNum years_lut[DATE_LUT_YEARS]; + /// Year number after DATE_LUT_MIN_YEAR -> LUTIndex in lut for start of year. + LUTIndex years_lut[DATE_LUT_YEARS]; /// Year number after DATE_LUT_MIN_YEAR * month number starting at zero -> day num for first day of month - DayNum years_months_lut[DATE_LUT_YEARS * 12]; + LUTIndex years_months_lut[DATE_LUT_YEARS * 12]; /// UTC offset at beginning of the Unix epoch. The same as unix timestamp of 1970-01-01 00:00:00 local time. time_t offset_at_start_of_epoch; - bool offset_is_whole_number_of_hours_everytime; + /// UTC offset at the beginning of the first supported year. + time_t offset_at_start_of_lut; + bool offset_is_whole_number_of_hours_during_epoch; /// Time zone name. std::string time_zone; - - /// We can correctly process only timestamps that less DATE_LUT_MAX (i.e. up to 2105 year inclusively) - /// We don't care about overflow. - inline DayNum findIndex(time_t t) const + inline LUTIndex findIndex(time_t t) const { /// First guess. - DayNum guess(t / 86400); + Int64 guess = (t / 86400) + daynum_offset_epoch; + + /// For negative time_t the integer division was rounded up, so the guess is offset by one. + if (unlikely(t < 0)) + --guess; + + if (guess < 0) + return LUTIndex(0); + if (guess >= DATE_LUT_SIZE) + return LUTIndex(DATE_LUT_SIZE - 1); /// UTC offset is from -12 to +14 in all known time zones. This requires checking only three indices. - if ((guess == 0 || t >= lut[guess].date) && t < lut[DayNum(guess + 1)].date) - return guess; + if (t >= lut[guess].date) + { + if (guess + 1 >= DATE_LUT_SIZE || t < lut[guess + 1].date) + return LUTIndex(guess); - /// Time zones that have offset 0 from UTC do daylight saving time change (if any) towards increasing UTC offset (example: British Standard Time). - if (t >= lut[DayNum(guess + 1)].date) - return DayNum(guess + 1); + return LUTIndex(guess + 1); + } - return DayNum(guess - 1); + return LUTIndex(guess ? guess - 1 : 0); } - inline const Values & find(time_t t) const + inline LUTIndex toLUTIndex(DayNum d) const { - return lut[findIndex(t)]; + return LUTIndex{(d + daynum_offset_epoch) & date_lut_mask}; + } + + inline LUTIndex toLUTIndex(ExtendedDayNum d) const + { + return LUTIndex{static_cast(d + daynum_offset_epoch) & date_lut_mask}; + } + + inline LUTIndex toLUTIndex(time_t t) const + { + return findIndex(t); + } + + inline LUTIndex toLUTIndex(LUTIndex i) const + { + return i; + } + + template + inline const Values & find(DateOrTime v) const + { + return lut[toLUTIndex(v)]; + } + + template + static inline T roundDown(T x, Divisor divisor) + { + static_assert(std::is_integral_v && std::is_integral_v); + assert(divisor > 0); + + if (likely(x >= 0)) + return x / divisor * divisor; + + /// Integer division for negative numbers rounds them towards zero (up). + /// We will shift the number so it will be rounded towards -inf (down). + + return (x + 1 - divisor) / divisor * divisor; } public: const std::string & getTimeZone() const { return time_zone; } + // Methods only for unit-testing, it makes very little sense to use it from user code. + auto getOffsetAtStartOfEpoch() const { return offset_at_start_of_epoch; } + auto getTimeOffsetAtStartOfLUT() const { return offset_at_start_of_lut; } + /// All functions below are thread-safe; arguments are not checked. - inline time_t toDate(time_t t) const { return find(t).date; } - inline unsigned toMonth(time_t t) const { return find(t).month; } - inline unsigned toQuarter(time_t t) const { return (find(t).month - 1) / 3 + 1; } - inline unsigned toYear(time_t t) const { return find(t).year; } - inline unsigned toDayOfWeek(time_t t) const { return find(t).day_of_week; } - inline unsigned toDayOfMonth(time_t t) const { return find(t).day_of_month; } + inline ExtendedDayNum toDayNum(ExtendedDayNum d) const + { + return d; + } + + template + inline ExtendedDayNum toDayNum(DateOrTime v) const + { + return ExtendedDayNum{static_cast(toLUTIndex(v).toUnderType() - daynum_offset_epoch)}; + } /// Round down to start of monday. - inline time_t toFirstDayOfWeek(time_t t) const + template + inline time_t toFirstDayOfWeek(DateOrTime v) const { - DayNum index = findIndex(t); - return lut[DayNum(index - (lut[index].day_of_week - 1))].date; + const LUTIndex i = toLUTIndex(v); + return lut[i - (lut[i].day_of_week - 1)].date; } - inline DayNum toFirstDayNumOfWeek(DayNum d) const + template + inline ExtendedDayNum toFirstDayNumOfWeek(DateOrTime v) const { - return DayNum(d - (lut[d].day_of_week - 1)); - } - - inline DayNum toFirstDayNumOfWeek(time_t t) const - { - return toFirstDayNumOfWeek(toDayNum(t)); + const LUTIndex i = toLUTIndex(v); + return toDayNum(i - (lut[i].day_of_week - 1)); } /// Round down to start of month. - inline time_t toFirstDayOfMonth(time_t t) const + template + inline time_t toFirstDayOfMonth(DateOrTime v) const { - DayNum index = findIndex(t); - return lut[index - (lut[index].day_of_month - 1)].date; + const LUTIndex i = toLUTIndex(v); + return lut[i - (lut[i].day_of_month - 1)].date; } - inline DayNum toFirstDayNumOfMonth(DayNum d) const + template + inline ExtendedDayNum toFirstDayNumOfMonth(DateOrTime v) const { - return DayNum(d - (lut[d].day_of_month - 1)); - } - - inline DayNum toFirstDayNumOfMonth(time_t t) const - { - return toFirstDayNumOfMonth(toDayNum(t)); + const LUTIndex i = toLUTIndex(v); + return toDayNum(i - (lut[i].day_of_month - 1)); } /// Round down to start of quarter. - inline DayNum toFirstDayNumOfQuarter(DayNum d) const + template + inline ExtendedDayNum toFirstDayNumOfQuarter(DateOrTime v) const { - DayNum index = d; + return toDayNum(toFirstDayOfQuarterIndex(v)); + } + + template + inline LUTIndex toFirstDayOfQuarterIndex(DateOrTime v) const + { + LUTIndex index = toLUTIndex(v); size_t month_inside_quarter = (lut[index].month - 1) % 3; index -= lut[index].day_of_month; @@ -175,17 +328,13 @@ public: --month_inside_quarter; } - return DayNum(index + 1); + return index + 1; } - inline DayNum toFirstDayNumOfQuarter(time_t t) const + template + inline time_t toFirstDayOfQuarter(DateOrTime v) const { - return toFirstDayNumOfQuarter(toDayNum(t)); - } - - inline time_t toFirstDayOfQuarter(time_t t) const - { - return fromDayNum(toFirstDayNumOfQuarter(t)); + return toDate(toFirstDayOfQuarterIndex(v)); } /// Round down to start of year. @@ -194,48 +343,47 @@ public: return lut[years_lut[lut[findIndex(t)].year - DATE_LUT_MIN_YEAR]].date; } - inline DayNum toFirstDayNumOfYear(DayNum d) const + template + inline LUTIndex toFirstDayNumOfYearIndex(DateOrTime v) const { - return years_lut[lut[d].year - DATE_LUT_MIN_YEAR]; + return years_lut[lut[toLUTIndex(v)].year - DATE_LUT_MIN_YEAR]; } - inline DayNum toFirstDayNumOfYear(time_t t) const + template + inline ExtendedDayNum toFirstDayNumOfYear(DateOrTime v) const { - return toFirstDayNumOfYear(toDayNum(t)); + return toDayNum(toFirstDayNumOfYearIndex(v)); } inline time_t toFirstDayOfNextMonth(time_t t) const { - DayNum index = findIndex(t); + LUTIndex index = findIndex(t); index += 32 - lut[index].day_of_month; return lut[index - (lut[index].day_of_month - 1)].date; } inline time_t toFirstDayOfPrevMonth(time_t t) const { - DayNum index = findIndex(t); + LUTIndex index = findIndex(t); index -= lut[index].day_of_month; return lut[index - (lut[index].day_of_month - 1)].date; } - inline UInt8 daysInMonth(DayNum d) const + template + inline UInt8 daysInMonth(DateOrTime value) const { - return lut[d].days_in_month; + const LUTIndex i = toLUTIndex(value); + return lut[i].days_in_month; } - inline UInt8 daysInMonth(time_t t) const - { - return find(t).days_in_month; - } - - inline UInt8 daysInMonth(UInt16 year, UInt8 month) const + inline UInt8 daysInMonth(Int16 year, UInt8 month) const { UInt16 idx = year - DATE_LUT_MIN_YEAR; if (unlikely(idx >= DATE_LUT_YEARS)) return 31; /// Implementation specific behaviour on overflow. /// 32 makes arithmetic more simple. - DayNum any_day_of_month = DayNum(years_lut[idx] + 32 * (month - 1)); + const auto any_day_of_month = years_lut[year - DATE_LUT_MIN_YEAR] + 32 * (month - 1); return lut[any_day_of_month].days_in_month; } @@ -243,101 +391,111 @@ public: */ inline time_t toDateAndShift(time_t t, Int32 days) const { - return lut[DayNum(findIndex(t) + days)].date; + return lut[findIndex(t) + days].date; } inline time_t toTime(time_t t) const { - DayNum index = findIndex(t); - - if (unlikely(index == 0 || index > DATE_LUT_MAX_DAY_NUM)) - return t + offset_at_start_of_epoch; + const LUTIndex index = findIndex(t); time_t res = t - lut[index].date; - if (res >= lut[index].time_at_offset_change) - res += lut[index].amount_of_offset_change; + if (res >= lut[index].time_at_offset_change()) + res += lut[index].amount_of_offset_change(); return res - offset_at_start_of_epoch; /// Starting at 1970-01-01 00:00:00 local time. } inline unsigned toHour(time_t t) const { - DayNum index = findIndex(t); - - /// If it is overflow case, - /// then limit number of hours to avoid insane results like 1970-01-01 89:28:15 - if (unlikely(index == 0 || index > DATE_LUT_MAX_DAY_NUM)) - return static_cast((t + offset_at_start_of_epoch) / 3600) % 24; + const LUTIndex index = findIndex(t); time_t time = t - lut[index].date; - if (time >= lut[index].time_at_offset_change) - time += lut[index].amount_of_offset_change; + if (time >= lut[index].time_at_offset_change()) + time += lut[index].amount_of_offset_change(); unsigned res = time / 3600; - return res <= 23 ? res : 0; + + /// In case time was changed backwards at the start of next day, we will repeat the hour 23. + return res <= 23 ? res : 23; } /** Calculating offset from UTC in seconds. - * which means Using the same literal time of "t" to get the corresponding timestamp in UTC, - * then subtract the former from the latter to get the offset result. - * The boundaries when meets DST(daylight saving time) change should be handled very carefully. - */ + * which means Using the same literal time of "t" to get the corresponding timestamp in UTC, + * then subtract the former from the latter to get the offset result. + * The boundaries when meets DST(daylight saving time) change should be handled very carefully. + */ inline time_t timezoneOffset(time_t t) const { - DayNum index = findIndex(t); + const LUTIndex index = findIndex(t); /// Calculate daylight saving offset first. /// Because the "amount_of_offset_change" in LUT entry only exists in the change day, it's costly to scan it from the very begin. /// but we can figure out all the accumulated offsets from 1970-01-01 to that day just by get the whole difference between lut[].date, /// and then, we can directly subtract multiple 86400s to get the real DST offsets for the leap seconds is not considered now. - time_t res = (lut[index].date - lut[0].date) % 86400; + time_t res = (lut[index].date - lut[daynum_offset_epoch].date) % 86400; + /// As so far to know, the maximal DST offset couldn't be more than 2 hours, so after the modulo operation the remainder /// will sits between [-offset --> 0 --> offset] which respectively corresponds to moving clock forward or backward. res = res > 43200 ? (86400 - res) : (0 - res); /// Check if has a offset change during this day. Add the change when cross the line - if (lut[index].amount_of_offset_change != 0 && t >= lut[index].date + lut[index].time_at_offset_change) - res += lut[index].amount_of_offset_change; + if (lut[index].amount_of_offset_change() != 0 && t >= lut[index].date + lut[index].time_at_offset_change()) + res += lut[index].amount_of_offset_change(); return res + offset_at_start_of_epoch; } - /** Only for time zones with/when offset from UTC is multiple of five minutes. - * This is true for all time zones: right now, all time zones have an offset that is multiple of 15 minutes. - * - * "By 1929, most major countries had adopted hourly time zones. Nepal was the last - * country to adopt a standard offset, shifting slightly to UTC+5:45 in 1986." - * - https://en.wikipedia.org/wiki/Time_zone#Offsets_from_UTC - * - * Also please note, that unix timestamp doesn't count "leap seconds": - * each minute, with added or subtracted leap second, spans exactly 60 unix timestamps. - */ - inline unsigned toSecond(time_t t) const { return UInt32(t) % 60; } + inline unsigned toSecond(time_t t) const + { + auto res = t % 60; + if (likely(res >= 0)) + return res; + return res + 60; + } inline unsigned toMinute(time_t t) const { - if (offset_is_whole_number_of_hours_everytime) - return (UInt32(t) / 60) % 60; + if (t >= 0 && offset_is_whole_number_of_hours_during_epoch) + return (t / 60) % 60; - UInt32 date = find(t).date; - return (UInt32(t) - date) / 60 % 60; + /// To consider the DST changing situation within this day + /// also make the special timezones with no whole hour offset such as 'Australia/Lord_Howe' been taken into account. + + LUTIndex index = findIndex(t); + UInt32 time = t - lut[index].date; + + if (time >= lut[index].time_at_offset_change()) + time += lut[index].amount_of_offset_change(); + + return time / 60 % 60; } - inline time_t toStartOfMinute(time_t t) const { return t / 60 * 60; } - inline time_t toStartOfFiveMinute(time_t t) const { return t / 300 * 300; } - inline time_t toStartOfFifteenMinutes(time_t t) const { return t / 900 * 900; } - inline time_t toStartOfTenMinutes(time_t t) const { return t / 600 * 600; } + /// NOTE: Assuming timezone offset is a multiple of 15 minutes. + inline time_t toStartOfMinute(time_t t) const { return roundDown(t, 60); } + inline time_t toStartOfFiveMinute(time_t t) const { return roundDown(t, 300); } + inline time_t toStartOfFifteenMinutes(time_t t) const { return roundDown(t, 900); } + inline time_t toStartOfTenMinutes(time_t t) const + { + if (t >= 0 && offset_is_whole_number_of_hours_during_epoch) + return t / 600 * 600; + + /// More complex logic is for Nepal - it has offset 05:45. Australia/Eucla is also unfortunate. + Int64 date = find(t).date; + return date + (t - date) / 600 * 600; + } + + /// NOTE: Assuming timezone transitions are multiple of hours. Lord Howe Island in Australia is a notable exception. inline time_t toStartOfHour(time_t t) const { - if (offset_is_whole_number_of_hours_everytime) + if (t >= 0 && offset_is_whole_number_of_hours_during_epoch) return t / 3600 * 3600; - UInt32 date = find(t).date; - return date + (UInt32(t) - date) / 3600 * 3600; + Int64 date = find(t).date; + return date + (t - date) / 3600 * 3600; } /** Number of calendar day since the beginning of UNIX epoch (1970-01-01 is zero) @@ -348,80 +506,89 @@ public: * because the same calendar day starts/ends at different timestamps in different time zones) */ - inline DayNum toDayNum(time_t t) const { return findIndex(t); } - inline time_t fromDayNum(DayNum d) const { return lut[d].date; } + inline time_t fromDayNum(DayNum d) const { return lut[toLUTIndex(d)].date; } + inline time_t fromDayNum(ExtendedDayNum d) const { return lut[toLUTIndex(d)].date; } - inline time_t toDate(DayNum d) const { return lut[d].date; } - inline unsigned toMonth(DayNum d) const { return lut[d].month; } - inline unsigned toQuarter(DayNum d) const { return (lut[d].month - 1) / 3 + 1; } - inline unsigned toYear(DayNum d) const { return lut[d].year; } - inline unsigned toDayOfWeek(DayNum d) const { return lut[d].day_of_week; } - inline unsigned toDayOfMonth(DayNum d) const { return lut[d].day_of_month; } - inline unsigned toDayOfYear(DayNum d) const { return d + 1 - toFirstDayNumOfYear(d); } + template + inline time_t toDate(DateOrTime v) const { return lut[toLUTIndex(v)].date; } - inline unsigned toDayOfYear(time_t t) const { return toDayOfYear(toDayNum(t)); } + template + inline unsigned toMonth(DateOrTime v) const { return lut[toLUTIndex(v)].month; } + + template + inline unsigned toQuarter(DateOrTime v) const { return (lut[toLUTIndex(v)].month - 1) / 3 + 1; } + + template + inline Int16 toYear(DateOrTime v) const { return lut[toLUTIndex(v)].year; } + + template + inline unsigned toDayOfWeek(DateOrTime v) const { return lut[toLUTIndex(v)].day_of_week; } + + template + inline unsigned toDayOfMonth(DateOrTime v) const { return lut[toLUTIndex(v)].day_of_month; } + + template + inline unsigned toDayOfYear(DateOrTime v) const + { + // TODO: different overload for ExtendedDayNum + const LUTIndex i = toLUTIndex(v); + return i + 1 - toFirstDayNumOfYearIndex(i); + } /// Number of week from some fixed moment in the past. Week begins at monday. /// (round down to monday and divide DayNum by 7; we made an assumption, /// that in domain of the function there was no weeks with any other number of days than 7) - inline unsigned toRelativeWeekNum(DayNum d) const + template + inline unsigned toRelativeWeekNum(DateOrTime v) const { + const LUTIndex i = toLUTIndex(v); /// We add 8 to avoid underflow at beginning of unix epoch. - return (d + 8 - toDayOfWeek(d)) / 7; - } - - inline unsigned toRelativeWeekNum(time_t t) const - { - return toRelativeWeekNum(toDayNum(t)); + return toDayNum(i + 8 - toDayOfWeek(i)) / 7; } /// Get year that contains most of the current week. Week begins at monday. - inline unsigned toISOYear(DayNum d) const + template + inline unsigned toISOYear(DateOrTime v) const { + const LUTIndex i = toLUTIndex(v); /// That's effectively the year of thursday of current week. - return toYear(DayNum(d + 4 - toDayOfWeek(d))); - } - - inline unsigned toISOYear(time_t t) const - { - return toISOYear(toDayNum(t)); + return toYear(toLUTIndex(i + 4 - toDayOfWeek(i))); } /// ISO year begins with a monday of the week that is contained more than by half in the corresponding calendar year. /// Example: ISO year 2019 begins at 2018-12-31. And ISO year 2017 begins at 2017-01-02. /// https://en.wikipedia.org/wiki/ISO_week_date - inline DayNum toFirstDayNumOfISOYear(DayNum d) const + template + inline LUTIndex toFirstDayNumOfISOYearIndex(DateOrTime v) const { - auto iso_year = toISOYear(d); + const LUTIndex i = toLUTIndex(v); + auto iso_year = toISOYear(i); - DayNum first_day_of_year = years_lut[iso_year - DATE_LUT_MIN_YEAR]; + const auto first_day_of_year = years_lut[iso_year - DATE_LUT_MIN_YEAR]; auto first_day_of_week_of_year = lut[first_day_of_year].day_of_week; - return DayNum(first_day_of_week_of_year <= 4 + return LUTIndex{first_day_of_week_of_year <= 4 ? first_day_of_year + 1 - first_day_of_week_of_year - : first_day_of_year + 8 - first_day_of_week_of_year); + : first_day_of_year + 8 - first_day_of_week_of_year}; } - inline DayNum toFirstDayNumOfISOYear(time_t t) const + template + inline ExtendedDayNum toFirstDayNumOfISOYear(DateOrTime v) const { - return toFirstDayNumOfISOYear(toDayNum(t)); + return toDayNum(toFirstDayNumOfISOYearIndex(v)); } inline time_t toFirstDayOfISOYear(time_t t) const { - return fromDayNum(toFirstDayNumOfISOYear(t)); + return lut[toFirstDayNumOfISOYearIndex(t)].date; } /// ISO 8601 week number. Week begins at monday. /// The week number 1 is the first week in year that contains 4 or more days (that's more than half). - inline unsigned toISOWeek(DayNum d) const + template + inline unsigned toISOWeek(DateOrTime v) const { - return 1 + DayNum(toFirstDayNumOfWeek(d) - toFirstDayNumOfISOYear(d)) / 7; - } - - inline unsigned toISOWeek(time_t t) const - { - return toISOWeek(toDayNum(t)); + return 1 + (toFirstDayNumOfWeek(v) - toFirstDayNumOfISOYear(v)) / 7; } /* @@ -457,30 +624,33 @@ public: Otherwise it is the last week of the previous year, and the next week is week 1. */ - inline YearWeek toYearWeek(DayNum d, UInt8 week_mode) const + template + inline YearWeek toYearWeek(DateOrTime v, UInt8 week_mode) const { - bool newyear_day_mode = week_mode & static_cast(WeekModeFlag::NEWYEAR_DAY); + const bool newyear_day_mode = week_mode & static_cast(WeekModeFlag::NEWYEAR_DAY); week_mode = check_week_mode(week_mode); - bool monday_first_mode = week_mode & static_cast(WeekModeFlag::MONDAY_FIRST); + const bool monday_first_mode = week_mode & static_cast(WeekModeFlag::MONDAY_FIRST); bool week_year_mode = week_mode & static_cast(WeekModeFlag::YEAR); - bool first_weekday_mode = week_mode & static_cast(WeekModeFlag::FIRST_WEEKDAY); + const bool first_weekday_mode = week_mode & static_cast(WeekModeFlag::FIRST_WEEKDAY); + + const LUTIndex i = toLUTIndex(v); // Calculate week number of WeekModeFlag::NEWYEAR_DAY mode if (newyear_day_mode) { - return toYearWeekOfNewyearMode(d, monday_first_mode); + return toYearWeekOfNewyearMode(i, monday_first_mode); } - YearWeek yw(toYear(d), 0); + YearWeek yw(toYear(i), 0); UInt16 days = 0; - UInt16 daynr = makeDayNum(yw.first, toMonth(d), toDayOfMonth(d)); - UInt16 first_daynr = makeDayNum(yw.first, 1, 1); + const auto daynr = makeDayNum(yw.first, toMonth(i), toDayOfMonth(i)); + auto first_daynr = makeDayNum(yw.first, 1, 1); // 0 for monday, 1 for tuesday ... // get weekday from first day in year. - UInt16 weekday = calc_weekday(DayNum(first_daynr), !monday_first_mode); + UInt16 weekday = calc_weekday(first_daynr, !monday_first_mode); - if (toMonth(d) == 1 && toDayOfMonth(d) <= static_cast(7 - weekday)) + if (toMonth(i) == 1 && toDayOfMonth(i) <= static_cast(7 - weekday)) { if (!week_year_mode && ((first_weekday_mode && weekday != 0) || (!first_weekday_mode && weekday >= 4))) return yw; @@ -511,48 +681,51 @@ public: /// Calculate week number of WeekModeFlag::NEWYEAR_DAY mode /// The week number 1 is the first week in year that contains January 1, - inline YearWeek toYearWeekOfNewyearMode(DayNum d, bool monday_first_mode) const + template + inline YearWeek toYearWeekOfNewyearMode(DateOrTime v, bool monday_first_mode) const { YearWeek yw(0, 0); UInt16 offset_day = monday_first_mode ? 0U : 1U; + const LUTIndex i = LUTIndex(v); + // Checking the week across the year - yw.first = toYear(DayNum(d + 7 - toDayOfWeek(DayNum(d + offset_day)))); + yw.first = toYear(i + 7 - toDayOfWeek(i + offset_day)); - DayNum first_day = makeDayNum(yw.first, 1, 1); - DayNum this_day = d; + auto first_day = makeLUTIndex(yw.first, 1, 1); + auto this_day = i; + // TODO: do not perform calculations in terms of DayNum, since that would under/overflow for extended range. if (monday_first_mode) { // Rounds down a date to the nearest Monday. first_day = toFirstDayNumOfWeek(first_day); - this_day = toFirstDayNumOfWeek(d); + this_day = toFirstDayNumOfWeek(i); } else { // Rounds down a date to the nearest Sunday. if (toDayOfWeek(first_day) != 7) - first_day = DayNum(first_day - toDayOfWeek(first_day)); - if (toDayOfWeek(d) != 7) - this_day = DayNum(d - toDayOfWeek(d)); + first_day = ExtendedDayNum(first_day - toDayOfWeek(first_day)); + if (toDayOfWeek(i) != 7) + this_day = ExtendedDayNum(i - toDayOfWeek(i)); } yw.second = (this_day - first_day) / 7 + 1; return yw; } - /** - * get first day of week with week_mode, return Sunday or Monday - */ - inline DayNum toFirstDayNumOfWeek(DayNum d, UInt8 week_mode) const + /// Get first day of week with week_mode, return Sunday or Monday + template + inline ExtendedDayNum toFirstDayNumOfWeek(DateOrTime v, UInt8 week_mode) const { bool monday_first_mode = week_mode & static_cast(WeekModeFlag::MONDAY_FIRST); if (monday_first_mode) { - return toFirstDayNumOfWeek(d); + return toFirstDayNumOfWeek(v); } else { - return (toDayOfWeek(d) != 7) ? DayNum(d - toDayOfWeek(d)) : d; + return (toDayOfWeek(v) != 7) ? ExtendedDayNum(v - toDayOfWeek(v)) : toDayNum(v); } } @@ -568,192 +741,231 @@ public: /** Calculate weekday from d. * Returns 0 for monday, 1 for tuesday... */ - inline unsigned calc_weekday(DayNum d, bool sunday_first_day_of_week) const + template + inline unsigned calc_weekday(DateOrTime v, bool sunday_first_day_of_week) const { + const LUTIndex i = toLUTIndex(v); if (!sunday_first_day_of_week) - return toDayOfWeek(d) - 1; + return toDayOfWeek(i) - 1; else - return toDayOfWeek(DayNum(d + 1)) - 1; + return toDayOfWeek(i + 1) - 1; } /// Calculate days in one year. - inline unsigned calc_days_in_year(UInt16 year) const + inline unsigned calc_days_in_year(Int32 year) const { return ((year & 3) == 0 && (year % 100 || (year % 400 == 0 && year)) ? 366 : 365); } /// Number of month from some fixed moment in the past (year * 12 + month) - inline unsigned toRelativeMonthNum(DayNum d) const + template + inline unsigned toRelativeMonthNum(DateOrTime v) const { - return lut[d].year * 12 + lut[d].month; + const LUTIndex i = toLUTIndex(v); + return lut[i].year * 12 + lut[i].month; } - inline unsigned toRelativeMonthNum(time_t t) const + template + inline unsigned toRelativeQuarterNum(DateOrTime v) const { - return toRelativeMonthNum(toDayNum(t)); - } - - inline unsigned toRelativeQuarterNum(DayNum d) const - { - return lut[d].year * 4 + (lut[d].month - 1) / 3; - } - - inline unsigned toRelativeQuarterNum(time_t t) const - { - return toRelativeQuarterNum(toDayNum(t)); + const LUTIndex i = toLUTIndex(v); + return lut[i].year * 4 + (lut[i].month - 1) / 3; } /// We count all hour-length intervals, unrelated to offset changes. inline time_t toRelativeHourNum(time_t t) const { - if (offset_is_whole_number_of_hours_everytime) + if (t >= 0 && offset_is_whole_number_of_hours_during_epoch) return t / 3600; /// Assume that if offset was fractional, then the fraction is the same as at the beginning of epoch. /// NOTE This assumption is false for "Pacific/Pitcairn" and "Pacific/Kiritimati" time zones. - return (t + 86400 - offset_at_start_of_epoch) / 3600; + return (t + DATE_LUT_ADD + 86400 - offset_at_start_of_epoch) / 3600 - (DATE_LUT_ADD / 3600); } - inline time_t toRelativeHourNum(DayNum d) const + template + inline time_t toRelativeHourNum(DateOrTime v) const { - return toRelativeHourNum(lut[d].date); + return toRelativeHourNum(lut[toLUTIndex(v)].date); } inline time_t toRelativeMinuteNum(time_t t) const { - return t / 60; + return (t + DATE_LUT_ADD) / 60 - (DATE_LUT_ADD / 60); } - inline time_t toRelativeMinuteNum(DayNum d) const + template + inline time_t toRelativeMinuteNum(DateOrTime v) const { - return toRelativeMinuteNum(lut[d].date); + return toRelativeMinuteNum(lut[toLUTIndex(v)].date); } - inline DayNum toStartOfYearInterval(DayNum d, UInt64 years) const + template + inline ExtendedDayNum toStartOfYearInterval(DateOrTime v, UInt64 years) const { if (years == 1) - return toFirstDayNumOfYear(d); - return years_lut[(lut[d].year - DATE_LUT_MIN_YEAR) / years * years]; + return toFirstDayNumOfYear(v); + + const LUTIndex i = toLUTIndex(v); + + UInt16 year = lut[i].year / years * years; + + /// For example, rounding down 1925 to 100 years will be 1900, but it's less than min supported year. + if (unlikely(year < DATE_LUT_MIN_YEAR)) + year = DATE_LUT_MIN_YEAR; + + return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]); } - inline DayNum toStartOfQuarterInterval(DayNum d, UInt64 quarters) const + inline ExtendedDayNum toStartOfQuarterInterval(ExtendedDayNum d, UInt64 quarters) const { if (quarters == 1) return toFirstDayNumOfQuarter(d); return toStartOfMonthInterval(d, quarters * 3); } - inline DayNum toStartOfMonthInterval(DayNum d, UInt64 months) const + inline ExtendedDayNum toStartOfMonthInterval(ExtendedDayNum d, UInt64 months) const { if (months == 1) return toFirstDayNumOfMonth(d); - const auto & date = lut[d]; - UInt32 month_total_index = (date.year - DATE_LUT_MIN_YEAR) * 12 + date.month - 1; - return years_months_lut[month_total_index / months * months]; + const Values & values = lut[toLUTIndex(d)]; + UInt32 month_total_index = (values.year - DATE_LUT_MIN_YEAR) * 12 + values.month - 1; + return toDayNum(years_months_lut[month_total_index / months * months]); } - inline DayNum toStartOfWeekInterval(DayNum d, UInt64 weeks) const + inline ExtendedDayNum toStartOfWeekInterval(ExtendedDayNum d, UInt64 weeks) const { if (weeks == 1) return toFirstDayNumOfWeek(d); UInt64 days = weeks * 7; // January 1st 1970 was Thursday so we need this 4-days offset to make weeks start on Monday. - return DayNum(4 + (d - 4) / days * days); + return ExtendedDayNum(4 + (d - 4) / days * days); } - inline time_t toStartOfDayInterval(DayNum d, UInt64 days) const + inline time_t toStartOfDayInterval(ExtendedDayNum d, UInt64 days) const { if (days == 1) return toDate(d); - return lut[d / days * days].date; + return lut[toLUTIndex(ExtendedDayNum(d / days * days))].date; } inline time_t toStartOfHourInterval(time_t t, UInt64 hours) const { if (hours == 1) return toStartOfHour(t); + + /** We will round the hour number since the midnight. + * It may split the day into non-equal intervals. + * For example, if we will round to 11-hour interval, + * the day will be split to the intervals 00:00:00..10:59:59, 11:00:00..21:59:59, 22:00:00..23:59:59. + * In case of daylight saving time or other transitions, + * the intervals can be shortened or prolonged to the amount of transition. + */ + UInt64 seconds = hours * 3600; - t = t / seconds * seconds; - if (offset_is_whole_number_of_hours_everytime) - return t; - return toStartOfHour(t); + + const LUTIndex index = findIndex(t); + const Values & values = lut[index]; + + time_t time = t - values.date; + if (time >= values.time_at_offset_change()) + { + /// Align to new hour numbers before rounding. + time += values.amount_of_offset_change(); + time = time / seconds * seconds; + + /// Should subtract the shift back but only if rounded time is not before shift. + if (time >= values.time_at_offset_change()) + { + time -= values.amount_of_offset_change(); + + /// With cutoff at the time of the shift. Otherwise we may end up with something like 23:00 previous day. + if (time < values.time_at_offset_change()) + time = values.time_at_offset_change(); + } + } + else + { + time = time / seconds * seconds; + } + + return values.date + time; } inline time_t toStartOfMinuteInterval(time_t t, UInt64 minutes) const { if (minutes == 1) return toStartOfMinute(t); + + /** In contrast to "toStartOfHourInterval" function above, + * the minute intervals are not aligned to the midnight. + * You will get unexpected results if for example, you round down to 60 minute interval + * and there was a time shift to 30 minutes. + * + * But this is not specified in docs and can be changed in future. + */ + UInt64 seconds = 60 * minutes; - return t / seconds * seconds; + return roundDown(t, seconds); } inline time_t toStartOfSecondInterval(time_t t, UInt64 seconds) const { if (seconds == 1) return t; - return t / seconds * seconds; + + return roundDown(t, seconds); + } + + inline LUTIndex makeLUTIndex(Int16 year, UInt8 month, UInt8 day_of_month) const + { + if (unlikely(year < DATE_LUT_MIN_YEAR || year > DATE_LUT_MAX_YEAR || month < 1 || month > 12 || day_of_month < 1 || day_of_month > 31)) + return LUTIndex(0); + + return LUTIndex{years_months_lut[(year - DATE_LUT_MIN_YEAR) * 12 + month - 1] + day_of_month - 1}; } /// Create DayNum from year, month, day of month. - inline DayNum makeDayNum(UInt16 year, UInt8 month, UInt8 day_of_month) const + inline ExtendedDayNum makeDayNum(Int16 year, UInt8 month, UInt8 day_of_month) const { if (unlikely(year < DATE_LUT_MIN_YEAR || year > DATE_LUT_MAX_YEAR || month < 1 || month > 12 || day_of_month < 1 || day_of_month > 31)) - return DayNum(0); // TODO (nemkov, DateTime64 phase 2): implement creating real date for year outside of LUT range. + return ExtendedDayNum(0); - // The day after 2106-02-07 will not stored fully as struct Values, so just overflow it as 0 - if (unlikely(year == DATE_LUT_MAX_YEAR && (month > 2 || (month == 2 && day_of_month > 7)))) - return DayNum(0); - - return DayNum(years_months_lut[(year - DATE_LUT_MIN_YEAR) * 12 + month - 1] + day_of_month - 1); + return toDayNum(makeLUTIndex(year, month, day_of_month)); } - inline time_t makeDate(UInt16 year, UInt8 month, UInt8 day_of_month) const + inline time_t makeDate(Int16 year, UInt8 month, UInt8 day_of_month) const { - return lut[makeDayNum(year, month, day_of_month)].date; + return lut[makeLUTIndex(year, month, day_of_month)].date; } /** Does not accept daylight saving time as argument: in case of ambiguity, it choose greater timestamp. */ - inline time_t makeDateTime(UInt16 year, UInt8 month, UInt8 day_of_month, UInt8 hour, UInt8 minute, UInt8 second) const + inline time_t makeDateTime(Int16 year, UInt8 month, UInt8 day_of_month, UInt8 hour, UInt8 minute, UInt8 second) const { - size_t index = makeDayNum(year, month, day_of_month); + size_t index = makeLUTIndex(year, month, day_of_month); UInt32 time_offset = hour * 3600 + minute * 60 + second; - if (time_offset >= lut[index].time_at_offset_change) - time_offset -= lut[index].amount_of_offset_change; + if (time_offset >= lut[index].time_at_offset_change()) + time_offset -= lut[index].amount_of_offset_change(); - UInt32 res = lut[index].date + time_offset; - - if (unlikely(res > DATE_LUT_MAX)) - return 0; - - return res; + return lut[index].date + time_offset; } - inline const Values & getValues(DayNum d) const { return lut[d]; } - inline const Values & getValues(time_t t) const { return lut[findIndex(t)]; } + template + inline const Values & getValues(DateOrTime v) const { return lut[toLUTIndex(v)]; } - inline UInt32 toNumYYYYMM(time_t t) const + template + inline UInt32 toNumYYYYMM(DateOrTime v) const { - const Values & values = find(t); + const Values & values = getValues(v); return values.year * 100 + values.month; } - inline UInt32 toNumYYYYMM(DayNum d) const + template + inline UInt32 toNumYYYYMMDD(DateOrTime v) const { - const Values & values = lut[d]; - return values.year * 100 + values.month; - } - - inline UInt32 toNumYYYYMMDD(time_t t) const - { - const Values & values = find(t); - return values.year * 10000 + values.month * 100 + values.day_of_month; - } - - inline UInt32 toNumYYYYMMDD(DayNum d) const - { - const Values & values = lut[d]; + const Values & values = getValues(v); return values.year * 10000 + values.month * 100 + values.day_of_month; } @@ -762,22 +974,85 @@ public: return makeDate(num / 10000, num / 100 % 100, num % 100); } - inline DayNum YYYYMMDDToDayNum(UInt32 num) const + inline ExtendedDayNum YYYYMMDDToDayNum(UInt32 num) const { return makeDayNum(num / 10000, num / 100 % 100, num % 100); } + struct DateComponents + { + uint16_t year; + uint8_t month; + uint8_t day; + }; + + struct TimeComponents + { + uint8_t hour; + uint8_t minute; + uint8_t second; + }; + + struct DateTimeComponents + { + DateComponents date; + TimeComponents time; + }; + + inline DateComponents toDateComponents(time_t t) const + { + const Values & values = getValues(t); + return { values.year, values.month, values.day_of_month }; + } + + inline DateTimeComponents toDateTimeComponents(time_t t) const + { + const LUTIndex index = findIndex(t); + const Values & values = lut[index]; + + DateTimeComponents res; + + res.date.year = values.year; + res.date.month = values.month; + res.date.day = values.day_of_month; + + time_t time = t - values.date; + if (time >= values.time_at_offset_change()) + time += values.amount_of_offset_change(); + + if (unlikely(time < 0)) + { + res.time.second = 0; + res.time.minute = 0; + res.time.hour = 0; + } + else + { + res.time.second = time % 60; + res.time.minute = time / 60 % 60; + res.time.hour = time / 3600; + } + + /// In case time was changed backwards at the start of next day, we will repeat the hour 23. + if (unlikely(res.time.hour > 23)) + res.time.hour = 23; + + return res; + } + + inline UInt64 toNumYYYYMMDDhhmmss(time_t t) const { - const Values & values = find(t); + DateTimeComponents components = toDateTimeComponents(t); + return - toSecond(t) - + toMinute(t) * 100 - + toHour(t) * 10000 - + UInt64(values.day_of_month) * 1000000 - + UInt64(values.month) * 100000000 - + UInt64(values.year) * 10000000000; + components.time.second + + components.time.minute * 100 + + components.time.hour * 10000 + + UInt64(components.date.day) * 1000000 + + UInt64(components.date.month) * 100000000 + + UInt64(components.date.year) * 10000000000; } inline time_t YYYYMMDDhhmmssToTime(UInt64 num) const @@ -796,15 +1071,19 @@ public: inline NO_SANITIZE_UNDEFINED time_t addDays(time_t t, Int64 delta) const { - DayNum index = findIndex(t); - time_t time_offset = toHour(t) * 3600 + toMinute(t) * 60 + toSecond(t); + const LUTIndex index = findIndex(t); + const Values & values = lut[index]; - index += delta; + time_t time = t - values.date; + if (time >= values.time_at_offset_change()) + time += values.amount_of_offset_change(); - if (time_offset >= lut[index].time_at_offset_change) - time_offset -= lut[index].amount_of_offset_change; + const LUTIndex new_index = index + delta; - return lut[index].date + time_offset; + if (time >= lut[new_index].time_at_offset_change()) + time -= lut[new_index].amount_of_offset_change(); + + return lut[new_index].date + time; } inline NO_SANITIZE_UNDEFINED time_t addWeeks(time_t t, Int64 delta) const @@ -812,7 +1091,7 @@ public: return addDays(t, delta * 7); } - inline UInt8 saturateDayOfMonth(UInt16 year, UInt8 month, UInt8 day_of_month) const + inline UInt8 saturateDayOfMonth(Int16 year, UInt8 month, UInt8 day_of_month) const { if (likely(day_of_month <= 28)) return day_of_month; @@ -825,25 +1104,12 @@ public: return day_of_month; } - /// If resulting month has less deys than source month, then saturation can happen. - /// Example: 31 Aug + 1 month = 30 Sep. - inline time_t addMonths(time_t t, Int64 delta) const + template + inline LUTIndex NO_SANITIZE_UNDEFINED addMonthsIndex(DateOrTime v, Int64 delta) const { - DayNum result_day = addMonths(toDayNum(t), delta); + const Values & values = lut[toLUTIndex(v)]; - time_t time_offset = toHour(t) * 3600 + toMinute(t) * 60 + toSecond(t); - - if (time_offset >= lut[result_day].time_at_offset_change) - time_offset -= lut[result_day].amount_of_offset_change; - - return lut[result_day].date + time_offset; - } - - inline NO_SANITIZE_UNDEFINED DayNum addMonths(DayNum d, Int64 delta) const - { - const Values & values = lut[d]; - - Int64 month = static_cast(values.month) + delta; + Int64 month = values.month + delta; if (month > 0) { @@ -851,7 +1117,7 @@ public: month = ((month - 1) % 12) + 1; auto day_of_month = saturateDayOfMonth(year, month, values.day_of_month); - return makeDayNum(year, month, day_of_month); + return makeLUTIndex(year, month, day_of_month); } else { @@ -859,36 +1125,48 @@ public: month = 12 - (-month % 12); auto day_of_month = saturateDayOfMonth(year, month, values.day_of_month); - return makeDayNum(year, month, day_of_month); + return makeLUTIndex(year, month, day_of_month); } } - inline NO_SANITIZE_UNDEFINED time_t addQuarters(time_t t, Int64 delta) const + /// If resulting month has less deys than source month, then saturation can happen. + /// Example: 31 Aug + 1 month = 30 Sep. + inline time_t NO_SANITIZE_UNDEFINED addMonths(time_t t, Int64 delta) const + { + const auto result_day = addMonthsIndex(t, delta); + + const LUTIndex index = findIndex(t); + const Values & values = lut[index]; + + time_t time = t - values.date; + if (time >= values.time_at_offset_change()) + time += values.amount_of_offset_change(); + + if (time >= lut[result_day].time_at_offset_change()) + time -= lut[result_day].amount_of_offset_change(); + + return lut[result_day].date + time; + } + + inline ExtendedDayNum NO_SANITIZE_UNDEFINED addMonths(ExtendedDayNum d, Int64 delta) const + { + return toDayNum(addMonthsIndex(d, delta)); + } + + inline time_t NO_SANITIZE_UNDEFINED addQuarters(time_t t, Int64 delta) const { return addMonths(t, delta * 3); } - inline NO_SANITIZE_UNDEFINED DayNum addQuarters(DayNum d, Int64 delta) const + inline ExtendedDayNum addQuarters(ExtendedDayNum d, Int64 delta) const { return addMonths(d, delta * 3); } - /// Saturation can occur if 29 Feb is mapped to non-leap year. - inline NO_SANITIZE_UNDEFINED time_t addYears(time_t t, Int64 delta) const + template + inline LUTIndex NO_SANITIZE_UNDEFINED addYearsIndex(DateOrTime v, Int64 delta) const { - DayNum result_day = addYears(toDayNum(t), delta); - - time_t time_offset = toHour(t) * 3600 + toMinute(t) * 60 + toSecond(t); - - if (time_offset >= lut[result_day].time_at_offset_change) - time_offset -= lut[result_day].amount_of_offset_change; - - return lut[result_day].date + time_offset; - } - - inline NO_SANITIZE_UNDEFINED DayNum addYears(DayNum d, Int64 delta) const - { - const Values & values = lut[d]; + const Values & values = lut[toLUTIndex(v)]; auto year = values.year + delta; auto month = values.month; @@ -898,42 +1176,61 @@ public: if (unlikely(day_of_month == 29 && month == 2)) day_of_month = saturateDayOfMonth(year, month, day_of_month); - return makeDayNum(year, month, day_of_month); + return makeLUTIndex(year, month, day_of_month); + } + + /// Saturation can occur if 29 Feb is mapped to non-leap year. + inline time_t addYears(time_t t, Int64 delta) const + { + auto result_day = addYearsIndex(t, delta); + + const LUTIndex index = findIndex(t); + const Values & values = lut[index]; + + time_t time = t - values.date; + if (time >= values.time_at_offset_change()) + time += values.amount_of_offset_change(); + + if (time >= lut[result_day].time_at_offset_change()) + time -= lut[result_day].amount_of_offset_change(); + + return lut[result_day].date + time; + } + + inline ExtendedDayNum addYears(ExtendedDayNum d, Int64 delta) const + { + return toDayNum(addYearsIndex(d, delta)); } inline std::string timeToString(time_t t) const { - const Values & values = find(t); + DateTimeComponents components = toDateTimeComponents(t); std::string s {"0000-00-00 00:00:00"}; - s[0] += values.year / 1000; - s[1] += (values.year / 100) % 10; - s[2] += (values.year / 10) % 10; - s[3] += values.year % 10; - s[5] += values.month / 10; - s[6] += values.month % 10; - s[8] += values.day_of_month / 10; - s[9] += values.day_of_month % 10; + s[0] += components.date.year / 1000; + s[1] += (components.date.year / 100) % 10; + s[2] += (components.date.year / 10) % 10; + s[3] += components.date.year % 10; + s[5] += components.date.month / 10; + s[6] += components.date.month % 10; + s[8] += components.date.day / 10; + s[9] += components.date.day % 10; - auto hour = toHour(t); - auto minute = toMinute(t); - auto second = toSecond(t); - - s[11] += hour / 10; - s[12] += hour % 10; - s[14] += minute / 10; - s[15] += minute % 10; - s[17] += second / 10; - s[18] += second % 10; + s[11] += components.time.hour / 10; + s[12] += components.time.hour % 10; + s[14] += components.time.minute / 10; + s[15] += components.time.minute % 10; + s[17] += components.time.second / 10; + s[18] += components.time.second % 10; return s; } inline std::string dateToString(time_t t) const { - const Values & values = find(t); + const Values & values = getValues(t); std::string s {"0000-00-00"}; @@ -949,9 +1246,9 @@ public: return s; } - inline std::string dateToString(DayNum d) const + inline std::string dateToString(ExtendedDayNum d) const { - const Values & values = lut[d]; + const Values & values = getValues(d); std::string s {"0000-00-00"}; @@ -969,7 +1266,7 @@ public: }; #if defined(__PPC__) -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif #endif diff --git a/base/common/DayNum.h b/base/common/DayNum.h index a4ef0c43b69..5cf4d4635c8 100644 --- a/base/common/DayNum.h +++ b/base/common/DayNum.h @@ -7,3 +7,8 @@ * See DateLUTImpl for usage examples. */ STRONG_TYPEDEF(UInt16, DayNum) + +/** Represent number of days since 1970-01-01 but in extended range, + * for dates before 1970-01-01 and after 2105 + */ +STRONG_TYPEDEF(Int32, ExtendedDayNum) diff --git a/base/common/LocalDate.h b/base/common/LocalDate.h index e5ebe877bc5..b1e6eeb907c 100644 --- a/base/common/LocalDate.h +++ b/base/common/LocalDate.h @@ -92,20 +92,10 @@ public: LocalDate(const LocalDate &) noexcept = default; LocalDate & operator= (const LocalDate &) noexcept = default; - LocalDate & operator= (time_t time) - { - init(time); - return *this; - } - - operator time_t() const - { - return DateLUT::instance().makeDate(m_year, m_month, m_day); - } - DayNum getDayNum() const { - return DateLUT::instance().makeDayNum(m_year, m_month, m_day); + const auto & lut = DateLUT::instance(); + return DayNum(lut.makeDayNum(m_year, m_month, m_day).toUnderType()); } operator DayNum() const @@ -166,12 +156,3 @@ public: }; static_assert(sizeof(LocalDate) == 4); - - -namespace std -{ -inline string to_string(const LocalDate & date) -{ - return date.toString(); -} -} diff --git a/base/common/LocalDateTime.h b/base/common/LocalDateTime.h index 0e237789bd1..dde283e5ebb 100644 --- a/base/common/LocalDateTime.h +++ b/base/common/LocalDateTime.h @@ -29,29 +29,16 @@ private: /// NOTE We may use attribute packed instead, but it is less portable. unsigned char pad = 0; - void init(time_t time) + void init(time_t time, const DateLUTImpl & time_zone) { - if (unlikely(time > DATE_LUT_MAX || time == 0)) - { - m_year = 0; - m_month = 0; - m_day = 0; - m_hour = 0; - m_minute = 0; - m_second = 0; + DateLUTImpl::DateTimeComponents components = time_zone.toDateTimeComponents(time); - return; - } - - const auto & date_lut = DateLUT::instance(); - const auto & values = date_lut.getValues(time); - - m_year = values.year; - m_month = values.month; - m_day = values.day_of_month; - m_hour = date_lut.toHour(time); - m_minute = date_lut.toMinute(time); - m_second = date_lut.toSecond(time); + m_year = components.date.year; + m_month = components.date.month; + m_day = components.date.day; + m_hour = components.time.hour; + m_minute = components.time.minute; + m_second = components.time.second; (void)pad; /// Suppress unused private field warning. } @@ -73,9 +60,9 @@ private: } public: - explicit LocalDateTime(time_t time) + explicit LocalDateTime(time_t time, const DateLUTImpl & time_zone = DateLUT::instance()) { - init(time); + init(time, time_zone); } LocalDateTime(unsigned short year_, unsigned char month_, unsigned char day_, @@ -104,19 +91,6 @@ public: LocalDateTime(const LocalDateTime &) noexcept = default; LocalDateTime & operator= (const LocalDateTime &) noexcept = default; - LocalDateTime & operator= (time_t time) - { - init(time); - return *this; - } - - operator time_t() const - { - return m_year == 0 - ? 0 - : DateLUT::instance().makeDateTime(m_year, m_month, m_day, m_hour, m_minute, m_second); - } - unsigned short year() const { return m_year; } unsigned char month() const { return m_month; } unsigned char day() const { return m_day; } @@ -132,8 +106,30 @@ public: void second(unsigned char x) { m_second = x; } LocalDate toDate() const { return LocalDate(m_year, m_month, m_day); } + LocalDateTime toStartOfDate() const { return LocalDateTime(m_year, m_month, m_day, 0, 0, 0); } - LocalDateTime toStartOfDate() { return LocalDateTime(m_year, m_month, m_day, 0, 0, 0); } + std::string toString() const + { + std::string s{"0000-00-00 00:00:00"}; + + s[0] += m_year / 1000; + s[1] += (m_year / 100) % 10; + s[2] += (m_year / 10) % 10; + s[3] += m_year % 10; + s[5] += m_month / 10; + s[6] += m_month % 10; + s[8] += m_day / 10; + s[9] += m_day % 10; + + s[11] += m_hour / 10; + s[12] += m_hour % 10; + s[14] += m_minute / 10; + s[15] += m_minute % 10; + s[17] += m_second / 10; + s[18] += m_second % 10; + + return s; + } bool operator< (const LocalDateTime & other) const { @@ -167,14 +163,3 @@ public: }; static_assert(sizeof(LocalDateTime) == 8); - - -namespace std -{ -inline string to_string(const LocalDateTime & datetime) -{ - stringstream str; - str << datetime; - return str.str(); -} -} diff --git a/base/common/MoveOrCopyIfThrow.h b/base/common/MoveOrCopyIfThrow.h new file mode 100644 index 00000000000..caa1b51e2bc --- /dev/null +++ b/base/common/MoveOrCopyIfThrow.h @@ -0,0 +1,33 @@ +#pragma once + +#include + +namespace detail +{ + template > + struct MoveOrCopyIfThrow; + + template + struct MoveOrCopyIfThrow + { + void operator()(T && src, T & dst) const + { + dst = std::forward(src); + } + }; + + template + struct MoveOrCopyIfThrow + { + void operator()(T && src, T & dst) const + { + dst = src; + } + }; + + template + void moveOrCopyIfThrow(T && src, T & dst) + { + MoveOrCopyIfThrow()(std::forward(src), dst); + } +} diff --git a/base/common/arithmeticOverflow.h b/base/common/arithmeticOverflow.h index a92fe56b9cb..c170d214636 100644 --- a/base/common/arithmeticOverflow.h +++ b/base/common/arithmeticOverflow.h @@ -25,6 +25,12 @@ namespace common return x - y; } + template + inline auto NO_SANITIZE_UNDEFINED negateIgnoreOverflow(T x) + { + return -x; + } + template inline bool addOverflow(T x, T y, T & res) { diff --git a/base/common/defines.h b/base/common/defines.h index 845a53179ef..ada8245f494 100644 --- a/base/common/defines.h +++ b/base/common/defines.h @@ -1,5 +1,20 @@ #pragma once +/// __has_feature supported only by clang. +/// +/// But libcxx/libcxxabi overrides it to 0, +/// thus the checks for __has_feature will be wrong. +/// +/// NOTE: +/// - __has_feature cannot be simply undefined, +/// since this will be broken if some C++ header will be included after +/// including +/// - it should not have fallback to 0, +/// since this may create false-positive detection (common problem) +#if defined(__clang__) && defined(__has_feature) +# define ch_has_feature __has_feature +#endif + #if defined(_MSC_VER) # if !defined(likely) # define likely(x) (x) @@ -32,8 +47,8 @@ /// Check for presence of address sanitizer #if !defined(ADDRESS_SANITIZER) -# if defined(__has_feature) -# if __has_feature(address_sanitizer) +# if defined(ch_has_feature) +# if ch_has_feature(address_sanitizer) # define ADDRESS_SANITIZER 1 # endif # elif defined(__SANITIZE_ADDRESS__) @@ -42,8 +57,8 @@ #endif #if !defined(THREAD_SANITIZER) -# if defined(__has_feature) -# if __has_feature(thread_sanitizer) +# if defined(ch_has_feature) +# if ch_has_feature(thread_sanitizer) # define THREAD_SANITIZER 1 # endif # elif defined(__SANITIZE_THREAD__) @@ -52,8 +67,8 @@ #endif #if !defined(MEMORY_SANITIZER) -# if defined(__has_feature) -# if __has_feature(memory_sanitizer) +# if defined(ch_has_feature) +# if ch_has_feature(memory_sanitizer) # define MEMORY_SANITIZER 1 # endif # elif defined(__MEMORY_SANITIZER__) @@ -61,6 +76,16 @@ # endif #endif +#if !defined(UNDEFINED_BEHAVIOR_SANITIZER) +# if defined(__has_feature) +# if __has_feature(undefined_behavior_sanitizer) +# define UNDEFINED_BEHAVIOR_SANITIZER 1 +# endif +# elif defined(__UNDEFINED_BEHAVIOR_SANITIZER__) +# define UNDEFINED_BEHAVIOR_SANITIZER 1 +# endif +#endif + #if defined(ADDRESS_SANITIZER) # define BOOST_USE_ASAN 1 # define BOOST_USE_UCONTEXT 1 diff --git a/base/common/phdr_cache.cpp b/base/common/phdr_cache.cpp index 4f6a066adab..49d566dac19 100644 --- a/base/common/phdr_cache.cpp +++ b/base/common/phdr_cache.cpp @@ -15,11 +15,11 @@ #endif #define __msan_unpoison(X, Y) // NOLINT -#if defined(__has_feature) -# if __has_feature(memory_sanitizer) -# undef __msan_unpoison -# include -# endif +#if defined(ch_has_feature) +# if ch_has_feature(memory_sanitizer) +# undef __msan_unpoison +# include +# endif #endif #include diff --git a/base/common/setTerminalEcho.cpp b/base/common/setTerminalEcho.cpp index 658f27705ba..66db216a235 100644 --- a/base/common/setTerminalEcho.cpp +++ b/base/common/setTerminalEcho.cpp @@ -1,45 +1,28 @@ -// https://stackoverflow.com/questions/1413445/reading-a-password-from-stdcin - #include #include #include #include #include - -#ifdef WIN32 -#include -#else #include #include -#include -#endif + void setTerminalEcho(bool enable) { -#ifdef WIN32 - auto handle = GetStdHandle(STD_INPUT_HANDLE); - DWORD mode; - if (!GetConsoleMode(handle, &mode)) - throw std::runtime_error(std::string("setTerminalEcho failed get: ") + std::to_string(GetLastError())); + /// Obtain terminal attributes, + /// toggle the ECHO flag + /// and set them back. - if (!enable) - mode &= ~ENABLE_ECHO_INPUT; - else - mode |= ENABLE_ECHO_INPUT; + struct termios tty{}; - if (!SetConsoleMode(handle, mode)) - throw std::runtime_error(std::string("setTerminalEcho failed set: ") + std::to_string(GetLastError())); -#else - struct termios tty; - if (tcgetattr(STDIN_FILENO, &tty)) + if (0 != tcgetattr(STDIN_FILENO, &tty)) throw std::runtime_error(std::string("setTerminalEcho failed get: ") + errnoToString(errno)); - if (!enable) - tty.c_lflag &= ~ECHO; - else - tty.c_lflag |= ECHO; - auto ret = tcsetattr(STDIN_FILENO, TCSANOW, &tty); - if (ret) + if (enable) + tty.c_lflag |= ECHO; + else + tty.c_lflag &= ~ECHO; + + if (0 != tcsetattr(STDIN_FILENO, TCSANOW, &tty)) throw std::runtime_error(std::string("setTerminalEcho failed set: ") + errnoToString(errno)); -#endif } diff --git a/base/common/strong_typedef.h b/base/common/strong_typedef.h index d9850a25c37..77b83bfa6e5 100644 --- a/base/common/strong_typedef.h +++ b/base/common/strong_typedef.h @@ -12,6 +12,7 @@ private: T t; public: + using UnderlyingType = T; template ::type> explicit StrongTypedef(const T & t_) : t(t_) {} template ::type> diff --git a/base/common/tests/CMakeLists.txt b/base/common/tests/CMakeLists.txt index b7082ee9900..2a07a94055f 100644 --- a/base/common/tests/CMakeLists.txt +++ b/base/common/tests/CMakeLists.txt @@ -1,25 +1,2 @@ -include (${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake) - -add_executable (date_lut2 date_lut2.cpp) -add_executable (date_lut3 date_lut3.cpp) -add_executable (date_lut_default_timezone date_lut_default_timezone.cpp) -add_executable (local_date_time_comparison local_date_time_comparison.cpp) -add_executable (realloc-perf allocator.cpp) - -set(PLATFORM_LIBS ${CMAKE_DL_LIBS}) - -target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (local_date_time_comparison PRIVATE common) -target_link_libraries (realloc-perf PRIVATE common) -add_check(local_date_time_comparison) - -if(USE_GTEST) - add_executable(unit_tests_libcommon gtest_json_test.cpp gtest_strong_typedef.cpp gtest_find_symbols.cpp) - target_link_libraries(unit_tests_libcommon PRIVATE common ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) - add_check(unit_tests_libcommon) -endif() - add_executable (dump_variable dump_variable.cpp) target_link_libraries (dump_variable PRIVATE clickhouse_common_io) diff --git a/base/common/tests/allocator.cpp b/base/common/tests/allocator.cpp deleted file mode 100644 index 03f6228e0f5..00000000000 --- a/base/common/tests/allocator.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include -#include -#include -#include - - -void thread_func() -{ - for (size_t i = 0; i < 100; ++i) - { - size_t size = 4096; - - void * buf = malloc(size); - if (!buf) - abort(); - memset(buf, 0, size); - - while (size < 1048576) - { - size_t next_size = size * 4; - - void * new_buf = realloc(buf, next_size); - if (!new_buf) - abort(); - buf = new_buf; - - memset(reinterpret_cast(buf) + size, 0, next_size - size); - size = next_size; - } - - free(buf); - } -} - - -int main(int, char **) -{ - std::vector threads(16); - for (size_t i = 0; i < 1000; ++i) - { - for (auto & thread : threads) - thread = std::thread(thread_func); - for (auto & thread : threads) - thread.join(); - } - return 0; -} diff --git a/base/common/tests/date_lut2.cpp b/base/common/tests/date_lut2.cpp deleted file mode 100644 index 6dcf5e8adf2..00000000000 --- a/base/common/tests/date_lut2.cpp +++ /dev/null @@ -1,53 +0,0 @@ -#include -#include - -#include - - -static std::string toString(time_t Value) -{ - struct tm tm; - char buf[96]; - - localtime_r(&Value, &tm); - snprintf(buf, sizeof(buf), "%04d-%02d-%02d %02d:%02d:%02d", - tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); - - return buf; -} - -static time_t orderedIdentifierToDate(unsigned value) -{ - struct tm tm; - - memset(&tm, 0, sizeof(tm)); - - tm.tm_year = value / 10000 - 1900; - tm.tm_mon = (value % 10000) / 100 - 1; - tm.tm_mday = value % 100; - tm.tm_isdst = -1; - - return mktime(&tm); -} - - -void loop(time_t begin, time_t end, int step) -{ - const auto & date_lut = DateLUT::instance(); - - for (time_t t = begin; t < end; t += step) - std::cout << toString(t) - << ", " << toString(date_lut.toTime(t)) - << ", " << date_lut.toHour(t) - << std::endl; -} - - -int main(int, char **) -{ - loop(orderedIdentifierToDate(20101031), orderedIdentifierToDate(20101101), 15 * 60); - loop(orderedIdentifierToDate(20100328), orderedIdentifierToDate(20100330), 15 * 60); - loop(orderedIdentifierToDate(20141020), orderedIdentifierToDate(20141106), 15 * 60); - - return 0; -} diff --git a/base/common/tests/date_lut3.cpp b/base/common/tests/date_lut3.cpp deleted file mode 100644 index 411765d2b2a..00000000000 --- a/base/common/tests/date_lut3.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include -#include - -#include - -#include - - -static std::string toString(time_t Value) -{ - struct tm tm; - char buf[96]; - - localtime_r(&Value, &tm); - snprintf(buf, sizeof(buf), "%04d-%02d-%02d %02d:%02d:%02d", - tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); - - return buf; -} - -static time_t orderedIdentifierToDate(unsigned value) -{ - struct tm tm; - - memset(&tm, 0, sizeof(tm)); - - tm.tm_year = value / 10000 - 1900; - tm.tm_mon = (value % 10000) / 100 - 1; - tm.tm_mday = value % 100; - tm.tm_isdst = -1; - - return mktime(&tm); -} - - -void loop(time_t begin, time_t end, int step) -{ - const auto & date_lut = DateLUT::instance(); - - for (time_t t = begin; t < end; t += step) - { - time_t t2 = date_lut.makeDateTime(date_lut.toYear(t), date_lut.toMonth(t), date_lut.toDayOfMonth(t), - date_lut.toHour(t), date_lut.toMinute(t), date_lut.toSecond(t)); - - std::string s1 = toString(t); - std::string s2 = toString(t2); - - std::cerr << s1 << ", " << s2 << std::endl; - - if (s1 != s2) - throw Poco::Exception("Test failed."); - } -} - - -int main(int, char **) -{ - loop(orderedIdentifierToDate(20101031), orderedIdentifierToDate(20101101), 15 * 60); - loop(orderedIdentifierToDate(20100328), orderedIdentifierToDate(20100330), 15 * 60); - - return 0; -} diff --git a/base/common/tests/date_lut_default_timezone.cpp b/base/common/tests/date_lut_default_timezone.cpp deleted file mode 100644 index b8e5aa08931..00000000000 --- a/base/common/tests/date_lut_default_timezone.cpp +++ /dev/null @@ -1,31 +0,0 @@ -#include -#include -#include - -int main(int, char **) -{ - try - { - const auto & date_lut = DateLUT::instance(); - std::cout << "Detected default timezone: `" << date_lut.getTimeZone() << "'" << std::endl; - time_t now = time(nullptr); - std::cout << "Current time: " << date_lut.timeToString(now) - << ", UTC: " << DateLUT::instance("UTC").timeToString(now) << std::endl; - } - catch (const Poco::Exception & e) - { - std::cerr << e.displayText() << std::endl; - return 1; - } - catch (std::exception & e) - { - std::cerr << "std::exception: " << e.what() << std::endl; - return 2; - } - catch (...) - { - std::cerr << "Some exception" << std::endl; - return 3; - } - return 0; -} diff --git a/base/common/tests/gtest_json_test.cpp b/base/common/tests/gtest_json_test.cpp deleted file mode 100644 index 189a1a03d99..00000000000 --- a/base/common/tests/gtest_json_test.cpp +++ /dev/null @@ -1,656 +0,0 @@ -#include -#include -#include -#include - -#include - -using namespace std::literals::string_literals; - -#include - -enum class ResultType -{ - Return, - Throw -}; - -struct GetStringTestRecord -{ - const char * input; - ResultType result_type; - const char * result; -}; - -TEST(JSONSuite, SimpleTest) -{ - std::vector test_data = - { - { R"("name")", ResultType::Return, "name" }, - { R"("Вафельница Vitek WX-1102 FL")", ResultType::Return, "Вафельница Vitek WX-1102 FL" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("184509")", ResultType::Return, "184509" }, - { R"("category")", ResultType::Return, "category" }, - { R"("Все для детей/Детская техника/Vitek")", ResultType::Return, "Все для детей/Детская техника/Vitek" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("В наличии")", ResultType::Return, "В наличии" }, - { R"("price")", ResultType::Return, "price" }, - { R"("2390.00")", ResultType::Return, "2390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("Карточка")", ResultType::Return, "Карточка" }, - { R"("position")", ResultType::Return, "position" }, - { R"("detail")", ResultType::Return, "detail" }, - { R"("actionField")", ResultType::Return, "actionField" }, - { R"("list")", ResultType::Return, "list" }, - { R"("http://www.techport.ru/q/?t=вафельница&sort=price&sdim=asc")", ResultType::Return, "http://www.techport.ru/q/?t=вафельница&sort=price&sdim=asc" }, - { R"("action")", ResultType::Return, "action" }, - { R"("detail")", ResultType::Return, "detail" }, - { R"("products")", ResultType::Return, "products" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Вафельница Vitek WX-1102 FL")", ResultType::Return, "Вафельница Vitek WX-1102 FL" }, - { R"("id")", ResultType::Return, "id" }, - { R"("184509")", ResultType::Return, "184509" }, - { R"("price")", ResultType::Return, "price" }, - { R"("2390.00")", ResultType::Return, "2390.00" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("Vitek")", ResultType::Return, "Vitek" }, - { R"("category")", ResultType::Return, "category" }, - { R"("Все для детей/Детская техника/Vitek")", ResultType::Return, "Все для детей/Детская техника/Vitek" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("В наличии")", ResultType::Return, "В наличии" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("isAuthorized")", ResultType::Return, "isAuthorized" }, - { R"("isSubscriber")", ResultType::Return, "isSubscriber" }, - { R"("postType")", ResultType::Return, "postType" }, - { R"("Новости")", ResultType::Return, "Новости" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("Электроплита GEFEST Брест ЭПНД 5140-01 0001")", ResultType::Return, "Электроплита GEFEST Брест ЭПНД 5140-01 0001" }, - { R"("price")", ResultType::Return, "price" }, - { R"("currencyCode")", ResultType::Return, "currencyCode" }, - { R"("RUB")", ResultType::Return, "RUB" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("trash_login")", ResultType::Return, "trash_login" }, - { R"("novikoff")", ResultType::Return, "novikoff" }, - { R"("trash_cat_link")", ResultType::Return, "trash_cat_link" }, - { R"("progs")", ResultType::Return, "progs" }, - { R"("trash_parent_link")", ResultType::Return, "trash_parent_link" }, - { R"("content")", ResultType::Return, "content" }, - { R"("trash_posted_parent")", ResultType::Return, "trash_posted_parent" }, - { R"("content.01.2016")", ResultType::Return, "content.01.2016" }, - { R"("trash_posted_cat")", ResultType::Return, "trash_posted_cat" }, - { R"("progs.01.2016")", ResultType::Return, "progs.01.2016" }, - { R"("trash_virus_count")", ResultType::Return, "trash_virus_count" }, - { R"("trash_is_android")", ResultType::Return, "trash_is_android" }, - { R"("trash_is_wp8")", ResultType::Return, "trash_is_wp8" }, - { R"("trash_is_ios")", ResultType::Return, "trash_is_ios" }, - { R"("trash_posted")", ResultType::Return, "trash_posted" }, - { R"("01.2016")", ResultType::Return, "01.2016" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("merchantId")", ResultType::Return, "merchantId" }, - { R"("13694_49246")", ResultType::Return, "13694_49246" }, - { R"("cps-source")", ResultType::Return, "cps-source" }, - { R"("wargaming")", ResultType::Return, "wargaming" }, - { R"("cps_provider")", ResultType::Return, "cps_provider" }, - { R"("default")", ResultType::Return, "default" }, - { R"("errorReason")", ResultType::Return, "errorReason" }, - { R"("no errors")", ResultType::Return, "no errors" }, - { R"("scid")", ResultType::Return, "scid" }, - { R"("isAuthPayment")", ResultType::Return, "isAuthPayment" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("rubric")", ResultType::Return, "rubric" }, - { R"("")", ResultType::Return, "" }, - { R"("rubric")", ResultType::Return, "rubric" }, - { R"("Мир")", ResultType::Return, "Мир" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("__ym")", ResultType::Return, "__ym" }, - { R"("ecommerce")", ResultType::Return, "ecommerce" }, - { R"("impressions")", ResultType::Return, "impressions" }, - { R"("id")", ResultType::Return, "id" }, - { R"("863813")", ResultType::Return, "863813" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Happy, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Happy, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("863839")", ResultType::Return, "863839" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Pretty kitten, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Pretty kitten, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("863847")", ResultType::Return, "863847" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Little tiger, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Little tiger, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911480")", ResultType::Return, "911480" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Puppy, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Puppy, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911484")", ResultType::Return, "911484" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Little bears, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Little bears, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911489")", ResultType::Return, "911489" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Dolphin, возраст 2-4 года, трикотаж")", ResultType::Return, "Футболка детская 3D Dolphin, возраст 2-4 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911496")", ResultType::Return, "911496" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Pretty, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Pretty, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911504")", ResultType::Return, "911504" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Fairytale, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Fairytale, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911508")", ResultType::Return, "911508" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Kittens, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Kittens, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911512")", ResultType::Return, "911512" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Sunshine, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Sunshine, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911516")", ResultType::Return, "911516" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Dog in bag, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Dog in bag, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911520")", ResultType::Return, "911520" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Cute puppy, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Cute puppy, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911524")", ResultType::Return, "911524" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Rabbit, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Rabbit, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("911528")", ResultType::Return, "911528" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Футболка детская 3D Turtle, возраст 1-2 года, трикотаж")", ResultType::Return, "Футболка детская 3D Turtle, возраст 1-2 года, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("390.00")", ResultType::Return, "390.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("888616")", ResultType::Return, "888616" }, - { R"("name")", ResultType::Return, "name" }, - { "\"3Д Футболка мужская \\\"Collorista\\\" Светлое завтра р-р XL(52-54), 100% хлопок, трикотаж\"", ResultType::Return, "3Д Футболка мужская \"Collorista\" Светлое завтра р-р XL(52-54), 100% хлопок, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Одежда и обувь/Мужская одежда/Футболки/")", ResultType::Return, "/Одежда и обувь/Мужская одежда/Футболки/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("406.60")", ResultType::Return, "406.60" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("913361")", ResultType::Return, "913361" }, - { R"("name")", ResultType::Return, "name" }, - { R"("3Д Футболка детская World р-р 8-10, 100% хлопок, трикотаж")", ResultType::Return, "3Д Футболка детская World р-р 8-10, 100% хлопок, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("470.00")", ResultType::Return, "470.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("913364")", ResultType::Return, "913364" }, - { R"("name")", ResultType::Return, "name" }, - { R"("3Д Футболка детская Force р-р 8-10, 100% хлопок, трикотаж")", ResultType::Return, "3Д Футболка детская Force р-р 8-10, 100% хлопок, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("470.00")", ResultType::Return, "470.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("913367")", ResultType::Return, "913367" }, - { R"("name")", ResultType::Return, "name" }, - { R"("3Д Футболка детская Winter tale р-р 8-10, 100% хлопок, трикотаж")", ResultType::Return, "3Д Футболка детская Winter tale р-р 8-10, 100% хлопок, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("470.00")", ResultType::Return, "470.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("913385")", ResultType::Return, "913385" }, - { R"("name")", ResultType::Return, "name" }, - { R"("3Д Футболка детская Moonshine р-р 8-10, 100% хлопок, трикотаж")", ResultType::Return, "3Д Футболка детская Moonshine р-р 8-10, 100% хлопок, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("470.00")", ResultType::Return, "470.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("id")", ResultType::Return, "id" }, - { R"("913391")", ResultType::Return, "913391" }, - { R"("name")", ResultType::Return, "name" }, - { R"("3Д Футболка детская Shaman р-р 8-10, 100% хлопок, трикотаж")", ResultType::Return, "3Д Футболка детская Shaman р-р 8-10, 100% хлопок, трикотаж" }, - { R"("category")", ResultType::Return, "category" }, - { R"("/Летние товары/Летний текстиль/")", ResultType::Return, "/Летние товары/Летний текстиль/" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("")", ResultType::Return, "" }, - { R"("price")", ResultType::Return, "price" }, - { R"("470.00")", ResultType::Return, "470.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("/retailrocket/")", ResultType::Return, "/retailrocket/" }, - { R"("position")", ResultType::Return, "position" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")", ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/" }, - { R"("usertype")", ResultType::Return, "usertype" }, - { R"("visitor")", ResultType::Return, "visitor" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("__ym")", ResultType::Return, "__ym" }, - { R"("ecommerce")", ResultType::Return, "ecommerce" }, - { R"("impressions")", ResultType::Return, "impressions" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("experiments")", ResultType::Return, "experiments" }, - { R"("lang")", ResultType::Return, "lang" }, - { R"("ru")", ResultType::Return, "ru" }, - { R"("los_portal")", ResultType::Return, "los_portal" }, - { R"("los_level")", ResultType::Return, "los_level" }, - { R"("none")", ResultType::Return, "none" }, - { R"("__ym")", ResultType::Return, "__ym" }, - { R"("ecommerce")", ResultType::Return, "ecommerce" }, - { R"("currencyCode")", ResultType::Return, "currencyCode" }, - { R"("RUR")", ResultType::Return, "RUR" }, - { R"("impressions")", ResultType::Return, "impressions" }, - { R"("name")", ResultType::Return, "name" }, - { R"("Чайник электрический Mystery MEK-1627, белый")", ResultType::Return, "Чайник электрический Mystery MEK-1627, белый" }, - { R"("brand")", ResultType::Return, "brand" }, - { R"("Mystery")", ResultType::Return, "Mystery" }, - { R"("id")", ResultType::Return, "id" }, - { R"("187180")", ResultType::Return, "187180" }, - { R"("category")", ResultType::Return, "category" }, - { R"("Мелкая бытовая техника/Мелкие кухонные приборы/Чайники электрические/Mystery")", ResultType::Return, "Мелкая бытовая техника/Мелкие кухонные приборы/Чайники электрические/Mystery" }, - { R"("variant")", ResultType::Return, "variant" }, - { R"("В наличии")", ResultType::Return, "В наличии" }, - { R"("price")", ResultType::Return, "price" }, - { R"("1630.00")", ResultType::Return, "1630.00" }, - { R"("list")", ResultType::Return, "list" }, - { R"("Карточка")", ResultType::Return, "Карточка" }, - { R"("position")", ResultType::Return, "position" }, - { R"("detail")", ResultType::Return, "detail" }, - { R"("actionField")", ResultType::Return, "actionField" }, - { R"("list")", ResultType::Return, "list" }, - { "\0\"", ResultType::Throw, "JSON: expected \", got \0" }, - { "\"/igrushki/konstruktory\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/1290414/komplekt-zhenskiy-dzhemper-plusbryuki-m-254-09-malina-plustemno-siniy-\0a", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Творчество/Рисование/Инструменты и кра\0a", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Строительство и ремонт/Силовая техника/Зарядные устройства для автомобильных аккумуляторов/Пуско-зарядные устр\xD0\0a", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Строительство и ремонт/Силовая техника/Зарядные устройств\xD0\0t", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Строительство и ремонт/Силовая техника/Зарядные устройства для автомобиль\0k", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\0t", ResultType::Throw, "JSON: expected \", got \0" }, - { "\"/Хозтовары/Хранение вещей и организа\xD1\0t", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Хозтовары/Товары для стир\0a", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"li\0a", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/734859/samolet-radioupravlyaemyy-istrebitel-rabotaet-o\0k", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/kosmetika-i-parfyum/parfyumeriya/mu\0t", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/ko\0\x04", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "", ResultType::Throw, "JSON: begin >= end." }, - { "\"/stroitelstvo-i-remont/stroit\0t", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/stroitelstvo-i-remont/stroitelnyy-instrument/av\0k", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/s\0a", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Строительство и ремонт/Строительный инструмент/Изм\0e", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/avto/soputstvuy\0l", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/str\0\xD0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Отвертка 2 в 1 \\\"TUNDRA basic\\\" 5х75 мм (+,-) \0\xFF", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/stroitelstvo-i-remont/stroitelnyy-instrument/avtoinstrumen\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Мелкая бытовая техника/Мелки\xD0\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Пряжа \\\"Бамбук стрейч\\0\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Карандаш чёрнографитны\xD0\0\xD0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Творчество/Рукоделие, аппликации/Пряжа и шерсть для \xD0\0l", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/1071547/karandash-chernografitnyy-volshebstvo-nv-kruglyy-d-7-2mm-dl-176mm-plast-tuba/\0e", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"ca\0e", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"ca\0e", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/1165424/chipbord-vyrubnoy-dlya-skrapbukinga-malyshi-mikki-maus-disney-bebi\0t", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/posuda/kuhonnye-prinadlezhnosti-i-i\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Канцтовары/Ежедневники и блокн\xD0\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/kanctovary/ezhednevniki-i-blok\0a", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Стакан \xD0\0a", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Набор бумаги для скрапбукинга \\\"Мои первый годик\\\": Микки Маус, Дисней бэби, 12 листов 29.5 х 29.5 см, 160\0\x80", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"c\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Органайзер для хранения аксессуаров, \0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"quantity\00", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Сменный блок для тетрадей на кольцах А5, 160 листов клетка, офсет \xE2\x84\0=", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Сувениры/Ф\xD0\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"\0\"", ResultType::Return, "\0" }, - { "\"\0\x04", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"va\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"ca\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"В \0\x04", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/letnie-tovary/z\0\x04", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Посудомоечная машина Ha\0=", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Крупная бытов\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Полочная акустическая система Magnat Needl\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"brand\00", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"pos\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"c\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"var\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Телевизоры и видеотехника/Всё для домашних кинотеатр\0=", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Флеш-диск Transcend JetFlash 620 8GB (TS8GJF62\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Табурет Мег\0\xD0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"variant\0\x04", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Катал\xD0\0\"", ResultType::Return, "Катал\xD0\0" }, - { "\"К\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Полочная акустическая система Magnat Needl\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"brand\00", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"pos\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"c\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"17\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/igrushki/razvivayusc\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Ключница \\\"\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Игр\xD1\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Игрушки/Игрушки для девочек/Игровые модули дл\xD1\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Крупная бытовая техника/Стиральные машины/С фронт\xD0\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\0 ", ResultType::Throw, "JSON: expected \", got \0" }, - { "\"Светодиодная лента SMD3528, 5 м. IP33, 60LED, зеленый, 4,8W/мет\xD1\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Сантехника/Мебель для ванных комнат/Стол\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\0o", ResultType::Throw, "JSON: expected \", got \0" }, - { "\"/igrushki/konstruktory\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/posuda/kuhonnye-prinadlezhnosti-i-instrumenty/kuhonnye-pr\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/1290414/komplekt-zhenskiy-dzhemper-plusbryuki-m-254-09-malina-plustemno-siniy-\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Творчество/Рисование/Инструменты и кра\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Строительство и ремонт/Силовая техника/Зарядные устройства для автомобильных аккумуляторов/Пуско-зарядные устр\xD0\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Строительство и ремонт/Силовая техника/Зарядные устройств\xD0\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Строительство и ремонт/Силовая техника/Зарядные устройства для автомобиль\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\0 ", ResultType::Throw, "JSON: expected \", got \0" }, - { "\"/Хозтовары/Хранение вещей и организа\xD1\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Хозтовары/Товары для стир\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"li\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/igrushki/igrus\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/734859/samolet-radioupravlyaemyy-istrebitel-rabotaet-o\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/kosmetika-i-parfyum/parfyumeriya/mu\00", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/ko\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/avto/avtomobilnyy\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/stroitelstvo-i-remont/stroit\00", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/stroitelstvo-i-remont/stroitelnyy-instrument/av\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/s\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Строительство и ремонт/Строительный инструмент/Изм\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/avto/soputstvuy\0\"", ResultType::Return, "/avto/soputstvuy\0" }, - { "\"/str\0k", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Отвертка 2 в 1 \\\"TUNDRA basic\\\" 5х75 мм (+,-) \0\xD0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/stroitelstvo-i-remont/stroitelnyy-instrument/avtoinstrumen\0=", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Чайник электрический Vitesse\0=", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Мелкая бытовая техника/Мелки\xD0\0\xD0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Пряжа \\\"Бамбук стрейч\\0о", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Карандаш чёрнографитны\xD0\0k", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Творчество/Рукоделие, аппликации/Пряжа и шерсть для \xD0\0\"", ResultType::Return, "/Творчество/Рукоделие, аппликации/Пряжа и шерсть для \xD0\0" }, - { "\"/1071547/karandash-chernografitnyy-volshebstvo-nv-kruglyy-d-7-2mm-dl-176mm-plast-tuba/\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"ca\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Подаро\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Средство для прочис\xD1\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"i\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/p\0\"", ResultType::Return, "/p\0" }, - { "\"/Сувениры/Магниты, н\xD0\0k", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Дерев\xD0\0=", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/prazdniki/svadba/svadebnaya-c\0\xD0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Канцт\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Праздники/То\xD0\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"v\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Косметика \xD0\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Спорт и отдых/Настольные игры/Покер, руле\xD1\0\xD0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"categ\0=", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/retailr\0k", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/retailrocket\0k", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Ежедневник недат А5 140л кл,ляссе,обл пв\0=", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/432809/ezhednevnik-organayzer-sredniy-s-remeshkom-na-knopke-v-oblozhke-kalkulyator-kalendar-do-\0\xD0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/1165424/chipbord-vyrubnoy-dlya-skrapbukinga-malyshi-mikki-maus-disney-bebi\0d", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/posuda/kuhonnye-prinadlezhnosti-i-i\0 ", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/Канцтовары/Ежедневники и блокн\xD0\0o", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"/kanctovary/ezhednevniki-i-blok\00", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Стакан \xD0\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"Набор бумаги для скрапбукинга \\\"Мои первый годик\\\": Микки Маус, Дисней бэби, 12 листов 29.5 х 29.5 см, 160\0\0", ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)." }, - { "\"c\0\"", ResultType::Return, "c\0" }, - }; - - for (auto i : boost::irange(0, 1/*00000*/)) - { - static_cast(i); - - for (auto & r : test_data) - { - try - { - JSON j(r.input, r.input + strlen(r.input)); - - ASSERT_EQ(j.getString(), r.result); - ASSERT_TRUE(r.result_type == ResultType::Return); - } - catch (JSONException & e) - { - ASSERT_TRUE(r.result_type == ResultType::Throw); - ASSERT_EQ(e.message(), r.result); - } - } - } -} diff --git a/base/common/wide_integer_impl.h b/base/common/wide_integer_impl.h index a34e757eaa5..456c10a22e4 100644 --- a/base/common/wide_integer_impl.h +++ b/base/common/wide_integer_impl.h @@ -249,15 +249,15 @@ struct integer::_impl return; } - const T alpha = t / max_int; + const T alpha = t / static_cast(max_int); - if (alpha <= max_int) + if (alpha <= static_cast(max_int)) self = static_cast(alpha); else // max(double) / 2^64 will surely contain less than 52 precision bits, so speed up computations. set_multiplier(self, alpha); self *= max_int; - self += static_cast(t - alpha * max_int); // += b_i + self += static_cast(t - alpha * static_cast(max_int)); // += b_i } constexpr static void wide_integer_from_bultin(integer& self, double rhs) noexcept { @@ -271,11 +271,15 @@ struct integer::_impl /// As to_Integral does a static_cast to int64_t, it may result in UB. /// The necessary check here is that long double has enough significant (mantissa) bits to store the /// int64_t max value precisely. + + //TODO Be compatible with Apple aarch64 +#if not (defined(__APPLE__) && defined(__aarch64__)) static_assert(LDBL_MANT_DIG >= 64, "On your system long double has less than 64 precision bits," "which may result in UB when initializing double from int64_t"); +#endif - if ((rhs > 0 && rhs < max_int) || (rhs < 0 && rhs > min_int)) + if ((rhs > 0 && rhs < static_cast(max_int)) || (rhs < 0 && rhs > static_cast(min_int))) { self = static_cast(rhs); return; diff --git a/base/daemon/CMakeLists.txt b/base/daemon/CMakeLists.txt index 26d59a57e7f..6ef87db6a61 100644 --- a/base/daemon/CMakeLists.txt +++ b/base/daemon/CMakeLists.txt @@ -5,6 +5,11 @@ add_library (daemon ) target_include_directories (daemon PUBLIC ..) + +if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) + target_link_libraries (daemon PUBLIC -Wl,-undefined,dynamic_lookup) +endif() + target_link_libraries (daemon PUBLIC loggers PRIVATE clickhouse_common_io clickhouse_common_config common ${EXECINFO_LIBRARIES}) if (USE_SENTRY) diff --git a/base/daemon/SentryWriter.cpp b/base/daemon/SentryWriter.cpp index 29430b65983..1b7d0064b99 100644 --- a/base/daemon/SentryWriter.cpp +++ b/base/daemon/SentryWriter.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/base/ext/scope_guard_safe.h b/base/ext/scope_guard_safe.h new file mode 100644 index 00000000000..55140213572 --- /dev/null +++ b/base/ext/scope_guard_safe.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include + +/// Same as SCOPE_EXIT() but block the MEMORY_LIMIT_EXCEEDED errors. +/// +/// Typical example of SCOPE_EXIT_MEMORY() usage is when code under it may do +/// some tiny allocations, that may fail under high memory pressure or/and low +/// max_memory_usage (and related limits). +/// +/// NOTE: it should be used with caution. +#define SCOPE_EXIT_MEMORY(...) SCOPE_EXIT( \ + MemoryTracker::LockExceptionInThread \ + lock_memory_tracker(VariableContext::Global); \ + __VA_ARGS__; \ +) + +/// Same as SCOPE_EXIT() but try/catch/tryLogCurrentException any exceptions. +/// +/// SCOPE_EXIT_SAFE() should be used in case the exception during the code +/// under SCOPE_EXIT() is not "that fatal" and error message in log is enough. +/// +/// Good example is calling CurrentThread::detachQueryIfNotDetached(). +/// +/// Anti-pattern is calling WriteBuffer::finalize() under SCOPE_EXIT_SAFE() +/// (since finalize() can do final write and it is better to fail abnormally +/// instead of ignoring write error). +/// +/// NOTE: it should be used with double caution. +#define SCOPE_EXIT_SAFE(...) SCOPE_EXIT( \ + try \ + { \ + __VA_ARGS__; \ + } \ + catch (...) \ + { \ + tryLogCurrentException(__PRETTY_FUNCTION__); \ + } \ +) + +/// Same as SCOPE_EXIT() but: +/// - block the MEMORY_LIMIT_EXCEEDED errors, +/// - try/catch/tryLogCurrentException any exceptions. +/// +/// SCOPE_EXIT_MEMORY_SAFE() can be used when the error can be ignored, and in +/// addition to SCOPE_EXIT_SAFE() it will also lock MEMORY_LIMIT_EXCEEDED to +/// avoid such exceptions. +/// +/// It does exists as a separate helper, since you do not need to lock +/// MEMORY_LIMIT_EXCEEDED always (there are cases when code under SCOPE_EXIT does +/// not do any allocations, while LockExceptionInThread increment atomic +/// variable). +/// +/// NOTE: it should be used with triple caution. +#define SCOPE_EXIT_MEMORY_SAFE(...) SCOPE_EXIT( \ + try \ + { \ + MemoryTracker::LockExceptionInThread \ + lock_memory_tracker(VariableContext::Global); \ + __VA_ARGS__; \ + } \ + catch (...) \ + { \ + tryLogCurrentException(__PRETTY_FUNCTION__); \ + } \ +) diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt index 684c6162941..e785e2ab2ce 100644 --- a/base/glibc-compatibility/CMakeLists.txt +++ b/base/glibc-compatibility/CMakeLists.txt @@ -1,5 +1,8 @@ if (GLIBC_COMPATIBILITY) - set (ENABLE_FASTMEMCPY ON) + add_subdirectory(memcpy) + if(TARGET memcpy) + set(MEMCPY_LIBRARY memcpy) + endif() enable_language(ASM) include(CheckIncludeFile) @@ -27,13 +30,6 @@ if (GLIBC_COMPATIBILITY) list(APPEND glibc_compatibility_sources musl/getentropy.c) endif() - if (NOT ARCH_ARM) - # clickhouse_memcpy don't support ARCH_ARM, see https://github.com/ClickHouse/ClickHouse/issues/18951 - add_library (clickhouse_memcpy OBJECT - ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.c - ) - endif() - # Need to omit frame pointers to match the performance of glibc set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer") @@ -51,15 +47,16 @@ if (GLIBC_COMPATIBILITY) target_compile_options(glibc-compatibility PRIVATE -fPIC) endif () - target_link_libraries(global-libs INTERFACE glibc-compatibility) + target_link_libraries(global-libs INTERFACE glibc-compatibility ${MEMCPY_LIBRARY}) install( - TARGETS glibc-compatibility + TARGETS glibc-compatibility ${MEMCPY_LIBRARY} EXPORT global ARCHIVE DESTINATION lib ) message (STATUS "Some symbols from glibc will be replaced for compatibility") + elseif (YANDEX_OFFICIAL_BUILD) message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.") endif () diff --git a/base/glibc-compatibility/memcpy/CMakeLists.txt b/base/glibc-compatibility/memcpy/CMakeLists.txt new file mode 100644 index 00000000000..133995d9b96 --- /dev/null +++ b/base/glibc-compatibility/memcpy/CMakeLists.txt @@ -0,0 +1,8 @@ +if (ARCH_AMD64) + add_library(memcpy STATIC memcpy.cpp) + + # We allow to include memcpy.h from user code for better inlining. + target_include_directories(memcpy PUBLIC $) + + target_compile_options(memcpy PRIVATE -fno-builtin-memcpy) +endif () diff --git a/base/glibc-compatibility/memcpy/memcpy.cpp b/base/glibc-compatibility/memcpy/memcpy.cpp new file mode 100644 index 00000000000..ec43a2c3649 --- /dev/null +++ b/base/glibc-compatibility/memcpy/memcpy.cpp @@ -0,0 +1,6 @@ +#include "memcpy.h" + +extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size) +{ + return inline_memcpy(dst, src, size); +} diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h new file mode 100644 index 00000000000..211d144cecb --- /dev/null +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -0,0 +1,217 @@ +#include + +#include + + +/** Custom memcpy implementation for ClickHouse. + * It has the following benefits over using glibc's implementation: + * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability. + * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient. + * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis. + * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries. + * + * Writing our own memcpy is extremely difficult for the following reasons: + * 1. The optimal variant depends on the specific CPU model. + * 2. The optimal variant depends on the distribution of size arguments. + * 3. It depends on the number of threads copying data concurrently. + * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other. + * Due to vast range of scenarios it makes proper testing especially difficult. + * When writing our own memcpy there is a risk to overoptimize it + * on non-representative microbenchmarks while making real-world use cases actually worse. + * + * Most of the benchmarks for memcpy on the internet are wrong. + * + * Let's look at the details: + * + * For small size, the order of branches in code is important. + * There are variants with specific order of branches (like here or in glibc) + * or with jump table (in asm code see example from Cosmopolitan libc: + * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61) + * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/) + * + * It's also important how to copy uneven sizes. + * Almost every implementation, including this, is using two overlapping movs. + * + * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation, + * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion. + * + * For larger sizes it's important to choose the instructions used: + * - SSE or AVX or AVX-512; + * - rep movsb; + * Performance will depend on the size threshold, on the CPU model, on the "erms" flag + * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes) + * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy + * + * Using AVX-512 can be bad due to throttling. + * Using AVX can be bad if most code is using SSE due to switching penalty + * (it also depends on the usage of "vzeroupper" instruction). + * But in some cases AVX gives a win. + * + * It also depends on how many times the loop will be unrolled. + * We are unrolling the loop 8 times (by the number of available registers), but it not always the best. + * + * It also depends on the usage of aligned or unaligned loads/stores. + * We are using unaligned loads and aligned stores. + * + * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD. + * Setting up correct offset for prefetching is non-obvious. + * + * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache). + * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower, + * because L3 cache is shared (and L2 cache is partially shared). + * + * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios, + * so we don't pay attention to using non-temporary stores. + * + * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial, + * even comparing to non-temporary aligned unrolled stores even with the most wide registers. + * + * memcpy can be written in asm, C or C++. The latter can also use inline asm. + * The asm implementation can be better to make sure that compiler won't make the code worse, + * to ensure the order of branches, the code layout, the usage of all required registers. + * But if it is located in separate translation unit, inlining will not be possible + * (inline asm can be used to overcome this limitation). + * Sometimes C or C++ code can be further optimized by compiler. + * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used. + * + * Please note that compiler can replace plain code to memcpy and vice versa. + * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy; + * it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy. + * This is often used to implement unaligned load/store without undefined behaviour in C++. + * - a loop with copying bytes can be recognized and replaced by a call to memcpy; + * it is controlled by -ftree-loop-distribute-patterns. + * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you + * inline code somewhat similar to a decent implementation of memcpy. + * + * This description is up to date as of Mar 2021. + * + * How to test the memcpy implementation for performance: + * 1. Test on real production workload. + * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios. + * + * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes. + * See https://habr.com/en/company/yandex/blog/457612/ + */ + + +static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size) +{ + /// We will use pointer arithmetic, so char pointer will be used. + /// Note that __restrict makes sense (otherwise compiler will reload data from memory + /// instead of using the value of registers due to possible aliasing). + char * __restrict dst = reinterpret_cast(dst_); + const char * __restrict src = reinterpret_cast(src_); + + /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it. + /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly + /// for inlining and removing this single instruction. + void * ret = dst; + +tail: + /// Small sizes and tails after the loop for large sizes. + /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. + /// This order of branches is from the disassembly of glibc's code. + /// We copy chunks of possibly uneven size with two overlapping movs. + /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. + if (size <= 16) + { + if (size >= 8) + { + /// Chunks of 8..16 bytes. + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } + else if (size >= 4) + { + /// Chunks of 4..7 bytes. + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } + else if (size >= 2) + { + /// Chunks of 2..3 bytes. + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } + else if (size >= 1) + { + /// A single byte. + *dst = *src; + } + /// No bytes remaining. + } + else + { + /// Medium and large sizes. + if (size <= 128) + { + /// Medium size, not enough for full loop unrolling. + + /// We will copy the last 16 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + /// Then we will copy every 16 bytes from the beginning in a loop. + /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes. + /// This is Ok, similar to the code for small sizes above. + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else + { + /// Large size with fully unrolled loop. + + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + /// If not aligned - we will copy first 16 bytes with unaligned stores. + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. We will use half of available SSE registers. + /// It's not possible to have both src and dst aligned. + /// So, we will use aligned stores and unaligned loads. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + + /// The latest remaining 0..127 bytes will be processed as usual. + goto tail; + } + } + + return ret; +} + diff --git a/base/mysqlxx/Connection.cpp b/base/mysqlxx/Connection.cpp index 8a15115cb06..2dbbc0c73f3 100644 --- a/base/mysqlxx/Connection.cpp +++ b/base/mysqlxx/Connection.cpp @@ -51,10 +51,11 @@ Connection::Connection( const char* ssl_key, unsigned timeout, unsigned rw_timeout, - bool enable_local_infile) + bool enable_local_infile, + bool opt_reconnect) : Connection() { - connect(db, server, user, password, port, socket, ssl_ca, ssl_cert, ssl_key, timeout, rw_timeout, enable_local_infile); + connect(db, server, user, password, port, socket, ssl_ca, ssl_cert, ssl_key, timeout, rw_timeout, enable_local_infile, opt_reconnect); } Connection::Connection(const std::string & config_name) @@ -80,7 +81,8 @@ void Connection::connect(const char* db, const char * ssl_key, unsigned timeout, unsigned rw_timeout, - bool enable_local_infile) + bool enable_local_infile, + bool opt_reconnect) { if (is_connected) disconnect(); @@ -104,9 +106,8 @@ void Connection::connect(const char* db, if (mysql_options(driver.get(), MYSQL_OPT_LOCAL_INFILE, &enable_local_infile_arg)) throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get())); - /// Enables auto-reconnect. - bool reconnect = true; - if (mysql_options(driver.get(), MYSQL_OPT_RECONNECT, reinterpret_cast(&reconnect))) + /// See C API Developer Guide: Automatic Reconnection Control + if (mysql_options(driver.get(), MYSQL_OPT_RECONNECT, reinterpret_cast(&opt_reconnect))) throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get())); /// Specifies particular ssl key and certificate if it needs diff --git a/base/mysqlxx/Connection.h b/base/mysqlxx/Connection.h index ca67db0e0c6..65955136eb1 100644 --- a/base/mysqlxx/Connection.h +++ b/base/mysqlxx/Connection.h @@ -14,6 +14,8 @@ /// Disable LOAD DATA LOCAL INFILE because it is insecure #define MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE false +/// See https://dev.mysql.com/doc/c-api/5.7/en/c-api-auto-reconnect.html +#define MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT true namespace mysqlxx @@ -76,7 +78,8 @@ public: const char * ssl_key = "", unsigned timeout = MYSQLXX_DEFAULT_TIMEOUT, unsigned rw_timeout = MYSQLXX_DEFAULT_RW_TIMEOUT, - bool enable_local_infile = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE); + bool enable_local_infile = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE, + bool opt_reconnect = MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT); /// Creates connection. Can be used if Poco::Util::Application is using. /// All settings will be got from config_name section of configuration. @@ -96,7 +99,8 @@ public: const char* ssl_key, unsigned timeout = MYSQLXX_DEFAULT_TIMEOUT, unsigned rw_timeout = MYSQLXX_DEFAULT_RW_TIMEOUT, - bool enable_local_infile = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE); + bool enable_local_infile = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE, + bool opt_reconnect = MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT); void connect(const std::string & config_name) { @@ -112,6 +116,7 @@ public: std::string ssl_cert = cfg.getString(config_name + ".ssl_cert", ""); std::string ssl_key = cfg.getString(config_name + ".ssl_key", ""); bool enable_local_infile = cfg.getBool(config_name + ".enable_local_infile", MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE); + bool opt_reconnect = cfg.getBool(config_name + ".opt_reconnect", MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT); unsigned timeout = cfg.getInt(config_name + ".connect_timeout", @@ -135,7 +140,8 @@ public: ssl_key.c_str(), timeout, rw_timeout, - enable_local_infile); + enable_local_infile, + opt_reconnect); } /// If MySQL connection was established. diff --git a/base/mysqlxx/Exception.h b/base/mysqlxx/Exception.h index eaeb3565af1..48cd0997b94 100644 --- a/base/mysqlxx/Exception.h +++ b/base/mysqlxx/Exception.h @@ -26,6 +26,15 @@ struct ConnectionFailed : public Exception }; +/// Connection to MySQL server was lost +struct ConnectionLost : public Exception +{ + ConnectionLost(const std::string & msg, int code = 0) : Exception(msg, code) {} + const char * name() const throw() override { return "mysqlxx::ConnectionLost"; } + const char * className() const throw() override { return "mysqlxx::ConnectionLost"; } +}; + + /// Erroneous query. struct BadQuery : public Exception { diff --git a/base/mysqlxx/Pool.cpp b/base/mysqlxx/Pool.cpp index 2cb3e62db84..386b4544b78 100644 --- a/base/mysqlxx/Pool.cpp +++ b/base/mysqlxx/Pool.cpp @@ -10,7 +10,6 @@ #include -#include #include @@ -41,7 +40,9 @@ void Pool::Entry::decrementRefCount() Pool::Pool(const Poco::Util::AbstractConfiguration & cfg, const std::string & config_name, unsigned default_connections_, unsigned max_connections_, const char * parent_config_name_) - : default_connections(default_connections_), max_connections(max_connections_) + : logger(Poco::Logger::get("mysqlxx::Pool")) + , default_connections(default_connections_) + , max_connections(max_connections_) { server = cfg.getString(config_name + ".host"); @@ -78,6 +79,9 @@ Pool::Pool(const Poco::Util::AbstractConfiguration & cfg, const std::string & co enable_local_infile = cfg.getBool(config_name + ".enable_local_infile", cfg.getBool(parent_config_name + ".enable_local_infile", MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE)); + + opt_reconnect = cfg.getBool(config_name + ".opt_reconnect", + cfg.getBool(parent_config_name + ".opt_reconnect", MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT)); } else { @@ -96,6 +100,8 @@ Pool::Pool(const Poco::Util::AbstractConfiguration & cfg, const std::string & co enable_local_infile = cfg.getBool( config_name + ".enable_local_infile", MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE); + + opt_reconnect = cfg.getBool(config_name + ".opt_reconnect", MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT); } connect_timeout = cfg.getInt(config_name + ".connect_timeout", @@ -125,20 +131,30 @@ Pool::Entry Pool::get() initialize(); for (;;) { + logger.trace("(%s): Iterating through existing MySQL connections", getDescription()); + for (auto & connection : connections) { if (connection->ref_count == 0) return Entry(connection, this); } + logger.trace("(%s): Trying to allocate a new connection.", getDescription()); if (connections.size() < static_cast(max_connections)) { Connection * conn = allocConnection(); if (conn) return Entry(conn, this); + + logger.trace("(%s): Unable to create a new connection: Allocation failed.", getDescription()); + } + else + { + logger.trace("(%s): Unable to create a new connection: Max number of connections has been reached.", getDescription()); } lock.unlock(); + logger.trace("(%s): Sleeping for %d seconds.", getDescription(), MYSQLXX_POOL_SLEEP_ON_CONNECT_FAIL); sleepForSeconds(MYSQLXX_POOL_SLEEP_ON_CONNECT_FAIL); lock.lock(); } @@ -158,12 +174,13 @@ Pool::Entry Pool::tryGet() /// Fixme: There is a race condition here b/c we do not synchronize with Pool::Entry's copy-assignment operator if (connection_ptr->ref_count == 0) { - Entry res(connection_ptr, this); - if (res.tryForceConnected()) /// Tries to reestablish connection as well - return res; + { + Entry res(connection_ptr, this); + if (res.tryForceConnected()) /// Tries to reestablish connection as well + return res; + } - auto & logger = Poco::Util::Application::instance().logger(); - logger.information("Idle connection to mysql server cannot be recovered, dropping it."); + logger.debug("(%s): Idle connection to MySQL server cannot be recovered, dropping it.", getDescription()); /// This one is disconnected, cannot be reestablished and so needs to be disposed of. connection_it = connections.erase(connection_it); @@ -186,6 +203,8 @@ Pool::Entry Pool::tryGet() void Pool::removeConnection(Connection* connection) { + logger.trace("(%s): Removing connection.", getDescription()); + std::lock_guard lock(mutex); if (connection) { @@ -210,8 +229,6 @@ void Pool::Entry::forceConnected() const if (data == nullptr) throw Poco::RuntimeException("Tried to access NULL database connection."); - Poco::Util::Application & app = Poco::Util::Application::instance(); - bool first = true; while (!tryForceConnected()) { @@ -220,7 +237,7 @@ void Pool::Entry::forceConnected() const else sleepForSeconds(MYSQLXX_POOL_SLEEP_ON_CONNECT_FAIL); - app.logger().information("MYSQL: Reconnecting to " + pool->description); + pool->logger.debug("Entry: Reconnecting to MySQL server %s", pool->description); data->conn.connect( pool->db.c_str(), pool->server.c_str(), @@ -233,7 +250,8 @@ void Pool::Entry::forceConnected() const pool->ssl_key.c_str(), pool->connect_timeout, pool->rw_timeout, - pool->enable_local_infile); + pool->enable_local_infile, + pool->opt_reconnect); } } @@ -242,18 +260,22 @@ bool Pool::Entry::tryForceConnected() const { auto * const mysql_driver = data->conn.getDriver(); const auto prev_connection_id = mysql_thread_id(mysql_driver); + + pool->logger.trace("Entry(connection %lu): sending PING to check if it is alive.", prev_connection_id); if (data->conn.ping()) /// Attempts to reestablish lost connection { const auto current_connection_id = mysql_thread_id(mysql_driver); if (prev_connection_id != current_connection_id) { - auto & logger = Poco::Util::Application::instance().logger(); - logger.information("Connection to mysql server has been reestablished. Connection id changed: %lu -> %lu", - prev_connection_id, current_connection_id); + pool->logger.debug("Entry(connection %lu): Reconnected to MySQL server. Connection id changed: %lu -> %lu", + current_connection_id, prev_connection_id, current_connection_id); } + + pool->logger.trace("Entry(connection %lu): PING ok.", current_connection_id); return true; } + pool->logger.trace("Entry(connection %lu): PING failed.", prev_connection_id); return false; } @@ -274,15 +296,13 @@ void Pool::initialize() Pool::Connection * Pool::allocConnection(bool dont_throw_if_failed_first_time) { - Poco::Util::Application & app = Poco::Util::Application::instance(); - - std::unique_ptr conn(new Connection); + std::unique_ptr conn_ptr{new Connection}; try { - app.logger().information("MYSQL: Connecting to " + description); + logger.debug("Connecting to %s", description); - conn->conn.connect( + conn_ptr->conn.connect( db.c_str(), server.c_str(), user.c_str(), @@ -294,29 +314,29 @@ Pool::Connection * Pool::allocConnection(bool dont_throw_if_failed_first_time) ssl_key.c_str(), connect_timeout, rw_timeout, - enable_local_infile); + enable_local_infile, + opt_reconnect); } catch (mysqlxx::ConnectionFailed & e) { + logger.error(e.what()); + if ((!was_successful && !dont_throw_if_failed_first_time) || e.errnum() == ER_ACCESS_DENIED_ERROR || e.errnum() == ER_DBACCESS_DENIED_ERROR || e.errnum() == ER_BAD_DB_ERROR) { - app.logger().error(e.what()); throw; } else { - app.logger().error(e.what()); return nullptr; } } + connections.push_back(conn_ptr.get()); was_successful = true; - auto * connection = conn.release(); - connections.push_back(connection); - return connection; + return conn_ptr.release(); } } diff --git a/base/mysqlxx/Pool.h b/base/mysqlxx/Pool.h index 83b00e0081a..530e2c78cf2 100644 --- a/base/mysqlxx/Pool.h +++ b/base/mysqlxx/Pool.h @@ -6,6 +6,8 @@ #include #include +#include + #include @@ -157,27 +159,29 @@ public: */ Pool(const std::string & db_, const std::string & server_, - const std::string & user_ = "", - const std::string & password_ = "", - unsigned port_ = 0, + const std::string & user_, + const std::string & password_, + unsigned port_, const std::string & socket_ = "", unsigned connect_timeout_ = MYSQLXX_DEFAULT_TIMEOUT, unsigned rw_timeout_ = MYSQLXX_DEFAULT_RW_TIMEOUT, unsigned default_connections_ = MYSQLXX_POOL_DEFAULT_START_CONNECTIONS, unsigned max_connections_ = MYSQLXX_POOL_DEFAULT_MAX_CONNECTIONS, - unsigned enable_local_infile_ = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE) - : default_connections(default_connections_), max_connections(max_connections_), - db(db_), server(server_), user(user_), password(password_), port(port_), socket(socket_), - connect_timeout(connect_timeout_), rw_timeout(rw_timeout_), enable_local_infile(enable_local_infile_) {} + unsigned enable_local_infile_ = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE, + bool opt_reconnect_ = MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT) + : logger(Poco::Logger::get("mysqlxx::Pool")), default_connections(default_connections_), + max_connections(max_connections_), db(db_), server(server_), user(user_), password(password_), port(port_), socket(socket_), + connect_timeout(connect_timeout_), rw_timeout(rw_timeout_), enable_local_infile(enable_local_infile_), + opt_reconnect(opt_reconnect_) {} Pool(const Pool & other) - : default_connections{other.default_connections}, + : logger(other.logger), default_connections{other.default_connections}, max_connections{other.max_connections}, db{other.db}, server{other.server}, user{other.user}, password{other.password}, port{other.port}, socket{other.socket}, connect_timeout{other.connect_timeout}, rw_timeout{other.rw_timeout}, - enable_local_infile{other.enable_local_infile} + enable_local_infile{other.enable_local_infile}, opt_reconnect(other.opt_reconnect) {} Pool & operator=(const Pool &) = delete; @@ -201,6 +205,8 @@ public: void removeConnection(Connection * connection); protected: + Poco::Logger & logger; + /// Number of MySQL connections which are created at launch. unsigned default_connections; /// Maximum possible number of connections @@ -231,6 +237,7 @@ private: std::string ssl_cert; std::string ssl_key; bool enable_local_infile; + bool opt_reconnect; /// True if connection was established at least once. bool was_successful{false}; diff --git a/base/mysqlxx/PoolWithFailover.cpp b/base/mysqlxx/PoolWithFailover.cpp index 5bee75aab1b..ea2d060e596 100644 --- a/base/mysqlxx/PoolWithFailover.cpp +++ b/base/mysqlxx/PoolWithFailover.cpp @@ -1,3 +1,7 @@ +#include +#include +#include +#include #include @@ -10,9 +14,12 @@ static bool startsWith(const std::string & s, const char * prefix) using namespace mysqlxx; -PoolWithFailover::PoolWithFailover(const Poco::Util::AbstractConfiguration & config_, - const std::string & config_name_, const unsigned default_connections_, - const unsigned max_connections_, const size_t max_tries_) +PoolWithFailover::PoolWithFailover( + const Poco::Util::AbstractConfiguration & config_, + const std::string & config_name_, + const unsigned default_connections_, + const unsigned max_connections_, + const size_t max_tries_) : max_tries(max_tries_) { shareable = config_.getBool(config_name_ + ".share_connection", false); @@ -33,6 +40,19 @@ PoolWithFailover::PoolWithFailover(const Poco::Util::AbstractConfiguration & con std::make_shared(config_, replica_name, default_connections_, max_connections_, config_name_.c_str())); } } + + /// PoolWithFailover objects are stored in a cache inside PoolFactory. + /// This cache is reset by ExternalDictionariesLoader after every SYSTEM RELOAD DICTIONAR{Y|IES} + /// which triggers massive re-constructing of connection pools. + /// The state of PRNGs like std::mt19937 is considered to be quite heavy + /// thus here we attempt to optimize its construction. + static thread_local std::mt19937 rnd_generator( + std::hash{}(std::this_thread::get_id()) + std::clock()); + for (auto & [_, replicas] : replicas_by_priority) + { + if (replicas.size() > 1) + std::shuffle(replicas.begin(), replicas.end(), rnd_generator); + } } else { @@ -41,16 +61,38 @@ PoolWithFailover::PoolWithFailover(const Poco::Util::AbstractConfiguration & con } } -PoolWithFailover::PoolWithFailover(const std::string & config_name_, const unsigned default_connections_, - const unsigned max_connections_, const size_t max_tries_) - : PoolWithFailover{ - Poco::Util::Application::instance().config(), config_name_, - default_connections_, max_connections_, max_tries_} + +PoolWithFailover::PoolWithFailover( + const std::string & config_name_, + const unsigned default_connections_, + const unsigned max_connections_, + const size_t max_tries_) + : PoolWithFailover{Poco::Util::Application::instance().config(), + config_name_, default_connections_, max_connections_, max_tries_} { } + +PoolWithFailover::PoolWithFailover( + const std::string & database, + const RemoteDescription & addresses, + const std::string & user, + const std::string & password, + size_t max_tries_) + : max_tries(max_tries_) + , shareable(false) +{ + /// Replicas have the same priority, but traversed replicas are moved to the end of the queue. + for (const auto & [host, port] : addresses) + { + replicas_by_priority[0].emplace_back(std::make_shared(database, host, user, password, port)); + } +} + + PoolWithFailover::PoolWithFailover(const PoolWithFailover & other) - : max_tries{other.max_tries}, shareable{other.shareable} + : max_tries{other.max_tries} + , shareable{other.shareable} { if (shareable) { diff --git a/base/mysqlxx/PoolWithFailover.h b/base/mysqlxx/PoolWithFailover.h index 029fc3ebad3..5154fc3e253 100644 --- a/base/mysqlxx/PoolWithFailover.h +++ b/base/mysqlxx/PoolWithFailover.h @@ -11,6 +11,8 @@ namespace mysqlxx { /** MySQL connection pool with support of failover. + * + * For dictionary source: * Have information about replicas and their priorities. * Tries to connect to replica in an order of priority. When equal priority, choose replica with maximum time without connections. * @@ -68,42 +70,58 @@ namespace mysqlxx using PoolPtr = std::shared_ptr; using Replicas = std::vector; - /// [priority][index] -> replica. + /// [priority][index] -> replica. Highest priority is 0. using ReplicasByPriority = std::map; - ReplicasByPriority replicas_by_priority; /// Number of connection tries. size_t max_tries; /// Mutex for set of replicas. std::mutex mutex; - /// Can the Pool be shared bool shareable; public: using Entry = Pool::Entry; + using RemoteDescription = std::vector>; /** - * config_name Name of parameter in configuration file. + * * Mysql dictionary sourse related params: + * config_name Name of parameter in configuration file for dictionary source. + * + * * Mysql storage related parameters: + * replicas_description + * + * * Mutual parameters: * default_connections Number of connection in pool to each replica at start. * max_connections Maximum number of connections in pool to each replica. * max_tries_ Max number of connection tries. */ - PoolWithFailover(const std::string & config_name_, + PoolWithFailover( + const std::string & config_name_, unsigned default_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS, unsigned max_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS, size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES); - PoolWithFailover(const Poco::Util::AbstractConfiguration & config_, + PoolWithFailover( + const Poco::Util::AbstractConfiguration & config_, const std::string & config_name_, unsigned default_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS, unsigned max_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS, size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES); + PoolWithFailover( + const std::string & database, + const RemoteDescription & addresses, + const std::string & user, + const std::string & password, + size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES); + PoolWithFailover(const PoolWithFailover & other); /** Allocates a connection to use. */ Entry get(); }; + + using PoolWithFailoverPtr = std::shared_ptr; } diff --git a/base/mysqlxx/Query.cpp b/base/mysqlxx/Query.cpp index f3485c54edc..d4514c3e988 100644 --- a/base/mysqlxx/Query.cpp +++ b/base/mysqlxx/Query.cpp @@ -1,11 +1,16 @@ #if __has_include() +#include #include #else +#include #include #endif +#include + #include #include +#include namespace mysqlxx @@ -57,8 +62,24 @@ void Query::reset() void Query::executeImpl() { std::string query_string = query_buf.str(); - if (mysql_real_query(conn->getDriver(), query_string.data(), query_string.size())) - throw BadQuery(errorMessage(conn->getDriver()), mysql_errno(conn->getDriver())); + + MYSQL* mysql_driver = conn->getDriver(); + + auto & logger = Poco::Logger::get("mysqlxx::Query"); + logger.trace("Running MySQL query using connection %lu", mysql_thread_id(mysql_driver)); + if (mysql_real_query(mysql_driver, query_string.data(), query_string.size())) + { + const auto err_no = mysql_errno(mysql_driver); + switch (err_no) + { + case CR_SERVER_GONE_ERROR: + [[fallthrough]]; + case CR_SERVER_LOST: + throw ConnectionLost(errorMessage(mysql_driver), err_no); + default: + throw BadQuery(errorMessage(mysql_driver), err_no); + } + } } UseQueryResult Query::use() diff --git a/base/mysqlxx/tests/CMakeLists.txt b/base/mysqlxx/tests/CMakeLists.txt index 2cf19d78418..6473a927308 100644 --- a/base/mysqlxx/tests/CMakeLists.txt +++ b/base/mysqlxx/tests/CMakeLists.txt @@ -1,5 +1,2 @@ -add_executable (mysqlxx_test mysqlxx_test.cpp) -target_link_libraries (mysqlxx_test PRIVATE mysqlxx) - add_executable (mysqlxx_pool_test mysqlxx_pool_test.cpp) target_link_libraries (mysqlxx_pool_test PRIVATE mysqlxx) diff --git a/base/mysqlxx/tests/failover.xml b/base/mysqlxx/tests/failover.xml deleted file mode 100644 index 73702eabb29..00000000000 --- a/base/mysqlxx/tests/failover.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - 3306 - root - Metrica - qwerty - - example02t - 0 - - - example02t - 3306 - root - qwerty - Metrica - 1 - - - diff --git a/base/mysqlxx/tests/mysqlxx_test.cpp b/base/mysqlxx/tests/mysqlxx_test.cpp deleted file mode 100644 index c505d34a58d..00000000000 --- a/base/mysqlxx/tests/mysqlxx_test.cpp +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include - - -int main(int, char **) -{ - try - { - mysqlxx::Connection connection("test", "127.0.0.1", "root", "qwerty", 3306); - std::cerr << "Connected." << std::endl; - - { - mysqlxx::Query query = connection.query(); - query << "SELECT 1 x, '2010-01-01 01:01:01' d"; - mysqlxx::UseQueryResult result = query.use(); - std::cerr << "use() called." << std::endl; - - while (mysqlxx::Row row = result.fetch()) - { - std::cerr << "Fetched row." << std::endl; - std::cerr << row[0] << ", " << row["x"] << std::endl; - std::cerr << row[1] << ", " << row["d"] - << ", " << row[1].getDate() - << ", " << row[1].getDateTime() - << ", " << row[1].getDate() - << ", " << row[1].getDateTime() - << std::endl - << row[1].getDate() << ", " << row[1].getDateTime() << std::endl - << row[1].getDate() << ", " << row[1].getDateTime() << std::endl - << row[1].getDate() << ", " << row[1].getDateTime() << std::endl - << row[1].getDate() << ", " << row[1].getDateTime() << std::endl - ; - - time_t t1 = row[0]; - time_t t2 = row[1]; - std::cerr << t1 << ", " << LocalDateTime(t1) << std::endl; - std::cerr << t2 << ", " << LocalDateTime(t2) << std::endl; - } - } - - { - mysqlxx::UseQueryResult result = connection.query("SELECT 'abc\\\\def' x").use(); - mysqlxx::Row row = result.fetch(); - std::cerr << row << std::endl; - std::cerr << row << std::endl; - } - - { - /// Копирование Query - mysqlxx::Query query1 = connection.query("SELECT"); - mysqlxx::Query query2 = query1; - query2 << " 1"; - - std::cerr << query1.str() << ", " << query2.str() << std::endl; - } - - { - /// NULL - mysqlxx::Null x = mysqlxx::null; - std::cerr << (x == mysqlxx::null ? "Ok" : "Fail") << std::endl; - std::cerr << (x == 0 ? "Fail" : "Ok") << std::endl; - std::cerr << (x.isNull() ? "Ok" : "Fail") << std::endl; - x = 1; - std::cerr << (x == mysqlxx::null ? "Fail" : "Ok") << std::endl; - std::cerr << (x == 0 ? "Fail" : "Ok") << std::endl; - std::cerr << (x == 1 ? "Ok" : "Fail") << std::endl; - std::cerr << (x.isNull() ? "Fail" : "Ok") << std::endl; - } - } - catch (const mysqlxx::Exception & e) - { - std::cerr << e.code() << ", " << e.message() << std::endl; - throw; - } - - return 0; -} diff --git a/base/readpassphrase/CMakeLists.txt b/base/readpassphrase/CMakeLists.txt index 574130ce6e3..51b12106eca 100644 --- a/base/readpassphrase/CMakeLists.txt +++ b/base/readpassphrase/CMakeLists.txt @@ -4,5 +4,5 @@ add_library(readpassphrase readpassphrase.c) set_target_properties(readpassphrase PROPERTIES LINKER_LANGUAGE C) -target_compile_options(readpassphrase PRIVATE -Wno-unused-result -Wno-reserved-id-macro) +target_compile_options(readpassphrase PRIVATE -Wno-unused-result -Wno-reserved-id-macro -Wno-disabled-macro-expansion) target_include_directories(readpassphrase PUBLIC .) diff --git a/base/readpassphrase/readpassphrase.c b/base/readpassphrase/readpassphrase.c index 9e8097643bb..8a7d3153915 100644 --- a/base/readpassphrase/readpassphrase.c +++ b/base/readpassphrase/readpassphrase.c @@ -94,7 +94,7 @@ restart: if (input != STDIN_FILENO && tcgetattr(input, &oterm) == 0) { memcpy(&term, &oterm, sizeof(term)); if (!(flags & RPP_ECHO_ON)) - term.c_lflag &= ~(ECHO | ECHONL); + term.c_lflag &= ~((unsigned int) (ECHO | ECHONL)); #ifdef VSTATUS if (term.c_cc[VSTATUS] != _POSIX_VDISABLE) term.c_cc[VSTATUS] = _POSIX_VDISABLE; diff --git a/cmake/analysis.cmake b/cmake/analysis.cmake index 369be295746..267bb34248b 100644 --- a/cmake/analysis.cmake +++ b/cmake/analysis.cmake @@ -16,6 +16,10 @@ if (ENABLE_CLANG_TIDY) set (USE_CLANG_TIDY ON) + # clang-tidy requires assertions to guide the analysis + # Note that NDEBUG is set implicitly by CMake for non-debug builds + set(COMPILER_FLAGS "${COMPILER_FLAGS} -UNDEBUG") + # The variable CMAKE_CXX_CLANG_TIDY will be set inside src and base directories with non third-party code. # set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}") elseif (FAIL_ON_UNSUPPORTED_OPTIONS_COMBINATION) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 9604ef62b31..60e0346dbbf 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -1,7 +1,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64") set (ARCH_AMD64 1) endif () -if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") +if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") set (ARCH_AARCH64 1) endif () if (ARCH_AARCH64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm") diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index ce92ae203ea..51f4b974161 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -1,9 +1,9 @@ # This strings autochanged from release_lib.sh: -SET(VERSION_REVISION 54448) +SET(VERSION_REVISION 54451) SET(VERSION_MAJOR 21) -SET(VERSION_MINOR 3) +SET(VERSION_MINOR 6) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH ef72ba7349f230321750c13ee63b49a11a7c0adc) -SET(VERSION_DESCRIBE v21.3.1.1-prestable) -SET(VERSION_STRING 21.3.1.1) +SET(VERSION_GITHASH 96fced4c3cf432fb0b401d2ab01f0c56e5f74a96) +SET(VERSION_DESCRIBE v21.6.1.1-prestable) +SET(VERSION_STRING 21.6.1.1) # end of autochange diff --git a/cmake/darwin/default_libs.cmake b/cmake/darwin/default_libs.cmake index 79ac675f234..a6ee800d59b 100644 --- a/cmake/darwin/default_libs.cmake +++ b/cmake/darwin/default_libs.cmake @@ -1,11 +1,14 @@ set (DEFAULT_LIBS "-nodefaultlibs") -if (NOT COMPILER_CLANG) - message (FATAL_ERROR "Darwin build is supported only for Clang") -endif () - set (DEFAULT_LIBS "${DEFAULT_LIBS} ${COVERAGE_OPTION} -lc -lm -lpthread -ldl") +if (COMPILER_GCC) + set (DEFAULT_LIBS "${DEFAULT_LIBS} -lgcc_eh") + if (ARCH_AARCH64) + set (DEFAULT_LIBS "${DEFAULT_LIBS} -lgcc") + endif () +endif () + message(STATUS "Default libraries: ${DEFAULT_LIBS}") set(CMAKE_CXX_STANDARD_LIBRARIES ${DEFAULT_LIBS}) diff --git a/cmake/darwin/toolchain-aarch64.cmake b/cmake/darwin/toolchain-aarch64.cmake new file mode 100644 index 00000000000..81398111495 --- /dev/null +++ b/cmake/darwin/toolchain-aarch64.cmake @@ -0,0 +1,14 @@ +set (CMAKE_SYSTEM_NAME "Darwin") +set (CMAKE_SYSTEM_PROCESSOR "aarch64") +set (CMAKE_C_COMPILER_TARGET "aarch64-apple-darwin") +set (CMAKE_CXX_COMPILER_TARGET "aarch64-apple-darwin") +set (CMAKE_ASM_COMPILER_TARGET "aarch64-apple-darwin") +set (CMAKE_OSX_SYSROOT "${CMAKE_CURRENT_LIST_DIR}/../toolchain/darwin-aarch64") + +set (CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) # disable linkage check - it doesn't work in CMake + +set (HAS_PRE_1970_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) +set (HAS_PRE_1970_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) + +set (HAS_POST_2038_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) +set (HAS_POST_2038_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) diff --git a/cmake/find/amqpcpp.cmake b/cmake/find/amqpcpp.cmake index 4191dce26bb..e3eaaf33ddb 100644 --- a/cmake/find/amqpcpp.cmake +++ b/cmake/find/amqpcpp.cmake @@ -1,3 +1,8 @@ +if (OS_DARWIN AND COMPILER_GCC) + # AMQP-CPP requires libuv which cannot be built with GCC in macOS due to a bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93082 + set (ENABLE_AMQPCPP OFF CACHE INTERNAL "") +endif() + option(ENABLE_AMQPCPP "Enalbe AMQP-CPP" ${ENABLE_LIBRARIES}) if (NOT ENABLE_AMQPCPP) diff --git a/cmake/find/base64.cmake b/cmake/find/base64.cmake index 7427baf9cad..acade11eb2f 100644 --- a/cmake/find/base64.cmake +++ b/cmake/find/base64.cmake @@ -1,4 +1,8 @@ -option (ENABLE_BASE64 "Enable base64" ${ENABLE_LIBRARIES}) +if(ARCH_AMD64 OR ARCH_ARM) + option (ENABLE_BASE64 "Enable base64" ${ENABLE_LIBRARIES}) +elseif(ENABLE_BASE64) + message (${RECONFIGURE_MESSAGE_LEVEL} "base64 library is only supported on x86_64 and aarch64") +endif() if (NOT ENABLE_BASE64) return() diff --git a/cmake/find/cassandra.cmake b/cmake/find/cassandra.cmake index 037d6c3f131..ded25a5bf41 100644 --- a/cmake/find/cassandra.cmake +++ b/cmake/find/cassandra.cmake @@ -1,3 +1,8 @@ +if (OS_DARWIN AND COMPILER_GCC) + # Cassandra requires libuv which cannot be built with GCC in macOS due to a bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93082 + set (ENABLE_CASSANDRA OFF CACHE INTERNAL "") +endif() + option(ENABLE_CASSANDRA "Enable Cassandra" ${ENABLE_LIBRARIES}) if (NOT ENABLE_CASSANDRA) diff --git a/cmake/find/ccache.cmake b/cmake/find/ccache.cmake index d9ccd1a9ac6..986c9cb5fe2 100644 --- a/cmake/find/ccache.cmake +++ b/cmake/find/ccache.cmake @@ -32,7 +32,12 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) if (CCACHE_VERSION VERSION_GREATER "3.2.0" OR NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang") message(STATUS "Using ${CCACHE_FOUND} ${CCACHE_VERSION}") - # debian (debhlpers) set SOURCE_DATE_EPOCH environment variable, that is + set (CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_FOUND} ${CMAKE_CXX_COMPILER_LAUNCHER}) + set (CMAKE_C_COMPILER_LAUNCHER ${CCACHE_FOUND} ${CMAKE_C_COMPILER_LAUNCHER}) + + set_property (GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND}) + + # debian (debhelpers) set SOURCE_DATE_EPOCH environment variable, that is # filled from the debian/changelog or current time. # # - 4.0+ ccache always includes this environment variable into the hash @@ -48,9 +53,6 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) message(STATUS "Ignore SOURCE_DATE_EPOCH for ccache") set_property (GLOBAL PROPERTY RULE_LAUNCH_COMPILE "env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND}") set_property (GLOBAL PROPERTY RULE_LAUNCH_LINK "env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND}") - else() - set_property (GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_FOUND}) - set_property (GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND}) endif() else () message(${RECONFIGURE_MESSAGE_LEVEL} "Not using ${CCACHE_FOUND} ${CCACHE_VERSION} bug: https://bugzilla.samba.org/show_bug.cgi?id=8118") diff --git a/cmake/find/datasketches.cmake b/cmake/find/datasketches.cmake new file mode 100644 index 00000000000..44ef324a9f2 --- /dev/null +++ b/cmake/find/datasketches.cmake @@ -0,0 +1,29 @@ +option (ENABLE_DATASKETCHES "Enable DataSketches" ${ENABLE_LIBRARIES}) + +if (ENABLE_DATASKETCHES) + +option (USE_INTERNAL_DATASKETCHES_LIBRARY "Set to FALSE to use system DataSketches library instead of bundled" ${NOT_UNBUNDLED}) + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/theta/CMakeLists.txt") + if (USE_INTERNAL_DATASKETCHES_LIBRARY) + message(WARNING "submodule contrib/datasketches-cpp is missing. to fix try run: \n git submodule update --init --recursive") + endif() + set(MISSING_INTERNAL_DATASKETCHES_LIBRARY 1) + set(USE_INTERNAL_DATASKETCHES_LIBRARY 0) +endif() + +if (USE_INTERNAL_DATASKETCHES_LIBRARY) + set(DATASKETCHES_LIBRARY theta) + set(DATASKETCHES_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/common/include" "${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/theta/include") +elseif (NOT MISSING_INTERNAL_DATASKETCHES_LIBRARY) + find_library(DATASKETCHES_LIBRARY theta) + find_path(DATASKETCHES_INCLUDE_DIR NAMES theta_sketch.hpp PATHS ${DATASKETCHES_INCLUDE_PATHS}) +endif() + +if (DATASKETCHES_LIBRARY AND DATASKETCHES_INCLUDE_DIR) + set(USE_DATASKETCHES 1) +endif() + +endif() + +message (STATUS "Using datasketches=${USE_DATASKETCHES}: ${DATASKETCHES_INCLUDE_DIR} : ${DATASKETCHES_LIBRARY}") diff --git a/cmake/find/fastops.cmake b/cmake/find/fastops.cmake index 5ab320bdb7a..1675646654e 100644 --- a/cmake/find/fastops.cmake +++ b/cmake/find/fastops.cmake @@ -1,7 +1,7 @@ -if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT OS_DARWIN) +if(ARCH_AMD64 AND NOT OS_FREEBSD AND NOT OS_DARWIN) option(ENABLE_FASTOPS "Enable fast vectorized mathematical functions library by Mikhail Parakhin" ${ENABLE_LIBRARIES}) elseif(ENABLE_FASTOPS) - message (${RECONFIGURE_MESSAGE_LEVEL} "Fastops library is not supported on ARM, FreeBSD and Darwin") + message (${RECONFIGURE_MESSAGE_LEVEL} "Fastops library is supported on x86_64 only, and not FreeBSD or Darwin") endif() if(NOT ENABLE_FASTOPS) diff --git a/cmake/find/hdfs3.cmake b/cmake/find/hdfs3.cmake index 7b385f24e1e..3aab2b612ef 100644 --- a/cmake/find/hdfs3.cmake +++ b/cmake/find/hdfs3.cmake @@ -1,4 +1,4 @@ -if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF) +if(NOT ARCH_ARM AND NOT OS_FREEBSD AND NOT APPLE AND USE_PROTOBUF AND NOT ARCH_PPC64LE) option(ENABLE_HDFS "Enable HDFS" ${ENABLE_LIBRARIES}) elseif(ENABLE_HDFS OR USE_INTERNAL_HDFS3_LIBRARY) message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot use HDFS3 with current configuration") diff --git a/cmake/find/krb5.cmake b/cmake/find/krb5.cmake index bd9c8e239cd..49b7462b710 100644 --- a/cmake/find/krb5.cmake +++ b/cmake/find/krb5.cmake @@ -5,8 +5,8 @@ if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/krb5/README") set (ENABLE_KRB5 0) endif () -if (NOT CMAKE_SYSTEM_NAME MATCHES "Linux") - message (WARNING "krb5 disabled in non-Linux environments") +if (NOT CMAKE_SYSTEM_NAME MATCHES "Linux" AND NOT (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND NOT CMAKE_CROSSCOMPILING)) + message (WARNING "krb5 disabled in non-Linux and non-native-Darwin environments") set (ENABLE_KRB5 0) endif () diff --git a/cmake/find/ldap.cmake b/cmake/find/ldap.cmake index 369c1e42e8d..d8baea89429 100644 --- a/cmake/find/ldap.cmake +++ b/cmake/find/ldap.cmake @@ -62,8 +62,10 @@ if (NOT OPENLDAP_FOUND AND NOT MISSING_INTERNAL_LDAP_LIBRARY) if ( ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "x86_64" ) OR ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "aarch64" ) OR + ( "${_system_name}" STREQUAL "linux" AND "${_system_processor}" STREQUAL "ppc64le" ) OR ( "${_system_name}" STREQUAL "freebsd" AND "${_system_processor}" STREQUAL "x86_64" ) OR - ( "${_system_name}" STREQUAL "darwin" AND "${_system_processor}" STREQUAL "x86_64" ) + ( "${_system_name}" STREQUAL "darwin" AND "${_system_processor}" STREQUAL "x86_64" ) OR + ( "${_system_name}" STREQUAL "darwin" AND "${_system_processor}" STREQUAL "aarch64" ) ) set (_ldap_supported_platform TRUE) endif () diff --git a/cmake/find/nanodbc.cmake b/cmake/find/nanodbc.cmake new file mode 100644 index 00000000000..894a2a60bad --- /dev/null +++ b/cmake/find/nanodbc.cmake @@ -0,0 +1,16 @@ +if (NOT ENABLE_ODBC) + return () +endif () + +if (NOT USE_INTERNAL_NANODBC_LIBRARY) + message (FATAL_ERROR "Only the bundled nanodbc library can be used") +endif () + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/nanodbc/CMakeLists.txt") + message (FATAL_ERROR "submodule contrib/nanodbc is missing. to fix try run: \n git submodule update --init --recursive") +endif() + +set (NANODBC_LIBRARY nanodbc) +set (NANODBC_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/nanodbc/nanodbc") + +message (STATUS "Using nanodbc: ${NANODBC_INCLUDE_DIR} : ${NANODBC_LIBRARY}") diff --git a/cmake/find/nuraft.cmake b/cmake/find/nuraft.cmake index 7fa5251946e..4e5258e132f 100644 --- a/cmake/find/nuraft.cmake +++ b/cmake/find/nuraft.cmake @@ -11,7 +11,7 @@ if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/NuRaft/CMakeLists.txt") return() endif () -if (NOT OS_FREEBSD AND NOT OS_DARWIN) +if (NOT OS_FREEBSD) set (USE_NURAFT 1) set (NURAFT_LIBRARY nuraft) diff --git a/cmake/find/odbc.cmake b/cmake/find/odbc.cmake index a23f0c831e9..c475e600c0d 100644 --- a/cmake/find/odbc.cmake +++ b/cmake/find/odbc.cmake @@ -50,4 +50,6 @@ if (NOT EXTERNAL_ODBC_LIBRARY_FOUND) set (USE_INTERNAL_ODBC_LIBRARY 1) endif () +set (USE_INTERNAL_NANODBC_LIBRARY 1) + message (STATUS "Using unixodbc") diff --git a/cmake/find/rocksdb.cmake b/cmake/find/rocksdb.cmake index 968cdb52407..94278a603d7 100644 --- a/cmake/find/rocksdb.cmake +++ b/cmake/find/rocksdb.cmake @@ -1,3 +1,7 @@ +if (OS_DARWIN AND ARCH_AARCH64) + set (ENABLE_ROCKSDB OFF CACHE INTERNAL "") +endif() + option(ENABLE_ROCKSDB "Enable ROCKSDB" ${ENABLE_LIBRARIES}) if (NOT ENABLE_ROCKSDB) diff --git a/cmake/find/s3.cmake b/cmake/find/s3.cmake index 1bbf48fd6b0..1b0c652a31a 100644 --- a/cmake/find/s3.cmake +++ b/cmake/find/s3.cmake @@ -1,7 +1,7 @@ -if(NOT OS_FREEBSD AND NOT APPLE AND NOT ARCH_ARM) +if(NOT OS_FREEBSD AND NOT APPLE) option(ENABLE_S3 "Enable S3" ${ENABLE_LIBRARIES}) elseif(ENABLE_S3 OR USE_INTERNAL_AWS_S3_LIBRARY) - message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on ARM, Apple or FreeBSD") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use S3 on Apple or FreeBSD") endif() if(NOT ENABLE_S3) diff --git a/cmake/find/xz.cmake b/cmake/find/xz.cmake new file mode 100644 index 00000000000..0d19859c6b1 --- /dev/null +++ b/cmake/find/xz.cmake @@ -0,0 +1,27 @@ +option (USE_INTERNAL_XZ_LIBRARY "Set to OFF to use system xz (lzma) library instead of bundled" ${NOT_UNBUNDLED}) + +if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/xz/src/liblzma/api/lzma.h") + if(USE_INTERNAL_XZ_LIBRARY) + message(WARNING "submodule contrib/xz is missing. to fix try run: \n git submodule update --init --recursive") + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal xz (lzma) library") + set(USE_INTERNAL_XZ_LIBRARY 0) + endif() + set(MISSING_INTERNAL_XZ_LIBRARY 1) +endif() + +if (NOT USE_INTERNAL_XZ_LIBRARY) + find_library (XZ_LIBRARY lzma) + find_path (XZ_INCLUDE_DIR NAMES lzma.h PATHS ${XZ_INCLUDE_PATHS}) + if (NOT XZ_LIBRARY OR NOT XZ_INCLUDE_DIR) + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system xz (lzma) library") + endif () +endif () + +if (XZ_LIBRARY AND XZ_INCLUDE_DIR) +elseif (NOT MISSING_INTERNAL_XZ_LIBRARY) + set (USE_INTERNAL_XZ_LIBRARY 1) + set (XZ_LIBRARY liblzma) + set (XZ_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/xz/src/liblzma/api) +endif () + +message (STATUS "Using xz (lzma): ${XZ_INCLUDE_DIR} : ${XZ_LIBRARY}") diff --git a/cmake/linux/default_libs.cmake b/cmake/linux/default_libs.cmake index d3a727e9cb8..c1e4d450389 100644 --- a/cmake/linux/default_libs.cmake +++ b/cmake/linux/default_libs.cmake @@ -6,7 +6,7 @@ set (DEFAULT_LIBS "-nodefaultlibs") # We need builtins from Clang's RT even without libcxx - for ubsan+int128. # See https://bugs.llvm.org/show_bug.cgi?id=16404 if (COMPILER_CLANG AND NOT (CMAKE_CROSSCOMPILING AND ARCH_AARCH64)) - execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.builtins-${CMAKE_SYSTEM_PROCESSOR}.a OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) else () set (BUILTINS_LIBRARY "-lgcc") endif () diff --git a/cmake/tools.cmake b/cmake/tools.cmake index cc4046d2469..44fc3b3e530 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -75,14 +75,14 @@ if (OS_LINUX AND NOT LINKER_NAME) endif () if (LINKER_NAME) - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") + if (COMPILER_CLANG AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 12.0.0 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 12.0.0)) + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LINKER_NAME}") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LINKER_NAME}") + else () + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") + endif () message(STATUS "Using custom linker by name: ${LINKER_NAME}") endif () -if (ARCH_PPC64LE) - if (COMPILER_CLANG OR (COMPILER_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)) - message(FATAL_ERROR "Only gcc-8 or higher is supported for powerpc architecture") - endif () -endif () diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake index 8122e9ef31e..a85fe8963c7 100644 --- a/cmake/warnings.cmake +++ b/cmake/warnings.cmake @@ -11,11 +11,6 @@ if (NOT MSVC) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra") endif () -if (USE_DEBUG_HELPERS) - set (INCLUDE_DEBUG_HELPERS "-I${ClickHouse_SOURCE_DIR}/base -include ${ClickHouse_SOURCE_DIR}/src/Core/iostream_debug_helpers.h") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${INCLUDE_DEBUG_HELPERS}") -endif () - # Add some warnings that are not available even with -Wall -Wextra -Wpedantic. # Intended for exploration of new compiler warnings that may be found useful. # Applies to clang only @@ -176,6 +171,7 @@ elseif (COMPILER_GCC) add_cxx_compile_options(-Wtrampolines) # Obvious add_cxx_compile_options(-Wunused) + add_cxx_compile_options(-Wundef) # Warn if vector operation is not implemented via SIMD capabilities of the architecture add_cxx_compile_options(-Wvector-operation-performance) # XXX: libstdc++ has some of these for 3way compare diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 20b4fad0437..087212ad3b0 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -32,12 +32,12 @@ endif() set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL 1) +add_subdirectory (abseil-cpp-cmake) add_subdirectory (antlr4-runtime-cmake) add_subdirectory (boost-cmake) add_subdirectory (cctz-cmake) add_subdirectory (consistent-hashing) add_subdirectory (dragonbox-cmake) -add_subdirectory (FastMemcpy) add_subdirectory (hyperscan-cmake) add_subdirectory (jemalloc-cmake) add_subdirectory (libcpuid-cmake) @@ -47,7 +47,11 @@ add_subdirectory (lz4-cmake) add_subdirectory (murmurhash) add_subdirectory (replxx-cmake) add_subdirectory (unixodbc-cmake) -add_subdirectory (xz) +add_subdirectory (nanodbc-cmake) + +if (USE_INTERNAL_XZ_LIBRARY) + add_subdirectory (xz) +endif() add_subdirectory (poco-cmake) add_subdirectory (croaring-cmake) @@ -93,14 +97,8 @@ if (USE_INTERNAL_ZLIB_LIBRARY) add_subdirectory (${INTERNAL_ZLIB_NAME}) # We should use same defines when including zlib.h as used when zlib compiled target_compile_definitions (zlib PUBLIC ZLIB_COMPAT WITH_GZFILEOP) - if (TARGET zlibstatic) - target_compile_definitions (zlibstatic PUBLIC ZLIB_COMPAT WITH_GZFILEOP) - endif () if (ARCH_AMD64 OR ARCH_AARCH64) target_compile_definitions (zlib PUBLIC X86_64 UNALIGNED_OK) - if (TARGET zlibstatic) - target_compile_definitions (zlibstatic PUBLIC X86_64 UNALIGNED_OK) - endif () endif () endif () @@ -215,15 +213,17 @@ if (USE_EMBEDDED_COMPILER AND USE_INTERNAL_LLVM_LIBRARY) set (LLVM_ENABLE_RTTI 1 CACHE INTERNAL "") set (LLVM_ENABLE_PIC 0 CACHE INTERNAL "") set (LLVM_TARGETS_TO_BUILD "X86;AArch64" CACHE STRING "") - # Yes it is set globally, but this is not enough, since llvm will add -std=c++11 after default - # And c++2a cannot be used, due to ambiguous operator != - if (COMPILER_GCC OR COMPILER_CLANG) - set (_CXX_STANDARD "gnu++17") - else() - set (_CXX_STANDARD "c++17") - endif() - set (LLVM_CXX_STD ${_CXX_STANDARD} CACHE STRING "" FORCE) + + # Need to use C++17 since the compilation is not possible with C++20 currently, due to ambiguous operator != etc. + # LLVM project will set its default value for the -std=... but our global setting from CMake will override it. + set (CMAKE_CXX_STANDARD_bak ${CMAKE_CXX_STANDARD}) + set (CMAKE_CXX_STANDARD 17) + add_subdirectory (llvm/llvm) + + set (CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_bak}) + unset (CMAKE_CXX_STANDARD_bak) + target_include_directories(LLVMSupport SYSTEM BEFORE PRIVATE ${ZLIB_INCLUDE_DIR}) endif () @@ -280,7 +280,14 @@ if (USE_AMQPCPP) add_subdirectory (amqpcpp-cmake) endif() if (USE_CASSANDRA) + # Need to use C++17 since the compilation is not possible with C++20 currently. + set (CMAKE_CXX_STANDARD_bak ${CMAKE_CXX_STANDARD}) + set (CMAKE_CXX_STANDARD 17) + add_subdirectory (cassandra) + + set (CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_bak}) + unset (CMAKE_CXX_STANDARD_bak) endif() # Should go before: diff --git a/contrib/FastMemcpy/CMakeLists.txt b/contrib/FastMemcpy/CMakeLists.txt deleted file mode 100644 index 8efe6d45dff..00000000000 --- a/contrib/FastMemcpy/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -option (ENABLE_FASTMEMCPY "Enable FastMemcpy library (only internal)" ${ENABLE_LIBRARIES}) - -if (NOT OS_LINUX OR ARCH_AARCH64) - set (ENABLE_FASTMEMCPY OFF) -endif () - -if (ENABLE_FASTMEMCPY) - set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy) - - set (SRCS - ${LIBRARY_DIR}/FastMemcpy.c - - memcpy_wrapper.c - ) - - add_library (FastMemcpy ${SRCS}) - target_include_directories (FastMemcpy PUBLIC ${LIBRARY_DIR}) - - target_compile_definitions(FastMemcpy PUBLIC USE_FASTMEMCPY=1) - - message (STATUS "Using FastMemcpy") -else () - add_library (FastMemcpy INTERFACE) - - target_compile_definitions(FastMemcpy INTERFACE USE_FASTMEMCPY=0) - - message (STATUS "Not using FastMemcpy") -endif () diff --git a/contrib/FastMemcpy/FastMemcpy.c b/contrib/FastMemcpy/FastMemcpy.c deleted file mode 100644 index 5021bcc7d16..00000000000 --- a/contrib/FastMemcpy/FastMemcpy.c +++ /dev/null @@ -1,220 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) -// -//===================================================================== -#include -#include -#include -#include - -#if (defined(_WIN32) || defined(WIN32)) -#include -#include -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif -#elif defined(__unix) -#include -#include -#else -#error it can only be compiled under windows or unix -#endif - -#include "FastMemcpy.h" - -unsigned int gettime() -{ - #if (defined(_WIN32) || defined(WIN32)) - return timeGetTime(); - #else - static struct timezone tz={ 0,0 }; - struct timeval time; - gettimeofday(&time,&tz); - return (time.tv_sec * 1000 + time.tv_usec / 1000); - #endif -} - -void sleepms(unsigned int millisec) -{ -#if defined(_WIN32) || defined(WIN32) - Sleep(millisec); -#else - usleep(millisec * 1000); -#endif -} - - -void benchmark(int dstalign, int srcalign, size_t size, int times) -{ - char *DATA1 = (char*)malloc(size + 64); - char *DATA2 = (char*)malloc(size + 64); - size_t LINEAR1 = ((size_t)DATA1); - size_t LINEAR2 = ((size_t)DATA2); - char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); - char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); - char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); - char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); - unsigned int t1, t2; - int k; - - sleepms(100); - t1 = gettime(); - for (k = times; k > 0; k--) { - memcpy(dst, src, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (k = times; k > 0; k--) { - memcpy_fast(dst, src, size); - } - t2 = gettime() - t2; - - free(DATA1); - free(DATA2); - - printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", - dstalign? "aligned" : "unalign", - srcalign? "aligned" : "unalign", (int)t2, (int)t1); -} - - -void bench(int copysize, int times) -{ - printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); - benchmark(1, 1, copysize, times); - benchmark(1, 0, copysize, times); - benchmark(0, 1, copysize, times); - benchmark(0, 0, copysize, times); - printf("\n"); -} - - -void random_bench(int maxsize, int times) -{ - static char A[11 * 1024 * 1024 + 2]; - static char B[11 * 1024 * 1024 + 2]; - static int random_offsets[0x10000]; - static int random_sizes[0x8000]; - unsigned int i, p1, p2; - unsigned int t1, t2; - for (i = 0; i < 0x10000; i++) { // generate random offsets - random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); - } - for (i = 0; i < 0x8000; i++) { // generate random sizes - random_sizes[i] = 1 + rand() % maxsize; - } - sleepms(100); - t1 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy(A + offset1, B + offset2, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy_fast(A + offset1, B + offset2, size); - } - t2 = gettime() - t2; - printf("benchmark random access:\n"); - printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); -} - - -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif - -int main(void) -{ - bench(32, 0x1000000); - bench(64, 0x1000000); - bench(512, 0x800000); - bench(1024, 0x400000); - bench(4096, 0x80000); - bench(8192, 0x40000); - bench(1024 * 1024 * 1, 0x800); - bench(1024 * 1024 * 4, 0x200); - bench(1024 * 1024 * 8, 0x100); - - random_bench(2048, 8000000); - - return 0; -} - - - - -/* -benchmark(size=32 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms -result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms - -benchmark(size=64 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms -result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms -result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms - -benchmark(size=512 bytes, times=8388608): -result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms -result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms -result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms - -benchmark(size=1024 bytes, times=4194304): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms -result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms -result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms - -benchmark(size=4096 bytes, times=524288): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms - -benchmark(size=8192 bytes, times=262144): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms - -benchmark(size=1048576 bytes, times=2048): -result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms -result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms -result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms -result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms - -benchmark(size=4194304 bytes, times=512): -result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms -result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms -result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms -result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms - -benchmark(size=8388608 bytes, times=256): -result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms -result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms -result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms -result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms - -benchmark random access: -memcpy_fast=515ms memcpy=1014ms - -*/ - - - - diff --git a/contrib/FastMemcpy/FastMemcpy.h b/contrib/FastMemcpy/FastMemcpy.h deleted file mode 100644 index 5dcbfcf1656..00000000000 --- a/contrib/FastMemcpy/FastMemcpy.h +++ /dev/null @@ -1,694 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - -typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; -typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; -typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -} - -static INLINE void memcpy_sse2_32(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); -} - -static INLINE void memcpy_sse2_64(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); -} - -static INLINE void memcpy_sse2_128(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); - __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); - __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); - __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); - _mm_storeu_si128(((__m128i*)dst) + 4, m4); - _mm_storeu_si128(((__m128i*)dst) + 5, m5); - _mm_storeu_si128(((__m128i*)dst) + 6, m6); - _mm_storeu_si128(((__m128i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -/// Attribute is used to avoid an error with undefined behaviour sanitizer -/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer -/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. -__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 64: - memcpy_sse2_64(dd - 64, ss - 64); - case 0: - break; - - case 65: - memcpy_sse2_64(dd - 65, ss - 65); - case 1: - dd[-1] = ss[-1]; - break; - - case 66: - memcpy_sse2_64(dd - 66, ss - 66); - case 2: - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 67: - memcpy_sse2_64(dd - 67, ss - 67); - case 3: - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 68: - memcpy_sse2_64(dd - 68, ss - 68); - case 4: - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 69: - memcpy_sse2_64(dd - 69, ss - 69); - case 5: - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 70: - memcpy_sse2_64(dd - 70, ss - 70); - case 6: - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 71: - memcpy_sse2_64(dd - 71, ss - 71); - case 7: - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 72: - memcpy_sse2_64(dd - 72, ss - 72); - case 8: - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 73: - memcpy_sse2_64(dd - 73, ss - 73); - case 9: - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 74: - memcpy_sse2_64(dd - 74, ss - 74); - case 10: - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 75: - memcpy_sse2_64(dd - 75, ss - 75); - case 11: - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 76: - memcpy_sse2_64(dd - 76, ss - 76); - case 12: - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 77: - memcpy_sse2_64(dd - 77, ss - 77); - case 13: - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 78: - memcpy_sse2_64(dd - 78, ss - 78); - case 14: - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 79: - memcpy_sse2_64(dd - 79, ss - 79); - case 15: - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 80: - memcpy_sse2_64(dd - 80, ss - 80); - case 16: - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 81: - memcpy_sse2_64(dd - 81, ss - 81); - case 17: - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 82: - memcpy_sse2_64(dd - 82, ss - 82); - case 18: - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 83: - memcpy_sse2_64(dd - 83, ss - 83); - case 19: - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 84: - memcpy_sse2_64(dd - 84, ss - 84); - case 20: - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 85: - memcpy_sse2_64(dd - 85, ss - 85); - case 21: - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 86: - memcpy_sse2_64(dd - 86, ss - 86); - case 22: - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 87: - memcpy_sse2_64(dd - 87, ss - 87); - case 23: - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 88: - memcpy_sse2_64(dd - 88, ss - 88); - case 24: - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 89: - memcpy_sse2_64(dd - 89, ss - 89); - case 25: - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 90: - memcpy_sse2_64(dd - 90, ss - 90); - case 26: - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 91: - memcpy_sse2_64(dd - 91, ss - 91); - case 27: - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 92: - memcpy_sse2_64(dd - 92, ss - 92); - case 28: - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 93: - memcpy_sse2_64(dd - 93, ss - 93); - case 29: - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 94: - memcpy_sse2_64(dd - 94, ss - 94); - case 30: - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 95: - memcpy_sse2_64(dd - 95, ss - 95); - case 31: - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 96: - memcpy_sse2_64(dd - 96, ss - 96); - case 32: - memcpy_sse2_32(dd - 32, ss - 32); - break; - - case 97: - memcpy_sse2_64(dd - 97, ss - 97); - case 33: - memcpy_sse2_32(dd - 33, ss - 33); - dd[-1] = ss[-1]; - break; - - case 98: - memcpy_sse2_64(dd - 98, ss - 98); - case 34: - memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 99: - memcpy_sse2_64(dd - 99, ss - 99); - case 35: - memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 100: - memcpy_sse2_64(dd - 100, ss - 100); - case 36: - memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 101: - memcpy_sse2_64(dd - 101, ss - 101); - case 37: - memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 102: - memcpy_sse2_64(dd - 102, ss - 102); - case 38: - memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 103: - memcpy_sse2_64(dd - 103, ss - 103); - case 39: - memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 104: - memcpy_sse2_64(dd - 104, ss - 104); - case 40: - memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 105: - memcpy_sse2_64(dd - 105, ss - 105); - case 41: - memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 106: - memcpy_sse2_64(dd - 106, ss - 106); - case 42: - memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 107: - memcpy_sse2_64(dd - 107, ss - 107); - case 43: - memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 108: - memcpy_sse2_64(dd - 108, ss - 108); - case 44: - memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 109: - memcpy_sse2_64(dd - 109, ss - 109); - case 45: - memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 110: - memcpy_sse2_64(dd - 110, ss - 110); - case 46: - memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 111: - memcpy_sse2_64(dd - 111, ss - 111); - case 47: - memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 112: - memcpy_sse2_64(dd - 112, ss - 112); - case 48: - memcpy_sse2_32(dd - 48, ss - 48); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 113: - memcpy_sse2_64(dd - 113, ss - 113); - case 49: - memcpy_sse2_32(dd - 49, ss - 49); - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 114: - memcpy_sse2_64(dd - 114, ss - 114); - case 50: - memcpy_sse2_32(dd - 50, ss - 50); - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 115: - memcpy_sse2_64(dd - 115, ss - 115); - case 51: - memcpy_sse2_32(dd - 51, ss - 51); - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 116: - memcpy_sse2_64(dd - 116, ss - 116); - case 52: - memcpy_sse2_32(dd - 52, ss - 52); - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 117: - memcpy_sse2_64(dd - 117, ss - 117); - case 53: - memcpy_sse2_32(dd - 53, ss - 53); - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 118: - memcpy_sse2_64(dd - 118, ss - 118); - case 54: - memcpy_sse2_32(dd - 54, ss - 54); - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 119: - memcpy_sse2_64(dd - 119, ss - 119); - case 55: - memcpy_sse2_32(dd - 55, ss - 55); - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 120: - memcpy_sse2_64(dd - 120, ss - 120); - case 56: - memcpy_sse2_32(dd - 56, ss - 56); - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 121: - memcpy_sse2_64(dd - 121, ss - 121); - case 57: - memcpy_sse2_32(dd - 57, ss - 57); - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 122: - memcpy_sse2_64(dd - 122, ss - 122); - case 58: - memcpy_sse2_32(dd - 58, ss - 58); - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 123: - memcpy_sse2_64(dd - 123, ss - 123); - case 59: - memcpy_sse2_32(dd - 59, ss - 59); - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 124: - memcpy_sse2_64(dd - 124, ss - 124); - case 60: - memcpy_sse2_32(dd - 60, ss - 60); - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 125: - memcpy_sse2_64(dd - 125, ss - 125); - case 61: - memcpy_sse2_32(dd - 61, ss - 61); - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 126: - memcpy_sse2_64(dd - 126, ss - 126); - case 62: - memcpy_sse2_32(dd - 62, ss - 62); - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 127: - memcpy_sse2_64(dd - 127, ss - 127); - case 63: - memcpy_sse2_32(dd - 63, ss - 63); - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 128: - memcpy_sse2_128(dd - 128, ss - 128); - break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size - size_t padding; - - // small memory copy - if (size <= 128) { - return memcpy_tiny(dst, src, size); - } - - // align destination to 16 bytes boundary - padding = (16 - (((size_t)dst) & 15)) & 15; - - if (padding > 0) { - __m128i head = _mm_loadu_si128((const __m128i*)src); - _mm_storeu_si128((__m128i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } - - // medium size copy - if (size <= cachesize) { - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_store_si128((((__m128i*)dst) + 0), c0); - _mm_store_si128((((__m128i*)dst) + 1), c1); - _mm_store_si128((((__m128i*)dst) + 2), c2); - _mm_store_si128((((__m128i*)dst) + 3), c3); - _mm_store_si128((((__m128i*)dst) + 4), c4); - _mm_store_si128((((__m128i*)dst) + 5), c5); - _mm_store_si128((((__m128i*)dst) + 6), c6); - _mm_store_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // big memory copy - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { - c0 = _mm_load_si128(((const __m128i*)src) + 0); - c1 = _mm_load_si128(((const __m128i*)src) + 1); - c2 = _mm_load_si128(((const __m128i*)src) + 2); - c3 = _mm_load_si128(((const __m128i*)src) + 3); - c4 = _mm_load_si128(((const __m128i*)src) + 4); - c5 = _mm_load_si128(((const __m128i*)src) + 5); - c6 = _mm_load_si128(((const __m128i*)src) + 6); - c7 = _mm_load_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // source unaligned - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - - return destination; -} - - -#endif diff --git a/contrib/FastMemcpy/FastMemcpy_Avx.c b/contrib/FastMemcpy/FastMemcpy_Avx.c deleted file mode 100644 index 6538c6b2126..00000000000 --- a/contrib/FastMemcpy/FastMemcpy_Avx.c +++ /dev/null @@ -1,171 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) -// -//===================================================================== -#include -#include -#include -#include -#include - -#if (defined(_WIN32) || defined(WIN32)) -#include -#include -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif -#elif defined(__unix) -#include -#include -#else -#error it can only be compiled under windows or unix -#endif - -#include "FastMemcpy_Avx.h" - - -unsigned int gettime() -{ - #if (defined(_WIN32) || defined(WIN32)) - return timeGetTime(); - #else - static struct timezone tz={ 0,0 }; - struct timeval time; - gettimeofday(&time,&tz); - return (time.tv_sec * 1000 + time.tv_usec / 1000); - #endif -} - -void sleepms(unsigned int millisec) -{ -#if defined(_WIN32) || defined(WIN32) - Sleep(millisec); -#else - usleep(millisec * 1000); -#endif -} - - - -void benchmark(int dstalign, int srcalign, size_t size, int times) -{ - char *DATA1 = (char*)malloc(size + 64); - char *DATA2 = (char*)malloc(size + 64); - size_t LINEAR1 = ((size_t)DATA1); - size_t LINEAR2 = ((size_t)DATA2); - char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); - char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); - char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); - char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); - unsigned int t1, t2; - int k; - - sleepms(100); - t1 = gettime(); - for (k = times; k > 0; k--) { - memcpy(dst, src, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (k = times; k > 0; k--) { - memcpy_fast(dst, src, size); - } - t2 = gettime() - t2; - - free(DATA1); - free(DATA2); - - printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", - dstalign? "aligned" : "unalign", - srcalign? "aligned" : "unalign", (int)t2, (int)t1); -} - - -void bench(int copysize, int times) -{ - printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); - benchmark(1, 1, copysize, times); - benchmark(1, 0, copysize, times); - benchmark(0, 1, copysize, times); - benchmark(0, 0, copysize, times); - printf("\n"); -} - - -void random_bench(int maxsize, int times) -{ - static char A[11 * 1024 * 1024 + 2]; - static char B[11 * 1024 * 1024 + 2]; - static int random_offsets[0x10000]; - static int random_sizes[0x8000]; - unsigned int i, p1, p2; - unsigned int t1, t2; - for (i = 0; i < 0x10000; i++) { // generate random offsets - random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); - } - for (i = 0; i < 0x8000; i++) { // generate random sizes - random_sizes[i] = 1 + rand() % maxsize; - } - sleepms(100); - t1 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy(A + offset1, B + offset2, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy_fast(A + offset1, B + offset2, size); - } - t2 = gettime() - t2; - printf("benchmark random access:\n"); - printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); -} - - -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif - -int main(void) -{ -#if 1 - bench(32, 0x1000000); - bench(64, 0x1000000); - bench(512, 0x800000); - bench(1024, 0x400000); -#endif - bench(4096, 0x80000); - bench(8192, 0x40000); -#if 1 - bench(1024 * 1024 * 1, 0x800); - bench(1024 * 1024 * 4, 0x200); -#endif - bench(1024 * 1024 * 8, 0x100); - - random_bench(2048, 8000000); - - return 0; -} - - - - -/* - -*/ - - - - diff --git a/contrib/FastMemcpy/FastMemcpy_Avx.h b/contrib/FastMemcpy/FastMemcpy_Avx.h deleted file mode 100644 index 8ba064b0350..00000000000 --- a/contrib/FastMemcpy/FastMemcpy_Avx.h +++ /dev/null @@ -1,492 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - - - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_avx_16(void *dst, const void *src) { -#if 1 - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -#else - *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0)); - *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8)); -#endif -} - -static INLINE void memcpy_avx_32(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); -} - -static INLINE void memcpy_avx_64(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); -} - -static INLINE void memcpy_avx_128(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); - _mm256_storeu_si256(((__m256i*)dst) + 2, m2); - _mm256_storeu_si256(((__m256i*)dst) + 3, m3); -} - -static INLINE void memcpy_avx_256(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - __m256i m4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - __m256i m5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - __m256i m6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - __m256i m7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); - _mm256_storeu_si256(((__m256i*)dst) + 2, m2); - _mm256_storeu_si256(((__m256i*)dst) + 3, m3); - _mm256_storeu_si256(((__m256i*)dst) + 4, m4); - _mm256_storeu_si256(((__m256i*)dst) + 5, m5); - _mm256_storeu_si256(((__m256i*)dst) + 6, m6); - _mm256_storeu_si256(((__m256i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 128: memcpy_avx_128(dd - 128, ss - 128); - case 0: break; - case 129: memcpy_avx_128(dd - 129, ss - 129); - case 1: dd[-1] = ss[-1]; break; - case 130: memcpy_avx_128(dd - 130, ss - 130); - case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 131: memcpy_avx_128(dd - 131, ss - 131); - case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break; - case 132: memcpy_avx_128(dd - 132, ss - 132); - case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 133: memcpy_avx_128(dd - 133, ss - 133); - case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break; - case 134: memcpy_avx_128(dd - 134, ss - 134); - case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 135: memcpy_avx_128(dd - 135, ss - 135); - case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 136: memcpy_avx_128(dd - 136, ss - 136); - case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 137: memcpy_avx_128(dd - 137, ss - 137); - case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break; - case 138: memcpy_avx_128(dd - 138, ss - 138); - case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 139: memcpy_avx_128(dd - 139, ss - 139); - case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 140: memcpy_avx_128(dd - 140, ss - 140); - case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 141: memcpy_avx_128(dd - 141, ss - 141); - case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 142: memcpy_avx_128(dd - 142, ss - 142); - case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 143: memcpy_avx_128(dd - 143, ss - 143); - case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 144: memcpy_avx_128(dd - 144, ss - 144); - case 16: memcpy_avx_16(dd - 16, ss - 16); break; - case 145: memcpy_avx_128(dd - 145, ss - 145); - case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; - case 146: memcpy_avx_128(dd - 146, ss - 146); - case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 147: memcpy_avx_128(dd - 147, ss - 147); - case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 148: memcpy_avx_128(dd - 148, ss - 148); - case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 149: memcpy_avx_128(dd - 149, ss - 149); - case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 150: memcpy_avx_128(dd - 150, ss - 150); - case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 151: memcpy_avx_128(dd - 151, ss - 151); - case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 152: memcpy_avx_128(dd - 152, ss - 152); - case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 153: memcpy_avx_128(dd - 153, ss - 153); - case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break; - case 154: memcpy_avx_128(dd - 154, ss - 154); - case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break; - case 155: memcpy_avx_128(dd - 155, ss - 155); - case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break; - case 156: memcpy_avx_128(dd - 156, ss - 156); - case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break; - case 157: memcpy_avx_128(dd - 157, ss - 157); - case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break; - case 158: memcpy_avx_128(dd - 158, ss - 158); - case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break; - case 159: memcpy_avx_128(dd - 159, ss - 159); - case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break; - case 160: memcpy_avx_128(dd - 160, ss - 160); - case 32: memcpy_avx_32(dd - 32, ss - 32); break; - case 161: memcpy_avx_128(dd - 161, ss - 161); - case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; - case 162: memcpy_avx_128(dd - 162, ss - 162); - case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 163: memcpy_avx_128(dd - 163, ss - 163); - case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 164: memcpy_avx_128(dd - 164, ss - 164); - case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 165: memcpy_avx_128(dd - 165, ss - 165); - case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 166: memcpy_avx_128(dd - 166, ss - 166); - case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 167: memcpy_avx_128(dd - 167, ss - 167); - case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 168: memcpy_avx_128(dd - 168, ss - 168); - case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 169: memcpy_avx_128(dd - 169, ss - 169); - case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break; - case 170: memcpy_avx_128(dd - 170, ss - 170); - case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break; - case 171: memcpy_avx_128(dd - 171, ss - 171); - case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break; - case 172: memcpy_avx_128(dd - 172, ss - 172); - case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break; - case 173: memcpy_avx_128(dd - 173, ss - 173); - case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break; - case 174: memcpy_avx_128(dd - 174, ss - 174); - case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break; - case 175: memcpy_avx_128(dd - 175, ss - 175); - case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break; - case 176: memcpy_avx_128(dd - 176, ss - 176); - case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break; - case 177: memcpy_avx_128(dd - 177, ss - 177); - case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break; - case 178: memcpy_avx_128(dd - 178, ss - 178); - case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break; - case 179: memcpy_avx_128(dd - 179, ss - 179); - case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break; - case 180: memcpy_avx_128(dd - 180, ss - 180); - case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break; - case 181: memcpy_avx_128(dd - 181, ss - 181); - case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break; - case 182: memcpy_avx_128(dd - 182, ss - 182); - case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break; - case 183: memcpy_avx_128(dd - 183, ss - 183); - case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break; - case 184: memcpy_avx_128(dd - 184, ss - 184); - case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break; - case 185: memcpy_avx_128(dd - 185, ss - 185); - case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break; - case 186: memcpy_avx_128(dd - 186, ss - 186); - case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break; - case 187: memcpy_avx_128(dd - 187, ss - 187); - case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break; - case 188: memcpy_avx_128(dd - 188, ss - 188); - case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break; - case 189: memcpy_avx_128(dd - 189, ss - 189); - case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break; - case 190: memcpy_avx_128(dd - 190, ss - 190); - case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break; - case 191: memcpy_avx_128(dd - 191, ss - 191); - case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break; - case 192: memcpy_avx_128(dd - 192, ss - 192); - case 64: memcpy_avx_64(dd - 64, ss - 64); break; - case 193: memcpy_avx_128(dd - 193, ss - 193); - case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break; - case 194: memcpy_avx_128(dd - 194, ss - 194); - case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 195: memcpy_avx_128(dd - 195, ss - 195); - case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 196: memcpy_avx_128(dd - 196, ss - 196); - case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 197: memcpy_avx_128(dd - 197, ss - 197); - case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 198: memcpy_avx_128(dd - 198, ss - 198); - case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 199: memcpy_avx_128(dd - 199, ss - 199); - case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 200: memcpy_avx_128(dd - 200, ss - 200); - case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 201: memcpy_avx_128(dd - 201, ss - 201); - case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break; - case 202: memcpy_avx_128(dd - 202, ss - 202); - case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break; - case 203: memcpy_avx_128(dd - 203, ss - 203); - case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break; - case 204: memcpy_avx_128(dd - 204, ss - 204); - case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break; - case 205: memcpy_avx_128(dd - 205, ss - 205); - case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break; - case 206: memcpy_avx_128(dd - 206, ss - 206); - case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break; - case 207: memcpy_avx_128(dd - 207, ss - 207); - case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break; - case 208: memcpy_avx_128(dd - 208, ss - 208); - case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break; - case 209: memcpy_avx_128(dd - 209, ss - 209); - case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break; - case 210: memcpy_avx_128(dd - 210, ss - 210); - case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break; - case 211: memcpy_avx_128(dd - 211, ss - 211); - case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break; - case 212: memcpy_avx_128(dd - 212, ss - 212); - case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break; - case 213: memcpy_avx_128(dd - 213, ss - 213); - case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break; - case 214: memcpy_avx_128(dd - 214, ss - 214); - case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break; - case 215: memcpy_avx_128(dd - 215, ss - 215); - case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break; - case 216: memcpy_avx_128(dd - 216, ss - 216); - case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break; - case 217: memcpy_avx_128(dd - 217, ss - 217); - case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break; - case 218: memcpy_avx_128(dd - 218, ss - 218); - case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break; - case 219: memcpy_avx_128(dd - 219, ss - 219); - case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break; - case 220: memcpy_avx_128(dd - 220, ss - 220); - case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break; - case 221: memcpy_avx_128(dd - 221, ss - 221); - case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break; - case 222: memcpy_avx_128(dd - 222, ss - 222); - case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break; - case 223: memcpy_avx_128(dd - 223, ss - 223); - case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break; - case 224: memcpy_avx_128(dd - 224, ss - 224); - case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break; - case 225: memcpy_avx_128(dd - 225, ss - 225); - case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break; - case 226: memcpy_avx_128(dd - 226, ss - 226); - case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break; - case 227: memcpy_avx_128(dd - 227, ss - 227); - case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break; - case 228: memcpy_avx_128(dd - 228, ss - 228); - case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break; - case 229: memcpy_avx_128(dd - 229, ss - 229); - case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break; - case 230: memcpy_avx_128(dd - 230, ss - 230); - case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break; - case 231: memcpy_avx_128(dd - 231, ss - 231); - case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break; - case 232: memcpy_avx_128(dd - 232, ss - 232); - case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break; - case 233: memcpy_avx_128(dd - 233, ss - 233); - case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break; - case 234: memcpy_avx_128(dd - 234, ss - 234); - case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break; - case 235: memcpy_avx_128(dd - 235, ss - 235); - case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break; - case 236: memcpy_avx_128(dd - 236, ss - 236); - case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break; - case 237: memcpy_avx_128(dd - 237, ss - 237); - case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break; - case 238: memcpy_avx_128(dd - 238, ss - 238); - case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break; - case 239: memcpy_avx_128(dd - 239, ss - 239); - case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break; - case 240: memcpy_avx_128(dd - 240, ss - 240); - case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break; - case 241: memcpy_avx_128(dd - 241, ss - 241); - case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break; - case 242: memcpy_avx_128(dd - 242, ss - 242); - case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break; - case 243: memcpy_avx_128(dd - 243, ss - 243); - case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break; - case 244: memcpy_avx_128(dd - 244, ss - 244); - case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break; - case 245: memcpy_avx_128(dd - 245, ss - 245); - case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break; - case 246: memcpy_avx_128(dd - 246, ss - 246); - case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break; - case 247: memcpy_avx_128(dd - 247, ss - 247); - case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break; - case 248: memcpy_avx_128(dd - 248, ss - 248); - case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break; - case 249: memcpy_avx_128(dd - 249, ss - 249); - case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break; - case 250: memcpy_avx_128(dd - 250, ss - 250); - case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break; - case 251: memcpy_avx_128(dd - 251, ss - 251); - case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break; - case 252: memcpy_avx_128(dd - 252, ss - 252); - case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break; - case 253: memcpy_avx_128(dd - 253, ss - 253); - case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break; - case 254: memcpy_avx_128(dd - 254, ss - 254); - case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break; - case 255: memcpy_avx_128(dd - 255, ss - 255); - case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break; - case 256: memcpy_avx_256(dd - 256, ss - 256); break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L3-cache size - size_t padding; - - // small memory copy - if (size <= 256) { - memcpy_tiny(dst, src, size); - _mm256_zeroupper(); - return destination; - } - - // align destination to 16 bytes boundary - padding = (32 - (((size_t)dst) & 31)) & 31; - -#if 0 - if (padding > 0) { - __m256i head = _mm256_loadu_si256((const __m256i*)src); - _mm256_storeu_si256((__m256i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } -#else - __m256i head = _mm256_loadu_si256((const __m256i*)src); - _mm256_storeu_si256((__m256i*)dst, head); - dst += padding; - src += padding; - size -= padding; -#endif - - // medium size copy - if (size <= cachesize) { - __m256i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 256; size -= 256) { - c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_storeu_si256((((__m256i*)dst) + 0), c0); - _mm256_storeu_si256((((__m256i*)dst) + 1), c1); - _mm256_storeu_si256((((__m256i*)dst) + 2), c2); - _mm256_storeu_si256((((__m256i*)dst) + 3), c3); - _mm256_storeu_si256((((__m256i*)dst) + 4), c4); - _mm256_storeu_si256((((__m256i*)dst) + 5), c5); - _mm256_storeu_si256((((__m256i*)dst) + 6), c6); - _mm256_storeu_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - else { // big memory copy - __m256i c0, c1, c2, c3, c4, c5, c6, c7; - /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 31) == 0) { // source aligned - for (; size >= 256; size -= 256) { - c0 = _mm256_load_si256(((const __m256i*)src) + 0); - c1 = _mm256_load_si256(((const __m256i*)src) + 1); - c2 = _mm256_load_si256(((const __m256i*)src) + 2); - c3 = _mm256_load_si256(((const __m256i*)src) + 3); - c4 = _mm256_load_si256(((const __m256i*)src) + 4); - c5 = _mm256_load_si256(((const __m256i*)src) + 5); - c6 = _mm256_load_si256(((const __m256i*)src) + 6); - c7 = _mm256_load_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_stream_si256((((__m256i*)dst) + 0), c0); - _mm256_stream_si256((((__m256i*)dst) + 1), c1); - _mm256_stream_si256((((__m256i*)dst) + 2), c2); - _mm256_stream_si256((((__m256i*)dst) + 3), c3); - _mm256_stream_si256((((__m256i*)dst) + 4), c4); - _mm256_stream_si256((((__m256i*)dst) + 5), c5); - _mm256_stream_si256((((__m256i*)dst) + 6), c6); - _mm256_stream_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - else { // source unaligned - for (; size >= 256; size -= 256) { - c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_stream_si256((((__m256i*)dst) + 0), c0); - _mm256_stream_si256((((__m256i*)dst) + 1), c1); - _mm256_stream_si256((((__m256i*)dst) + 2), c2); - _mm256_stream_si256((((__m256i*)dst) + 3), c3); - _mm256_stream_si256((((__m256i*)dst) + 4), c4); - _mm256_stream_si256((((__m256i*)dst) + 5), c5); - _mm256_stream_si256((((__m256i*)dst) + 6), c6); - _mm256_stream_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - _mm256_zeroupper(); - - return destination; -} - - -#endif - - - diff --git a/contrib/FastMemcpy/LICENSE b/contrib/FastMemcpy/LICENSE deleted file mode 100644 index c449da6aa8a..00000000000 --- a/contrib/FastMemcpy/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Linwei - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - diff --git a/contrib/FastMemcpy/README.md b/contrib/FastMemcpy/README.md deleted file mode 100644 index e253f6bf5dd..00000000000 --- a/contrib/FastMemcpy/README.md +++ /dev/null @@ -1,20 +0,0 @@ -Internal implementation of `memcpy` function. - -It has the following advantages over `libc`-supplied implementation: -- it is linked statically, so the function is called directly, not through a `PLT` (procedure lookup table of shared library); -- it is linked statically, so the function can have position-dependent code; -- your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library; -- you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions; -- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is within few percents). - -Currently it uses the implementation from **Linwei** (skywind3000@163.com). -Look at https://www.zhihu.com/question/35172305 for discussion. - -Drawbacks: -- only use SSE 2, doesn't use wider (AVX, AVX 512) vector registers when available; -- no CPU dispatching; doesn't take into account actual cache size. - -Also worth to look at: -- simple implementation from Facebook: https://github.com/facebook/folly/blob/master/folly/memcpy.S -- implementation from Agner Fog: http://www.agner.org/optimize/ -- glibc source code. diff --git a/contrib/FastMemcpy/memcpy_wrapper.c b/contrib/FastMemcpy/memcpy_wrapper.c deleted file mode 100644 index 1f57345980a..00000000000 --- a/contrib/FastMemcpy/memcpy_wrapper.c +++ /dev/null @@ -1,6 +0,0 @@ -#include "FastMemcpy.h" - -void * memcpy(void * __restrict destination, const void * __restrict source, size_t size) -{ - return memcpy_fast(destination, source, size); -} diff --git a/contrib/NuRaft b/contrib/NuRaft index 7adf7ae33e7..377f8e77491 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 7adf7ae33e7d5c307342431b577c8ab1025ee793 +Subproject commit 377f8e77491d9f66ce8e32e88aae19dffe8dc4d7 diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt new file mode 100644 index 00000000000..c8cb512066a --- /dev/null +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -0,0 +1,18 @@ +set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp") +if(NOT EXISTS "${ABSL_ROOT_DIR}/CMakeLists.txt") + message(FATAL_ERROR " submodule third_party/abseil-cpp is missing. To fix try run: \n git submodule update --init --recursive") +endif() +add_subdirectory("${ABSL_ROOT_DIR}" "${ClickHouse_BINARY_DIR}/contrib/abseil-cpp") + +add_library(abseil_swiss_tables INTERFACE) + +target_link_libraries(abseil_swiss_tables INTERFACE + absl::flat_hash_map + absl::flat_hash_set +) + +get_target_property(FLAT_HASH_MAP_INCLUDE_DIR absl::flat_hash_map INTERFACE_INCLUDE_DIRECTORIES) +target_include_directories (abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_MAP_INCLUDE_DIR}) + +get_target_property(FLAT_HASH_SET_INCLUDE_DIR absl::flat_hash_set INTERFACE_INCLUDE_DIRECTORIES) +target_include_directories (abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_SET_INCLUDE_DIR}) diff --git a/contrib/antlr4-runtime b/contrib/antlr4-runtime index a2fa7b76e2e..672643e9a42 160000 --- a/contrib/antlr4-runtime +++ b/contrib/antlr4-runtime @@ -1 +1 @@ -Subproject commit a2fa7b76e2ee16d2ad955e9214a90bbf79da66fc +Subproject commit 672643e9a427ef803abf13bc8cb4989606553d64 diff --git a/contrib/arrow b/contrib/arrow index 744bdfe188f..616b3dc76a0 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 744bdfe188f018e5e05f5deebd4e9ee0a7706cf4 +Subproject commit 616b3dc76a0c8450b4027ded8a78e9619d7c845f diff --git a/contrib/boost b/contrib/boost index 48f40ebb539..a8d43d3142c 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 48f40ebb539220d328958f8823b094c0b07a4e79 +Subproject commit a8d43d3142cc6b26fc55bec33f7f6edb1156ab7a diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index b9298f59f2b..0759935a7db 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -160,6 +160,12 @@ if (NOT EXTERNAL_BOOST_FOUND) enable_language(ASM) SET(ASM_OPTIONS "-x assembler-with-cpp") + set (SRCS_CONTEXT + ${LIBRARY_DIR}/libs/context/src/dummy.cpp + ${LIBRARY_DIR}/libs/context/src/execution_context.cpp + ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + if (SANITIZE AND (SANITIZE STREQUAL "address" OR SANITIZE STREQUAL "thread")) add_compile_definitions(BOOST_USE_UCONTEXT) @@ -169,39 +175,34 @@ if (NOT EXTERNAL_BOOST_FOUND) add_compile_definitions(BOOST_USE_TSAN) endif() - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/fiber.cpp ${LIBRARY_DIR}/libs/context/src/continuation.cpp - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) - elseif (ARCH_ARM) - set (SRCS_CONTEXT + endif() + if (ARCH_ARM) + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ) + elseif (ARCH_PPC64LE) + set (SRCS_CONTEXT ${SRCS_CONTEXT} + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_elf_gas.S ) elseif(OS_DARWIN) - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) else() - set (SRCS_CONTEXT + set (SRCS_CONTEXT ${SRCS_CONTEXT} ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/dummy.cpp - ${LIBRARY_DIR}/libs/context/src/execution_context.cpp - ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp ) endif() diff --git a/contrib/boringssl b/contrib/boringssl index 8b2bf912ba0..83c1cda8a02 160000 --- a/contrib/boringssl +++ b/contrib/boringssl @@ -1 +1 @@ -Subproject commit 8b2bf912ba04823cfe9e7e8f5bb60cb7f6252449 +Subproject commit 83c1cda8a0224dc817cbad2966c7ed4acc35f02a diff --git a/contrib/boringssl-cmake/CMakeLists.txt b/contrib/boringssl-cmake/CMakeLists.txt index 017a8a64c0e..adfee82dda4 100644 --- a/contrib/boringssl-cmake/CMakeLists.txt +++ b/contrib/boringssl-cmake/CMakeLists.txt @@ -16,7 +16,7 @@ endif() if(CMAKE_COMPILER_IS_GNUCXX OR CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fvisibility=hidden -fno-common -fno-exceptions -fno-rtti") - if(APPLE) + if(APPLE AND CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") endif() diff --git a/contrib/brotli b/contrib/brotli index 5805f99a533..63be8a99401 160000 --- a/contrib/brotli +++ b/contrib/brotli @@ -1 +1 @@ -Subproject commit 5805f99a533a8f8118699c0100d8c102f3605f65 +Subproject commit 63be8a99401992075c23e99f7c84de1c653e39e2 diff --git a/contrib/brotli-cmake/CMakeLists.txt b/contrib/brotli-cmake/CMakeLists.txt index e22f4593c02..4c5f584de9d 100644 --- a/contrib/brotli-cmake/CMakeLists.txt +++ b/contrib/brotli-cmake/CMakeLists.txt @@ -2,6 +2,8 @@ set(BROTLI_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/brotli/c) set(BROTLI_BINARY_DIR ${ClickHouse_BINARY_DIR}/contrib/brotli/c) set(SRCS + ${BROTLI_SOURCE_DIR}/enc/command.c + ${BROTLI_SOURCE_DIR}/enc/fast_log.c ${BROTLI_SOURCE_DIR}/dec/bit_reader.c ${BROTLI_SOURCE_DIR}/dec/state.c ${BROTLI_SOURCE_DIR}/dec/huffman.c @@ -26,6 +28,9 @@ set(SRCS ${BROTLI_SOURCE_DIR}/enc/memory.c ${BROTLI_SOURCE_DIR}/common/dictionary.c ${BROTLI_SOURCE_DIR}/common/transform.c + ${BROTLI_SOURCE_DIR}/common/platform.c + ${BROTLI_SOURCE_DIR}/common/context.c + ${BROTLI_SOURCE_DIR}/common/constants.c ) add_library(brotli ${SRCS}) diff --git a/contrib/cassandra b/contrib/cassandra index b446d7eb68e..c097fb5c7e6 160000 --- a/contrib/cassandra +++ b/contrib/cassandra @@ -1 +1 @@ -Subproject commit b446d7eb68e6962f431e2b3771313bfe9a2bbd93 +Subproject commit c097fb5c7e63cc430016d9a8b240d8e63fbefa52 diff --git a/contrib/cctz-cmake/CMakeLists.txt b/contrib/cctz-cmake/CMakeLists.txt index 90e33dc9f62..a3869478347 100644 --- a/contrib/cctz-cmake/CMakeLists.txt +++ b/contrib/cctz-cmake/CMakeLists.txt @@ -97,12 +97,19 @@ if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS) set(TZ_OBJS ${TZ_OBJS} ${TZ_OBJ}) # https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake - add_custom_command(OUTPUT ${TZ_OBJ} - COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} - COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} + # PPC64LE fails to do this with objcopy, use ld or lld instead + if (ARCH_PPC64LE) + add_custom_command(OUTPUT ${TZ_OBJ} + COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} + COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${TZ_OBJ} ${TIMEZONE_ID} + COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) + else() + add_custom_command(OUTPUT ${TZ_OBJ} + COMMAND cp ${TZDIR}/${TIMEZONE} ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID} + COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} --rename-section .data=.rodata,alloc,load,readonly,data,contents ${TIMEZONE_ID} ${TZ_OBJ} - COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) - + COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}) + endif() set_source_files_properties(${TZ_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true) endforeach(TIMEZONE) diff --git a/contrib/datasketches-cpp b/contrib/datasketches-cpp new file mode 160000 index 00000000000..f915d35b2de --- /dev/null +++ b/contrib/datasketches-cpp @@ -0,0 +1 @@ +Subproject commit f915d35b2de676683493c86c585141a1e1c83334 diff --git a/contrib/flatbuffers b/contrib/flatbuffers index 6df40a24717..22e3ffc66d2 160000 --- a/contrib/flatbuffers +++ b/contrib/flatbuffers @@ -1 +1 @@ -Subproject commit 6df40a2471737b27271bdd9b900ab5f3aec746c7 +Subproject commit 22e3ffc66d2d7d72d1414390aa0f04ffd114a5a1 diff --git a/contrib/googletest b/contrib/googletest index 356f2d264a4..e7e591764ba 160000 --- a/contrib/googletest +++ b/contrib/googletest @@ -1 +1 @@ -Subproject commit 356f2d264a485db2fcc50ec1c672e0d37b6cb39b +Subproject commit e7e591764baba0a0c3c9ad0014430e7a27331d16 diff --git a/contrib/grpc b/contrib/grpc index 7436366ceb3..8d558f03fe3 160000 --- a/contrib/grpc +++ b/contrib/grpc @@ -1 +1 @@ -Subproject commit 7436366ceb341ba5c00ea29f1645e02a2b70bf93 +Subproject commit 8d558f03fe370240081424fafa76cdc9301ea14b diff --git a/contrib/grpc-cmake/CMakeLists.txt b/contrib/grpc-cmake/CMakeLists.txt index 97ca3fab4db..b93968f62f9 100644 --- a/contrib/grpc-cmake/CMakeLists.txt +++ b/contrib/grpc-cmake/CMakeLists.txt @@ -39,11 +39,6 @@ set(_gRPC_SSL_LIBRARIES ${OPENSSL_LIBRARIES}) # Use abseil-cpp from ClickHouse contrib, not from gRPC third_party. set(gRPC_ABSL_PROVIDER "clickhouse" CACHE STRING "" FORCE) -set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp") -if(NOT EXISTS "${ABSL_ROOT_DIR}/CMakeLists.txt") - message(FATAL_ERROR " grpc: submodule third_party/abseil-cpp is missing. To fix try run: \n git submodule update --init --recursive") -endif() -add_subdirectory("${ABSL_ROOT_DIR}" "${ClickHouse_BINARY_DIR}/contrib/abseil-cpp") # Choose to build static or shared library for c-ares. if (MAKE_STATIC_LIBRARIES) diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt index b8a6474413a..f8cab3e548c 100644 --- a/contrib/jemalloc-cmake/CMakeLists.txt +++ b/contrib/jemalloc-cmake/CMakeLists.txt @@ -1,10 +1,13 @@ -if (SANITIZE OR NOT (ARCH_AMD64 OR ARCH_ARM) OR NOT (OS_LINUX OR OS_FREEBSD OR OS_DARWIN)) +if (SANITIZE OR NOT ( + ((OS_LINUX OR OS_FREEBSD) AND (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE)) OR + (OS_DARWIN AND CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") +)) if (ENABLE_JEMALLOC) message (${RECONFIGURE_MESSAGE_LEVEL} - "jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64 or aarch64 on linux or freebsd.") - endif() + "jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64, or ppc64le Linux or FreeBSD builds and RelWithDebInfo macOS builds.") + endif () set (ENABLE_JEMALLOC OFF) -else() +else () option (ENABLE_JEMALLOC "Enable jemalloc allocator" ${ENABLE_LIBRARIES}) endif () @@ -34,9 +37,9 @@ if (OS_LINUX) # avoid spurious latencies and additional work associated with # MADV_DONTNEED. See # https://github.com/ClickHouse/ClickHouse/issues/11121 for motivation. - set (JEMALLOC_CONFIG_MALLOC_CONF "percpu_arena:percpu,oversize_threshold:0,muzzy_decay_ms:10000") + set (JEMALLOC_CONFIG_MALLOC_CONF "percpu_arena:percpu,oversize_threshold:0,muzzy_decay_ms:5000,dirty_decay_ms:5000") else() - set (JEMALLOC_CONFIG_MALLOC_CONF "oversize_threshold:0,muzzy_decay_ms:10000") + set (JEMALLOC_CONFIG_MALLOC_CONF "oversize_threshold:0,muzzy_decay_ms:5000,dirty_decay_ms:5000") endif() # CACHE variable is empty, to allow changing defaults without necessity # to purge cache @@ -107,6 +110,8 @@ if (ARCH_AMD64) set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_x86_64") elseif (ARCH_ARM) set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_aarch64") +elseif (ARCH_PPC64LE) + set(JEMALLOC_INCLUDE_PREFIX "${JEMALLOC_INCLUDE_PREFIX}_ppc64le") else () message (FATAL_ERROR "internal jemalloc: This arch is not supported") endif () @@ -119,12 +124,14 @@ target_include_directories(jemalloc SYSTEM PRIVATE target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_NO_PRIVATE_NAMESPACE) if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") - target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_DEBUG=1 -DJEMALLOC_PROF=1) + target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_DEBUG=1) +endif () - if (USE_UNWIND) - target_compile_definitions (jemalloc PRIVATE -DJEMALLOC_PROF_LIBUNWIND=1) - target_link_libraries (jemalloc PRIVATE unwind) - endif () +target_compile_definitions(jemalloc PRIVATE -DJEMALLOC_PROF=1) + +if (USE_UNWIND) + target_compile_definitions (jemalloc PRIVATE -DJEMALLOC_PROF_LIBUNWIND=1) + target_link_libraries (jemalloc PRIVATE unwind) endif () target_compile_options(jemalloc PRIVATE -Wno-redundant-decls) diff --git a/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in index c7c884d0eaa..5c0407db24a 100644 --- a/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/contrib/jemalloc-cmake/include_darwin_aarch64/jemalloc/internal/jemalloc_internal_defs.h.in @@ -42,7 +42,7 @@ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16 * bits are the same as bit 47. */ -#define LG_VADDR 48 +#define LG_VADDR 64 /* Defined if C11 atomics are available. */ #define JEMALLOC_C11_ATOMICS 1 @@ -101,11 +101,6 @@ */ #define JEMALLOC_HAVE_MACH_ABSOLUTE_TIME 1 -/* - * Defined if clock_gettime(CLOCK_REALTIME, ...) is available. - */ -#define JEMALLOC_HAVE_CLOCK_REALTIME 1 - /* * Defined if _malloc_thread_cleanup() exists. At least in the case of * FreeBSD, pthread_key_create() allocates, which if used during malloc @@ -181,14 +176,14 @@ /* #undef LG_QUANTUM */ /* One page is 2^LG_PAGE bytes. */ -#define LG_PAGE 16 +#define LG_PAGE 14 /* * One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the * system does not explicitly support huge pages; system calls that require * explicit huge page support are separately configured. */ -#define LG_HUGEPAGE 29 +#define LG_HUGEPAGE 21 /* * If defined, adjacent virtual memory mappings with identical attributes @@ -356,7 +351,7 @@ /* #undef JEMALLOC_EXPORT */ /* config.malloc_conf options string. */ -#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@" +#define JEMALLOC_CONFIG_MALLOC_CONF "" /* If defined, jemalloc takes the malloc/free/etc. symbol names. */ /* #undef JEMALLOC_IS_MALLOC */ diff --git a/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in new file mode 100644 index 00000000000..8068861041f --- /dev/null +++ b/contrib/jemalloc-cmake/include_linux_ppc64le/jemalloc/internal/jemalloc_internal_defs.h.in @@ -0,0 +1,367 @@ +/* include/jemalloc/internal/jemalloc_internal_defs.h. Generated from jemalloc_internal_defs.h.in by configure. */ +#ifndef JEMALLOC_INTERNAL_DEFS_H_ +#define JEMALLOC_INTERNAL_DEFS_H_ +/* + * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all + * public APIs to be prefixed. This makes it possible, with some care, to use + * multiple allocators simultaneously. + */ +/* #undef JEMALLOC_PREFIX */ +/* #undef JEMALLOC_CPREFIX */ + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +#define JEMALLOC_OVERRIDE___LIBC_CALLOC +#define JEMALLOC_OVERRIDE___LIBC_FREE +#define JEMALLOC_OVERRIDE___LIBC_MALLOC +#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN +#define JEMALLOC_OVERRIDE___LIBC_REALLOC +#define JEMALLOC_OVERRIDE___LIBC_VALLOC +/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */ + +/* + * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs. + * For shared libraries, symbol visibility mechanisms prevent these symbols + * from being exported, but for static libraries, naming collisions are a real + * possibility. + */ +#define JEMALLOC_PRIVATE_NAMESPACE je_ + +/* + * Hyper-threaded CPUs may need a special instruction inside spin loops in + * order to yield to another virtual CPU. + */ +#define CPU_SPINWAIT +/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */ +#define HAVE_CPU_SPINWAIT 0 + +/* + * Number of significant bits in virtual addresses. This may be less than the + * total number of bits in a pointer, e.g. on x64, for which the uppermost 16 + * bits are the same as bit 47. + */ +#define LG_VADDR 64 + +/* Defined if C11 atomics are available. */ +#define JEMALLOC_C11_ATOMICS 1 + +/* Defined if GCC __atomic atomics are available. */ +#define JEMALLOC_GCC_ATOMIC_ATOMICS 1 +/* and the 8-bit variant support. */ +#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS 1 + +/* Defined if GCC __sync atomics are available. */ +#define JEMALLOC_GCC_SYNC_ATOMICS 1 +/* and the 8-bit variant support. */ +#define JEMALLOC_GCC_U8_SYNC_ATOMICS 1 + +/* + * Defined if __builtin_clz() and __builtin_clzl() are available. + */ +#define JEMALLOC_HAVE_BUILTIN_CLZ + +/* + * Defined if os_unfair_lock_*() functions are available, as provided by Darwin. + */ +/* #undef JEMALLOC_OS_UNFAIR_LOCK */ + +/* Defined if syscall(2) is usable. */ +#define JEMALLOC_USE_SYSCALL + +/* + * Defined if secure_getenv(3) is available. + */ +// #define JEMALLOC_HAVE_SECURE_GETENV + +/* + * Defined if issetugid(2) is available. + */ +/* #undef JEMALLOC_HAVE_ISSETUGID */ + +/* Defined if pthread_atfork(3) is available. */ +#define JEMALLOC_HAVE_PTHREAD_ATFORK + +/* Defined if pthread_setname_np(3) is available. */ +#define JEMALLOC_HAVE_PTHREAD_SETNAME_NP + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available. + */ +#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1 + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available. + */ +#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1 + +/* + * Defined if mach_absolute_time() is available. + */ +/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */ + +/* + * Defined if _malloc_thread_cleanup() exists. At least in the case of + * FreeBSD, pthread_key_create() allocates, which if used during malloc + * bootstrapping will cause recursion into the pthreads library. Therefore, if + * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in + * malloc_tsd. + */ +/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */ + +/* + * Defined if threaded initialization is known to be safe on this platform. + * Among other things, it must be possible to initialize a mutex without + * triggering allocation in order for threaded allocation to be safe. + */ +#define JEMALLOC_THREADED_INIT + +/* + * Defined if the pthreads implementation defines + * _pthread_mutex_init_calloc_cb(), in which case the function is used in order + * to avoid recursive allocation during mutex initialization. + */ +/* #undef JEMALLOC_MUTEX_INIT_CB */ + +/* Non-empty if the tls_model attribute is supported. */ +#define JEMALLOC_TLS_MODEL __attribute__((tls_model("initial-exec"))) + +/* + * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables + * inline functions. + */ +/* #undef JEMALLOC_DEBUG */ + +/* JEMALLOC_STATS enables statistics calculation. */ +#define JEMALLOC_STATS + +/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */ +/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */ + +/* JEMALLOC_PROF enables allocation profiling. */ +/* #undef JEMALLOC_PROF */ + +/* Use libunwind for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBUNWIND */ + +/* Use libgcc for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBGCC */ + +/* Use gcc intrinsics for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_GCC */ + +/* + * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage + * segment (DSS). + */ +#define JEMALLOC_DSS + +/* Support memory filling (junk/zero). */ +#define JEMALLOC_FILL + +/* Support utrace(2)-based tracing. */ +/* #undef JEMALLOC_UTRACE */ + +/* Support optional abort() on OOM. */ +/* #undef JEMALLOC_XMALLOC */ + +/* Support lazy locking (avoid locking unless a second thread is launched). */ +/* #undef JEMALLOC_LAZY_LOCK */ + +/* + * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size + * classes). + */ +/* #undef LG_QUANTUM */ + +/* One page is 2^LG_PAGE bytes. */ +#define LG_PAGE 16 + +/* + * One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the + * system does not explicitly support huge pages; system calls that require + * explicit huge page support are separately configured. + */ +#define LG_HUGEPAGE 21 + +/* + * If defined, adjacent virtual memory mappings with identical attributes + * automatically coalesce, and they fragment when changes are made to subranges. + * This is the normal order of things for mmap()/munmap(), but on Windows + * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e. + * mappings do *not* coalesce/fragment. + */ +#define JEMALLOC_MAPS_COALESCE + +/* + * If defined, retain memory for later reuse by default rather than using e.g. + * munmap() to unmap freed extents. This is enabled on 64-bit Linux because + * common sequences of mmap()/munmap() calls will cause virtual memory map + * holes. + */ +#define JEMALLOC_RETAIN + +/* TLS is used to map arenas and magazine caches to threads. */ +#define JEMALLOC_TLS + +/* + * Used to mark unreachable code to quiet "end of non-void" compiler warnings. + * Don't use this directly; instead use unreachable() from util.h + */ +#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable + +/* + * ffs*() functions to use for bitmapping. Don't use these directly; instead, + * use ffs_*() from util.h. + */ +#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll +#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl +#define JEMALLOC_INTERNAL_FFS __builtin_ffs + +/* + * popcount*() functions to use for bitmapping. + */ +#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl +#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount + +/* + * If defined, explicitly attempt to more uniformly distribute large allocation + * pointer alignments across all cache indices. + */ +#define JEMALLOC_CACHE_OBLIVIOUS + +/* + * If defined, enable logging facilities. We make this a configure option to + * avoid taking extra branches everywhere. + */ +/* #undef JEMALLOC_LOG */ + +/* + * If defined, use readlinkat() (instead of readlink()) to follow + * /etc/malloc_conf. + */ +/* #undef JEMALLOC_READLINKAT */ + +/* + * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings. + */ +/* #undef JEMALLOC_ZONE */ + +/* + * Methods for determining whether the OS overcommits. + * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's + * /proc/sys/vm.overcommit_memory file. + * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl. + */ +/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */ +#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY + +/* Defined if madvise(2) is available. */ +#define JEMALLOC_HAVE_MADVISE + +/* + * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE + * arguments to madvise(2). + */ +#define JEMALLOC_HAVE_MADVISE_HUGE + +/* + * Methods for purging unused pages differ between operating systems. + * + * madvise(..., MADV_FREE) : This marks pages as being unused, such that they + * will be discarded rather than swapped out. + * madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is + * defined, this immediately discards pages, + * such that new pages will be demand-zeroed if + * the address region is later touched; + * otherwise this behaves similarly to + * MADV_FREE, though typically with higher + * system overhead. + */ +#define JEMALLOC_PURGE_MADVISE_FREE +#define JEMALLOC_PURGE_MADVISE_DONTNEED +#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS + +/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */ +/* #undef JEMALLOC_DEFINE_MADVISE_FREE */ + +/* + * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise. + */ +#define JEMALLOC_MADVISE_DONTDUMP + +/* + * Defined if transparent huge pages (THPs) are supported via the + * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled. + */ +/* #undef JEMALLOC_THP */ + +/* Define if operating system has alloca.h header. */ +#define JEMALLOC_HAS_ALLOCA_H 1 + +/* C99 restrict keyword supported. */ +#define JEMALLOC_HAS_RESTRICT 1 + +/* For use by hash code. */ +/* #undef JEMALLOC_BIG_ENDIAN */ + +/* sizeof(int) == 2^LG_SIZEOF_INT. */ +#define LG_SIZEOF_INT 2 + +/* sizeof(long) == 2^LG_SIZEOF_LONG. */ +#define LG_SIZEOF_LONG 3 + +/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */ +#define LG_SIZEOF_LONG_LONG 3 + +/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */ +#define LG_SIZEOF_INTMAX_T 3 + +/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */ +#define JEMALLOC_GLIBC_MALLOC_HOOK + +/* glibc memalign hook. */ +#define JEMALLOC_GLIBC_MEMALIGN_HOOK + +/* pthread support */ +#define JEMALLOC_HAVE_PTHREAD + +/* dlsym() support */ +#define JEMALLOC_HAVE_DLSYM + +/* Adaptive mutex support in pthreads. */ +#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP + +/* GNU specific sched_getcpu support */ +#define JEMALLOC_HAVE_SCHED_GETCPU + +/* GNU specific sched_setaffinity support */ +#define JEMALLOC_HAVE_SCHED_SETAFFINITY + +/* + * If defined, all the features necessary for background threads are present. + */ +#define JEMALLOC_BACKGROUND_THREAD 1 + +/* + * If defined, jemalloc symbols are not exported (doesn't work when + * JEMALLOC_PREFIX is not defined). + */ +/* #undef JEMALLOC_EXPORT */ + +/* config.malloc_conf options string. */ +#define JEMALLOC_CONFIG_MALLOC_CONF "@JEMALLOC_CONFIG_MALLOC_CONF@" + +/* If defined, jemalloc takes the malloc/free/etc. symbol names. */ +#define JEMALLOC_IS_MALLOC 1 + +/* + * Defined if strerror_r returns char * if _GNU_SOURCE is defined. + */ +#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE + +/* Performs additional safety checks when defined. */ +/* #undef JEMALLOC_OPT_SAFETY_CHECKS */ + +#endif /* JEMALLOC_INTERNAL_DEFS_H_ */ diff --git a/contrib/krb5-cmake/CMakeLists.txt b/contrib/krb5-cmake/CMakeLists.txt index f88402df1fa..fce7fbc582a 100644 --- a/contrib/krb5-cmake/CMakeLists.txt +++ b/contrib/krb5-cmake/CMakeLists.txt @@ -474,13 +474,6 @@ add_custom_command( WORKING_DIRECTORY "${KRB5_SOURCE_DIR}/util/et" ) -add_custom_target( - CREATE_COMPILE_ET ALL - DEPENDS ${KRB5_SOURCE_DIR}/util/et/compile_et - COMMENT "creating compile_et" - VERBATIM -) - file(GLOB_RECURSE ET_FILES "${KRB5_SOURCE_DIR}/*.et" ) @@ -531,7 +524,7 @@ add_custom_command( add_custom_target( - ERROR_MAP_H ALL + ERROR_MAP_H DEPENDS ${KRB5_SOURCE_DIR}/lib/gssapi/krb5/error_map.h COMMENT "generating error_map.h" VERBATIM @@ -544,14 +537,14 @@ add_custom_command( ) add_custom_target( - ERRMAP_H ALL + ERRMAP_H DEPENDS ${KRB5_SOURCE_DIR}/lib/gssapi/generic/errmap.h COMMENT "generating errmap.h" VERBATIM ) add_custom_target( - KRB_5_H ALL + KRB_5_H DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/include/krb5/krb5.h COMMENT "generating krb5.h" VERBATIM @@ -564,15 +557,19 @@ add_dependencies( ERRMAP_H ERROR_MAP_H KRB_5_H - ) +) preprocess_et(processed_et_files ${ET_FILES}) -add_custom_command( - OUTPUT ${KRB5_SOURCE_DIR}/lib/gssapi/generic/errmap.h - COMMAND perl -w -I../../../util ../../../util/gen.pl bimap errmap.h NAME=mecherrmap LEFT=OM_uint32 RIGHT=struct\ mecherror LEFTPRINT=print_OM_uint32 RIGHTPRINT=mecherror_print LEFTCMP=cmp_OM_uint32 RIGHTCMP=mecherror_cmp - WORKING_DIRECTORY "${KRB5_SOURCE_DIR}/lib/gssapi/generic" -) +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/include_private/kcmrpc.h ${CMAKE_CURRENT_BINARY_DIR}/include_private/kcmrpc.c + COMMAND mig -header kcmrpc.h -user kcmrpc.c -sheader /dev/null -server /dev/null -I${KRB5_SOURCE_DIR}/lib/krb5/ccache ${KRB5_SOURCE_DIR}/lib/krb5/ccache/kcmrpc.defs + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/include_private" + ) + + list(APPEND ALL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/include_private/kcmrpc.c) +endif() target_sources(${KRB5_LIBRARY} PRIVATE ${ALL_SRCS} @@ -604,6 +601,25 @@ file(COPY ${KRB5_SOURCE_DIR}/util/et/com_err.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/include/ ) +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/osconf.h + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/include_private/ +) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/profile.h + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/include_private/ +) + +string(TOLOWER "${CMAKE_SYSTEM_NAME}" _system_name) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/autoconf_${_system_name}.h + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/include_private/ +) + +file(RENAME + ${CMAKE_CURRENT_BINARY_DIR}/include_private/autoconf_${_system_name}.h + ${CMAKE_CURRENT_BINARY_DIR}/include_private/autoconf.h +) + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/krb5 ) @@ -633,7 +649,7 @@ target_include_directories(${KRB5_LIBRARY} PUBLIC ) target_include_directories(${KRB5_LIBRARY} PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} #for autoconf.h + ${CMAKE_CURRENT_BINARY_DIR}/include_private # For autoconf.h and other generated headers. ${KRB5_SOURCE_DIR} ${KRB5_SOURCE_DIR}/include ${KRB5_SOURCE_DIR}/lib/gssapi/mechglue diff --git a/contrib/krb5-cmake/autoconf_darwin.h b/contrib/krb5-cmake/autoconf_darwin.h new file mode 100644 index 00000000000..965e2f02997 --- /dev/null +++ b/contrib/krb5-cmake/autoconf_darwin.h @@ -0,0 +1,764 @@ +/* include/autoconf.h. Generated from autoconf.h.in by configure. */ +/* include/autoconf.h.in. Generated from configure.in by autoheader. */ + + +#ifndef KRB5_AUTOCONF_H +#define KRB5_AUTOCONF_H + + +/* Define if AES-NI support is enabled */ +/* #undef AESNI */ + +/* Define if socket can't be bound to 0.0.0.0 */ +/* #undef BROKEN_STREAMS_SOCKETS */ + +/* Define if va_list objects can be simply copied by assignment. */ +/* #undef CAN_COPY_VA_LIST */ + +/* Define to reduce code size even if it means more cpu usage */ +/* #undef CONFIG_SMALL */ + +/* Define if __attribute__((constructor)) works */ +#define CONSTRUCTOR_ATTR_WORKS 1 + +/* Define to default ccache name */ +#define DEFCCNAME "FILE:/tmp/krb5cc_%{uid}" + +/* Define to default client keytab name */ +#define DEFCKTNAME "FILE:/etc/krb5/user/%{euid}/client.keytab" + +/* Define to default keytab name */ +#define DEFKTNAME "FILE:/etc/krb5.keytab" + +/* Define if library initialization should be delayed until first use */ +#define DELAY_INITIALIZER 1 + +/* Define if __attribute__((destructor)) works */ +#define DESTRUCTOR_ATTR_WORKS 1 + +/* Define to disable PKINIT plugin support */ +#define DISABLE_PKINIT 1 + +/* Define if LDAP KDB support within the Kerberos library (mainly ASN.1 code) + should be enabled. */ +/* #undef ENABLE_LDAP */ + +/* Define if translation functions should be used. */ +/* #undef ENABLE_NLS */ + +/* Define if thread support enabled */ +#define ENABLE_THREADS 1 + +/* Define as return type of endrpcent */ +#define ENDRPCENT_TYPE void + +/* Define if Fortuna PRNG is selected */ +#define FORTUNA 1 + +/* Define to the type of elements in the array set by `getgroups'. Usually + this is either `int' or `gid_t'. */ +#define GETGROUPS_T gid_t + +/* Define if gethostbyname_r returns int rather than struct hostent * */ +/* #undef GETHOSTBYNAME_R_RETURNS_INT */ + +/* Type of getpeername second argument. */ +#define GETPEERNAME_ARG3_TYPE GETSOCKNAME_ARG3_TYPE + +/* Define if getpwnam_r exists but takes only 4 arguments (e.g., POSIX draft 6 + implementations like some Solaris releases). */ +/* #undef GETPWNAM_R_4_ARGS */ + +/* Define if getpwnam_r returns an int */ +#define GETPWNAM_R_RETURNS_INT 1 + +/* Define if getpwuid_r exists but takes only 4 arguments (e.g., POSIX draft 6 + implementations like some Solaris releases). */ +/* #undef GETPWUID_R_4_ARGS */ + +/* Define if getservbyname_r returns int rather than struct servent * */ +/* #undef GETSERVBYNAME_R_RETURNS_INT */ + +/* Type of pointer target for argument 3 to getsockname */ +#define GETSOCKNAME_ARG3_TYPE socklen_t + +/* Define if gmtime_r returns int instead of struct tm pointer, as on old + HP-UX systems. */ +/* #undef GMTIME_R_RETURNS_INT */ + +/* Define if va_copy macro or function is available. */ +#define HAS_VA_COPY 1 + +/* Define to 1 if you have the `access' function. */ +#define HAVE_ACCESS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_INET_H 1 + +/* Define to 1 if you have the `bswap16' function. */ +/* #undef HAVE_BSWAP16 */ + +/* Define to 1 if you have the `bswap64' function. */ +/* #undef HAVE_BSWAP64 */ + +/* Define to 1 if bswap_16 is available via byteswap.h */ +/* #undef HAVE_BSWAP_16 */ + +/* Define to 1 if bswap_64 is available via byteswap.h */ +/* #undef HAVE_BSWAP_64 */ + +/* Define if bt_rseq is available, for recursive btree traversal. */ +#define HAVE_BT_RSEQ 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_BYTESWAP_H */ + +/* Define to 1 if you have the `chmod' function. */ +#define HAVE_CHMOD 1 + +/* Define if cmocka library is available. */ +/* #undef HAVE_CMOCKA */ + +/* Define to 1 if you have the `compile' function. */ +/* #undef HAVE_COMPILE */ + +/* Define if com_err has compatible gettext support */ +#define HAVE_COM_ERR_INTL 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CPUID_H */ + +/* Define to 1 if you have the `daemon' function. */ +#define HAVE_DAEMON 1 + +/* Define to 1 if you have the declaration of `strerror_r', and to 0 if you + don't. */ +#define HAVE_DECL_STRERROR_R 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +#define HAVE_DIRENT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you have the `dn_skipname' function. */ +#define HAVE_DN_SKIPNAME 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_ENDIAN_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_ERRNO_H 1 + +/* Define to 1 if you have the `fchmod' function. */ +#define HAVE_FCHMOD 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the `flock' function. */ +#define HAVE_FLOCK 1 + +/* Define to 1 if you have the `fnmatch' function. */ +#define HAVE_FNMATCH 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FNMATCH_H 1 + +/* Define if you have the getaddrinfo function */ +#define HAVE_GETADDRINFO 1 + +/* Define to 1 if you have the `getcwd' function. */ +#define HAVE_GETCWD 1 + +/* Define to 1 if you have the `getenv' function. */ +#define HAVE_GETENV 1 + +/* Define to 1 if you have the `geteuid' function. */ +#define HAVE_GETEUID 1 + +/* Define if gethostbyname_r exists and its return type is known */ +/* #undef HAVE_GETHOSTBYNAME_R */ + +/* Define to 1 if you have the `getnameinfo' function. */ +#define HAVE_GETNAMEINFO 1 + +/* Define if system getopt should be used. */ +#define HAVE_GETOPT 1 + +/* Define if system getopt_long should be used. */ +#define HAVE_GETOPT_LONG 1 + +/* Define if getpwnam_r is available and useful. */ +#define HAVE_GETPWNAM_R 1 + +/* Define if getpwuid_r is available and useful. */ +#define HAVE_GETPWUID_R 1 + +/* Define if getservbyname_r exists and its return type is known */ +/* #undef HAVE_GETSERVBYNAME_R */ + +/* Have the gettimeofday function */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if you have the `getusershell' function. */ +#define HAVE_GETUSERSHELL 1 + +/* Define to 1 if you have the `gmtime_r' function. */ +#define HAVE_GMTIME_R 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_IFADDRS_H 1 + +/* Define to 1 if you have the `inet_ntop' function. */ +#define HAVE_INET_NTOP 1 + +/* Define to 1 if you have the `inet_pton' function. */ +#define HAVE_INET_PTON 1 + +/* Define to 1 if the system has the type `int16_t'. */ +#define HAVE_INT16_T 1 + +/* Define to 1 if the system has the type `int32_t'. */ +#define HAVE_INT32_T 1 + +/* Define to 1 if the system has the type `int8_t'. */ +#define HAVE_INT8_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_KEYUTILS_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LBER_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LDAP_H */ + +/* Define to 1 if you have the `crypto' library (-lcrypto). */ +#define HAVE_LIBCRYPTO 1 + +/* Define if building with libedit. */ +/* #undef HAVE_LIBEDIT */ + +/* Define to 1 if you have the `nsl' library (-lnsl). */ +/* #undef HAVE_LIBNSL */ + +/* Define to 1 if you have the `resolv' library (-lresolv). */ +#define HAVE_LIBRESOLV 1 + +/* Define to 1 if you have the `socket' library (-lsocket). */ +/* #undef HAVE_LIBSOCKET */ + +/* Define if the util library is available */ +#define HAVE_LIBUTIL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H 1 + +/* Define to 1 if you have the `localtime_r' function. */ +#define HAVE_LOCALTIME_R 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MACHINE_BYTE_ORDER_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MACHINE_ENDIAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `mkstemp' function. */ +#define HAVE_MKSTEMP 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. */ +/* #undef HAVE_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_NETDB_H 1 + +/* Define if netdb.h declares h_errno */ +#define HAVE_NETDB_H_H_ERRNO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_IN_H 1 + +/* Define to 1 if you have the `ns_initparse' function. */ +#define HAVE_NS_INITPARSE 1 + +/* Define to 1 if you have the `ns_name_uncompress' function. */ +#define HAVE_NS_NAME_UNCOMPRESS 1 + +/* Define if OpenSSL supports cms. */ +#define HAVE_OPENSSL_CMS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PATHS_H 1 + +/* Define if persistent keyrings are supported */ +/* #undef HAVE_PERSISTENT_KEYRING */ + +/* Define to 1 if you have the header file. */ +#define HAVE_POLL_H 1 + +/* Define if #pragma weak references work */ +/* #undef HAVE_PRAGMA_WEAK_REF */ + +/* Define if you have POSIX threads libraries and header files. */ +#define HAVE_PTHREAD 1 + +/* Define to 1 if you have the `pthread_once' function. */ +#define HAVE_PTHREAD_ONCE 1 + +/* Have PTHREAD_PRIO_INHERIT. */ +#define HAVE_PTHREAD_PRIO_INHERIT 1 + +/* Define to 1 if you have the `pthread_rwlock_init' function. */ +#define HAVE_PTHREAD_RWLOCK_INIT 1 + +/* Define if pthread_rwlock_init is provided in the thread library. */ +#define HAVE_PTHREAD_RWLOCK_INIT_IN_THREAD_LIB 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PWD_H 1 + +/* Define if building with GNU Readline. */ +/* #undef HAVE_READLINE */ + +/* Define if regcomp exists and functions */ +#define HAVE_REGCOMP 1 + +/* Define to 1 if you have the `regexec' function. */ +#define HAVE_REGEXEC 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_REGEXPR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_REGEX_H 1 + +/* Define to 1 if you have the `res_nclose' function. */ +#define HAVE_RES_NCLOSE 1 + +/* Define to 1 if you have the `res_ndestroy' function. */ +#define HAVE_RES_NDESTROY 1 + +/* Define to 1 if you have the `res_ninit' function. */ +#define HAVE_RES_NINIT 1 + +/* Define to 1 if you have the `res_nsearch' function. */ +#define HAVE_RES_NSEARCH 1 + +/* Define to 1 if you have the `res_search' function */ +#define HAVE_RES_SEARCH 1 + +/* Define to 1 if you have the `re_comp' function. */ +/* #undef HAVE_RE_COMP */ + +/* Define to 1 if you have the `re_exec' function. */ +/* #undef HAVE_RE_EXEC */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_SASL_H */ + +/* Define if struct sockaddr contains sa_len */ +#define HAVE_SA_LEN 1 + +/* Define to 1 if you have the `setegid' function. */ +#define HAVE_SETEGID 1 + +/* Define to 1 if you have the `setenv' function. */ +#define HAVE_SETENV 1 + +/* Define to 1 if you have the `seteuid' function. */ +#define HAVE_SETEUID 1 + +/* Define if setluid provided in OSF/1 security library */ +/* #undef HAVE_SETLUID */ + +/* Define to 1 if you have the `setregid' function. */ +#define HAVE_SETREGID 1 + +/* Define to 1 if you have the `setresgid' function. */ +/* #undef HAVE_SETRESGID */ + +/* Define to 1 if you have the `setresuid' function. */ +/* #undef HAVE_SETRESUID */ + +/* Define to 1 if you have the `setreuid' function. */ +#define HAVE_SETREUID 1 + +/* Define to 1 if you have the `setsid' function. */ +#define HAVE_SETSID 1 + +/* Define to 1 if you have the `setvbuf' function. */ +#define HAVE_SETVBUF 1 + +/* Define if there is a socklen_t type. If not, probably use size_t */ +#define HAVE_SOCKLEN_T 1 + +/* Define to 1 if you have the `srand' function. */ +#define HAVE_SRAND 1 + +/* Define to 1 if you have the `srand48' function. */ +#define HAVE_SRAND48 1 + +/* Define to 1 if you have the `srandom' function. */ +#define HAVE_SRANDOM 1 + +/* Define to 1 if the system has the type `ssize_t'. */ +#define HAVE_SSIZE_T 1 + +/* Define to 1 if you have the `stat' function. */ +#define HAVE_STAT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDDEF_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `step' function. */ +/* #undef HAVE_STEP */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strdup' function. */ +#define HAVE_STRDUP 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the `strerror_r' function. */ +#define HAVE_STRERROR_R 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strlcpy' function. */ +#define HAVE_STRLCPY 1 + +/* Define to 1 if you have the `strptime' function. */ +#define HAVE_STRPTIME 1 + +/* Define to 1 if the system has the type `struct cmsghdr'. */ +#define HAVE_STRUCT_CMSGHDR 1 + +/* Define if there is a struct if_laddrconf. */ +/* #undef HAVE_STRUCT_IF_LADDRCONF */ + +/* Define to 1 if the system has the type `struct in6_pktinfo'. */ +#define HAVE_STRUCT_IN6_PKTINFO 1 + +/* Define to 1 if the system has the type `struct in_pktinfo'. */ +#define HAVE_STRUCT_IN_PKTINFO 1 + +/* Define if there is a struct lifconf. */ +/* #undef HAVE_STRUCT_LIFCONF */ + +/* Define to 1 if the system has the type `struct rt_msghdr'. */ +#define HAVE_STRUCT_RT_MSGHDR 1 + +/* Define to 1 if the system has the type `struct sockaddr_storage'. */ +#define HAVE_STRUCT_SOCKADDR_STORAGE 1 + +/* Define to 1 if `st_mtimensec' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_MTIMENSEC */ + +/* Define to 1 if `st_mtimespec.tv_nsec' is a member of `struct stat'. */ +#define HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC 1 + +/* Define to 1 if `st_mtim.tv_nsec' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_BSWAP_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_DIR_H */ + +/* Define if sys_errlist in libc */ +#define HAVE_SYS_ERRLIST 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_FILE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_FILIO_H 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKIO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UIO_H 1 + +/* Define if tcl.h found */ +/* #undef HAVE_TCL_H */ + +/* Define if tcl/tcl.h found */ +/* #undef HAVE_TCL_TCL_H */ + +/* Define to 1 if you have the `timegm' function. */ +#define HAVE_TIMEGM 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `unsetenv' function. */ +#define HAVE_UNSETENV 1 + +/* Define to 1 if the system has the type `u_char'. */ +#define HAVE_U_CHAR 1 + +/* Define to 1 if the system has the type `u_int'. */ +#define HAVE_U_INT 1 + +/* Define to 1 if the system has the type `u_int16_t'. */ +#define HAVE_U_INT16_T 1 + +/* Define to 1 if the system has the type `u_int32_t'. */ +#define HAVE_U_INT32_T 1 + +/* Define to 1 if the system has the type `u_int8_t'. */ +#define HAVE_U_INT8_T 1 + +/* Define to 1 if the system has the type `u_long'. */ +#define HAVE_U_LONG 1 + +/* Define to 1 if you have the `vasprintf' function. */ +#define HAVE_VASPRINTF 1 + +/* Define to 1 if you have the `vsnprintf' function. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 if you have the `vsprintf' function. */ +#define HAVE_VSPRINTF 1 + +/* Define to 1 if the system has the type `__int128_t'. */ +#define HAVE___INT128_T 1 + +/* Define to 1 if the system has the type `__uint128_t'. */ +#define HAVE___UINT128_T 1 + +/* Define if errno.h declares perror */ +/* #undef HDR_HAS_PERROR */ + +/* May need to be defined to enable IPv6 support, for example on IRIX */ +/* #undef INET6 */ + +/* Define if MIT Project Athena default configuration should be used */ +/* #undef KRB5_ATHENA_COMPAT */ + +/* Define for DNS support of locating realms and KDCs */ +#undef KRB5_DNS_LOOKUP + +/* Define to enable DNS lookups of Kerberos realm names */ +/* #undef KRB5_DNS_LOOKUP_REALM */ + +/* Define if the KDC should return only vague error codes to clients */ +/* #undef KRBCONF_VAGUE_ERRORS */ + +/* define if the system header files are missing prototype for daemon() */ +#define NEED_DAEMON_PROTO 1 + +/* Define if in6addr_any is not defined in libc */ +#define NEED_INSIXADDR_ANY 1 + +/* define if the system header files are missing prototype for + ss_execute_command() */ +/* #undef NEED_SS_EXECUTE_COMMAND_PROTO */ + +/* define if the system header files are missing prototype for strptime() */ +/* #undef NEED_STRPTIME_PROTO */ + +/* define if the system header files are missing prototype for swab() */ +/* #undef NEED_SWAB_PROTO */ + +/* Define if need to declare sys_errlist */ +/* #undef NEED_SYS_ERRLIST */ + +/* define if the system header files are missing prototype for vasprintf() */ +/* #undef NEED_VASPRINTF_PROTO */ + +/* Define if the KDC should use no lookaside cache */ +/* #undef NOCACHE */ + +/* Define if references to pthread routines should be non-weak. */ +/* #undef NO_WEAK_PTHREADS */ + +/* Define if lex produes code with yylineno */ +/* #undef NO_YYLINENO */ + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "krb5-bugs@mit.edu" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "Kerberos 5" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "Kerberos 5 1.17.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "krb5" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "1.17.1" + +/* Define if setjmp indicates POSIX interface */ +#define POSIX_SETJMP 1 + +/* Define if POSIX signal handling is used */ +#define POSIX_SIGNALS 1 + +/* Define if POSIX signal handlers are used */ +#define POSIX_SIGTYPE 1 + +/* Define if termios.h exists and tcsetattr exists */ +#define POSIX_TERMIOS 1 + +/* Define to necessary symbol if this constant uses a non-standard name on + your system. */ +/* #undef PTHREAD_CREATE_JOINABLE */ + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* Define as return type of setrpcent */ +#define SETRPCENT_TYPE void + +/* The size of `size_t', as computed by sizeof. */ +#define SIZEOF_SIZE_T 8 + +/* The size of `time_t', as computed by sizeof. */ +#define SIZEOF_TIME_T 8 + +/* Define to use OpenSSL for SPAKE preauth */ +#define SPAKE_OPENSSL 1 + +/* Define for static plugin linkage */ +/* #undef STATIC_PLUGINS */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if strerror_r returns char *. */ +/* #undef STRERROR_R_CHAR_P */ + +/* Define if sys_errlist is defined in errno.h */ +#define SYS_ERRLIST_DECLARED 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Define if no TLS implementation is selected */ +/* #undef TLS_IMPL_NONE */ + +/* Define if TLS implementation is OpenSSL */ +#define TLS_IMPL_OPENSSL 1 + +/* Define if you have dirent.h functionality */ +#define USE_DIRENT_H 1 + +/* Define if dlopen should be used */ +#define USE_DLOPEN 1 + +/* Define if the keyring ccache should be enabled */ +/* #undef USE_KEYRING_CCACHE */ + +/* Define if link-time options for library finalization will be used */ +/* #undef USE_LINKER_FINI_OPTION */ + +/* Define if link-time options for library initialization will be used */ +/* #undef USE_LINKER_INIT_OPTION */ + +/* Define if sigprocmask should be used */ +#define USE_SIGPROCMASK 1 + +/* Define if wait takes int as a argument */ +#define WAIT_USES_INT 1 + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#define YYTEXT_POINTER 1 + +/* Define to enable extensions in glibc */ +#define _GNU_SOURCE 1 + +/* Define to enable C11 extensions */ +#define __STDC_WANT_LIB_EXT1__ 1 + +/* Define to empty if `const' does not conform to ANSI C. */ +/* #undef const */ + +/* Define to `int' if doesn't define. */ +/* #undef gid_t */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define krb5_sigtype to type of signal handler */ +#define krb5_sigtype void + +/* Define to `int' if does not define. */ +/* #undef mode_t */ + +/* Define to `long int' if does not define. */ +/* #undef off_t */ + +/* Define to `long' if does not define. */ +/* #undef time_t */ + +/* Define to `int' if doesn't define. */ +/* #undef uid_t */ + + +#if defined(__GNUC__) && !defined(inline) +/* Silence gcc pedantic warnings about ANSI C. */ +# define inline __inline__ +#endif +#endif /* KRB5_AUTOCONF_H */ diff --git a/contrib/krb5-cmake/autoconf.h b/contrib/krb5-cmake/autoconf_linux.h similarity index 100% rename from contrib/krb5-cmake/autoconf.h rename to contrib/krb5-cmake/autoconf_linux.h diff --git a/contrib/libcpuid-cmake/CMakeLists.txt b/contrib/libcpuid-cmake/CMakeLists.txt index 8c1be50b4e6..9baebb3ba1b 100644 --- a/contrib/libcpuid-cmake/CMakeLists.txt +++ b/contrib/libcpuid-cmake/CMakeLists.txt @@ -1,11 +1,9 @@ -if (NOT ARCH_ARM) +if(ARCH_AMD64) option (ENABLE_CPUID "Enable libcpuid library (only internal)" ${ENABLE_LIBRARIES}) -endif() - -if (ARCH_ARM AND ENABLE_CPUID) - message (${RECONFIGURE_MESSAGE_LEVEL} "cpuid is not supported on ARM") +elseif(ENABLE_CPUID) + message (${RECONFIGURE_MESSAGE_LEVEL} "libcpuid is only supported on x86_64") set (ENABLE_CPUID 0) -endif () +endif() if (NOT ENABLE_CPUID) add_library (cpuid INTERFACE) diff --git a/contrib/libcxx b/contrib/libcxx index 8b80a151d12..2fa892f69ac 160000 --- a/contrib/libcxx +++ b/contrib/libcxx @@ -1 +1 @@ -Subproject commit 8b80a151d12b98ffe2d0c22f7cec12c3b9ff88d7 +Subproject commit 2fa892f69acbaa40f8a18c6484854a6183a34482 diff --git a/contrib/libcxx-cmake/CMakeLists.txt b/contrib/libcxx-cmake/CMakeLists.txt index 3b5d53cd1c0..59d23b2cd9e 100644 --- a/contrib/libcxx-cmake/CMakeLists.txt +++ b/contrib/libcxx-cmake/CMakeLists.txt @@ -56,6 +56,11 @@ if (USE_UNWIND) target_compile_definitions(cxx PUBLIC -DSTD_EXCEPTION_HAS_STACK_TRACE=1) endif () +# Override the deduced attribute support that causes error. +if (OS_DARWIN AND COMPILER_GCC) + add_compile_definitions(_LIBCPP_INIT_PRIORITY_MAX) +endif () + target_compile_options(cxx PUBLIC $<$:-nostdinc++>) # Third party library may have substandard code. diff --git a/contrib/libdivide/libdivide.h b/contrib/libdivide/libdivide.h index 81057b7b43d..33d210310a1 100644 --- a/contrib/libdivide/libdivide.h +++ b/contrib/libdivide/libdivide.h @@ -18,78 +18,79 @@ #include #if defined(__cplusplus) - #include - #include - #include +#include +#include +#include #else - #include - #include +#include +#include #endif -#if defined(LIBDIVIDE_AVX512) - #include -#elif defined(LIBDIVIDE_AVX2) - #include -#elif defined(LIBDIVIDE_SSE2) - #include +#if defined(LIBDIVIDE_SSE2) +#include +#endif +#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512) +#include +#endif +#if defined(LIBDIVIDE_NEON) +#include #endif #if defined(_MSC_VER) - #include - // disable warning C4146: unary minus operator applied - // to unsigned type, result still unsigned - #pragma warning(disable: 4146) - #define LIBDIVIDE_VC +#include +// disable warning C4146: unary minus operator applied +// to unsigned type, result still unsigned +#pragma warning(disable : 4146) +#define LIBDIVIDE_VC #endif #if !defined(__has_builtin) - #define __has_builtin(x) 0 +#define __has_builtin(x) 0 #endif #if defined(__SIZEOF_INT128__) - #define HAS_INT128_T - // clang-cl on Windows does not yet support 128-bit division - #if !(defined(__clang__) && defined(LIBDIVIDE_VC)) - #define HAS_INT128_DIV - #endif +#define HAS_INT128_T +// clang-cl on Windows does not yet support 128-bit division +#if !(defined(__clang__) && defined(LIBDIVIDE_VC)) +#define HAS_INT128_DIV +#endif #endif #if defined(__x86_64__) || defined(_M_X64) - #define LIBDIVIDE_X86_64 +#define LIBDIVIDE_X86_64 #endif #if defined(__i386__) - #define LIBDIVIDE_i386 +#define LIBDIVIDE_i386 #endif #if defined(__GNUC__) || defined(__clang__) - #define LIBDIVIDE_GCC_STYLE_ASM +#define LIBDIVIDE_GCC_STYLE_ASM #endif #if defined(__cplusplus) || defined(LIBDIVIDE_VC) - #define LIBDIVIDE_FUNCTION __FUNCTION__ +#define LIBDIVIDE_FUNCTION __FUNCTION__ #else - #define LIBDIVIDE_FUNCTION __func__ +#define LIBDIVIDE_FUNCTION __func__ #endif -#define LIBDIVIDE_ERROR(msg) \ - do { \ - fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", \ - __LINE__, LIBDIVIDE_FUNCTION, msg); \ - abort(); \ +#define LIBDIVIDE_ERROR(msg) \ + do { \ + fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \ + abort(); \ } while (0) #if defined(LIBDIVIDE_ASSERTIONS_ON) - #define LIBDIVIDE_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \ - __LINE__, LIBDIVIDE_FUNCTION, #x); \ - abort(); \ - } \ - } while (0) +#define LIBDIVIDE_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \ + LIBDIVIDE_FUNCTION, #x); \ + abort(); \ + } \ + } while (0) #else - #define LIBDIVIDE_ASSERT(x) +#define LIBDIVIDE_ASSERT(x) #endif #ifdef __cplusplus @@ -193,25 +194,33 @@ static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uin static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); -static inline int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom); +static inline int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom); static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom); -static inline int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom); +static inline int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom); static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom); -static inline int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom); -static inline uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom); -static inline int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom); -static inline uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom); +static inline int32_t libdivide_s32_branchfree_do( + int32_t numer, const struct libdivide_s32_branchfree_t *denom); +static inline uint32_t libdivide_u32_branchfree_do( + uint32_t numer, const struct libdivide_u32_branchfree_t *denom); +static inline int64_t libdivide_s64_branchfree_do( + int64_t numer, const struct libdivide_s64_branchfree_t *denom); +static inline uint64_t libdivide_u64_branchfree_do( + uint64_t numer, const struct libdivide_u64_branchfree_t *denom); -static inline int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); +static inline int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); -static inline int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); +static inline int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); -static inline int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom); -static inline uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom); -static inline int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom); -static inline uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom); +static inline int32_t libdivide_s32_branchfree_recover( + const struct libdivide_s32_branchfree_t *denom); +static inline uint32_t libdivide_u32_branchfree_recover( + const struct libdivide_u32_branchfree_t *denom); +static inline int64_t libdivide_s64_branchfree_recover( + const struct libdivide_s64_branchfree_t *denom); +static inline uint64_t libdivide_u64_branchfree_recover( + const struct libdivide_u64_branchfree_t *denom); //////// Internal Utility Functions @@ -229,8 +238,7 @@ static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { } static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { -#if defined(LIBDIVIDE_VC) && \ - defined(LIBDIVIDE_X86_64) +#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) return __umulh(x, y); #elif defined(HAS_INT128_T) __uint128_t xl = x, yl = y; @@ -256,8 +264,7 @@ static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { } static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { -#if defined(LIBDIVIDE_VC) && \ - defined(LIBDIVIDE_X86_64) +#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) return __mulh(x, y); #elif defined(HAS_INT128_T) __int128_t xl = x, yl = y; @@ -279,8 +286,7 @@ static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { } static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { -#if defined(__GNUC__) || \ - __has_builtin(__builtin_clz) +#if defined(__GNUC__) || __has_builtin(__builtin_clz) // Fast way to count leading zeros return __builtin_clz(val); #elif defined(LIBDIVIDE_VC) @@ -290,8 +296,7 @@ static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { } return 0; #else - if (val == 0) - return 32; + if (val == 0) return 32; int32_t result = 8; uint32_t hi = 0xFFU << 24; while ((val & hi) == 0) { @@ -307,8 +312,7 @@ static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { } static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { -#if defined(__GNUC__) || \ - __has_builtin(__builtin_clzll) +#if defined(__GNUC__) || __has_builtin(__builtin_clzll) // Fast way to count leading zeros return __builtin_clzll(val); #elif defined(LIBDIVIDE_VC) && defined(_WIN64) @@ -328,14 +332,11 @@ static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit // uint {v}. The result must fit in 32 bits. // Returns the quotient directly and the remainder in *r -static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { -#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \ - defined(LIBDIVIDE_GCC_STYLE_ASM) +static inline uint32_t libdivide_64_div_32_to_32( + uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { +#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM) uint32_t result; - __asm__("divl %[v]" - : "=a"(result), "=d"(*r) - : [v] "r"(v), "a"(u0), "d"(u1) - ); + __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1)); return result; #else uint64_t n = ((uint64_t)u1 << 32) | u0; @@ -349,19 +350,13 @@ static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint3 // uint {v}. The result must fit in 64 bits. // Returns the quotient directly and the remainder in *r static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { -#if defined(LIBDIVIDE_X86_64) && \ - defined(LIBDIVIDE_GCC_STYLE_ASM) + // N.B. resist the temptation to use __uint128_t here. + // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than + // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because + // it's not inlined. +#if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM) uint64_t result; - __asm__("divq %[v]" - : "=a"(result), "=d"(*r) - : [v] "r"(v), "a"(u0), "d"(u1) - ); - return result; -#elif defined(HAS_INT128_T) && \ - defined(HAS_INT128_DIV) - __uint128_t n = ((__uint128_t)u1 << 64) | u0; - uint64_t result = (uint64_t)(n / v); - *r = (uint64_t)(n - result * (__uint128_t)v); + __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1)); return result; #else // Code taken from Hacker's Delight: @@ -369,19 +364,19 @@ static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, // License permits inclusion here per: // http://www.hackersdelight.org/permissions.htm - const uint64_t b = (1ULL << 32); // Number base (32 bits) - uint64_t un1, un0; // Norm. dividend LSD's - uint64_t vn1, vn0; // Norm. divisor digits - uint64_t q1, q0; // Quotient digits - uint64_t un64, un21, un10; // Dividend digit pairs - uint64_t rhat; // A remainder - int32_t s; // Shift amount for norm + const uint64_t b = (1ULL << 32); // Number base (32 bits) + uint64_t un1, un0; // Norm. dividend LSD's + uint64_t vn1, vn0; // Norm. divisor digits + uint64_t q1, q0; // Quotient digits + uint64_t un64, un21, un10; // Dividend digit pairs + uint64_t rhat; // A remainder + int32_t s; // Shift amount for norm // If overflow, set rem. to an impossible value, // and return the largest possible quotient if (u1 >= v) { - *r = (uint64_t) -1; - return (uint64_t) -1; + *r = (uint64_t)-1; + return (uint64_t)-1; } // count leading zeros @@ -390,7 +385,7 @@ static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, // Normalize divisor v = v << s; un64 = (u1 << s) | (u0 >> (64 - s)); - un10 = u0 << s; // Shift dividend left + un10 = u0 << s; // Shift dividend left } else { // Avoid undefined behavior of (u0 >> 64). // The behavior is undefined if the right operand is @@ -415,11 +410,10 @@ static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, while (q1 >= b || q1 * vn0 > b * rhat + un1) { q1 = q1 - 1; rhat = rhat + vn1; - if (rhat >= b) - break; + if (rhat >= b) break; } - // Multiply and subtract + // Multiply and subtract un21 = un64 * b + un1 - q1 * v; // Compute the second quotient digit @@ -429,8 +423,7 @@ static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, while (q0 >= b || q0 * vn0 > b * rhat + un0) { q0 = q0 - 1; rhat = rhat + vn1; - if (rhat >= b) - break; + if (rhat >= b) break; } *r = (un21 * b + un0 - q0 * v) >> s; @@ -445,8 +438,7 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign *u1 <<= shift; *u1 |= *u0 >> (64 - shift); *u0 <<= shift; - } - else if (signed_shift < 0) { + } else if (signed_shift < 0) { uint32_t shift = -signed_shift; *u0 >>= shift; *u0 |= *u1 << (64 - shift); @@ -455,9 +447,9 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign } // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. -static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { -#if defined(HAS_INT128_T) && \ - defined(HAS_INT128_DIV) +static uint64_t libdivide_128_div_128_to_64( + uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { +#if defined(HAS_INT128_T) && defined(HAS_INT128_DIV) __uint128_t ufull = u_hi; __uint128_t vfull = v_hi; ufull = (ufull << 64) | u_lo; @@ -470,7 +462,10 @@ static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64 #else // Adapted from "Unsigned Doubleword Division" in Hacker's Delight // We want to compute u / v - typedef struct { uint64_t hi; uint64_t lo; } u128_t; + typedef struct { + uint64_t hi; + uint64_t lo; + } u128_t; u128_t u = {u_hi, u_lo}; u128_t v = {v_hi, v_lo}; @@ -490,7 +485,7 @@ static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64 // Normalize the divisor so its MSB is 1 u128_t v1t = v; libdivide_u128_shift(&v1t.hi, &v1t.lo, n); - uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 + uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 // To ensure no overflow u128_t u1 = u; @@ -508,7 +503,7 @@ static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64 // Make q0 correct or too small by 1 // Equivalent to `if (q0 != 0) q0 = q0 - 1;` if (q0.hi != 0 || q0.lo != 0) { - q0.hi -= (q0.lo == 0); // borrow + q0.hi -= (q0.lo == 0); // borrow q0.lo -= 1; } @@ -520,22 +515,21 @@ static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64 // Each term is 128 bit // High half of full product (upper 128 bits!) are dropped u128_t q0v = {0, 0}; - q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo); - q0v.lo = q0.lo*v.lo; + q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo); + q0v.lo = q0.lo * v.lo; // Compute u - q0v as u_q0v // This is the remainder u128_t u_q0v = u; - u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow + u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow u_q0v.lo -= q0v.lo; // Check if u_q0v >= v // This checks if our remainder is larger than the divisor - if ((u_q0v.hi > v.hi) || - (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { + if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { // Increment q0 q0.lo += 1; - q0.hi += (q0.lo == 0); // carry + q0.hi += (q0.lo == 0); // carry // Subtract v from remainder u_q0v.hi -= v.hi + (u_q0v.lo < v.lo); @@ -611,7 +605,8 @@ struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1); - struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; + struct libdivide_u32_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; return ret; } @@ -619,14 +614,12 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return numer >> more; - } - else { + } else { uint32_t q = libdivide_mullhi_u32(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint32_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_32_SHIFT_MASK); - } - else { + } else { // All upper bits are 0, // don't need to mask them off. return q >> more; @@ -634,7 +627,8 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { } } -uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { +uint32_t libdivide_u32_branchfree_do( + uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { uint32_t q = libdivide_mullhi_u32(denom->magic, numer); uint32_t t = ((numer - q) >> 1) + q; return t >> denom->more; @@ -671,7 +665,7 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits - uint32_t full_q = half_q + half_q + ((rem<<1) >= d); + uint32_t full_q = half_q + half_q + ((rem << 1) >= d); // We rounded down in gen (hence +1) return full_q + 1; @@ -700,7 +694,7 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_ // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits - uint32_t full_q = half_q + half_q + ((rem<<1) >= d); + uint32_t full_q = half_q + half_q + ((rem << 1) >= d); // We rounded down in gen (hence +1) return full_q + 1; @@ -747,7 +741,7 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int proposed_m += proposed_m; const uint64_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; } result.magic = 1 + proposed_m; result.more = more; @@ -770,7 +764,8 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1); - struct libdivide_u64_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; + struct libdivide_u64_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; return ret; } @@ -778,22 +773,21 @@ uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return numer >> more; - } - else { + } else { uint64_t q = libdivide_mullhi_u64(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint64_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_64_SHIFT_MASK); - } - else { - // All upper bits are 0, - // don't need to mask them off. + } else { + // All upper bits are 0, + // don't need to mask them off. return q >> more; } } } -uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { +uint64_t libdivide_u64_branchfree_do( + uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { uint64_t q = libdivide_mullhi_u64(denom->magic, numer); uint64_t t = ((numer - q) >> 1) + q; return t >> denom->more; @@ -829,13 +823,14 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { // Note that the quotient is guaranteed <= 64 bits, // but the remainder may need 65! uint64_t r_hi, r_lo; - uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); + uint64_t half_q = + libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d // Note that d is a 65 bit value, so r1 is small and so r1 + r1 // cannot overflow uint64_t dr_lo = r_lo + r_lo; - uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry + uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); return full_q + 1; @@ -863,13 +858,14 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ // Note that the quotient is guaranteed <= 64 bits, // but the remainder may need 65! uint64_t r_hi, r_lo; - uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); + uint64_t half_q = + libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d // Note that d is a 65 bit value, so r1 is small and so r1 + r1 // cannot overflow uint64_t dr_lo = r_lo + r_lo; - uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry + uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); return full_q + 1; @@ -1023,8 +1019,7 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { // the magic number's sign is opposite that of the divisor. // We want to compute the positive magic number. int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); - int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) - ? denom->magic > 0 : denom->magic < 0; + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; // Handle the power of 2 case (including branchfree) if (denom->magic == 0) { @@ -1033,7 +1028,7 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { } uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic); - uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30 + uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30 uint32_t q = (uint32_t)(n / d); int32_t result = (int32_t)q; result += 1; @@ -1126,7 +1121,7 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - if (!denom->magic) { // shift path + if (!denom->magic) { // shift path uint64_t mask = (1ULL << shift) - 1; uint64_t uq = numer + ((numer >> 63) & mask); int64_t q = (int64_t)uq; @@ -1178,7 +1173,7 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - if (denom->magic == 0) { // shift path + if (denom->magic == 0) { // shift path uint64_t absD = 1ULL << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; @@ -1187,8 +1182,7 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { } else { // Unsigned math is much easier int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); - int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) - ? denom->magic > 0 : denom->magic < 0; + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic); uint64_t n_hi = 1ULL << shift, n_lo = 0; @@ -1206,30 +1200,305 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t return libdivide_s64_recover((const struct libdivide_s64_t *)denom); } -#if defined(LIBDIVIDE_AVX512) +#if defined(LIBDIVIDE_NEON) -static inline __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom); -static inline __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom); -static inline __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom); -static inline __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom); +static inline uint32x4_t libdivide_u32_do_vec128( + uint32x4_t numers, const struct libdivide_u32_t *denom); +static inline int32x4_t libdivide_s32_do_vec128( + int32x4_t numers, const struct libdivide_s32_t *denom); +static inline uint64x2_t libdivide_u64_do_vec128( + uint64x2_t numers, const struct libdivide_u64_t *denom); +static inline int64x2_t libdivide_s64_do_vec128( + int64x2_t numers, const struct libdivide_s64_t *denom); -static inline __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom); +static inline uint32x4_t libdivide_u32_branchfree_do_vec128( + uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom); +static inline int32x4_t libdivide_s32_branchfree_do_vec128( + int32x4_t numers, const struct libdivide_s32_branchfree_t *denom); +static inline uint64x2_t libdivide_u64_branchfree_do_vec128( + uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom); +static inline int64x2_t libdivide_s64_branchfree_do_vec128( + int64x2_t numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions -static inline __m512i libdivide_s64_signbits(__m512i v) {; +// Logical right shift by runtime value. +// NEON implements right shift as left shits by negative values. +static inline uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) { + int32_t wamt = static_cast(amt); + return vshlq_u32(v, vdupq_n_s32(-wamt)); +} + +static inline uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) { + int64_t wamt = static_cast(amt); + return vshlq_u64(v, vdupq_n_s64(-wamt)); +} + +// Arithmetic right shift by runtime value. +static inline int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) { + int32_t wamt = static_cast(amt); + return vshlq_s32(v, vdupq_n_s32(-wamt)); +} + +static inline int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) { + int64_t wamt = static_cast(amt); + return vshlq_s64(v, vdupq_n_s64(-wamt)); +} + +static inline int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); } + +static inline uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) { + // Desire is [x0, x1, x2, x3] + uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b)); // [_, x0, _, x1] + uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b)); //[_, x2, _, x3] + return vuzp2q_u32(w1, w2); // [x0, x1, x2, x3] +} + +static inline int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) { + int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b)); // [_, x0, _, x1] + int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b)); //[_, x2, _, x3] + return vuzp2q_s32(w1, w2); // [x0, x1, x2, x3] +} + +static inline uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) { + // full 128 bits product is: + // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) + // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. + + // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits. + uint64x2_t y = vdupq_n_u64(sy); + uint32x2_t x0 = vmovn_u64(x); + uint32x2_t y0 = vmovn_u64(y); + uint32x2_t x1 = vshrn_n_u64(x, 32); + uint32x2_t y1 = vshrn_n_u64(y, 32); + + // Compute x0*y0. + uint64x2_t x0y0 = vmull_u32(x0, y0); + uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32); + + // Compute other intermediate products. + uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0); // temp = x0y0_hi + x1*y0; + // We want to split temp into its low 32 bits and high 32 bits, both + // in the low half of 64 bit registers. + // Use shifts to avoid needing a reg for the mask. + uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32); // temp_lo = temp & 0xFFFFFFFF; + uint64x2_t temp_hi = vshrq_n_u64(temp, 32); // temp_hi = temp >> 32; + + temp_lo = vmlal_u32(temp_lo, x0, y1); // temp_lo += x0*y0 + temp_lo = vshrq_n_u64(temp_lo, 32); // temp_lo >>= 32 + temp_hi = vmlal_u32(temp_hi, x1, y1); // temp_hi += x1*y1 + uint64x2_t result = vaddq_u64(temp_hi, temp_lo); + return result; +} + +static inline int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) { + int64x2_t p = vreinterpretq_s64_u64( + libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), static_cast(sy))); + int64x2_t y = vdupq_n_s64(sy); + int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y); + int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x); + p = vsubq_s64(p, t1); + p = vsubq_s64(p, t2); + return p; +} + +////////// UINT32 + +uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return libdivide_u32_neon_srl(numers, more); + } else { + uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + // Note we can use halving-subtract to avoid the shift. + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); + return libdivide_u32_neon_srl(t, shift); + } else { + return libdivide_u32_neon_srl(q, more); + } + } +} + +uint32x4_t libdivide_u32_branchfree_do_vec128( + uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) { + uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); + uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); + return libdivide_u32_neon_srl(t, denom->more); +} + +////////// UINT64 + +uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + return libdivide_u64_neon_srl(numers, more); + } else { + uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + // No 64-bit halving subtracts in NEON :( + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); + return libdivide_u64_neon_srl(t, shift); + } else { + return libdivide_u64_neon_srl(q, more); + } + } +} + +uint64x2_t libdivide_u64_branchfree_do_vec128( + uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) { + uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); + uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); + return libdivide_u64_neon_srl(t, denom->more); +} + +////////// SINT32 + +int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak)); + q = libdivide_s32_neon_sra(q, shift); + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = vsubq_s32(veorq_s32(q, sign), sign); + return q; + } else { + int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign)); + } + // q >>= shift + q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK); + q = vaddq_s32( + q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31))); // q += (q < 0) + return q; + } +} + +int32x4_t libdivide_s32_branchfree_do_vec128( + int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); + int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic); + q = vaddq_s32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + int32x4_t q_sign = vshrq_n_s32(q, 31); // q_sign = q >> 31 + int32x4_t mask = vdupq_n_s32((1U << shift) - is_power_of_2); + q = vaddq_s32(q, vandq_s32(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s32_neon_sra(q, shift); // q >>= shift + q = vsubq_s32(veorq_s32(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + +int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; + int64_t magic = denom->magic; + if (magic == 0) { // shift path + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + int64x2_t roundToZeroTweak = vdupq_n_s64(mask); // TODO: no need to sign extend + // q = numer + ((numer >> 63) & roundToZeroTweak); + int64x2_t q = + vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_neon_sra(q, shift); + // q = (q ^ sign) - sign; + int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7)); + q = vsubq_s64(veorq_s64(q, sign), sign); + return q; + } else { + int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: no need to widen + // q += ((numer ^ sign) - sign); + q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign)); + } + // q >>= denom->mult_path.shift + q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = vaddq_s64( + q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63))); // q += (q < 0) + return q; + } +} + +int64x2_t libdivide_s64_branchfree_do_vec128( + int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) { + int64_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + // must be arithmetic shift + int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: avoid sign extend + + // libdivide_mullhi_s64(numers, magic); + int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); + q = vaddq_s64(q, numers); // q += numers + + // If q is non-negative, we have nothing to do. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. + uint32_t is_power_of_2 = (magic == 0); + int64x2_t q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + int64x2_t mask = vdupq_n_s64((1ULL << shift) - is_power_of_2); + q = vaddq_s64(q, vandq_s64(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_neon_sra(q, shift); // q >>= shift + q = vsubq_s64(veorq_s64(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +#endif + +#if defined(LIBDIVIDE_AVX512) + +static inline __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom); +static inline __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom); +static inline __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom); +static inline __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom); + +static inline __m512i libdivide_u32_branchfree_do_vec512( + __m512i numers, const struct libdivide_u32_branchfree_t *denom); +static inline __m512i libdivide_s32_branchfree_do_vec512( + __m512i numers, const struct libdivide_s32_branchfree_t *denom); +static inline __m512i libdivide_u64_branchfree_do_vec512( + __m512i numers, const struct libdivide_u64_branchfree_t *denom); +static inline __m512i libdivide_s64_branchfree_do_vec512( + __m512i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +static inline __m512i libdivide_s64_signbits(__m512i v) { + ; return _mm512_srai_epi64(v, 63); } -static inline __m512i libdivide_s64_shift_right_vector(__m512i v, int amt) { +static inline __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) { return _mm512_srai_epi64(v, amt); } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) { +static inline __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); @@ -1238,7 +1507,7 @@ static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) { } // b is one 32-bit value repeated. -static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) { +static inline __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); @@ -1247,30 +1516,31 @@ static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) { } // Here, y is assumed to contain one 64-bit value repeated. -// https://stackoverflow.com/a/28827013 -static inline __m512i libdivide_mullhi_u64_vector(__m512i x, __m512i y) { - __m512i lomask = _mm512_set1_epi64(0xffffffff); - __m512i xh = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM) 0xB1); - __m512i yh = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM) 0xB1); - __m512i w0 = _mm512_mul_epu32(x, y); - __m512i w1 = _mm512_mul_epu32(x, yh); - __m512i w2 = _mm512_mul_epu32(xh, y); - __m512i w3 = _mm512_mul_epu32(xh, yh); - __m512i w0h = _mm512_srli_epi64(w0, 32); - __m512i s1 = _mm512_add_epi64(w1, w0h); - __m512i s1l = _mm512_and_si512(s1, lomask); - __m512i s1h = _mm512_srli_epi64(s1, 32); - __m512i s2 = _mm512_add_epi64(w2, s1l); - __m512i s2h = _mm512_srli_epi64(s2, 32); - __m512i hi = _mm512_add_epi64(w3, s1h); - hi = _mm512_add_epi64(hi, s2h); +static inline __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) { + // see m128i variant for comments. + __m512i x0y0 = _mm512_mul_epu32(x, y); + __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32); - return hi; + __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); + __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); + + __m512i x0y1 = _mm512_mul_epu32(x, y1); + __m512i x1y0 = _mm512_mul_epu32(x1, y); + __m512i x1y1 = _mm512_mul_epu32(x1, y1); + + __m512i mask = _mm512_set1_epi64(0xFFFFFFFF); + __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi); + __m512i temp_lo = _mm512_and_si512(temp, mask); + __m512i temp_hi = _mm512_srli_epi64(temp, 32); + + temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm512_add_epi64(x1y1, temp_hi); + return _mm512_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. -static inline __m512i libdivide_mullhi_s64_vector(__m512i x, __m512i y) { - __m512i p = libdivide_mullhi_u64_vector(x, y); +static inline __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) { + __m512i p = libdivide_mullhi_u64_vec512(x, y); __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y); __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x); p = _mm512_sub_epi64(p, t1); @@ -1280,131 +1550,130 @@ static inline __m512i libdivide_mullhi_s64_vector(__m512i x, __m512i y) { ////////// UINT32 -__m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom) { +__m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm512_srli_epi32(numers, more); - } - else { - __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic)); + } else { + __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); return _mm512_srli_epi32(t, shift); - } - else { + } else { return _mm512_srli_epi32(q, more); } } } -__m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom) { - __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic)); +__m512i libdivide_u32_branchfree_do_vec512( + __m512i numers, const struct libdivide_u32_branchfree_t *denom) { + __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); return _mm512_srli_epi32(t, denom->more); } ////////// UINT64 -__m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom) { +__m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm512_srli_epi64(numers, more); - } - else { - __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic)); + } else { + __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); return _mm512_srli_epi64(t, shift); - } - else { + } else { return _mm512_srli_epi64(q, more); } } } -__m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom) { - __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic)); +__m512i libdivide_u64_branchfree_do_vec512( + __m512i numers, const struct libdivide_u64_branchfree_t *denom) { + __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); return _mm512_srli_epi64(t, denom->more); } ////////// SINT32 -__m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom) { +__m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = (1U << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); - __m512i q = _mm512_add_epi32(numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); + __m512i q = _mm512_add_epi32( + numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm512_srai_epi32(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); return q; - } - else { - __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(denom->magic)); + } else { + __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { - // must be arithmetic shift + // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); - // q += ((numer ^ sign) - sign); + // q += ((numer ^ sign) - sign); q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign)); } // q >>= shift q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); - q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) + q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) return q; } } -__m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom) { +__m512i libdivide_s32_branchfree_do_vec512( + __m512i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - // must be arithmetic shift + // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); - __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(magic)); - q = _mm512_add_epi32(q, numers); // q += numers + __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic)); + q = _mm512_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); - __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 + __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2); - q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm512_srai_epi32(q, shift); // q >>= shift - q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign + q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm512_srai_epi32(q, shift); // q >>= shift + q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 -__m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom) { +__m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; - if (magic == 0) { // shift path + if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = (1ULL << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi64(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); - __m512i q = _mm512_add_epi64(numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak)); - q = libdivide_s64_shift_right_vector(q, shift); + __m512i q = _mm512_add_epi64( + numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec512(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); - // q = (q ^ sign) - sign; + // q = (q ^ sign) - sign; q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); return q; - } - else { - __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic)); + } else { + __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); @@ -1412,46 +1681,53 @@ __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *de q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign)); } // q >>= denom->mult_path.shift - q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); - q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) + q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) return q; } } -__m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom) { +__m512i libdivide_s64_branchfree_do_vec512( + __m512i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); - // libdivide_mullhi_s64(numers, magic); - __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic)); - q = _mm512_add_epi64(q, numers); // q += numers + // libdivide_mullhi_s64(numers, magic); + __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); + q = _mm512_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2); - q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) - q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift - q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign + q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift + q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign return q; } -#elif defined(LIBDIVIDE_AVX2) +#endif -static inline __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom); -static inline __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom); -static inline __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom); -static inline __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom); +#if defined(LIBDIVIDE_AVX2) -static inline __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom); +static inline __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom); +static inline __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom); +static inline __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom); +static inline __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom); + +static inline __m256i libdivide_u32_branchfree_do_vec256( + __m256i numers, const struct libdivide_u32_branchfree_t *denom); +static inline __m256i libdivide_s32_branchfree_do_vec256( + __m256i numers, const struct libdivide_s32_branchfree_t *denom); +static inline __m256i libdivide_u64_branchfree_do_vec256( + __m256i numers, const struct libdivide_u64_branchfree_t *denom); +static inline __m256i libdivide_s64_branchfree_do_vec256( + __m256i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions @@ -1463,7 +1739,7 @@ static inline __m256i libdivide_s64_signbits(__m256i v) { } // Implementation of _mm256_srai_epi64 (from AVX512). -static inline __m256i libdivide_s64_shift_right_vector(__m256i v, int amt) { +static inline __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) { const int b = 64 - amt; __m256i m = _mm256_set1_epi64x(1ULL << (b - 1)); __m256i x = _mm256_srli_epi64(v, amt); @@ -1472,7 +1748,7 @@ static inline __m256i libdivide_s64_shift_right_vector(__m256i v, int amt) { } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) { +static inline __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); @@ -1481,7 +1757,7 @@ static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) { } // b is one 32-bit value repeated. -static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) { +static inline __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); @@ -1490,30 +1766,31 @@ static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) { } // Here, y is assumed to contain one 64-bit value repeated. -// https://stackoverflow.com/a/28827013 -static inline __m256i libdivide_mullhi_u64_vector(__m256i x, __m256i y) { - __m256i lomask = _mm256_set1_epi64x(0xffffffff); - __m256i xh = _mm256_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h - __m256i yh = _mm256_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h - __m256i w0 = _mm256_mul_epu32(x, y); // x0l*y0l, x1l*y1l - __m256i w1 = _mm256_mul_epu32(x, yh); // x0l*y0h, x1l*y1h - __m256i w2 = _mm256_mul_epu32(xh, y); // x0h*y0l, x1h*y0l - __m256i w3 = _mm256_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h - __m256i w0h = _mm256_srli_epi64(w0, 32); - __m256i s1 = _mm256_add_epi64(w1, w0h); - __m256i s1l = _mm256_and_si256(s1, lomask); - __m256i s1h = _mm256_srli_epi64(s1, 32); - __m256i s2 = _mm256_add_epi64(w2, s1l); - __m256i s2h = _mm256_srli_epi64(s2, 32); - __m256i hi = _mm256_add_epi64(w3, s1h); - hi = _mm256_add_epi64(hi, s2h); +static inline __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) { + // see m128i variant for comments. + __m256i x0y0 = _mm256_mul_epu32(x, y); + __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32); - return hi; + __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); + __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); + + __m256i x0y1 = _mm256_mul_epu32(x, y1); + __m256i x1y0 = _mm256_mul_epu32(x1, y); + __m256i x1y1 = _mm256_mul_epu32(x1, y1); + + __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF); + __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi); + __m256i temp_lo = _mm256_and_si256(temp, mask); + __m256i temp_hi = _mm256_srli_epi64(temp, 32); + + temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm256_add_epi64(x1y1, temp_hi); + return _mm256_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. -static inline __m256i libdivide_mullhi_s64_vector(__m256i x, __m256i y) { - __m256i p = libdivide_mullhi_u64_vector(x, y); +static inline __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) { + __m256i p = libdivide_mullhi_u64_vec256(x, y); __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y); __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x); p = _mm256_sub_epi64(p, t1); @@ -1523,131 +1800,130 @@ static inline __m256i libdivide_mullhi_s64_vector(__m256i x, __m256i y) { ////////// UINT32 -__m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom) { +__m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi32(numers, more); - } - else { - __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic)); + } else { + __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); return _mm256_srli_epi32(t, shift); - } - else { + } else { return _mm256_srli_epi32(q, more); } } } -__m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom) { - __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic)); +__m256i libdivide_u32_branchfree_do_vec256( + __m256i numers, const struct libdivide_u32_branchfree_t *denom) { + __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); return _mm256_srli_epi32(t, denom->more); } ////////// UINT64 -__m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom) { +__m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi64(numers, more); - } - else { - __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic)); + } else { + __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); return _mm256_srli_epi64(t, shift); - } - else { + } else { return _mm256_srli_epi64(q, more); } } } -__m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom) { - __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic)); +__m256i libdivide_u64_branchfree_do_vec256( + __m256i numers, const struct libdivide_u64_branchfree_t *denom) { + __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); return _mm256_srli_epi64(t, denom->more); } ////////// SINT32 -__m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom) { +__m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = (1U << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); - __m256i q = _mm256_add_epi32(numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); + __m256i q = _mm256_add_epi32( + numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm256_srai_epi32(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); return q; - } - else { - __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(denom->magic)); + } else { + __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { - // must be arithmetic shift + // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); - // q += ((numer ^ sign) - sign); + // q += ((numer ^ sign) - sign); q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign)); } // q >>= shift q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); - q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) + q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) return q; } } -__m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom) { +__m256i libdivide_s32_branchfree_do_vec256( + __m256i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - // must be arithmetic shift + // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); - __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(magic)); - q = _mm256_add_epi32(q, numers); // q += numers + __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic)); + q = _mm256_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); - __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 + __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2); - q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm256_srai_epi32(q, shift); // q >>= shift - q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm256_srai_epi32(q, shift); // q >>= shift + q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 -__m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom) { +__m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; - if (magic == 0) { // shift path + if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = (1ULL << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); - __m256i q = _mm256_add_epi64(numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak)); - q = libdivide_s64_shift_right_vector(q, shift); + __m256i q = _mm256_add_epi64( + numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec256(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); - // q = (q ^ sign) - sign; + // q = (q ^ sign) - sign; q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); return q; - } - else { - __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic)); + } else { + __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); @@ -1655,46 +1931,53 @@ __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *de q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign)); } // q >>= denom->mult_path.shift - q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); - q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) + q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) return q; } } -__m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom) { +__m256i libdivide_s64_branchfree_do_vec256( + __m256i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); - // libdivide_mullhi_s64(numers, magic); - __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic)); - q = _mm256_add_epi64(q, numers); // q += numers + // libdivide_mullhi_s64(numers, magic); + __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); + q = _mm256_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2); - q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) - q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift - q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift + q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } -#elif defined(LIBDIVIDE_SSE2) +#endif -static inline __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom); -static inline __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom); -static inline __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom); -static inline __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom); +#if defined(LIBDIVIDE_SSE2) -static inline __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom); +static inline __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom); +static inline __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom); +static inline __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom); +static inline __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom); + +static inline __m128i libdivide_u32_branchfree_do_vec128( + __m128i numers, const struct libdivide_u32_branchfree_t *denom); +static inline __m128i libdivide_s32_branchfree_do_vec128( + __m128i numers, const struct libdivide_s32_branchfree_t *denom); +static inline __m128i libdivide_u64_branchfree_do_vec128( + __m128i numers, const struct libdivide_u64_branchfree_t *denom); +static inline __m128i libdivide_s64_branchfree_do_vec128( + __m128i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions @@ -1706,7 +1989,7 @@ static inline __m128i libdivide_s64_signbits(__m128i v) { } // Implementation of _mm_srai_epi64 (from AVX512). -static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { +static inline __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) { const int b = 64 - amt; __m128i m = _mm_set1_epi64x(1ULL << (b - 1)); __m128i x = _mm_srli_epi64(v, amt); @@ -1715,7 +1998,7 @@ static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) { +static inline __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) { __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); __m128i a1X3X = _mm_srli_epi64(a, 32); __m128i mask = _mm_set_epi32(-1, 0, -1, 0); @@ -1726,8 +2009,8 @@ static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) { // SSE2 does not have a signed multiplication instruction, but we can convert // unsigned to signed pretty efficiently. Again, b is just a 32 bit value // repeated four times. -static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) { - __m128i p = libdivide_mullhi_u32_vector(a, b); +static inline __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) { + __m128i p = libdivide_mullhi_u32_vec128(a, b); // t1 = (a >> 31) & y, arithmetic shift __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); @@ -1737,30 +2020,41 @@ static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) { } // Here, y is assumed to contain one 64-bit value repeated. -// https://stackoverflow.com/a/28827013 -static inline __m128i libdivide_mullhi_u64_vector(__m128i x, __m128i y) { - __m128i lomask = _mm_set1_epi64x(0xffffffff); - __m128i xh = _mm_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h - __m128i yh = _mm_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h - __m128i w0 = _mm_mul_epu32(x, y); // x0l*y0l, x1l*y1l - __m128i w1 = _mm_mul_epu32(x, yh); // x0l*y0h, x1l*y1h - __m128i w2 = _mm_mul_epu32(xh, y); // x0h*y0l, x1h*y0l - __m128i w3 = _mm_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h - __m128i w0h = _mm_srli_epi64(w0, 32); - __m128i s1 = _mm_add_epi64(w1, w0h); - __m128i s1l = _mm_and_si128(s1, lomask); - __m128i s1h = _mm_srli_epi64(s1, 32); - __m128i s2 = _mm_add_epi64(w2, s1l); - __m128i s2h = _mm_srli_epi64(s2, 32); - __m128i hi = _mm_add_epi64(w3, s1h); - hi = _mm_add_epi64(hi, s2h); +static inline __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) { + // full 128 bits product is: + // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) + // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. - return hi; + // Compute x0*y0. + // Note x1, y1 are ignored by mul_epu32. + __m128i x0y0 = _mm_mul_epu32(x, y); + __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32); + + // Get x1, y1 in the low bits. + // We could shuffle or right shift. Shuffles are preferred as they preserve + // the source register for the next computation. + __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); + __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); + + // No need to mask off top 32 bits for mul_epu32. + __m128i x0y1 = _mm_mul_epu32(x, y1); + __m128i x1y0 = _mm_mul_epu32(x1, y); + __m128i x1y1 = _mm_mul_epu32(x1, y1); + + // Mask here selects low bits only. + __m128i mask = _mm_set1_epi64x(0xFFFFFFFF); + __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); + __m128i temp_lo = _mm_and_si128(temp, mask); + __m128i temp_hi = _mm_srli_epi64(temp, 32); + + temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm_add_epi64(x1y1, temp_hi); + return _mm_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. -static inline __m128i libdivide_mullhi_s64_vector(__m128i x, __m128i y) { - __m128i p = libdivide_mullhi_u64_vector(x, y); +static inline __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) { + __m128i p = libdivide_mullhi_u64_vec128(x, y); __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); p = _mm_sub_epi64(p, t1); @@ -1770,131 +2064,130 @@ static inline __m128i libdivide_mullhi_s64_vector(__m128i x, __m128i y) { ////////// UINT32 -__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) { +__m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi32(numers, more); - } - else { - __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic)); + } else { + __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); return _mm_srli_epi32(t, shift); - } - else { + } else { return _mm_srli_epi32(q, more); } } } -__m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) { - __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic)); +__m128i libdivide_u32_branchfree_do_vec128( + __m128i numers, const struct libdivide_u32_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); return _mm_srli_epi32(t, denom->more); } ////////// UINT64 -__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) { +__m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi64(numers, more); - } - else { - __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic)); + } else { + __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); return _mm_srli_epi64(t, shift); - } - else { + } else { return _mm_srli_epi64(q, more); } } } -__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) { - __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic)); +__m128i libdivide_u64_branchfree_do_vec128( + __m128i numers, const struct libdivide_u64_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); return _mm_srli_epi64(t, denom->more); } ////////// SINT32 -__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) { +__m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = (1U << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); - __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + __m128i q = + _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm_srai_epi32(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); return q; - } - else { - __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(denom->magic)); + } else { + __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { - // must be arithmetic shift + // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); - // q += ((numer ^ sign) - sign); + // q += ((numer ^ sign) - sign); q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); } // q >>= shift q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); - q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) return q; } } -__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) { +__m128i libdivide_s32_branchfree_do_vec128( + __m128i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - // must be arithmetic shift + // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); - __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(magic)); - q = _mm_add_epi32(q, numers); // q += numers + __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic)); + q = _mm_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); - __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 + __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2); - q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm_srai_epi32(q, shift); // q >>= shift - q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm_srai_epi32(q, shift); // q >>= shift + q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 -__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) { +__m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; - if (magic == 0) { // shift path + if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = (1ULL << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); - __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); - q = libdivide_s64_shift_right_vector(q, shift); + __m128i q = + _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vec128(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); - // q = (q ^ sign) - sign; + // q = (q ^ sign) - sign; q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); return q; - } - else { - __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic)); + } else { + __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); @@ -1902,32 +2195,33 @@ __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *de q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); } // q >>= denom->mult_path.shift - q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); - q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) + q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK); + q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) return q; } } -__m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom) { +__m128i libdivide_s64_branchfree_do_vec128( + __m128i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); - // libdivide_mullhi_s64(numers, magic); - __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic)); - q = _mm_add_epi64(q, numers); // q += numers + // libdivide_mullhi_s64(numers, magic); + __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); + q = _mm_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 + __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2); - q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) - q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift - q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift + q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } @@ -1937,143 +2231,273 @@ __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivid #ifdef __cplusplus -// The C++ divider class is templated on both an integer type -// (like uint64_t) and an algorithm type. -// * BRANCHFULL is the default algorithm type. -// * BRANCHFREE is the branchfree algorithm type. -enum { - BRANCHFULL, - BRANCHFREE +enum Branching { + BRANCHFULL, // use branching algorithms + BRANCHFREE // use branchfree algorithms }; -#if defined(LIBDIVIDE_AVX512) - #define LIBDIVIDE_VECTOR_TYPE __m512i -#elif defined(LIBDIVIDE_AVX2) - #define LIBDIVIDE_VECTOR_TYPE __m256i -#elif defined(LIBDIVIDE_SSE2) - #define LIBDIVIDE_VECTOR_TYPE __m128i +#if defined(LIBDIVIDE_NEON) +// Helper to deduce NEON vector type for integral type. +template +struct NeonVecFor {}; + +template <> +struct NeonVecFor { + typedef uint32x4_t type; +}; + +template <> +struct NeonVecFor { + typedef int32x4_t type; +}; + +template <> +struct NeonVecFor { + typedef uint64x2_t type; +}; + +template <> +struct NeonVecFor { + typedef int64x2_t type; +}; #endif -#if !defined(LIBDIVIDE_VECTOR_TYPE) - #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) +// Versions of our algorithms for SIMD. +#if defined(LIBDIVIDE_NEON) +#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ + typename NeonVecFor::type divide(typename NeonVecFor::type n) const { \ + return libdivide_##ALGO##_do_vec128(n, &denom); \ + } #else - #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) \ - LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { \ - return libdivide_##ALGO##_do_vector(n, &denom); \ - } +#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) +#endif +#if defined(LIBDIVIDE_SSE2) +#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ + __m128i divide(__m128i n) const { return libdivide_##ALGO##_do_vec128(n, &denom); } +#else +#define LIBDIVIDE_DIVIDE_SSE2(ALGO) +#endif + +#if defined(LIBDIVIDE_AVX2) +#define LIBDIVIDE_DIVIDE_AVX2(ALGO) \ + __m256i divide(__m256i n) const { return libdivide_##ALGO##_do_vec256(n, &denom); } +#else +#define LIBDIVIDE_DIVIDE_AVX2(ALGO) +#endif + +#if defined(LIBDIVIDE_AVX512) +#define LIBDIVIDE_DIVIDE_AVX512(ALGO) \ + __m512i divide(__m512i n) const { return libdivide_##ALGO##_do_vec512(n, &denom); } +#else +#define LIBDIVIDE_DIVIDE_AVX512(ALGO) #endif // The DISPATCHER_GEN() macro generates C++ methods (for the given integer // and algorithm types) that redirect to libdivide's C API. -#define DISPATCHER_GEN(T, ALGO) \ - libdivide_##ALGO##_t denom; \ - dispatcher() { } \ - dispatcher(T d) \ - : denom(libdivide_##ALGO##_gen(d)) \ - { } \ - T divide(T n) const { \ - return libdivide_##ALGO##_do(n, &denom); \ - } \ - LIBDIVIDE_DIVIDE_VECTOR(ALGO) \ - T recover() const { \ - return libdivide_##ALGO##_recover(&denom); \ - } +#define DISPATCHER_GEN(T, ALGO) \ + libdivide_##ALGO##_t denom; \ + dispatcher() {} \ + dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ + T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ + T recover() const { return libdivide_##ALGO##_recover(&denom); } \ + LIBDIVIDE_DIVIDE_NEON(ALGO, T) \ + LIBDIVIDE_DIVIDE_SSE2(ALGO) \ + LIBDIVIDE_DIVIDE_AVX2(ALGO) \ + LIBDIVIDE_DIVIDE_AVX512(ALGO) // The dispatcher selects a specific division algorithm for a given // type and ALGO using partial template specialization. -template struct dispatcher { }; +template +struct dispatcher {}; -template<> struct dispatcher { DISPATCHER_GEN(int32_t, s32) }; -template<> struct dispatcher { DISPATCHER_GEN(int32_t, s32_branchfree) }; -template<> struct dispatcher { DISPATCHER_GEN(uint32_t, u32) }; -template<> struct dispatcher { DISPATCHER_GEN(uint32_t, u32_branchfree) }; -template<> struct dispatcher { DISPATCHER_GEN(int64_t, s64) }; -template<> struct dispatcher { DISPATCHER_GEN(int64_t, s64_branchfree) }; -template<> struct dispatcher { DISPATCHER_GEN(uint64_t, u64) }; -template<> struct dispatcher { DISPATCHER_GEN(uint64_t, u64_branchfree) }; +template <> +struct dispatcher { + DISPATCHER_GEN(int32_t, s32) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int32_t, s32_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint32_t, u32) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint32_t, u32_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int64_t, s64) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int64_t, s64_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint64_t, u64) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint64_t, u64_branchfree) +}; // This is the main divider class for use by the user (C++ API). // The actual division algorithm is selected using the dispatcher struct // based on the integer and algorithm template parameters. -template +template class divider { -public: + public: // We leave the default constructor empty so that creating // an array of dividers and then initializing them // later doesn't slow us down. - divider() { } + divider() {} // Constructor that takes the divisor as a parameter - divider(T d) : div(d) { } + divider(T d) : div(d) {} // Divides n by the divisor - T divide(T n) const { - return div.divide(n); - } + T divide(T n) const { return div.divide(n); } // Recovers the divisor, returns the value that was // used to initialize this divider object. - T recover() const { - return div.recover(); + T recover() const { return div.recover(); } + + bool operator==(const divider &other) const { + return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more; } - bool operator==(const divider& other) const { - return div.denom.magic == other.denom.magic && - div.denom.more == other.denom.more; - } + bool operator!=(const divider &other) const { return !(*this == other); } - bool operator!=(const divider& other) const { - return !(*this == other); - } - -#if defined(LIBDIVIDE_VECTOR_TYPE) - // Treats the vector as packed integer values with the same type as - // the divider (e.g. s32, u32, s64, u64) and divides each of - // them by the divider, returning the packed quotients. - LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { + // Vector variants treat the input as packed integer values with the same type as the divider + // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed + // quotients. +#if defined(LIBDIVIDE_SSE2) + __m128i divide(__m128i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_AVX2) + __m256i divide(__m256i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_AVX512) + __m512i divide(__m512i n) const { return div.divide(n); } +#endif +#if defined(LIBDIVIDE_NEON) + typename NeonVecFor::type divide(typename NeonVecFor::type n) const { return div.divide(n); } #endif -private: + private: // Storage for the actual divisor - dispatcher::value, - std::is_signed::value, sizeof(T), ALGO> div; + dispatcher::value, std::is_signed::value, sizeof(T), ALGO> div; }; // Overload of operator / for scalar division -template -T operator/(T n, const divider& div) { +template +T operator/(T n, const divider &div) { return div.divide(n); } // Overload of operator /= for scalar division -template -T& operator/=(T& n, const divider& div) { +template +T &operator/=(T &n, const divider &div) { n = div.divide(n); return n; } -#if defined(LIBDIVIDE_VECTOR_TYPE) - // Overload of operator / for vector division - template - LIBDIVIDE_VECTOR_TYPE operator/(LIBDIVIDE_VECTOR_TYPE n, const divider& div) { - return div.divide(n); - } - // Overload of operator /= for vector division - template - LIBDIVIDE_VECTOR_TYPE& operator/=(LIBDIVIDE_VECTOR_TYPE& n, const divider& div) { - n = div.divide(n); - return n; - } +// Overloads for vector types. +#if defined(LIBDIVIDE_SSE2) +template +__m128i operator/(__m128i n, const divider &div) { + return div.divide(n); +} + +template +__m128i operator/=(__m128i &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif +#if defined(LIBDIVIDE_AVX2) +template +__m256i operator/(__m256i n, const divider &div) { + return div.divide(n); +} + +template +__m256i operator/=(__m256i &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif +#if defined(LIBDIVIDE_AVX512) +template +__m512i operator/(__m512i n, const divider &div) { + return div.divide(n); +} + +template +__m512i operator/=(__m512i &n, const divider &div) { + n = div.divide(n); + return n; +} #endif -// libdivdie::branchfree_divider +#if defined(LIBDIVIDE_NEON) +template +uint32x4_t operator/(uint32x4_t n, const divider &div) { + return div.divide(n); +} + +template +int32x4_t operator/(int32x4_t n, const divider &div) { + return div.divide(n); +} + +template +uint64x2_t operator/(uint64x2_t n, const divider &div) { + return div.divide(n); +} + +template +int64x2_t operator/(int64x2_t n, const divider &div) { + return div.divide(n); +} + +template +uint32x4_t operator/=(uint32x4_t &n, const divider &div) { + n = div.divide(n); + return n; +} + +template +int32x4_t operator/=(int32x4_t &n, const divider &div) { + n = div.divide(n); + return n; +} + +template +uint64x2_t operator/=(uint64x2_t &n, const divider &div) { + n = div.divide(n); + return n; +} + +template +int64x2_t operator/=(int64x2_t &n, const divider &div) { + n = div.divide(n); + return n; +} +#endif + +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) +// libdivide::branchfree_divider template using branchfree_divider = divider; +#endif -} // namespace libdivide +} // namespace libdivide -#endif // __cplusplus +#endif // __cplusplus -#endif // LIBDIVIDE_H +#endif // LIBDIVIDE_H diff --git a/contrib/libpq b/contrib/libpq index 1f9c286dba6..c7624588ddd 160000 --- a/contrib/libpq +++ b/contrib/libpq @@ -1 +1 @@ -Subproject commit 1f9c286dba60809edb64e384d6727d80d269b6cf +Subproject commit c7624588ddd84f153dd5990e81b886e4568bddde diff --git a/contrib/librdkafka-cmake/config.h.in b/contrib/librdkafka-cmake/config.h.in index 80b6ea61b6e..9fecb45e42d 100644 --- a/contrib/librdkafka-cmake/config.h.in +++ b/contrib/librdkafka-cmake/config.h.in @@ -66,7 +66,7 @@ #cmakedefine WITH_SASL_OAUTHBEARER 1 #cmakedefine WITH_SASL_CYRUS 1 // crc32chw -#if !defined(__PPC__) && (!defined(__aarch64__) || defined(__ARM_FEATURE_CRC32)) +#if !defined(__PPC__) && (!defined(__aarch64__) || defined(__ARM_FEATURE_CRC32)) && !(defined(__aarch64__) && defined(__APPLE__)) #define WITH_CRC32C_HW 1 #endif // regex @@ -75,6 +75,8 @@ #define HAVE_STRNDUP 1 // strerror_r #define HAVE_STRERROR_R 1 +// rand_r +#define HAVE_RAND_R 1 #ifdef __APPLE__ // pthread_setname_np diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 21f451d4d31..5f4034a3a63 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 21f451d4d3157ffed31ec60a8b76c407190e66bd +Subproject commit 5f4034a3a6376416504f17186c55fe401c6d8e5e diff --git a/contrib/nanodbc b/contrib/nanodbc new file mode 160000 index 00000000000..9fc45967551 --- /dev/null +++ b/contrib/nanodbc @@ -0,0 +1 @@ +Subproject commit 9fc459675515d491401727ec67fca38db721f28c diff --git a/contrib/nanodbc-cmake/CMakeLists.txt b/contrib/nanodbc-cmake/CMakeLists.txt new file mode 100644 index 00000000000..1673b311c49 --- /dev/null +++ b/contrib/nanodbc-cmake/CMakeLists.txt @@ -0,0 +1,18 @@ +if (NOT USE_INTERNAL_NANODBC_LIBRARY) + return () +endif () + +set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/nanodbc) + +if (NOT TARGET unixodbc) + message(FATAL_ERROR "Configuration error: unixodbc is not a target") +endif() + +set (SRCS + ${LIBRARY_DIR}/nanodbc/nanodbc.cpp +) + +add_library(nanodbc ${SRCS}) + +target_link_libraries (nanodbc PUBLIC unixodbc) +target_include_directories (nanodbc SYSTEM PUBLIC ${LIBRARY_DIR}/) diff --git a/contrib/openldap-cmake/darwin_aarch64/include/lber_types.h b/contrib/openldap-cmake/darwin_aarch64/include/lber_types.h new file mode 100644 index 00000000000..dbd59430527 --- /dev/null +++ b/contrib/openldap-cmake/darwin_aarch64/include/lber_types.h @@ -0,0 +1,63 @@ +/* include/lber_types.h. Generated from lber_types.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * LBER types + */ + +#ifndef _LBER_TYPES_H +#define _LBER_TYPES_H + +#include + +LDAP_BEGIN_DECL + +/* LBER boolean, enum, integers (32 bits or larger) */ +#define LBER_INT_T int + +/* LBER tags (32 bits or larger) */ +#define LBER_TAG_T long + +/* LBER socket descriptor */ +#define LBER_SOCKET_T int + +/* LBER lengths (32 bits or larger) */ +#define LBER_LEN_T long + +/* ------------------------------------------------------------ */ + +/* booleans, enumerations, and integers */ +typedef LBER_INT_T ber_int_t; + +/* signed and unsigned versions */ +typedef signed LBER_INT_T ber_sint_t; +typedef unsigned LBER_INT_T ber_uint_t; + +/* tags */ +typedef unsigned LBER_TAG_T ber_tag_t; + +/* "socket" descriptors */ +typedef LBER_SOCKET_T ber_socket_t; + +/* lengths */ +typedef unsigned LBER_LEN_T ber_len_t; + +/* signed lengths */ +typedef signed LBER_LEN_T ber_slen_t; + +LDAP_END_DECL + +#endif /* _LBER_TYPES_H */ diff --git a/contrib/openldap-cmake/darwin_aarch64/include/ldap_config.h b/contrib/openldap-cmake/darwin_aarch64/include/ldap_config.h new file mode 100644 index 00000000000..89f7b40b884 --- /dev/null +++ b/contrib/openldap-cmake/darwin_aarch64/include/ldap_config.h @@ -0,0 +1,74 @@ +/* include/ldap_config.h. Generated from ldap_config.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * This file works in conjunction with OpenLDAP configure system. + * If you do no like the values below, adjust your configure options. + */ + +#ifndef _LDAP_CONFIG_H +#define _LDAP_CONFIG_H + +/* directory separator */ +#ifndef LDAP_DIRSEP +#ifndef _WIN32 +#define LDAP_DIRSEP "/" +#else +#define LDAP_DIRSEP "\\" +#endif +#endif + +/* directory for temporary files */ +#if defined(_WIN32) +# define LDAP_TMPDIR "C:\\." /* we don't have much of a choice */ +#elif defined( _P_tmpdir ) +# define LDAP_TMPDIR _P_tmpdir +#elif defined( P_tmpdir ) +# define LDAP_TMPDIR P_tmpdir +#elif defined( _PATH_TMPDIR ) +# define LDAP_TMPDIR _PATH_TMPDIR +#else +# define LDAP_TMPDIR LDAP_DIRSEP "tmp" +#endif + +/* directories */ +#ifndef LDAP_BINDIR +#define LDAP_BINDIR "/tmp/ldap-prefix/bin" +#endif +#ifndef LDAP_SBINDIR +#define LDAP_SBINDIR "/tmp/ldap-prefix/sbin" +#endif +#ifndef LDAP_DATADIR +#define LDAP_DATADIR "/tmp/ldap-prefix/share/openldap" +#endif +#ifndef LDAP_SYSCONFDIR +#define LDAP_SYSCONFDIR "/tmp/ldap-prefix/etc/openldap" +#endif +#ifndef LDAP_LIBEXECDIR +#define LDAP_LIBEXECDIR "/tmp/ldap-prefix/libexec" +#endif +#ifndef LDAP_MODULEDIR +#define LDAP_MODULEDIR "/tmp/ldap-prefix/libexec/openldap" +#endif +#ifndef LDAP_RUNDIR +#define LDAP_RUNDIR "/tmp/ldap-prefix/var" +#endif +#ifndef LDAP_LOCALEDIR +#define LDAP_LOCALEDIR "" +#endif + + +#endif /* _LDAP_CONFIG_H */ diff --git a/contrib/openldap-cmake/darwin_aarch64/include/ldap_features.h b/contrib/openldap-cmake/darwin_aarch64/include/ldap_features.h new file mode 100644 index 00000000000..f0cc7c3626f --- /dev/null +++ b/contrib/openldap-cmake/darwin_aarch64/include/ldap_features.h @@ -0,0 +1,61 @@ +/* include/ldap_features.h. Generated from ldap_features.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * LDAP Features + */ + +#ifndef _LDAP_FEATURES_H +#define _LDAP_FEATURES_H 1 + +/* OpenLDAP API version macros */ +#define LDAP_VENDOR_VERSION 20501 +#define LDAP_VENDOR_VERSION_MAJOR 2 +#define LDAP_VENDOR_VERSION_MINOR 5 +#define LDAP_VENDOR_VERSION_PATCH X + +/* +** WORK IN PROGRESS! +** +** OpenLDAP reentrancy/thread-safeness should be dynamically +** checked using ldap_get_option(). +** +** The -lldap implementation is not thread-safe. +** +** The -lldap_r implementation is: +** LDAP_API_FEATURE_THREAD_SAFE (basic thread safety) +** but also be: +** LDAP_API_FEATURE_SESSION_THREAD_SAFE +** LDAP_API_FEATURE_OPERATION_THREAD_SAFE +** +** The preprocessor flag LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE +** can be used to determine if -lldap_r is available at compile +** time. You must define LDAP_THREAD_SAFE if and only if you +** link with -lldap_r. +** +** If you fail to define LDAP_THREAD_SAFE when linking with +** -lldap_r or define LDAP_THREAD_SAFE when linking with -lldap, +** provided header definitions and declarations may be incorrect. +** +*/ + +/* is -lldap_r available or not */ +#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1 + +/* LDAP v2 Referrals */ +/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */ + +#endif /* LDAP_FEATURES */ diff --git a/contrib/openldap-cmake/darwin_aarch64/include/portable.h b/contrib/openldap-cmake/darwin_aarch64/include/portable.h new file mode 100644 index 00000000000..fdf4e89017e --- /dev/null +++ b/contrib/openldap-cmake/darwin_aarch64/include/portable.h @@ -0,0 +1,1169 @@ +/* include/portable.h. Generated from portable.hin by configure. */ +/* include/portable.hin. Generated from configure.in by autoheader. */ + + +/* begin of portable.h.pre */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#ifndef _LDAP_PORTABLE_H +#define _LDAP_PORTABLE_H + +/* define this if needed to get reentrant functions */ +#ifndef REENTRANT +#define REENTRANT 1 +#endif +#ifndef _REENTRANT +#define _REENTRANT 1 +#endif + +/* define this if needed to get threadsafe functions */ +#ifndef THREADSAFE +#define THREADSAFE 1 +#endif +#ifndef _THREADSAFE +#define _THREADSAFE 1 +#endif +#ifndef THREAD_SAFE +#define THREAD_SAFE 1 +#endif +#ifndef _THREAD_SAFE +#define _THREAD_SAFE 1 +#endif + +#ifndef _SGI_MP_SOURCE +#define _SGI_MP_SOURCE 1 +#endif + +/* end of portable.h.pre */ + + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* define to use both and */ +/* #undef BOTH_STRINGS_H */ + +/* define if cross compiling */ +/* #undef CROSS_COMPILING */ + +/* set to the number of arguments ctime_r() expects */ +#define CTIME_R_NARGS 2 + +/* define if toupper() requires islower() */ +/* #undef C_UPPER_LOWER */ + +/* define if sys_errlist is not declared in stdio.h or errno.h */ +/* #undef DECL_SYS_ERRLIST */ + +/* define to enable slapi library */ +/* #undef ENABLE_SLAPI */ + +/* defined to be the EXE extension */ +#define EXEEXT "" + +/* set to the number of arguments gethostbyaddr_r() expects */ +/* #undef GETHOSTBYADDR_R_NARGS */ + +/* set to the number of arguments gethostbyname_r() expects */ +/* #undef GETHOSTBYNAME_R_NARGS */ + +/* Define to 1 if `TIOCGWINSZ' requires . */ +/* #undef GWINSZ_IN_SYS_IOCTL */ + +/* define if you have AIX security lib */ +/* #undef HAVE_AIX_SECURITY */ + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_INET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ASSERT_H 1 + +/* Define to 1 if you have the `bcopy' function. */ +#define HAVE_BCOPY 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_BITS_TYPES_H */ + +/* Define to 1 if you have the `chroot' function. */ +#define HAVE_CHROOT 1 + +/* Define to 1 if you have the `closesocket' function. */ +/* #undef HAVE_CLOSESOCKET */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CONIO_H */ + +/* define if crypt(3) is available */ +/* #undef HAVE_CRYPT */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CRYPT_H */ + +/* define if crypt_r() is also available */ +/* #undef HAVE_CRYPT_R */ + +/* Define to 1 if you have the `ctime_r' function. */ +#define HAVE_CTIME_R 1 + +/* define if you have Cyrus SASL */ +/* #undef HAVE_CYRUS_SASL */ + +/* define if your system supports /dev/poll */ +/* #undef HAVE_DEVPOLL */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_DIRECT_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +#define HAVE_DIRENT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ +/* #undef HAVE_DOPRNT */ + +/* define if system uses EBCDIC instead of ASCII */ +/* #undef HAVE_EBCDIC */ + +/* Define to 1 if you have the `endgrent' function. */ +#define HAVE_ENDGRENT 1 + +/* Define to 1 if you have the `endpwent' function. */ +#define HAVE_ENDPWENT 1 + +/* define if your system supports epoll */ +/* #undef HAVE_EPOLL */ + +/* Define to 1 if you have the header file. */ +#define HAVE_ERRNO_H 1 + +/* Define to 1 if you have the `fcntl' function. */ +#define HAVE_FCNTL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* define if you actually have FreeBSD fetch(3) */ +/* #undef HAVE_FETCH */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_FILIO_H */ + +/* Define to 1 if you have the `flock' function. */ +#define HAVE_FLOCK 1 + +/* Define to 1 if you have the `fstat' function. */ +#define HAVE_FSTAT 1 + +/* Define to 1 if you have the `gai_strerror' function. */ +#define HAVE_GAI_STRERROR 1 + +/* Define to 1 if you have the `getaddrinfo' function. */ +#define HAVE_GETADDRINFO 1 + +/* Define to 1 if you have the `getdtablesize' function. */ +#define HAVE_GETDTABLESIZE 1 + +/* Define to 1 if you have the `geteuid' function. */ +#define HAVE_GETEUID 1 + +/* Define to 1 if you have the `getgrgid' function. */ +#define HAVE_GETGRGID 1 + +/* Define to 1 if you have the `gethostbyaddr_r' function. */ +/* #undef HAVE_GETHOSTBYADDR_R */ + +/* Define to 1 if you have the `gethostbyname_r' function. */ +/* #undef HAVE_GETHOSTBYNAME_R */ + +/* Define to 1 if you have the `gethostname' function. */ +#define HAVE_GETHOSTNAME 1 + +/* Define to 1 if you have the `getnameinfo' function. */ +#define HAVE_GETNAMEINFO 1 + +/* Define to 1 if you have the `getopt' function. */ +#define HAVE_GETOPT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_GETOPT_H 1 + +/* Define to 1 if you have the `getpassphrase' function. */ +/* #undef HAVE_GETPASSPHRASE */ + +/* Define to 1 if you have the `getpeereid' function. */ +#define HAVE_GETPEEREID 1 + +/* Define to 1 if you have the `getpeerucred' function. */ +/* #undef HAVE_GETPEERUCRED */ + +/* Define to 1 if you have the `getpwnam' function. */ +#define HAVE_GETPWNAM 1 + +/* Define to 1 if you have the `getpwuid' function. */ +#define HAVE_GETPWUID 1 + +/* Define to 1 if you have the `getspnam' function. */ +/* #undef HAVE_GETSPNAM */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_GMP_H */ + +/* Define to 1 if you have the `gmtime_r' function. */ +#define HAVE_GMTIME_R 1 + +/* define if you have GNUtls */ +/* #undef HAVE_GNUTLS */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_GNUTLS_GNUTLS_H */ + +/* if you have GNU Pth */ +/* #undef HAVE_GNU_PTH */ + +/* Define to 1 if you have the header file. */ +#define HAVE_GRP_H 1 + +/* Define to 1 if you have the `hstrerror' function. */ +#define HAVE_HSTRERROR 1 + +/* define to you inet_aton(3) is available */ +#define HAVE_INET_ATON 1 + +/* Define to 1 if you have the `inet_ntoa_b' function. */ +/* #undef HAVE_INET_NTOA_B */ + +/* Define to 1 if you have the `inet_ntop' function. */ +#define HAVE_INET_NTOP 1 + +/* Define to 1 if you have the `initgroups' function. */ +#define HAVE_INITGROUPS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `ioctl' function. */ +#define HAVE_IOCTL 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_IO_H */ + +/* define if your system supports kqueue */ +#define HAVE_KQUEUE 1 + +/* Define to 1 if you have the `gen' library (-lgen). */ +/* #undef HAVE_LIBGEN */ + +/* Define to 1 if you have the `gmp' library (-lgmp). */ +/* #undef HAVE_LIBGMP */ + +/* Define to 1 if you have the `inet' library (-linet). */ +/* #undef HAVE_LIBINET */ + +/* define if you have libtool -ltdl */ +/* #undef HAVE_LIBLTDL */ + +/* Define to 1 if you have the `net' library (-lnet). */ +/* #undef HAVE_LIBNET */ + +/* Define to 1 if you have the `nsl' library (-lnsl). */ +/* #undef HAVE_LIBNSL */ + +/* Define to 1 if you have the `nsl_s' library (-lnsl_s). */ +/* #undef HAVE_LIBNSL_S */ + +/* Define to 1 if you have the `socket' library (-lsocket). */ +/* #undef HAVE_LIBSOCKET */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LIBUTIL_H */ + +/* Define to 1 if you have the `V3' library (-lV3). */ +/* #undef HAVE_LIBV3 */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H 1 + +/* if you have LinuxThreads */ +/* #undef HAVE_LINUX_THREADS */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if you have the `localtime_r' function. */ +#define HAVE_LOCALTIME_R 1 + +/* Define to 1 if you have the `lockf' function. */ +#define HAVE_LOCKF 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LTDL_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MALLOC_H */ + +/* Define to 1 if you have the `memcpy' function. */ +#define HAVE_MEMCPY 1 + +/* Define to 1 if you have the `memmove' function. */ +#define HAVE_MEMMOVE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memrchr' function. */ +/* #undef HAVE_MEMRCHR */ + +/* Define to 1 if you have the `mkstemp' function. */ +#define HAVE_MKSTEMP 1 + +/* Define to 1 if you have the `mktemp' function. */ +#define HAVE_MKTEMP 1 + +/* define this if you have mkversion */ +#define HAVE_MKVERSION 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. */ +/* #undef HAVE_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_TCP_H 1 + +/* define if strerror_r returns char* instead of int */ +/* #undef HAVE_NONPOSIX_STRERROR_R */ + +/* if you have NT Event Log */ +/* #undef HAVE_NT_EVENT_LOG */ + +/* if you have NT Service Manager */ +/* #undef HAVE_NT_SERVICE_MANAGER */ + +/* if you have NT Threads */ +/* #undef HAVE_NT_THREADS */ + +/* define if you have OpenSSL */ +#define HAVE_OPENSSL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_BN_H 1 + +/* define if you have OpenSSL with CRL checking capability */ +#define HAVE_OPENSSL_CRL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_CRYPTO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_SSL_H 1 + +/* Define to 1 if you have the `pipe' function. */ +#define HAVE_PIPE 1 + +/* Define to 1 if you have the `poll' function. */ +#define HAVE_POLL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_POLL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PROCESS_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PSAP_H */ + +/* define to pthreads API spec revision */ +#define HAVE_PTHREADS 10 + +/* define if you have pthread_detach function */ +#define HAVE_PTHREAD_DETACH 1 + +/* Define to 1 if you have the `pthread_getconcurrency' function. */ +#define HAVE_PTHREAD_GETCONCURRENCY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PTHREAD_H 1 + +/* Define to 1 if you have the `pthread_kill' function. */ +#define HAVE_PTHREAD_KILL 1 + +/* Define to 1 if you have the `pthread_kill_other_threads_np' function. */ +/* #undef HAVE_PTHREAD_KILL_OTHER_THREADS_NP */ + +/* define if you have pthread_rwlock_destroy function */ +#define HAVE_PTHREAD_RWLOCK_DESTROY 1 + +/* Define to 1 if you have the `pthread_setconcurrency' function. */ +#define HAVE_PTHREAD_SETCONCURRENCY 1 + +/* Define to 1 if you have the `pthread_yield' function. */ +/* #undef HAVE_PTHREAD_YIELD */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PTH_H */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PWD_H 1 + +/* Define to 1 if you have the `read' function. */ +#define HAVE_READ 1 + +/* Define to 1 if you have the `recv' function. */ +#define HAVE_RECV 1 + +/* Define to 1 if you have the `recvfrom' function. */ +#define HAVE_RECVFROM 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_REGEX_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_RESOLV_H */ + +/* define if you have res_query() */ +/* #undef HAVE_RES_QUERY */ + +/* define if OpenSSL needs RSAref */ +/* #undef HAVE_RSAREF */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_SASL_H */ + +/* define if your SASL library has sasl_version() */ +/* #undef HAVE_SASL_VERSION */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SCHED_H 1 + +/* Define to 1 if you have the `sched_yield' function. */ +#define HAVE_SCHED_YIELD 1 + +/* Define to 1 if you have the `send' function. */ +#define HAVE_SEND 1 + +/* Define to 1 if you have the `sendmsg' function. */ +#define HAVE_SENDMSG 1 + +/* Define to 1 if you have the `sendto' function. */ +#define HAVE_SENDTO 1 + +/* Define to 1 if you have the `setegid' function. */ +#define HAVE_SETEGID 1 + +/* Define to 1 if you have the `seteuid' function. */ +#define HAVE_SETEUID 1 + +/* Define to 1 if you have the `setgid' function. */ +#define HAVE_SETGID 1 + +/* Define to 1 if you have the `setpwfile' function. */ +/* #undef HAVE_SETPWFILE */ + +/* Define to 1 if you have the `setsid' function. */ +#define HAVE_SETSID 1 + +/* Define to 1 if you have the `setuid' function. */ +#define HAVE_SETUID 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SGTTY_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SHADOW_H */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `signal' function. */ +#define HAVE_SIGNAL 1 + +/* Define to 1 if you have the `sigset' function. */ +#define HAVE_SIGSET 1 + +/* define if you have -lslp */ +/* #undef HAVE_SLP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SLP_H */ + +/* Define to 1 if you have the `snprintf' function. */ +#define HAVE_SNPRINTF 1 + +/* if you have spawnlp() */ +/* #undef HAVE_SPAWNLP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SQLEXT_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SQL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDDEF_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `strdup' function. */ +#define HAVE_STRDUP 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the `strerror_r' function. */ +#define HAVE_STRERROR_R 1 + +/* Define to 1 if you have the `strftime' function. */ +#define HAVE_STRFTIME 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strpbrk' function. */ +#define HAVE_STRPBRK 1 + +/* Define to 1 if you have the `strrchr' function. */ +#define HAVE_STRRCHR 1 + +/* Define to 1 if you have the `strsep' function. */ +#define HAVE_STRSEP 1 + +/* Define to 1 if you have the `strspn' function. */ +#define HAVE_STRSPN 1 + +/* Define to 1 if you have the `strstr' function. */ +#define HAVE_STRSTR 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoll' function. */ +#define HAVE_STRTOLL 1 + +/* Define to 1 if you have the `strtoq' function. */ +#define HAVE_STRTOQ 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `strtoull' function. */ +#define HAVE_STRTOULL 1 + +/* Define to 1 if you have the `strtouq' function. */ +#define HAVE_STRTOUQ 1 + +/* Define to 1 if `msg_accrightslen' is a member of `struct msghdr'. */ +/* #undef HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTSLEN */ + +/* Define to 1 if `msg_control' is a member of `struct msghdr'. */ +/* #undef HAVE_STRUCT_MSGHDR_MSG_CONTROL */ + +/* Define to 1 if `pw_gecos' is a member of `struct passwd'. */ +#define HAVE_STRUCT_PASSWD_PW_GECOS 1 + +/* Define to 1 if `pw_passwd' is a member of `struct passwd'. */ +#define HAVE_STRUCT_PASSWD_PW_PASSWD 1 + +/* Define to 1 if `st_blksize' is a member of `struct stat'. */ +#define HAVE_STRUCT_STAT_ST_BLKSIZE 1 + +/* Define to 1 if `st_fstype' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE */ + +/* define to 1 if st_fstype is char * */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE_CHAR */ + +/* define to 1 if st_fstype is int */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE_INT */ + +/* Define to 1 if `st_vfstype' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_VFSTYPE */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYNCH_H */ + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYSEXITS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYSLOG_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_DEVPOLL_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_DIR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_EPOLL_H */ + +/* define if you actually have sys_errlist in your libs */ +#define HAVE_SYS_ERRLIST 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_ERRNO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_EVENT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_FILE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_FILIO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_FSTYP_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_IOCTL_H 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_POLL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PRIVGRP_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSLOG_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UCRED_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UIO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UN_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_UUID_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_VMOUNT_H */ + +/* Define to 1 if you have that is POSIX.1 compatible. */ +#define HAVE_SYS_WAIT_H 1 + +/* define if you have -lwrap */ +/* #undef HAVE_TCPD */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_TCPD_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_TERMIOS_H 1 + +/* if you have Solaris LWP (thr) package */ +/* #undef HAVE_THR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_THREAD_H */ + +/* Define to 1 if you have the `thr_getconcurrency' function. */ +/* #undef HAVE_THR_GETCONCURRENCY */ + +/* Define to 1 if you have the `thr_setconcurrency' function. */ +/* #undef HAVE_THR_SETCONCURRENCY */ + +/* Define to 1 if you have the `thr_yield' function. */ +/* #undef HAVE_THR_YIELD */ + +/* define if you have TLS */ +#define HAVE_TLS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UTIME_H 1 + +/* define if you have uuid_generate() */ +/* #undef HAVE_UUID_GENERATE */ + +/* define if you have uuid_to_str() */ +/* #undef HAVE_UUID_TO_STR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_UUID_UUID_H */ + +/* Define to 1 if you have the `vprintf' function. */ +#define HAVE_VPRINTF 1 + +/* Define to 1 if you have the `vsnprintf' function. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 if you have the `wait4' function. */ +#define HAVE_WAIT4 1 + +/* Define to 1 if you have the `waitpid' function. */ +#define HAVE_WAITPID 1 + +/* define if you have winsock */ +/* #undef HAVE_WINSOCK */ + +/* define if you have winsock2 */ +/* #undef HAVE_WINSOCK2 */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINSOCK2_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINSOCK_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WIREDTIGER_H */ + +/* Define to 1 if you have the `write' function. */ +#define HAVE_WRITE 1 + +/* define if select implicitly yields */ +#define HAVE_YIELDING_SELECT 1 + +/* Define to 1 if you have the `_vsnprintf' function. */ +/* #undef HAVE__VSNPRINTF */ + +/* define to 32-bit or greater integer type */ +#define LBER_INT_T int + +/* define to large integer type */ +#define LBER_LEN_T long + +/* define to socket descriptor type */ +#define LBER_SOCKET_T int + +/* define to large integer type */ +#define LBER_TAG_T long + +/* define to 1 if library is thread safe */ +#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1 + +/* define to LDAP VENDOR VERSION */ +/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */ + +/* define this to add debugging code */ +/* #undef LDAP_DEBUG */ + +/* define if LDAP libs are dynamic */ +/* #undef LDAP_LIBS_DYNAMIC */ + +/* define to support PF_INET6 */ +#define LDAP_PF_INET6 1 + +/* define to support PF_LOCAL */ +#define LDAP_PF_LOCAL 1 + +/* define this to add SLAPI code */ +/* #undef LDAP_SLAPI */ + +/* define this to add syslog code */ +/* #undef LDAP_SYSLOG */ + +/* Version */ +#define LDAP_VENDOR_VERSION 20501 + +/* Major */ +#define LDAP_VENDOR_VERSION_MAJOR 2 + +/* Minor */ +#define LDAP_VENDOR_VERSION_MINOR 5 + +/* Patch */ +#define LDAP_VENDOR_VERSION_PATCH X + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* define if memcmp is not 8-bit clean or is otherwise broken */ +/* #undef NEED_MEMCMP_REPLACEMENT */ + +/* define if you have (or want) no threads */ +/* #undef NO_THREADS */ + +/* define to use the original debug style */ +/* #undef OLD_DEBUG */ + +/* Package */ +#define OPENLDAP_PACKAGE "OpenLDAP" + +/* Version */ +#define OPENLDAP_VERSION "2.5.X" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "" + +/* define if sched_yield yields the entire process */ +/* #undef REPLACE_BROKEN_YIELD */ + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* Define to the type of arg 1 for `select'. */ +#define SELECT_TYPE_ARG1 int + +/* Define to the type of args 2, 3 and 4 for `select'. */ +#define SELECT_TYPE_ARG234 (fd_set *) + +/* Define to the type of arg 5 for `select'. */ +#define SELECT_TYPE_ARG5 (struct timeval *) + +/* The size of `int', as computed by sizeof. */ +#define SIZEOF_INT 4 + +/* The size of `long', as computed by sizeof. */ +#define SIZEOF_LONG 8 + +/* The size of `long long', as computed by sizeof. */ +#define SIZEOF_LONG_LONG 8 + +/* The size of `short', as computed by sizeof. */ +#define SIZEOF_SHORT 2 + +/* The size of `wchar_t', as computed by sizeof. */ +#define SIZEOF_WCHAR_T 4 + +/* define to support per-object ACIs */ +/* #undef SLAPD_ACI_ENABLED */ + +/* define to support LDAP Async Metadirectory backend */ +/* #undef SLAPD_ASYNCMETA */ + +/* define to support cleartext passwords */ +/* #undef SLAPD_CLEARTEXT */ + +/* define to support crypt(3) passwords */ +/* #undef SLAPD_CRYPT */ + +/* define to support DNS SRV backend */ +/* #undef SLAPD_DNSSRV */ + +/* define to support LDAP backend */ +/* #undef SLAPD_LDAP */ + +/* define to support MDB backend */ +/* #undef SLAPD_MDB */ + +/* define to support LDAP Metadirectory backend */ +/* #undef SLAPD_META */ + +/* define to support modules */ +/* #undef SLAPD_MODULES */ + +/* dynamically linked module */ +#define SLAPD_MOD_DYNAMIC 2 + +/* statically linked module */ +#define SLAPD_MOD_STATIC 1 + +/* define to support cn=Monitor backend */ +/* #undef SLAPD_MONITOR */ + +/* define to support NDB backend */ +/* #undef SLAPD_NDB */ + +/* define to support NULL backend */ +/* #undef SLAPD_NULL */ + +/* define for In-Directory Access Logging overlay */ +/* #undef SLAPD_OVER_ACCESSLOG */ + +/* define for Audit Logging overlay */ +/* #undef SLAPD_OVER_AUDITLOG */ + +/* define for Automatic Certificate Authority overlay */ +/* #undef SLAPD_OVER_AUTOCA */ + +/* define for Collect overlay */ +/* #undef SLAPD_OVER_COLLECT */ + +/* define for Attribute Constraint overlay */ +/* #undef SLAPD_OVER_CONSTRAINT */ + +/* define for Dynamic Directory Services overlay */ +/* #undef SLAPD_OVER_DDS */ + +/* define for Dynamic Directory Services overlay */ +/* #undef SLAPD_OVER_DEREF */ + +/* define for Dynamic Group overlay */ +/* #undef SLAPD_OVER_DYNGROUP */ + +/* define for Dynamic List overlay */ +/* #undef SLAPD_OVER_DYNLIST */ + +/* define for Reverse Group Membership overlay */ +/* #undef SLAPD_OVER_MEMBEROF */ + +/* define for Password Policy overlay */ +/* #undef SLAPD_OVER_PPOLICY */ + +/* define for Proxy Cache overlay */ +/* #undef SLAPD_OVER_PROXYCACHE */ + +/* define for Referential Integrity overlay */ +/* #undef SLAPD_OVER_REFINT */ + +/* define for Return Code overlay */ +/* #undef SLAPD_OVER_RETCODE */ + +/* define for Rewrite/Remap overlay */ +/* #undef SLAPD_OVER_RWM */ + +/* define for Sequential Modify overlay */ +/* #undef SLAPD_OVER_SEQMOD */ + +/* define for ServerSideSort/VLV overlay */ +/* #undef SLAPD_OVER_SSSVLV */ + +/* define for Syncrepl Provider overlay */ +/* #undef SLAPD_OVER_SYNCPROV */ + +/* define for Translucent Proxy overlay */ +/* #undef SLAPD_OVER_TRANSLUCENT */ + +/* define for Attribute Uniqueness overlay */ +/* #undef SLAPD_OVER_UNIQUE */ + +/* define for Value Sorting overlay */ +/* #undef SLAPD_OVER_VALSORT */ + +/* define to support PASSWD backend */ +/* #undef SLAPD_PASSWD */ + +/* define to support PERL backend */ +/* #undef SLAPD_PERL */ + +/* define to support relay backend */ +/* #undef SLAPD_RELAY */ + +/* define to support reverse lookups */ +/* #undef SLAPD_RLOOKUPS */ + +/* define to support SHELL backend */ +/* #undef SLAPD_SHELL */ + +/* define to support SOCK backend */ +/* #undef SLAPD_SOCK */ + +/* define to support SASL passwords */ +/* #undef SLAPD_SPASSWD */ + +/* define to support SQL backend */ +/* #undef SLAPD_SQL */ + +/* define to support WiredTiger backend */ +/* #undef SLAPD_WT */ + +/* define to support run-time loadable ACL */ +/* #undef SLAP_DYNACL */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Define to 1 if your declares `struct tm'. */ +/* #undef TM_IN_SYS_TIME */ + +/* set to urandom device */ +#define URANDOM_DEVICE "/dev/urandom" + +/* define to use OpenSSL BIGNUM for MP */ +/* #undef USE_MP_BIGNUM */ + +/* define to use GMP for MP */ +/* #undef USE_MP_GMP */ + +/* define to use 'long' for MP */ +/* #undef USE_MP_LONG */ + +/* define to use 'long long' for MP */ +/* #undef USE_MP_LONG_LONG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to the type of arg 3 for `accept'. */ +#define ber_socklen_t socklen_t + +/* Define to `char *' if does not define. */ +/* #undef caddr_t */ + +/* Define to empty if `const' does not conform to ANSI C. */ +/* #undef const */ + +/* Define to `int' if doesn't define. */ +/* #undef gid_t */ + +/* Define to `int' if does not define. */ +/* #undef mode_t */ + +/* Define to `long' if does not define. */ +/* #undef off_t */ + +/* Define to `int' if does not define. */ +/* #undef pid_t */ + +/* Define to `int' if does not define. */ +/* #undef sig_atomic_t */ + +/* Define to `unsigned' if does not define. */ +/* #undef size_t */ + +/* define to snprintf routine */ +/* #undef snprintf */ + +/* Define like ber_socklen_t if does not define. */ +/* #undef socklen_t */ + +/* Define to `signed int' if does not define. */ +/* #undef ssize_t */ + +/* Define to `int' if doesn't define. */ +/* #undef uid_t */ + +/* define as empty if volatile is not supported */ +/* #undef volatile */ + +/* define to snprintf routine */ +/* #undef vsnprintf */ + + +/* begin of portable.h.post */ + +#ifdef _WIN32 +/* don't suck in all of the win32 api */ +# define WIN32_LEAN_AND_MEAN 1 +#endif + +#ifndef LDAP_NEEDS_PROTOTYPES +/* force LDAP_P to always include prototypes */ +#define LDAP_NEEDS_PROTOTYPES 1 +#endif + +#ifndef LDAP_REL_ENG +#if (LDAP_VENDOR_VERSION == 000000) && !defined(LDAP_DEVEL) +#define LDAP_DEVEL +#endif +#if defined(LDAP_DEVEL) && !defined(LDAP_TEST) +#define LDAP_TEST +#endif +#endif + +#ifdef HAVE_STDDEF_H +# include +#endif + +#ifdef HAVE_EBCDIC +/* ASCII/EBCDIC converting replacements for stdio funcs + * vsnprintf and snprintf are used too, but they are already + * checked by the configure script + */ +#define fputs ber_pvt_fputs +#define fgets ber_pvt_fgets +#define printf ber_pvt_printf +#define fprintf ber_pvt_fprintf +#define vfprintf ber_pvt_vfprintf +#define vsprintf ber_pvt_vsprintf +#endif + +#include "ac/fdset.h" + +#include "ldap_cdefs.h" +#include "ldap_features.h" + +#include "ac/assert.h" +#include "ac/localize.h" + +#endif /* _LDAP_PORTABLE_H */ +/* end of portable.h.post */ + diff --git a/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h b/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h new file mode 100644 index 00000000000..dbd59430527 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/lber_types.h @@ -0,0 +1,63 @@ +/* include/lber_types.h. Generated from lber_types.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * LBER types + */ + +#ifndef _LBER_TYPES_H +#define _LBER_TYPES_H + +#include + +LDAP_BEGIN_DECL + +/* LBER boolean, enum, integers (32 bits or larger) */ +#define LBER_INT_T int + +/* LBER tags (32 bits or larger) */ +#define LBER_TAG_T long + +/* LBER socket descriptor */ +#define LBER_SOCKET_T int + +/* LBER lengths (32 bits or larger) */ +#define LBER_LEN_T long + +/* ------------------------------------------------------------ */ + +/* booleans, enumerations, and integers */ +typedef LBER_INT_T ber_int_t; + +/* signed and unsigned versions */ +typedef signed LBER_INT_T ber_sint_t; +typedef unsigned LBER_INT_T ber_uint_t; + +/* tags */ +typedef unsigned LBER_TAG_T ber_tag_t; + +/* "socket" descriptors */ +typedef LBER_SOCKET_T ber_socket_t; + +/* lengths */ +typedef unsigned LBER_LEN_T ber_len_t; + +/* signed lengths */ +typedef signed LBER_LEN_T ber_slen_t; + +LDAP_END_DECL + +#endif /* _LBER_TYPES_H */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h b/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h new file mode 100644 index 00000000000..89f7b40b884 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/ldap_config.h @@ -0,0 +1,74 @@ +/* include/ldap_config.h. Generated from ldap_config.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * This file works in conjunction with OpenLDAP configure system. + * If you do no like the values below, adjust your configure options. + */ + +#ifndef _LDAP_CONFIG_H +#define _LDAP_CONFIG_H + +/* directory separator */ +#ifndef LDAP_DIRSEP +#ifndef _WIN32 +#define LDAP_DIRSEP "/" +#else +#define LDAP_DIRSEP "\\" +#endif +#endif + +/* directory for temporary files */ +#if defined(_WIN32) +# define LDAP_TMPDIR "C:\\." /* we don't have much of a choice */ +#elif defined( _P_tmpdir ) +# define LDAP_TMPDIR _P_tmpdir +#elif defined( P_tmpdir ) +# define LDAP_TMPDIR P_tmpdir +#elif defined( _PATH_TMPDIR ) +# define LDAP_TMPDIR _PATH_TMPDIR +#else +# define LDAP_TMPDIR LDAP_DIRSEP "tmp" +#endif + +/* directories */ +#ifndef LDAP_BINDIR +#define LDAP_BINDIR "/tmp/ldap-prefix/bin" +#endif +#ifndef LDAP_SBINDIR +#define LDAP_SBINDIR "/tmp/ldap-prefix/sbin" +#endif +#ifndef LDAP_DATADIR +#define LDAP_DATADIR "/tmp/ldap-prefix/share/openldap" +#endif +#ifndef LDAP_SYSCONFDIR +#define LDAP_SYSCONFDIR "/tmp/ldap-prefix/etc/openldap" +#endif +#ifndef LDAP_LIBEXECDIR +#define LDAP_LIBEXECDIR "/tmp/ldap-prefix/libexec" +#endif +#ifndef LDAP_MODULEDIR +#define LDAP_MODULEDIR "/tmp/ldap-prefix/libexec/openldap" +#endif +#ifndef LDAP_RUNDIR +#define LDAP_RUNDIR "/tmp/ldap-prefix/var" +#endif +#ifndef LDAP_LOCALEDIR +#define LDAP_LOCALEDIR "" +#endif + + +#endif /* _LDAP_CONFIG_H */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h b/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h new file mode 100644 index 00000000000..f0cc7c3626f --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/ldap_features.h @@ -0,0 +1,61 @@ +/* include/ldap_features.h. Generated from ldap_features.hin by configure. */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* + * LDAP Features + */ + +#ifndef _LDAP_FEATURES_H +#define _LDAP_FEATURES_H 1 + +/* OpenLDAP API version macros */ +#define LDAP_VENDOR_VERSION 20501 +#define LDAP_VENDOR_VERSION_MAJOR 2 +#define LDAP_VENDOR_VERSION_MINOR 5 +#define LDAP_VENDOR_VERSION_PATCH X + +/* +** WORK IN PROGRESS! +** +** OpenLDAP reentrancy/thread-safeness should be dynamically +** checked using ldap_get_option(). +** +** The -lldap implementation is not thread-safe. +** +** The -lldap_r implementation is: +** LDAP_API_FEATURE_THREAD_SAFE (basic thread safety) +** but also be: +** LDAP_API_FEATURE_SESSION_THREAD_SAFE +** LDAP_API_FEATURE_OPERATION_THREAD_SAFE +** +** The preprocessor flag LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE +** can be used to determine if -lldap_r is available at compile +** time. You must define LDAP_THREAD_SAFE if and only if you +** link with -lldap_r. +** +** If you fail to define LDAP_THREAD_SAFE when linking with +** -lldap_r or define LDAP_THREAD_SAFE when linking with -lldap, +** provided header definitions and declarations may be incorrect. +** +*/ + +/* is -lldap_r available or not */ +#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1 + +/* LDAP v2 Referrals */ +/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */ + +#endif /* LDAP_FEATURES */ diff --git a/contrib/openldap-cmake/linux_ppc64le/include/portable.h b/contrib/openldap-cmake/linux_ppc64le/include/portable.h new file mode 100644 index 00000000000..2924b6713a4 --- /dev/null +++ b/contrib/openldap-cmake/linux_ppc64le/include/portable.h @@ -0,0 +1,1169 @@ +/* include/portable.h. Generated from portable.hin by configure. */ +/* include/portable.hin. Generated from configure.in by autoheader. */ + + +/* begin of portable.h.pre */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2020 The OpenLDAP Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#ifndef _LDAP_PORTABLE_H +#define _LDAP_PORTABLE_H + +/* define this if needed to get reentrant functions */ +#ifndef REENTRANT +#define REENTRANT 1 +#endif +#ifndef _REENTRANT +#define _REENTRANT 1 +#endif + +/* define this if needed to get threadsafe functions */ +#ifndef THREADSAFE +#define THREADSAFE 1 +#endif +#ifndef _THREADSAFE +#define _THREADSAFE 1 +#endif +#ifndef THREAD_SAFE +#define THREAD_SAFE 1 +#endif +#ifndef _THREAD_SAFE +#define _THREAD_SAFE 1 +#endif + +#ifndef _SGI_MP_SOURCE +#define _SGI_MP_SOURCE 1 +#endif + +/* end of portable.h.pre */ + + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* define to use both and */ +/* #undef BOTH_STRINGS_H */ + +/* define if cross compiling */ +/* #undef CROSS_COMPILING */ + +/* set to the number of arguments ctime_r() expects */ +#define CTIME_R_NARGS 2 + +/* define if toupper() requires islower() */ +/* #undef C_UPPER_LOWER */ + +/* define if sys_errlist is not declared in stdio.h or errno.h */ +/* #undef DECL_SYS_ERRLIST */ + +/* define to enable slapi library */ +/* #undef ENABLE_SLAPI */ + +/* defined to be the EXE extension */ +#define EXEEXT "" + +/* set to the number of arguments gethostbyaddr_r() expects */ +#define GETHOSTBYADDR_R_NARGS 8 + +/* set to the number of arguments gethostbyname_r() expects */ +#define GETHOSTBYNAME_R_NARGS 6 + +/* Define to 1 if `TIOCGWINSZ' requires . */ +#define GWINSZ_IN_SYS_IOCTL 1 + +/* define if you have AIX security lib */ +/* #undef HAVE_AIX_SECURITY */ + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_INET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ARPA_NAMESER_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ASSERT_H 1 + +/* Define to 1 if you have the `bcopy' function. */ +#define HAVE_BCOPY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_BITS_TYPES_H 1 + +/* Define to 1 if you have the `chroot' function. */ +#define HAVE_CHROOT 1 + +/* Define to 1 if you have the `closesocket' function. */ +/* #undef HAVE_CLOSESOCKET */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CONIO_H */ + +/* define if crypt(3) is available */ +/* #undef HAVE_CRYPT */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_CRYPT_H */ + +/* define if crypt_r() is also available */ +/* #undef HAVE_CRYPT_R */ + +/* Define to 1 if you have the `ctime_r' function. */ +#define HAVE_CTIME_R 1 + +/* define if you have Cyrus SASL */ +/* #undef HAVE_CYRUS_SASL */ + +/* define if your system supports /dev/poll */ +/* #undef HAVE_DEVPOLL */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_DIRECT_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +#define HAVE_DIRENT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ +/* #undef HAVE_DOPRNT */ + +/* define if system uses EBCDIC instead of ASCII */ +/* #undef HAVE_EBCDIC */ + +/* Define to 1 if you have the `endgrent' function. */ +#define HAVE_ENDGRENT 1 + +/* Define to 1 if you have the `endpwent' function. */ +#define HAVE_ENDPWENT 1 + +/* define if your system supports epoll */ +#define HAVE_EPOLL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_ERRNO_H 1 + +/* Define to 1 if you have the `fcntl' function. */ +#define HAVE_FCNTL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* define if you actually have FreeBSD fetch(3) */ +/* #undef HAVE_FETCH */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_FILIO_H */ + +/* Define to 1 if you have the `flock' function. */ +#define HAVE_FLOCK 1 + +/* Define to 1 if you have the `fstat' function. */ +#define HAVE_FSTAT 1 + +/* Define to 1 if you have the `gai_strerror' function. */ +#define HAVE_GAI_STRERROR 1 + +/* Define to 1 if you have the `getaddrinfo' function. */ +#define HAVE_GETADDRINFO 1 + +/* Define to 1 if you have the `getdtablesize' function. */ +#define HAVE_GETDTABLESIZE 1 + +/* Define to 1 if you have the `geteuid' function. */ +#define HAVE_GETEUID 1 + +/* Define to 1 if you have the `getgrgid' function. */ +#define HAVE_GETGRGID 1 + +/* Define to 1 if you have the `gethostbyaddr_r' function. */ +#define HAVE_GETHOSTBYADDR_R 1 + +/* Define to 1 if you have the `gethostbyname_r' function. */ +#define HAVE_GETHOSTBYNAME_R 1 + +/* Define to 1 if you have the `gethostname' function. */ +#define HAVE_GETHOSTNAME 1 + +/* Define to 1 if you have the `getnameinfo' function. */ +#define HAVE_GETNAMEINFO 1 + +/* Define to 1 if you have the `getopt' function. */ +#define HAVE_GETOPT 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_GETOPT_H 1 + +/* Define to 1 if you have the `getpassphrase' function. */ +/* #undef HAVE_GETPASSPHRASE */ + +/* Define to 1 if you have the `getpeereid' function. */ +/* #undef HAVE_GETPEEREID */ + +/* Define to 1 if you have the `getpeerucred' function. */ +/* #undef HAVE_GETPEERUCRED */ + +/* Define to 1 if you have the `getpwnam' function. */ +#define HAVE_GETPWNAM 1 + +/* Define to 1 if you have the `getpwuid' function. */ +#define HAVE_GETPWUID 1 + +/* Define to 1 if you have the `getspnam' function. */ +#define HAVE_GETSPNAM 1 + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_GMP_H */ + +/* Define to 1 if you have the `gmtime_r' function. */ +#define HAVE_GMTIME_R 1 + +/* define if you have GNUtls */ +/* #undef HAVE_GNUTLS */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_GNUTLS_GNUTLS_H */ + +/* if you have GNU Pth */ +/* #undef HAVE_GNU_PTH */ + +/* Define to 1 if you have the header file. */ +#define HAVE_GRP_H 1 + +/* Define to 1 if you have the `hstrerror' function. */ +#define HAVE_HSTRERROR 1 + +/* define to you inet_aton(3) is available */ +#define HAVE_INET_ATON 1 + +/* Define to 1 if you have the `inet_ntoa_b' function. */ +/* #undef HAVE_INET_NTOA_B */ + +/* Define to 1 if you have the `inet_ntop' function. */ +#define HAVE_INET_NTOP 1 + +/* Define to 1 if you have the `initgroups' function. */ +#define HAVE_INITGROUPS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `ioctl' function. */ +#define HAVE_IOCTL 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_IO_H */ + +/* define if your system supports kqueue */ +/* #undef HAVE_KQUEUE */ + +/* Define to 1 if you have the `gen' library (-lgen). */ +/* #undef HAVE_LIBGEN */ + +/* Define to 1 if you have the `gmp' library (-lgmp). */ +/* #undef HAVE_LIBGMP */ + +/* Define to 1 if you have the `inet' library (-linet). */ +/* #undef HAVE_LIBINET */ + +/* define if you have libtool -ltdl */ +/* #undef HAVE_LIBLTDL */ + +/* Define to 1 if you have the `net' library (-lnet). */ +/* #undef HAVE_LIBNET */ + +/* Define to 1 if you have the `nsl' library (-lnsl). */ +/* #undef HAVE_LIBNSL */ + +/* Define to 1 if you have the `nsl_s' library (-lnsl_s). */ +/* #undef HAVE_LIBNSL_S */ + +/* Define to 1 if you have the `socket' library (-lsocket). */ +/* #undef HAVE_LIBSOCKET */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LIBUTIL_H */ + +/* Define to 1 if you have the `V3' library (-lV3). */ +/* #undef HAVE_LIBV3 */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LIMITS_H 1 + +/* if you have LinuxThreads */ +/* #undef HAVE_LINUX_THREADS */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if you have the `localtime_r' function. */ +#define HAVE_LOCALTIME_R 1 + +/* Define to 1 if you have the `lockf' function. */ +#define HAVE_LOCKF 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_LTDL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MALLOC_H 1 + +/* Define to 1 if you have the `memcpy' function. */ +#define HAVE_MEMCPY 1 + +/* Define to 1 if you have the `memmove' function. */ +#define HAVE_MEMMOVE 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memrchr' function. */ +#define HAVE_MEMRCHR 1 + +/* Define to 1 if you have the `mkstemp' function. */ +#define HAVE_MKSTEMP 1 + +/* Define to 1 if you have the `mktemp' function. */ +#define HAVE_MKTEMP 1 + +/* define this if you have mkversion */ +#define HAVE_MKVERSION 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. */ +/* #undef HAVE_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_NETINET_TCP_H 1 + +/* define if strerror_r returns char* instead of int */ +/* #undef HAVE_NONPOSIX_STRERROR_R */ + +/* if you have NT Event Log */ +/* #undef HAVE_NT_EVENT_LOG */ + +/* if you have NT Service Manager */ +/* #undef HAVE_NT_SERVICE_MANAGER */ + +/* if you have NT Threads */ +/* #undef HAVE_NT_THREADS */ + +/* define if you have OpenSSL */ +#define HAVE_OPENSSL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_BN_H 1 + +/* define if you have OpenSSL with CRL checking capability */ +#define HAVE_OPENSSL_CRL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_CRYPTO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_OPENSSL_SSL_H 1 + +/* Define to 1 if you have the `pipe' function. */ +#define HAVE_PIPE 1 + +/* Define to 1 if you have the `poll' function. */ +#define HAVE_POLL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_POLL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PROCESS_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PSAP_H */ + +/* define to pthreads API spec revision */ +#define HAVE_PTHREADS 10 + +/* define if you have pthread_detach function */ +#define HAVE_PTHREAD_DETACH 1 + +/* Define to 1 if you have the `pthread_getconcurrency' function. */ +#define HAVE_PTHREAD_GETCONCURRENCY 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PTHREAD_H 1 + +/* Define to 1 if you have the `pthread_kill' function. */ +#define HAVE_PTHREAD_KILL 1 + +/* Define to 1 if you have the `pthread_kill_other_threads_np' function. */ +/* #undef HAVE_PTHREAD_KILL_OTHER_THREADS_NP */ + +/* define if you have pthread_rwlock_destroy function */ +#define HAVE_PTHREAD_RWLOCK_DESTROY 1 + +/* Define to 1 if you have the `pthread_setconcurrency' function. */ +#define HAVE_PTHREAD_SETCONCURRENCY 1 + +/* Define to 1 if you have the `pthread_yield' function. */ +#define HAVE_PTHREAD_YIELD 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_PTH_H */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_PWD_H 1 + +/* Define to 1 if you have the `read' function. */ +#define HAVE_READ 1 + +/* Define to 1 if you have the `recv' function. */ +#define HAVE_RECV 1 + +/* Define to 1 if you have the `recvfrom' function. */ +#define HAVE_RECVFROM 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_REGEX_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_RESOLV_H */ + +/* define if you have res_query() */ +/* #undef HAVE_RES_QUERY */ + +/* define if OpenSSL needs RSAref */ +/* #undef HAVE_RSAREF */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SASL_SASL_H */ + +/* define if your SASL library has sasl_version() */ +/* #undef HAVE_SASL_VERSION */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SCHED_H 1 + +/* Define to 1 if you have the `sched_yield' function. */ +#define HAVE_SCHED_YIELD 1 + +/* Define to 1 if you have the `send' function. */ +#define HAVE_SEND 1 + +/* Define to 1 if you have the `sendmsg' function. */ +#define HAVE_SENDMSG 1 + +/* Define to 1 if you have the `sendto' function. */ +#define HAVE_SENDTO 1 + +/* Define to 1 if you have the `setegid' function. */ +#define HAVE_SETEGID 1 + +/* Define to 1 if you have the `seteuid' function. */ +#define HAVE_SETEUID 1 + +/* Define to 1 if you have the `setgid' function. */ +#define HAVE_SETGID 1 + +/* Define to 1 if you have the `setpwfile' function. */ +/* #undef HAVE_SETPWFILE */ + +/* Define to 1 if you have the `setsid' function. */ +#define HAVE_SETSID 1 + +/* Define to 1 if you have the `setuid' function. */ +#define HAVE_SETUID 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SGTTY_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SHADOW_H */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `signal' function. */ +#define HAVE_SIGNAL 1 + +/* Define to 1 if you have the `sigset' function. */ +#define HAVE_SIGSET 1 + +/* define if you have -lslp */ +/* #undef HAVE_SLP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SLP_H */ + +/* Define to 1 if you have the `snprintf' function. */ +#define HAVE_SNPRINTF 1 + +/* if you have spawnlp() */ +/* #undef HAVE_SPAWNLP */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SQLEXT_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SQL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDDEF_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the `strdup' function. */ +#define HAVE_STRDUP 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the `strerror_r' function. */ +#define HAVE_STRERROR_R 1 + +/* Define to 1 if you have the `strftime' function. */ +#define HAVE_STRFTIME 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strpbrk' function. */ +#define HAVE_STRPBRK 1 + +/* Define to 1 if you have the `strrchr' function. */ +#define HAVE_STRRCHR 1 + +/* Define to 1 if you have the `strsep' function. */ +#define HAVE_STRSEP 1 + +/* Define to 1 if you have the `strspn' function. */ +#define HAVE_STRSPN 1 + +/* Define to 1 if you have the `strstr' function. */ +#define HAVE_STRSTR 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoll' function. */ +#define HAVE_STRTOLL 1 + +/* Define to 1 if you have the `strtoq' function. */ +#define HAVE_STRTOQ 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `strtoull' function. */ +#define HAVE_STRTOULL 1 + +/* Define to 1 if you have the `strtouq' function. */ +#define HAVE_STRTOUQ 1 + +/* Define to 1 if `msg_accrightslen' is a member of `struct msghdr'. */ +/* #undef HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTSLEN */ + +/* Define to 1 if `msg_control' is a member of `struct msghdr'. */ +#define HAVE_STRUCT_MSGHDR_MSG_CONTROL 1 + +/* Define to 1 if `pw_gecos' is a member of `struct passwd'. */ +#define HAVE_STRUCT_PASSWD_PW_GECOS 1 + +/* Define to 1 if `pw_passwd' is a member of `struct passwd'. */ +#define HAVE_STRUCT_PASSWD_PW_PASSWD 1 + +/* Define to 1 if `st_blksize' is a member of `struct stat'. */ +#define HAVE_STRUCT_STAT_ST_BLKSIZE 1 + +/* Define to 1 if `st_fstype' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE */ + +/* define to 1 if st_fstype is char * */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE_CHAR */ + +/* define to 1 if st_fstype is int */ +/* #undef HAVE_STRUCT_STAT_ST_FSTYPE_INT */ + +/* Define to 1 if `st_vfstype' is a member of `struct stat'. */ +/* #undef HAVE_STRUCT_STAT_ST_VFSTYPE */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYNCH_H */ + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYSEXITS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYSLOG_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_DEVPOLL_H */ + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_DIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_EPOLL_H 1 + +/* define if you actually have sys_errlist in your libs */ +#define HAVE_SYS_ERRLIST 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_ERRNO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_EVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_FILE_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_FILIO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_FSTYP_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_IOCTL_H 1 + +/* Define to 1 if you have the header file, and it defines `DIR'. + */ +/* #undef HAVE_SYS_NDIR_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_POLL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PRIVGRP_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SELECT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SOCKET_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSLOG_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_UCRED_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UIO_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_UN_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_UUID_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_VMOUNT_H */ + +/* Define to 1 if you have that is POSIX.1 compatible. */ +#define HAVE_SYS_WAIT_H 1 + +/* define if you have -lwrap */ +/* #undef HAVE_TCPD */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_TCPD_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_TERMIOS_H 1 + +/* if you have Solaris LWP (thr) package */ +/* #undef HAVE_THR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_THREAD_H */ + +/* Define to 1 if you have the `thr_getconcurrency' function. */ +/* #undef HAVE_THR_GETCONCURRENCY */ + +/* Define to 1 if you have the `thr_setconcurrency' function. */ +/* #undef HAVE_THR_SETCONCURRENCY */ + +/* Define to 1 if you have the `thr_yield' function. */ +/* #undef HAVE_THR_YIELD */ + +/* define if you have TLS */ +#define HAVE_TLS 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UTIME_H 1 + +/* define if you have uuid_generate() */ +/* #undef HAVE_UUID_GENERATE */ + +/* define if you have uuid_to_str() */ +/* #undef HAVE_UUID_TO_STR */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_UUID_UUID_H */ + +/* Define to 1 if you have the `vprintf' function. */ +#define HAVE_VPRINTF 1 + +/* Define to 1 if you have the `vsnprintf' function. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 if you have the `wait4' function. */ +#define HAVE_WAIT4 1 + +/* Define to 1 if you have the `waitpid' function. */ +#define HAVE_WAITPID 1 + +/* define if you have winsock */ +/* #undef HAVE_WINSOCK */ + +/* define if you have winsock2 */ +/* #undef HAVE_WINSOCK2 */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINSOCK2_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WINSOCK_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_WIREDTIGER_H */ + +/* Define to 1 if you have the `write' function. */ +#define HAVE_WRITE 1 + +/* define if select implicitly yields */ +#define HAVE_YIELDING_SELECT 1 + +/* Define to 1 if you have the `_vsnprintf' function. */ +/* #undef HAVE__VSNPRINTF */ + +/* define to 32-bit or greater integer type */ +#define LBER_INT_T int + +/* define to large integer type */ +#define LBER_LEN_T long + +/* define to socket descriptor type */ +#define LBER_SOCKET_T int + +/* define to large integer type */ +#define LBER_TAG_T long + +/* define to 1 if library is thread safe */ +#define LDAP_API_FEATURE_X_OPENLDAP_THREAD_SAFE 1 + +/* define to LDAP VENDOR VERSION */ +/* #undef LDAP_API_FEATURE_X_OPENLDAP_V2_REFERRALS */ + +/* define this to add debugging code */ +/* #undef LDAP_DEBUG */ + +/* define if LDAP libs are dynamic */ +/* #undef LDAP_LIBS_DYNAMIC */ + +/* define to support PF_INET6 */ +#define LDAP_PF_INET6 1 + +/* define to support PF_LOCAL */ +#define LDAP_PF_LOCAL 1 + +/* define this to add SLAPI code */ +/* #undef LDAP_SLAPI */ + +/* define this to add syslog code */ +/* #undef LDAP_SYSLOG */ + +/* Version */ +#define LDAP_VENDOR_VERSION 20501 + +/* Major */ +#define LDAP_VENDOR_VERSION_MAJOR 2 + +/* Minor */ +#define LDAP_VENDOR_VERSION_MINOR 5 + +/* Patch */ +#define LDAP_VENDOR_VERSION_PATCH X + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* define if memcmp is not 8-bit clean or is otherwise broken */ +/* #undef NEED_MEMCMP_REPLACEMENT */ + +/* define if you have (or want) no threads */ +/* #undef NO_THREADS */ + +/* define to use the original debug style */ +/* #undef OLD_DEBUG */ + +/* Package */ +#define OPENLDAP_PACKAGE "OpenLDAP" + +/* Version */ +#define OPENLDAP_VERSION "2.5.X" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "" + +/* define if sched_yield yields the entire process */ +/* #undef REPLACE_BROKEN_YIELD */ + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* Define to the type of arg 1 for `select'. */ +#define SELECT_TYPE_ARG1 int + +/* Define to the type of args 2, 3 and 4 for `select'. */ +#define SELECT_TYPE_ARG234 (fd_set *) + +/* Define to the type of arg 5 for `select'. */ +#define SELECT_TYPE_ARG5 (struct timeval *) + +/* The size of `int', as computed by sizeof. */ +#define SIZEOF_INT 4 + +/* The size of `long', as computed by sizeof. */ +#define SIZEOF_LONG 8 + +/* The size of `long long', as computed by sizeof. */ +#define SIZEOF_LONG_LONG 8 + +/* The size of `short', as computed by sizeof. */ +#define SIZEOF_SHORT 2 + +/* The size of `wchar_t', as computed by sizeof. */ +#define SIZEOF_WCHAR_T 4 + +/* define to support per-object ACIs */ +/* #undef SLAPD_ACI_ENABLED */ + +/* define to support LDAP Async Metadirectory backend */ +/* #undef SLAPD_ASYNCMETA */ + +/* define to support cleartext passwords */ +/* #undef SLAPD_CLEARTEXT */ + +/* define to support crypt(3) passwords */ +/* #undef SLAPD_CRYPT */ + +/* define to support DNS SRV backend */ +/* #undef SLAPD_DNSSRV */ + +/* define to support LDAP backend */ +/* #undef SLAPD_LDAP */ + +/* define to support MDB backend */ +/* #undef SLAPD_MDB */ + +/* define to support LDAP Metadirectory backend */ +/* #undef SLAPD_META */ + +/* define to support modules */ +/* #undef SLAPD_MODULES */ + +/* dynamically linked module */ +#define SLAPD_MOD_DYNAMIC 2 + +/* statically linked module */ +#define SLAPD_MOD_STATIC 1 + +/* define to support cn=Monitor backend */ +/* #undef SLAPD_MONITOR */ + +/* define to support NDB backend */ +/* #undef SLAPD_NDB */ + +/* define to support NULL backend */ +/* #undef SLAPD_NULL */ + +/* define for In-Directory Access Logging overlay */ +/* #undef SLAPD_OVER_ACCESSLOG */ + +/* define for Audit Logging overlay */ +/* #undef SLAPD_OVER_AUDITLOG */ + +/* define for Automatic Certificate Authority overlay */ +/* #undef SLAPD_OVER_AUTOCA */ + +/* define for Collect overlay */ +/* #undef SLAPD_OVER_COLLECT */ + +/* define for Attribute Constraint overlay */ +/* #undef SLAPD_OVER_CONSTRAINT */ + +/* define for Dynamic Directory Services overlay */ +/* #undef SLAPD_OVER_DDS */ + +/* define for Dynamic Directory Services overlay */ +/* #undef SLAPD_OVER_DEREF */ + +/* define for Dynamic Group overlay */ +/* #undef SLAPD_OVER_DYNGROUP */ + +/* define for Dynamic List overlay */ +/* #undef SLAPD_OVER_DYNLIST */ + +/* define for Reverse Group Membership overlay */ +/* #undef SLAPD_OVER_MEMBEROF */ + +/* define for Password Policy overlay */ +/* #undef SLAPD_OVER_PPOLICY */ + +/* define for Proxy Cache overlay */ +/* #undef SLAPD_OVER_PROXYCACHE */ + +/* define for Referential Integrity overlay */ +/* #undef SLAPD_OVER_REFINT */ + +/* define for Return Code overlay */ +/* #undef SLAPD_OVER_RETCODE */ + +/* define for Rewrite/Remap overlay */ +/* #undef SLAPD_OVER_RWM */ + +/* define for Sequential Modify overlay */ +/* #undef SLAPD_OVER_SEQMOD */ + +/* define for ServerSideSort/VLV overlay */ +/* #undef SLAPD_OVER_SSSVLV */ + +/* define for Syncrepl Provider overlay */ +/* #undef SLAPD_OVER_SYNCPROV */ + +/* define for Translucent Proxy overlay */ +/* #undef SLAPD_OVER_TRANSLUCENT */ + +/* define for Attribute Uniqueness overlay */ +/* #undef SLAPD_OVER_UNIQUE */ + +/* define for Value Sorting overlay */ +/* #undef SLAPD_OVER_VALSORT */ + +/* define to support PASSWD backend */ +/* #undef SLAPD_PASSWD */ + +/* define to support PERL backend */ +/* #undef SLAPD_PERL */ + +/* define to support relay backend */ +/* #undef SLAPD_RELAY */ + +/* define to support reverse lookups */ +/* #undef SLAPD_RLOOKUPS */ + +/* define to support SHELL backend */ +/* #undef SLAPD_SHELL */ + +/* define to support SOCK backend */ +/* #undef SLAPD_SOCK */ + +/* define to support SASL passwords */ +/* #undef SLAPD_SPASSWD */ + +/* define to support SQL backend */ +/* #undef SLAPD_SQL */ + +/* define to support WiredTiger backend */ +/* #undef SLAPD_WT */ + +/* define to support run-time loadable ACL */ +/* #undef SLAP_DYNACL */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Define to 1 if your declares `struct tm'. */ +/* #undef TM_IN_SYS_TIME */ + +/* set to urandom device */ +#define URANDOM_DEVICE "/dev/urandom" + +/* define to use OpenSSL BIGNUM for MP */ +/* #undef USE_MP_BIGNUM */ + +/* define to use GMP for MP */ +/* #undef USE_MP_GMP */ + +/* define to use 'long' for MP */ +/* #undef USE_MP_LONG */ + +/* define to use 'long long' for MP */ +/* #undef USE_MP_LONG_LONG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to the type of arg 3 for `accept'. */ +#define ber_socklen_t socklen_t + +/* Define to `char *' if does not define. */ +/* #undef caddr_t */ + +/* Define to empty if `const' does not conform to ANSI C. */ +/* #undef const */ + +/* Define to `int' if doesn't define. */ +/* #undef gid_t */ + +/* Define to `int' if does not define. */ +/* #undef mode_t */ + +/* Define to `long' if does not define. */ +/* #undef off_t */ + +/* Define to `int' if does not define. */ +/* #undef pid_t */ + +/* Define to `int' if does not define. */ +/* #undef sig_atomic_t */ + +/* Define to `unsigned' if does not define. */ +/* #undef size_t */ + +/* define to snprintf routine */ +/* #undef snprintf */ + +/* Define like ber_socklen_t if does not define. */ +/* #undef socklen_t */ + +/* Define to `signed int' if does not define. */ +/* #undef ssize_t */ + +/* Define to `int' if doesn't define. */ +/* #undef uid_t */ + +/* define as empty if volatile is not supported */ +/* #undef volatile */ + +/* define to snprintf routine */ +/* #undef vsnprintf */ + + +/* begin of portable.h.post */ + +#ifdef _WIN32 +/* don't suck in all of the win32 api */ +# define WIN32_LEAN_AND_MEAN 1 +#endif + +#ifndef LDAP_NEEDS_PROTOTYPES +/* force LDAP_P to always include prototypes */ +#define LDAP_NEEDS_PROTOTYPES 1 +#endif + +#ifndef LDAP_REL_ENG +#if (LDAP_VENDOR_VERSION == 000000) && !defined(LDAP_DEVEL) +#define LDAP_DEVEL +#endif +#if defined(LDAP_DEVEL) && !defined(LDAP_TEST) +#define LDAP_TEST +#endif +#endif + +#ifdef HAVE_STDDEF_H +# include +#endif + +#ifdef HAVE_EBCDIC +/* ASCII/EBCDIC converting replacements for stdio funcs + * vsnprintf and snprintf are used too, but they are already + * checked by the configure script + */ +#define fputs ber_pvt_fputs +#define fgets ber_pvt_fgets +#define printf ber_pvt_printf +#define fprintf ber_pvt_fprintf +#define vfprintf ber_pvt_vfprintf +#define vsprintf ber_pvt_vsprintf +#endif + +#include "ac/fdset.h" + +#include "ldap_cdefs.h" +#include "ldap_features.h" + +#include "ac/assert.h" +#include "ac/localize.h" + +#endif /* _LDAP_PORTABLE_H */ +/* end of portable.h.post */ + diff --git a/contrib/poco b/contrib/poco index fbaaba4a02e..b7d9ec16ee3 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit fbaaba4a02e29987b8c584747a496c79528f125f +Subproject commit b7d9ec16ee33ca76643d5fcd907ea9a33285640a diff --git a/contrib/poco-cmake/Foundation/CMakeLists.txt b/contrib/poco-cmake/Foundation/CMakeLists.txt index f4647461ec0..6476845b4e3 100644 --- a/contrib/poco-cmake/Foundation/CMakeLists.txt +++ b/contrib/poco-cmake/Foundation/CMakeLists.txt @@ -233,3 +233,10 @@ else () message (STATUS "Using Poco::Foundation: ${LIBRARY_POCO_FOUNDATION} ${INCLUDE_POCO_FOUNDATION}") endif () + +if(OS_DARWIN AND ARCH_AARCH64) + target_compile_definitions (_poco_foundation + PRIVATE + POCO_NO_STAT64 + ) +endif() diff --git a/contrib/replxx b/contrib/replxx index cdb6e3f2ce4..2b24f14594d 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit cdb6e3f2ce4464225daf9c8beeae7db98d590bdc +Subproject commit 2b24f14594d7606792b92544bb112a6322ba34d7 diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt index 77a30776a4a..117015ef5c2 100644 --- a/contrib/rocksdb-cmake/CMakeLists.txt +++ b/contrib/rocksdb-cmake/CMakeLists.txt @@ -142,14 +142,14 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") endif(HAS_ALTIVEC) endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") -if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64") CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC) if(HAS_ARMV8_CRC) message(STATUS " HAS_ARMV8_CRC yes") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") endif(HAS_ARMV8_CRC) -endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64") include(CheckCXXSourceCompiles) diff --git a/contrib/simdjson b/contrib/simdjson index 3190d66a490..95b4870e20b 160000 --- a/contrib/simdjson +++ b/contrib/simdjson @@ -1 +1 @@ -Subproject commit 3190d66a49059092a1753dc35595923debfc1698 +Subproject commit 95b4870e20be5f97d9dcf63b23b1c6f520c366c1 diff --git a/contrib/zlib-ng b/contrib/zlib-ng index 6fd1846c8b8..5cc4d232020 160000 --- a/contrib/zlib-ng +++ b/contrib/zlib-ng @@ -1 +1 @@ -Subproject commit 6fd1846c8b8f59436fe2dd752d0f316ddbb64df6 +Subproject commit 5cc4d232020dc66d1d6c5438834457e2a2f6127b diff --git a/debian/changelog b/debian/changelog index 53b36cae114..8b6626416a9 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,5 @@ -clickhouse (21.3.1.1) unstable; urgency=low +clickhouse (21.6.1.1) unstable; urgency=low * Modified source code - -- clickhouse-release Mon, 01 Feb 2021 12:50:53 +0300 + -- clickhouse-release Tue, 20 Apr 2021 01:48:16 +0300 diff --git a/debian/clickhouse-common-static.install b/debian/clickhouse-common-static.install index f1cbf0848d3..087a6dbba8f 100644 --- a/debian/clickhouse-common-static.install +++ b/debian/clickhouse-common-static.install @@ -1,4 +1,5 @@ usr/bin/clickhouse usr/bin/clickhouse-odbc-bridge +usr/bin/clickhouse-library-bridge usr/bin/clickhouse-extract-from-config -etc/security/limits.d/clickhouse.conf +usr/share/bash-completion/completions diff --git a/debian/clickhouse-server.config b/debian/clickhouse-server.config deleted file mode 100644 index 636ff7f4da7..00000000000 --- a/debian/clickhouse-server.config +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh -e - -test -f /usr/share/debconf/confmodule && . /usr/share/debconf/confmodule - -db_fget clickhouse-server/default-password seen || true -password_seen="$RET" - -if [ "$1" = "reconfigure" ]; then - password_seen=false -fi - -if [ "$password_seen" != "true" ]; then - db_input high clickhouse-server/default-password || true - db_go || true -fi -db_go || true diff --git a/debian/clickhouse-server.postinst b/debian/clickhouse-server.postinst index dc876f45954..419c13e3daf 100644 --- a/debian/clickhouse-server.postinst +++ b/debian/clickhouse-server.postinst @@ -23,11 +23,13 @@ if [ ! -f "/etc/debian_version" ]; then fi if [ "$1" = configure ] || [ -n "$not_deb_os" ]; then + + ${CLICKHOUSE_GENERIC_PROGRAM} install --user "${CLICKHOUSE_USER}" --group "${CLICKHOUSE_GROUP}" --pid-path "${CLICKHOUSE_PIDDIR}" --config-path "${CLICKHOUSE_CONFDIR}" --binary-path "${CLICKHOUSE_BINDIR}" --log-path "${CLICKHOUSE_LOGDIR}" --data-path "${CLICKHOUSE_DATADIR}" + if [ -x "/bin/systemctl" ] && [ -f /etc/systemd/system/clickhouse-server.service ] && [ -d /run/systemd/system ]; then # if old rc.d service present - remove it if [ -x "/etc/init.d/clickhouse-server" ] && [ -x "/usr/sbin/update-rc.d" ]; then /usr/sbin/update-rc.d clickhouse-server remove - echo "ClickHouse init script has migrated to systemd. Please manually stop old server and restart the service: sudo killall clickhouse-server && sleep 5 && sudo service clickhouse-server restart" fi /bin/systemctl daemon-reload @@ -38,10 +40,8 @@ if [ "$1" = configure ] || [ -n "$not_deb_os" ]; then if [ -x "/usr/sbin/update-rc.d" ]; then /usr/sbin/update-rc.d clickhouse-server defaults 19 19 >/dev/null || exit $? else - echo # TODO [ "$OS" = "rhel" ] || [ "$OS" = "centos" ] || [ "$OS" = "fedora" ] + echo # Other OS fi fi fi - - ${CLICKHOUSE_GENERIC_PROGRAM} install --user "${CLICKHOUSE_USER}" --group "${CLICKHOUSE_GROUP}" --pid-path "${CLICKHOUSE_PIDDIR}" --config-path "${CLICKHOUSE_CONFDIR}" --binary-path "${CLICKHOUSE_BINDIR}" --log-path "${CLICKHOUSE_LOGDIR}" --data-path "${CLICKHOUSE_DATADIR}" fi diff --git a/debian/clickhouse-server.preinst b/debian/clickhouse-server.preinst deleted file mode 100644 index 3529aefa7da..00000000000 --- a/debian/clickhouse-server.preinst +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh - -if [ "$1" = "upgrade" ]; then - # Return etc/cron.d/clickhouse-server to original state - service clickhouse-server disable_cron ||: -fi - -#DEBHELPER# diff --git a/debian/clickhouse-server.prerm b/debian/clickhouse-server.prerm deleted file mode 100644 index 02e855a7125..00000000000 --- a/debian/clickhouse-server.prerm +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -if [ "$1" = "upgrade" ] || [ "$1" = "remove" ]; then - # Return etc/cron.d/clickhouse-server to original state - service clickhouse-server disable_cron ||: -fi diff --git a/debian/clickhouse-server.templates b/debian/clickhouse-server.templates deleted file mode 100644 index dd55824e15c..00000000000 --- a/debian/clickhouse-server.templates +++ /dev/null @@ -1,3 +0,0 @@ -Template: clickhouse-server/default-password -Type: password -Description: Enter password for default user: diff --git a/debian/clickhouse.limits b/debian/clickhouse.limits deleted file mode 100644 index aca44082c4e..00000000000 --- a/debian/clickhouse.limits +++ /dev/null @@ -1,2 +0,0 @@ -clickhouse soft nofile 262144 -clickhouse hard nofile 262144 diff --git a/debian/rules b/debian/rules index 8eb47e95389..73d1f3d3b34 100755 --- a/debian/rules +++ b/debian/rules @@ -113,9 +113,6 @@ override_dh_install: ln -sf clickhouse-server.docs debian/clickhouse-client.docs ln -sf clickhouse-server.docs debian/clickhouse-common-static.docs - mkdir -p $(DESTDIR)/etc/security/limits.d - cp debian/clickhouse.limits $(DESTDIR)/etc/security/limits.d/clickhouse.conf - # systemd compatibility mkdir -p $(DESTDIR)/etc/systemd/system/ cp debian/clickhouse-server.service $(DESTDIR)/etc/systemd/system/ diff --git a/debian/watch b/debian/watch index 7ad4cedf713..ed3cab97ade 100644 --- a/debian/watch +++ b/debian/watch @@ -1,6 +1,6 @@ version=4 opts="filenamemangle=s%(?:.*?)?v?(\d[\d.]*)-stable\.tar\.gz%clickhouse-$1.tar.gz%" \ - https://github.com/yandex/clickhouse/tags \ + https://github.com/ClickHouse/ClickHouse/tags \ (?:.*?/)?v?(\d[\d.]*)-stable\.tar\.gz debian uupdate diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile index 43921a4d3c4..569025dec1c 100644 --- a/docker/client/Dockerfile +++ b/docker/client/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.3.1.* +ARG version=21.6.1.* RUN apt-get update \ && apt-get install --yes --no-install-recommends \ @@ -18,6 +18,7 @@ RUN apt-get update \ clickhouse-client=$version \ clickhouse-common-static=$version \ locales \ + tzdata \ && rm -rf /var/lib/apt/lists/* /var/cache/debconf \ && apt-get clean diff --git a/docker/images.json b/docker/images.json index 303bd159ce4..e2e22468596 100644 --- a/docker/images.json +++ b/docker/images.json @@ -138,7 +138,8 @@ "docker/test/stateless_unbundled", "docker/test/stateless_pytest", "docker/test/integration/base", - "docker/test/fuzzer" + "docker/test/fuzzer", + "docker/test/keeper-jepsen" ] }, "docker/packager/unbundled": { @@ -159,5 +160,9 @@ "docker/test/sqlancer": { "name": "yandex/clickhouse-sqlancer-test", "dependent": [] + }, + "docker/test/keeper-jepsen": { + "name": "yandex/clickhouse-keeper-jepsen-test", + "dependent": [] } } diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 74de1a3e9bd..6948aeb3b18 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -4,14 +4,22 @@ FROM ubuntu:20.04 ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11 RUN apt-get update \ - && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \ + && apt-get install \ + apt-transport-https \ + apt-utils \ + ca-certificates \ + dnsutils \ + gnupg \ + iputils-ping \ + lsb-release \ + wget \ --yes --no-install-recommends --verbose-versions \ && export LLVM_PUBKEY_HASH="bda960a8da687a275a2078d43c111d66b1c6a893a3275271beedf266c1ff4a0cdecb429c7a5cccf9f486ea7aa43fd27f" \ && wget -nv -O /tmp/llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key \ && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \ && apt-key add /tmp/llvm-snapshot.gpg.key \ && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ - && echo "deb [trusted=yes] http://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ + && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ /etc/apt/sources.list # initial packages @@ -27,35 +35,38 @@ RUN apt-get update \ RUN apt-get update \ && apt-get install \ bash \ - cmake \ + build-essential \ ccache \ - curl \ - gcc-9 \ - g++-9 \ clang-10 \ - clang-tidy-10 \ - lld-10 \ - llvm-10 \ - llvm-10-dev \ clang-11 \ + clang-tidy-10 \ clang-tidy-11 \ - lld-11 \ - llvm-11 \ - llvm-11-dev \ + cmake \ + cmake \ + curl \ + g++-9 \ + gcc-9 \ + gdb \ + git \ + gperf \ + gperf \ + intel-opencl-icd \ libicu-dev \ libreadline-dev \ + lld-10 \ + lld-11 \ + llvm-10 \ + llvm-10-dev \ + llvm-11 \ + llvm-11-dev \ + moreutils \ ninja-build \ - gperf \ - git \ - opencl-headers \ ocl-icd-libopencl1 \ - intel-opencl-icd \ - tzdata \ - gperf \ - cmake \ - gdb \ + opencl-headers \ + pigz \ + pixz \ rename \ - build-essential \ + tzdata \ --yes --no-install-recommends # This symlink required by gcc to find lld compiler @@ -103,4 +114,4 @@ RUN rm /etc/apt/sources.list.d/proposed-repositories.list && apt-get update COPY build.sh / -CMD ["/bin/bash", "/build.sh"] +CMD ["bash", "-c", "/build.sh 2>&1 | ts"] diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index a42789c6186..cf74105fbbb 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -11,17 +11,28 @@ tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build/cmake/toolc mkdir -p build/cmake/toolchain/freebsd-x86_64 tar xJf freebsd-11.3-toolchain.tar.xz -C build/cmake/toolchain/freebsd-x86_64 --strip-components=1 +# Uncomment to debug ccache. Don't put ccache log in /output right away, or it +# will be confusingly packed into the "performance" package. +# export CCACHE_LOGFILE=/build/ccache.log +# export CCACHE_DEBUG=1 + mkdir -p build/build_docker cd build/build_docker -ccache --show-stats ||: -ccache --zero-stats ||: -ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so ||: rm -f CMakeCache.txt # Read cmake arguments into array (possibly empty) read -ra CMAKE_FLAGS <<< "${CMAKE_FLAGS:-}" cmake --debug-trycompile --verbose=1 -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" .. + +ccache --show-config ||: +ccache --show-stats ||: +ccache --zero-stats ||: + # shellcheck disable=SC2086 # No quotes because I want it to expand to nothing if empty. ninja $NINJA_FLAGS clickhouse-bundle + +ccache --show-config ||: +ccache --show-stats ||: + mv ./programs/clickhouse* /output mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds find . -name '*.so' -print -exec mv '{}' /output \; @@ -65,8 +76,21 @@ then cp ../programs/server/config.xml /output/config cp ../programs/server/users.xml /output/config cp -r --dereference ../programs/server/config.d /output/config - tar -czvf "$COMBINED_OUTPUT.tgz" /output + tar -cv -I pigz -f "$COMBINED_OUTPUT.tgz" /output rm -r /output/* mv "$COMBINED_OUTPUT.tgz" /output fi -ccache --show-stats ||: + +if [ "${CCACHE_DEBUG:-}" == "1" ] +then + find . -name '*.ccache-*' -print0 \ + | tar -c -I pixz -f /output/ccache-debug.txz --null -T - +fi + +if [ -n "$CCACHE_LOGFILE" ] +then + # Compress the log as well, or else the CI will try to compress all log + # files in place, and will fail because this directory is not writable. + tar -cv -I pixz -f /output/ccache.log.txz "$CCACHE_LOGFILE" +fi + diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile index 8fd89d60f85..902929a2644 100644 --- a/docker/packager/deb/Dockerfile +++ b/docker/packager/deb/Dockerfile @@ -34,31 +34,32 @@ RUN curl -O https://clickhouse-builds.s3.yandex.net/utils/1/dpkg-deb \ # Libraries from OS are only needed to test the "unbundled" build (this is not used in production). RUN apt-get update \ && apt-get install \ - gcc-9 \ - g++-9 \ - clang-11 \ - clang-tidy-11 \ - lld-11 \ - llvm-11 \ - llvm-11-dev \ + alien \ clang-10 \ + clang-11 \ clang-tidy-10 \ + clang-tidy-11 \ + cmake \ + debhelper \ + devscripts \ + g++-9 \ + gcc-9 \ + gdb \ + git \ + gperf \ lld-10 \ + lld-11 \ llvm-10 \ llvm-10-dev \ + llvm-11 \ + llvm-11-dev \ + moreutils \ ninja-build \ perl \ - pkg-config \ - devscripts \ - debhelper \ - git \ - tzdata \ - gperf \ - alien \ - cmake \ - gdb \ - moreutils \ pigz \ + pixz \ + pkg-config \ + tzdata \ --yes --no-install-recommends # NOTE: For some reason we have outdated version of gcc-10 in ubuntu 20.04 stable. diff --git a/docker/packager/deb/build.sh b/docker/packager/deb/build.sh index 6450e21d289..c1a0b27db5d 100755 --- a/docker/packager/deb/build.sh +++ b/docker/packager/deb/build.sh @@ -2,8 +2,14 @@ set -x -e +# Uncomment to debug ccache. +# export CCACHE_LOGFILE=/build/ccache.log +# export CCACHE_DEBUG=1 + +ccache --show-config ||: ccache --show-stats ||: ccache --zero-stats ||: + read -ra ALIEN_PKGS <<< "${ALIEN_PKGS:-}" build/release --no-pbuilder "${ALIEN_PKGS[@]}" | ts '%Y-%m-%d %H:%M:%S' mv /*.deb /output @@ -22,5 +28,19 @@ then mv /build/obj-*/src/unit_tests_dbms /output/binary fi fi + +ccache --show-config ||: ccache --show-stats ||: -ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so ||: + +if [ "${CCACHE_DEBUG:-}" == "1" ] +then + find /build -name '*.ccache-*' -print0 \ + | tar -c -I pixz -f /output/ccache-debug.txz --null -T - +fi + +if [ -n "$CCACHE_LOGFILE" ] +then + # Compress the log as well, or else the CI will try to compress all log + # files in place, and will fail because this directory is not writable. + tar -cv -I pixz -f /output/ccache.log.txz "$CCACHE_LOGFILE" +fi diff --git a/docker/packager/unbundled/build.sh b/docker/packager/unbundled/build.sh index 54575ab977c..99fc34fd9f3 100755 --- a/docker/packager/unbundled/build.sh +++ b/docker/packager/unbundled/build.sh @@ -13,4 +13,3 @@ mv /*.rpm /output ||: # if exists mv /*.tgz /output ||: # if exists ccache --show-stats ||: -ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so ||: diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index 8e39af5646c..48c978366c6 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -1,9 +1,24 @@ FROM ubuntu:20.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.3.1.* +ARG version=21.6.1.* ARG gosu_ver=1.10 +# set non-empty deb_location_url url to create a docker image +# from debs created by CI build, for example: +# docker build . --network host --build-arg version="21.4.1.6282" --build-arg deb_location_url="https://clickhouse-builds.s3.yandex.net/21852/069cfbff388b3d478d1a16dc7060b48073f5d522/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/" -t filimonovq/clickhouse-server:pr21852 +ARG deb_location_url="" + +# set non-empty single_binary_location_url to create docker image +# from a single binary url (useful for non-standard builds - with sanitizers, for arm64). +# for example (run on aarch64 server): +# docker build . --network host --build-arg single_binary_location_url="https://builds.clickhouse.tech/master/aarch64/clickhouse" -t altinity/clickhouse-server:master-testing-arm +# note: clickhouse-odbc-bridge is not supported there. +ARG single_binary_location_url="" + +# see https://github.com/moby/moby/issues/4032#issuecomment-192327844 +ARG DEBIAN_FRONTEND=noninteractive + # user/group precreated explicitly with fixed uid/gid on purpose. # It is especially important for rootless containers: in that case entrypoint # can't do chown and owners of mounted volumes should be configured externally. @@ -19,19 +34,37 @@ RUN groupadd -r clickhouse --gid=101 \ ca-certificates \ dirmngr \ gnupg \ + locales \ + wget \ + tzdata \ && mkdir -p /etc/apt/sources.list.d \ && apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 \ && echo $repository > /etc/apt/sources.list.d/clickhouse.list \ - && apt-get update \ - && env DEBIAN_FRONTEND=noninteractive \ - apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \ - && env DEBIAN_FRONTEND=noninteractive \ - apt-get install --allow-unauthenticated --yes --no-install-recommends \ - clickhouse-common-static=$version \ - clickhouse-client=$version \ - clickhouse-server=$version \ - locales \ - wget \ + && if [ -n "$deb_location_url" ]; then \ + echo "installing from custom url with deb packages: $deb_location_url" \ + rm -rf /tmp/clickhouse_debs \ + && mkdir -p /tmp/clickhouse_debs \ + && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-common-static_${version}_amd64.deb" -P /tmp/clickhouse_debs \ + && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-client_${version}_all.deb" -P /tmp/clickhouse_debs \ + && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-server_${version}_all.deb" -P /tmp/clickhouse_debs \ + && dpkg -i /tmp/clickhouse_debs/*.deb ; \ + elif [ -n "$single_binary_location_url" ]; then \ + echo "installing from single binary url: $single_binary_location_url" \ + && rm -rf /tmp/clickhouse_binary \ + && mkdir -p /tmp/clickhouse_binary \ + && wget --progress=bar:force:noscroll "$single_binary_location_url" -O /tmp/clickhouse_binary/clickhouse \ + && chmod +x /tmp/clickhouse_binary/clickhouse \ + && /tmp/clickhouse_binary/clickhouse install --user "clickhouse" --group "clickhouse" ; \ + else \ + echo "installing from repository: $repository" \ + && apt-get update \ + && apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \ + && apt-get install --allow-unauthenticated --yes --no-install-recommends \ + clickhouse-common-static=$version \ + clickhouse-client=$version \ + clickhouse-server=$version ; \ + fi \ + && clickhouse-local -q 'SELECT * FROM system.build_options' \ && rm -rf \ /var/lib/apt/lists/* \ /var/cache/debconf \ diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 0f9de1996ab..cd192c0c9da 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -21,7 +21,9 @@ RUN addgroup -S -g 101 clickhouse \ && chown clickhouse:clickhouse /var/lib/clickhouse \ && chown root:clickhouse /var/log/clickhouse-server \ && chmod +x /entrypoint.sh \ - && apk add --no-cache su-exec bash \ + && apk add --no-cache su-exec bash tzdata \ + && cp /usr/share/zoneinfo/UTC /etc/localtime \ + && echo "UTC" > /etc/timezone \ && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client # we need to allow "others" access to clickhouse folder, because docker container diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index 0138a165505..4486b0d9d7f 100755 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -38,17 +38,16 @@ if ! $gosu test -f "$CLICKHOUSE_CONFIG" -a -r "$CLICKHOUSE_CONFIG"; then exit 1 fi -# port is needed to check if clickhouse-server is ready for connections -HTTP_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=http_port)" - # get CH directories locations DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=path || true)" TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)" USER_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=user_files_path || true)" LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.log || true)" -LOG_DIR="$(dirname "$LOG_PATH" || true)" +LOG_DIR="" +if [ -n "$LOG_PATH" ]; then LOG_DIR="$(dirname "$LOG_PATH")"; fi ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.errorlog || true)" -ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH" || true)" +ERROR_LOG_DIR="" +if [ -n "$ERROR_LOG_PATH" ]; then ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)" CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}" @@ -106,6 +105,9 @@ EOT fi if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then + # port is needed to check if clickhouse-server is ready for connections + HTTP_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=http_port)" + # Listen only on localhost until the initialization is done $gosu /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" -- --listen_host=127.0.0.1 & pid="$!" diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile index f151ae8fddf..0e4646386ce 100644 --- a/docker/test/Dockerfile +++ b/docker/test/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.3.1.* +ARG version=21.6.1.* RUN apt-get update && \ apt-get install -y apt-transport-https dirmngr && \ diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile index 64be52d8e30..2864f7fc4da 100644 --- a/docker/test/fasttest/Dockerfile +++ b/docker/test/fasttest/Dockerfile @@ -1,7 +1,7 @@ # docker build -t yandex/clickhouse-fasttest . FROM ubuntu:20.04 -ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=10 +ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11 RUN apt-get update \ && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \ @@ -43,20 +43,20 @@ RUN apt-get update \ clang-tidy-${LLVM_VERSION} \ cmake \ curl \ - lsof \ expect \ fakeroot \ - git \ gdb \ + git \ gperf \ lld-${LLVM_VERSION} \ llvm-${LLVM_VERSION} \ + lsof \ moreutils \ ninja-build \ psmisc \ python3 \ - python3-pip \ python3-lxml \ + python3-pip \ python3-requests \ python3-termcolor \ rename \ diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index b6fcdd7f7d2..a7cc398e5c9 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -8,6 +8,9 @@ trap 'kill $(jobs -pr) ||:' EXIT # that we can run the "everything else" stage from the cloned source. stage=${stage:-} +# Compiler version, normally set by Dockerfile +export LLVM_VERSION=${LLVM_VERSION:-11} + # A variable to pass additional flags to CMake. # Here we explicitly default it to nothing so that bash doesn't complain about # it being undefined. Also read it as array so that we can pass an empty list @@ -70,6 +73,7 @@ function start_server --path "$FASTTEST_DATA" --user_files_path "$FASTTEST_DATA/user_files" --top_level_domains_path "$FASTTEST_DATA/top_level_domains" + --keeper_server.log_storage_path "$FASTTEST_DATA/coordination" ) clickhouse-server "${opts[@]}" &>> "$FASTTEST_OUTPUT/server.log" & server_pid=$! @@ -123,22 +127,26 @@ continue function clone_root { - git clone https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" + git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" ( cd "$FASTTEST_SOURCE" if [ "$PULL_REQUEST_NUMBER" != "0" ]; then - if git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then + if git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then git checkout FETCH_HEAD - echo 'Clonned merge head' + echo "Checked out pull/$PULL_REQUEST_NUMBER/merge ($(git rev-parse FETCH_HEAD))" else - git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/head" + git fetch --depth 1 origin "+refs/pull/$PULL_REQUEST_NUMBER/head" git checkout "$COMMIT_SHA" - echo 'Checked out to commit' + echo "Checked out nominal SHA $COMMIT_SHA for PR $PULL_REQUEST_NUMBER" fi else if [ -v COMMIT_SHA ]; then + git fetch --depth 1 origin "$COMMIT_SHA" git checkout "$COMMIT_SHA" + echo "Checked out nominal SHA $COMMIT_SHA for master" + else + echo "Using default repository head $(git rev-parse HEAD)" fi fi ) @@ -150,6 +158,7 @@ function clone_submodules cd "$FASTTEST_SOURCE" SUBMODULES_TO_UPDATE=( + contrib/abseil-cpp contrib/antlr4-runtime contrib/boost contrib/zlib-ng @@ -179,7 +188,7 @@ function clone_submodules ) git submodule sync - git submodule update --init --recursive "${SUBMODULES_TO_UPDATE[@]}" + git submodule update --depth 1 --init --recursive "${SUBMODULES_TO_UPDATE[@]}" git submodule foreach git reset --hard git submodule foreach git checkout @ -f git submodule foreach git clean -xfd @@ -213,7 +222,7 @@ function run_cmake ( cd "$FASTTEST_BUILD" - cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_C_COMPILER=clang-10 "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt" + cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER="clang++-${LLVM_VERSION}" -DCMAKE_C_COMPILER="clang-${LLVM_VERSION}" "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt" ) } @@ -221,7 +230,7 @@ function build { ( cd "$FASTTEST_BUILD" - time ninja clickhouse-bundle | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt" + time ninja clickhouse-bundle 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt" if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then cp programs/clickhouse "$FASTTEST_OUTPUT/clickhouse" fi @@ -290,6 +299,8 @@ function run_tests 01318_decrypt # Depends on OpenSSL 01663_aes_msan # Depends on OpenSSL 01667_aes_args_check # Depends on OpenSSL + 01776_decrypt_aead_size_check # Depends on OpenSSL + 01811_filter_by_null # Depends on OpenSSL 01281_unsucceeded_insert_select_queries_counter 01292_create_user 01294_lazy_database_concurrent @@ -297,10 +308,8 @@ function run_tests 01354_order_by_tuple_collate_const 01355_ilike 01411_bayesian_ab_testing - 01532_collate_in_low_cardinality - 01533_collate_in_nullable - 01542_collate_in_array - 01543_collate_in_tuple + collate + collation _orc_ arrow avro @@ -355,7 +364,12 @@ function run_tests # JSON functions 01666_blns - 01674_htm_xml_coarse_parse + + # Requires postgresql-client + 01802_test_postgresql_protocol_with_row_policy + + # Depends on AWS + 01801_s3_cluster ) (time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 ||:) | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt" @@ -375,7 +389,7 @@ function run_tests stop_server ||: # Clean the data so that there is no interference from the previous test run. - rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files} ||: + rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files,coordination} ||: start_server @@ -418,7 +432,7 @@ case "$stage" in # See the compatibility hacks in `clone_root` stage above. Remove at the same time, # after Nov 1, 2020. cd "$FASTTEST_WORKSPACE" - clone_submodules | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt" + clone_submodules 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/submodule_log.txt" ;& "run_cmake") run_cmake @@ -429,7 +443,7 @@ case "$stage" in "configure") # The `install_log.txt` is also needed for compatibility with old CI task -- # if there is no log, it will decide that build failed. - configure | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" + configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" ;& "run_tests") run_tests diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 766fec76179..626bedb453c 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -4,7 +4,9 @@ set -eux set -o pipefail trap "exit" INT TERM -trap 'kill $(jobs -pr) ||:' EXIT +# The watchdog is in the separate process group, so we have to kill it separately +# if the script terminates earlier. +trap 'kill $(jobs -pr) ${watchdog_pid:-} ||:' EXIT stage=${stage:-} script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" @@ -14,35 +16,28 @@ BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-11_debug_none_bundled_unsplitted function clone { -( + # The download() function is dependent on CI binaries anyway, so we can take + # the repo from the CI as well. For local runs, start directly from the "fuzz" + # stage. rm -rf ch ||: - mkdir ch - cd ch - - git init - git remote add origin https://github.com/ClickHouse/ClickHouse - - # Network is unreliable. GitHub neither. - for _ in {1..100}; do git fetch --depth=100 origin "$SHA_TO_TEST" && break; sleep 1; done - # Used to obtain the list of modified or added tests - for _ in {1..100}; do git fetch --depth=100 origin master && break; sleep 1; done - - # If not master, try to fetch pull/.../{head,merge} - if [ "$PR_TO_TEST" != "0" ] - then - for _ in {1..100}; do git fetch --depth=100 origin "refs/pull/$PR_TO_TEST/*:refs/heads/pull/$PR_TO_TEST/*" && break; sleep 1; done - fi - - git checkout "$SHA_TO_TEST" -) + mkdir ch ||: + wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz" + tar -C ch --strip-components=1 -xf clickhouse_no_subs.tar.gz + ls -lath ||: } function download { - wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse" + wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse" & + wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/ci-changed-files.txt" & + wait + chmod +x clickhouse ln -s ./clickhouse ./clickhouse-server ln -s ./clickhouse ./clickhouse-client + + # clickhouse-server is in the current dir + export PATH="$PWD:$PATH" } function configure @@ -74,25 +69,38 @@ function watchdog killall -9 clickhouse-client ||: } +function filter_exists +{ + local path + for path in "$@"; do + if [ -e "$path" ]; then + echo "$path" + else + echo "'$path' does not exists" >&2 + fi + done +} + function fuzz { # Obtain the list of newly added tests. They will be fuzzed in more extreme way than other tests. - cd ch - NEW_TESTS=$(git diff --name-only "$(git merge-base origin/master "$SHA_TO_TEST"~)" "$SHA_TO_TEST" | grep -P 'tests/queries/0_stateless/.*\.sql' | sed -r -e 's!^!ch/!' | sort -R) - cd .. + # Don't overwrite the NEW_TESTS_OPT so that it can be set from the environment. + NEW_TESTS="$(grep -P 'tests/queries/0_stateless/.*\.sql' ci-changed-files.txt | sed -r -e 's!^!ch/!' | sort -R)" + # ci-changed-files.txt contains also files that has been deleted/renamed, filter them out. + NEW_TESTS="$(filter_exists $NEW_TESTS)" if [[ -n "$NEW_TESTS" ]] then - NEW_TESTS_OPT="--interleave-queries-file ${NEW_TESTS}" + NEW_TESTS_OPT="${NEW_TESTS_OPT:---interleave-queries-file ${NEW_TESTS}}" else - NEW_TESTS_OPT="" + NEW_TESTS_OPT="${NEW_TESTS_OPT:-}" fi - ./clickhouse-server --config-file db/config.xml -- --path db 2>&1 | tail -100000 > server.log & + clickhouse-server --config-file db/config.xml -- --path db 2>&1 | tail -100000 > server.log & server_pid=$! kill -0 $server_pid - while ! ./clickhouse-client --query "select 1" && kill -0 $server_pid ; do echo . ; sleep 1 ; done - ./clickhouse-client --query "select 1" + while ! clickhouse-client --query "select 1" && kill -0 $server_pid ; do echo . ; sleep 1 ; done + clickhouse-client --query "select 1" kill -0 $server_pid echo Server started @@ -111,14 +119,14 @@ continue # SC2012: Use find instead of ls to better handle non-alphanumeric filenames. They are all alphanumeric. # SC2046: Quote this to prevent word splitting. Actually I need word splitting. # shellcheck disable=SC2012,SC2046 - ./clickhouse-client --query-fuzzer-runs=1000 --queries-file $(ls -1 ch/tests/queries/0_stateless/*.sql | sort -R) $NEW_TESTS_OPT \ + clickhouse-client --query-fuzzer-runs=1000 --queries-file $(ls -1 ch/tests/queries/0_stateless/*.sql | sort -R) $NEW_TESTS_OPT \ > >(tail -n 100000 > fuzzer.log) \ 2>&1 \ || fuzzer_exit_code=$? echo "Fuzzer exit code is $fuzzer_exit_code" - ./clickhouse-client --query "select elapsed, query from system.processes" ||: + clickhouse-client --query "select elapsed, query from system.processes" ||: killall clickhouse-server ||: for _ in {1..10} do @@ -190,7 +198,7 @@ case "$stage" in # Lost connection to the server. This probably means that the server died # with abort. echo "failure" > status.txt - if ! grep -ao "Received signal.*\|Logical error.*\|Assertion.*failed\|Failed assertion.*\|.*runtime error: .*\|.*is located.*\|SUMMARY: MemorySanitizer:.*\|SUMMARY: ThreadSanitizer:.*\|.*_LIBCPP_ASSERT.*" server.log > description.txt + if ! grep -ao "Received signal.*\|Logical error.*\|Assertion.*failed\|Failed assertion.*\|.*runtime error: .*\|.*is located.*\|SUMMARY: AddressSanitizer:.*\|SUMMARY: MemorySanitizer:.*\|SUMMARY: ThreadSanitizer:.*\|.*_LIBCPP_ASSERT.*" server.log > description.txt then echo "Lost connection to server. See the logs." > description.txt fi diff --git a/docker/test/integration/base/Dockerfile b/docker/test/integration/base/Dockerfile index 4963ff0094d..1c962f1bf8f 100644 --- a/docker/test/integration/base/Dockerfile +++ b/docker/test/integration/base/Dockerfile @@ -18,7 +18,9 @@ RUN apt-get update \ curl \ tar \ krb5-user \ - iproute2 + iproute2 \ + lsof \ + g++ RUN rm -rf \ /var/lib/apt/lists/* \ /var/cache/debconf \ diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index e0e5e36a3d6..783e689ed01 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -31,6 +31,7 @@ RUN apt-get update \ software-properties-common \ libkrb5-dev \ krb5-user \ + g++ \ && rm -rf \ /var/lib/apt/lists/* \ /var/cache/debconf \ diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml b/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml new file mode 100644 index 00000000000..d0674362709 --- /dev/null +++ b/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml @@ -0,0 +1,23 @@ +version: '2.3' +services: + mysql2: + image: mysql:5.7 + restart: always + environment: + MYSQL_ROOT_PASSWORD: clickhouse + ports: + - 3348:3306 + mysql3: + image: mysql:5.7 + restart: always + environment: + MYSQL_ROOT_PASSWORD: clickhouse + ports: + - 3388:3306 + mysql4: + image: mysql:5.7 + restart: always + environment: + MYSQL_ROOT_PASSWORD: clickhouse + ports: + - 3368:3306 diff --git a/docker/test/integration/runner/compose/docker_compose_postgres_cluster.yml b/docker/test/integration/runner/compose/docker_compose_postgres_cluster.yml new file mode 100644 index 00000000000..d04c8a2f3a6 --- /dev/null +++ b/docker/test/integration/runner/compose/docker_compose_postgres_cluster.yml @@ -0,0 +1,23 @@ +version: '2.3' +services: + postgres2: + image: postgres + restart: always + environment: + POSTGRES_PASSWORD: mysecretpassword + ports: + - 5421:5432 + postgres3: + image: postgres + restart: always + environment: + POSTGRES_PASSWORD: mysecretpassword + ports: + - 5441:5432 + postgres4: + image: postgres + restart: always + environment: + POSTGRES_PASSWORD: mysecretpassword + ports: + - 5461:5432 diff --git a/docker/test/integration/runner/compose/docker_compose_zookeeper.yml b/docker/test/integration/runner/compose/docker_compose_zookeeper.yml index 49e285b5515..1601d217a25 100644 --- a/docker/test/integration/runner/compose/docker_compose_zookeeper.yml +++ b/docker/test/integration/runner/compose/docker_compose_zookeeper.yml @@ -1,11 +1,11 @@ version: '2.3' services: zoo1: - image: zookeeper:3.4.12 + image: zookeeper:3.6.2 restart: always environment: ZOO_TICK_TIME: 500 - ZOO_SERVERS: server.1=zoo1:2888:3888 server.2=zoo2:2888:3888 server.3=zoo3:2888:3888 + ZOO_SERVERS: server.1=zoo1:2888:3888;2181 server.2=zoo2:2888:3888;2181 server.3=zoo3:2888:3888;2181 ZOO_MY_ID: 1 JVMFLAGS: -Dzookeeper.forceSync=no volumes: @@ -16,11 +16,11 @@ services: source: ${ZK_DATA_LOG1:-} target: /datalog zoo2: - image: zookeeper:3.4.12 + image: zookeeper:3.6.2 restart: always environment: ZOO_TICK_TIME: 500 - ZOO_SERVERS: server.1=zoo1:2888:3888 server.2=zoo2:2888:3888 server.3=zoo3:2888:3888 + ZOO_SERVERS: server.1=zoo1:2888:3888;2181 server.2=zoo2:2888:3888;2181 server.3=zoo3:2888:3888 ZOO_MY_ID: 2 JVMFLAGS: -Dzookeeper.forceSync=no volumes: @@ -31,11 +31,11 @@ services: source: ${ZK_DATA_LOG2:-} target: /datalog zoo3: - image: zookeeper:3.4.12 + image: zookeeper:3.6.2 restart: always environment: ZOO_TICK_TIME: 500 - ZOO_SERVERS: server.1=zoo1:2888:3888 server.2=zoo2:2888:3888 server.3=zoo3:2888:3888 + ZOO_SERVERS: server.1=zoo1:2888:3888;2181 server.2=zoo2:2888:3888;2181 server.3=zoo3:2888:3888;2181 ZOO_MY_ID: 3 JVMFLAGS: -Dzookeeper.forceSync=no volumes: diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index c0255d3d706..bda6f5a719d 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -21,6 +21,7 @@ export CLICKHOUSE_TESTS_SERVER_BIN_PATH=/clickhouse export CLICKHOUSE_TESTS_CLIENT_BIN_PATH=/clickhouse export CLICKHOUSE_TESTS_BASE_CONFIG_DIR=/clickhouse-config export CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH=/clickhouse-odbc-bridge +export CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH=/clickhouse-library-bridge export DOCKER_MYSQL_GOLANG_CLIENT_TAG=${DOCKER_MYSQL_GOLANG_CLIENT_TAG:=latest} export DOCKER_MYSQL_JAVA_CLIENT_TAG=${DOCKER_MYSQL_JAVA_CLIENT_TAG:=latest} diff --git a/docker/test/keeper-jepsen/Dockerfile b/docker/test/keeper-jepsen/Dockerfile new file mode 100644 index 00000000000..1a62d5e793f --- /dev/null +++ b/docker/test/keeper-jepsen/Dockerfile @@ -0,0 +1,39 @@ +# docker build -t yandex/clickhouse-keeper-jepsen-test . +FROM yandex/clickhouse-test-base + +ENV DEBIAN_FRONTEND=noninteractive +ENV CLOJURE_VERSION=1.10.3.814 + +# arguments +ENV PR_TO_TEST="" +ENV SHA_TO_TEST="" + +ENV NODES_USERNAME="root" +ENV NODES_PASSWORD="" +ENV TESTS_TO_RUN="30" +ENV TIME_LIMIT="30" + + +# volumes +ENV NODES_FILE_PATH="/nodes.txt" +ENV TEST_OUTPUT="/test_output" + +RUN mkdir "/root/.ssh" +RUN touch "/root/.ssh/known_hosts" + +# install java +RUN apt-get update && apt-get install default-jre default-jdk libjna-java libjna-jni ssh gnuplot graphviz --yes --no-install-recommends + +# install clojure +RUN curl -O "https://download.clojure.org/install/linux-install-${CLOJURE_VERSION}.sh" && \ + chmod +x "linux-install-${CLOJURE_VERSION}.sh" && \ + bash "./linux-install-${CLOJURE_VERSION}.sh" + +# install leiningen +RUN curl -O "https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein" && \ + chmod +x ./lein && \ + mv ./lein /usr/bin + +COPY run.sh / + +CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/keeper-jepsen/run.sh b/docker/test/keeper-jepsen/run.sh new file mode 100644 index 00000000000..352585e16e3 --- /dev/null +++ b/docker/test/keeper-jepsen/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + + +CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"} +CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""} + + +if [ -z "$CLICKHOUSE_REPO_PATH" ]; then + CLICKHOUSE_REPO_PATH=ch + rm -rf ch ||: + mkdir ch ||: + wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz" + tar -C ch --strip-components=1 -xf clickhouse_no_subs.tar.gz + ls -lath ||: +fi + +cd "$CLICKHOUSE_REPO_PATH/tests/jepsen.clickhouse-keeper" + +(lein run test-all --nodes-file "$NODES_FILE_PATH" --username "$NODES_USERNAME" --logging-json --password "$NODES_PASSWORD" --time-limit "$TIME_LIMIT" --concurrency 50 -r 50 --snapshot-distance 100 --stale-log-gap 100 --reserved-log-items 10 --lightweight-run --clickhouse-source "$CLICKHOUSE_PACKAGE" -q --test-count "$TESTS_TO_RUN" || true) | tee "$TEST_OUTPUT/jepsen_run_all_tests.log" + +mv store "$TEST_OUTPUT/" diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh index 2b19a5e75a8..4507de16492 100755 --- a/docker/test/performance-comparison/compare.sh +++ b/docker/test/performance-comparison/compare.sh @@ -2,7 +2,9 @@ set -exu set -o pipefail trap "exit" INT TERM -trap 'kill $(jobs -pr) ||:' EXIT +# The watchdog is in the separate process group, so we have to kill it separately +# if the script terminates earlier. +trap 'kill $(jobs -pr) ${watchdog_pid:-} ||:' EXIT stage=${stage:-} script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" @@ -358,6 +360,8 @@ mkdir analyze analyze/tmp ||: build_log_column_definitions # Split the raw test output into files suitable for analysis. +# To debug calculations only for a particular test, substitute a suitable +# wildcard here, e.g. `for test_file in modulo-raw.tsv`. for test_file in *-raw.tsv do test_name=$(basename "$test_file" "-raw.tsv") @@ -467,7 +471,13 @@ create view broken_queries as create table query_run_metrics_for_stats engine File( TSV, -- do not add header -- will parse with grep 'analyze/query-run-metrics-for-stats.tsv') - as select test, query_index, 0 run, version, metric_values + as select test, query_index, 0 run, version, + -- For debugging, add a filter for a particular metric like this: + -- arrayFilter(m, n -> n = 'client_time', metric_values, metric_names) + -- metric_values + -- Note that further reporting may break, because the metric names are + -- not filtered. + metric_values from query_run_metric_arrays where (test, query_index) not in broken_queries order by test, query_index, run, version @@ -585,8 +595,19 @@ create view query_metric_stats as -- Main statistics for queries -- query time as reported in query log. create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv') as select - abs(diff) > report_threshold and abs(diff) > stat_threshold as changed_fail, - abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show, + -- It is important to have a non-strict inequality with stat_threshold + -- here. The randomization distribution is actually discrete, and when + -- the number of runs is small, the quantile we need (e.g. 0.99) turns + -- out to be the maximum value of the distribution. We can also hit this + -- maximum possible value with our test run, and this obviously means + -- that we have observed the difference to the best precision possible + -- for the given number of runs. If we use a strict equality here, we + -- will miss such cases. This happened in the wild and lead to some + -- uncaught regressions, because for the default 7 runs we do for PRs, + -- the randomization distribution has only 16 values, so the max quantile + -- is actually 0.9375. + abs(diff) > report_threshold and abs(diff) >= stat_threshold as changed_fail, + abs(diff) > report_threshold - 0.05 and abs(diff) >= stat_threshold as changed_show, not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail, not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show, @@ -739,7 +760,7 @@ create view test_times_view as total_client_time, queries, query_max, - real / queries avg_real_per_query, + real / if(queries > 0, queries, 1) avg_real_per_query, query_min, runs from test_time @@ -760,7 +781,7 @@ create view test_times_view_total as sum(total_client_time), sum(queries), max(query_max), - sum(real) / sum(queries) avg_real_per_query, + sum(real) / if(sum(queries) > 0, sum(queries), 1) avg_real_per_query, min(query_min), -- Totaling the number of runs doesn't make sense, but use the max so -- that the reporting script doesn't complain about queries being too diff --git a/docker/test/performance-comparison/config/config.d/user_files.xml b/docker/test/performance-comparison/config/config.d/user_files.xml new file mode 100644 index 00000000000..9dc8daab66b --- /dev/null +++ b/docker/test/performance-comparison/config/config.d/user_files.xml @@ -0,0 +1,10 @@ + + + /var/lib/clickhouse/user_files/ + + + users.xml + + + access/ + diff --git a/docker/test/performance-comparison/config/config.d/zzz-perf-comparison-tweaks-config.xml b/docker/test/performance-comparison/config/config.d/zzz-perf-comparison-tweaks-config.xml index ee2006201b0..7b941f844de 100644 --- a/docker/test/performance-comparison/config/config.d/zzz-perf-comparison-tweaks-config.xml +++ b/docker/test/performance-comparison/config/config.d/zzz-perf-comparison-tweaks-config.xml @@ -1,9 +1,10 @@ + - + :: diff --git a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml index f3609bcfcdb..63e23d8453c 100644 --- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml +++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml @@ -17,6 +17,14 @@ 12 + + + 64Mi + + + 1 + + diff --git a/docker/test/performance-comparison/eqmed.sql b/docker/test/performance-comparison/eqmed.sql index 139f0758798..d0111550ee6 100644 --- a/docker/test/performance-comparison/eqmed.sql +++ b/docker/test/performance-comparison/eqmed.sql @@ -1,4 +1,6 @@ --- input is table(test text, query text, run UInt32, version int, metrics Array(float)) +-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)). +-- Run like this: +-- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv select arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded, arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded, @@ -8,14 +10,19 @@ select from ( -- quantiles of randomization distributions + -- note that for small number of runs, the exact quantile might not make + -- sense, because the last possible value of randomization distribution + -- might take a larger percentage of distirbution (i.e. the distribution + -- actually has discrete values, and the last step can be large). select quantileExactForEach(0.99)( arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d ) threshold - ---- uncomment to see what the distribution is really like - --, uniqExact(d.1) u + ---- Uncomment to see what the distribution is really like. This debug + ---- code only works for single (the first) metric. + --, uniqExact(d[1]) u --, arraySort(x->x.1, -- arrayZip( - -- (sumMap([d.1], [1]) as f).1, + -- (sumMap([d[1]], [1]) as f).1, -- f.2)) full_histogram from ( diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index f1c5df146aa..2588b9f4213 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -66,7 +66,12 @@ reportStageEnd('parse') subst_elems = root.findall('substitutions/substitution') available_parameters = {} # { 'table': ['hits_10m', 'hits_100m'], ... } for e in subst_elems: - available_parameters[e.find('name').text] = [v.text for v in e.findall('values/value')] + name = e.find('name').text + values = [v.text for v in e.findall('values/value')] + if not values: + raise Exception(f'No values given for substitution {{{name}}}') + + available_parameters[name] = values # Takes parallel lists of templates, substitutes them with all combos of # parameters. The set of parameters is determined based on the first list. @@ -263,8 +268,16 @@ for query_index in queries_to_run: for conn_index, c in enumerate(all_connections): try: prewarm_id = f'{query_prefix}.prewarm0' - # Will also detect too long queries during warmup stage - res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': 10}) + + try: + # Will also detect too long queries during warmup stage + res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': args.max_query_seconds}) + except clickhouse_driver.errors.Error as e: + # Add query id to the exception to make debugging easier. + e.args = (prewarm_id, *e.args) + e.message = prewarm_id + ': ' + e.message + raise + print(f'prewarm\t{query_index}\t{prewarm_id}\t{conn_index}\t{c.last_query.elapsed}') except KeyboardInterrupt: raise @@ -311,8 +324,8 @@ for query_index in queries_to_run: for conn_index, c in enumerate(this_query_connections): try: - res = c.execute(q, query_id = run_id) - except Exception as e: + res = c.execute(q, query_id = run_id, settings = {'max_execution_time': args.max_query_seconds}) + except clickhouse_driver.errors.Error as e: # Add query id to the exception to make debugging easier. e.args = (run_id, *e.args) e.message = run_id + ': ' + e.message @@ -389,7 +402,7 @@ for query_index in queries_to_run: try: res = c.execute(q, query_id = run_id, settings = {'query_profiler_real_time_period_ns': 10000000}) print(f'profile\t{query_index}\t{run_id}\t{conn_index}\t{c.last_query.elapsed}') - except Exception as e: + except clickhouse_driver.errors.Error as e: # Add query id to the exception to make debugging easier. e.args = (run_id, *e.args) e.message = run_id + ': ' + e.message diff --git a/docker/test/split_build_smoke_test/Dockerfile b/docker/test/split_build_smoke_test/Dockerfile index c77db1c6c88..54a9eb17868 100644 --- a/docker/test/split_build_smoke_test/Dockerfile +++ b/docker/test/split_build_smoke_test/Dockerfile @@ -2,5 +2,6 @@ FROM yandex/clickhouse-binary-builder COPY run.sh /run.sh +COPY process_split_build_smoke_test_result.py / CMD /run.sh diff --git a/docker/test/split_build_smoke_test/process_split_build_smoke_test_result.py b/docker/test/split_build_smoke_test/process_split_build_smoke_test_result.py new file mode 100755 index 00000000000..58d6ba8c62a --- /dev/null +++ b/docker/test/split_build_smoke_test/process_split_build_smoke_test_result.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +import os +import logging +import argparse +import csv + +RESULT_LOG_NAME = "run.log" + +def process_result(result_folder): + + status = "success" + description = 'Server started and responded' + summary = [("Smoke test", "OK")] + with open(os.path.join(result_folder, RESULT_LOG_NAME), 'r') as run_log: + lines = run_log.read().split('\n') + if not lines or lines[0].strip() != 'OK': + status = "failure" + logging.info("Lines is not ok: %s", str('\n'.join(lines))) + summary = [("Smoke test", "FAIL")] + description = 'Server failed to respond, see result in logs' + + result_logs = [] + server_log_path = os.path.join(result_folder, "clickhouse-server.log") + stderr_log_path = os.path.join(result_folder, "stderr.log") + client_stderr_log_path = os.path.join(result_folder, "clientstderr.log") + + if os.path.exists(server_log_path): + result_logs.append(server_log_path) + + if os.path.exists(stderr_log_path): + result_logs.append(stderr_log_path) + + if os.path.exists(client_stderr_log_path): + result_logs.append(client_stderr_log_path) + + return status, description, summary, result_logs + + +def write_results(results_file, status_file, results, status): + with open(results_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerows(results) + with open(status_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerow(status) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + parser = argparse.ArgumentParser(description="ClickHouse script for parsing results of split build smoke test") + parser.add_argument("--in-results-dir", default='/test_output/') + parser.add_argument("--out-results-file", default='/test_output/test_results.tsv') + parser.add_argument("--out-status-file", default='/test_output/check_status.tsv') + args = parser.parse_args() + + state, description, test_results, logs = process_result(args.in_results_dir) + logging.info("Result parsed") + status = (state, description) + write_results(args.out_results_file, args.out_status_file, test_results, status) + logging.info("Result written") diff --git a/docker/test/split_build_smoke_test/run.sh b/docker/test/split_build_smoke_test/run.sh index eac9848030e..b565d7a481e 100755 --- a/docker/test/split_build_smoke_test/run.sh +++ b/docker/test/split_build_smoke_test/run.sh @@ -5,16 +5,18 @@ set -x install_and_run_server() { mkdir /unpacked tar -xzf /package_folder/shared_build.tgz -C /unpacked --strip 1 - LD_LIBRARY_PATH=/unpacked /unpacked/clickhouse-server --config /unpacked/config/config.xml >/var/log/clickhouse-server/stderr.log 2>&1 & + LD_LIBRARY_PATH=/unpacked /unpacked/clickhouse-server --config /unpacked/config/config.xml >/test_output/stderr.log 2>&1 & } run_client() { for i in {1..100}; do sleep 1 - LD_LIBRARY_PATH=/unpacked /unpacked/clickhouse-client --query "select 'OK'" 2>/var/log/clickhouse-server/clientstderr.log && break + LD_LIBRARY_PATH=/unpacked /unpacked/clickhouse-client --query "select 'OK'" > /test_output/run.log 2> /test_output/clientstderr.log && break [[ $i == 100 ]] && echo 'FAIL' done } install_and_run_server run_client +mv /var/log/clickhouse-server/clickhouse-server.log /test_output/clickhouse-server.log +/process_split_build_smoke_test_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv diff --git a/docker/test/sqlancer/Dockerfile b/docker/test/sqlancer/Dockerfile index 38a773e65ad..253ca1b729a 100644 --- a/docker/test/sqlancer/Dockerfile +++ b/docker/test/sqlancer/Dockerfile @@ -1,8 +1,7 @@ # docker build -t yandex/clickhouse-sqlancer-test . FROM ubuntu:20.04 -RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven --yes --no-install-recommends - +RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven python3 --yes --no-install-recommends RUN wget https://github.com/sqlancer/sqlancer/archive/master.zip -O /sqlancer.zip RUN mkdir /sqlancer && \ cd /sqlancer && \ @@ -10,4 +9,5 @@ RUN mkdir /sqlancer && \ RUN cd /sqlancer/sqlancer-master && mvn package -DskipTests COPY run.sh / +COPY process_sqlancer_result.py / CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/sqlancer/process_sqlancer_result.py b/docker/test/sqlancer/process_sqlancer_result.py new file mode 100755 index 00000000000..ede3cabc1c5 --- /dev/null +++ b/docker/test/sqlancer/process_sqlancer_result.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import os +import logging +import argparse +import csv + + +def process_result(result_folder): + status = "success" + summary = [] + paths = [] + tests = ["TLPWhere", "TLPGroupBy", "TLPHaving", "TLPWhereGroupBy", "TLPDistinct", "TLPAggregate"] + + for test in tests: + err_path = '{}/{}.err'.format(result_folder, test) + out_path = '{}/{}.out'.format(result_folder, test) + if not os.path.exists(err_path): + logging.info("No output err on path %s", err_path) + summary.append((test, "SKIPPED")) + elif not os.path.exists(out_path): + logging.info("No output log on path %s", out_path) + else: + paths.append(err_path) + paths.append(out_path) + with open(err_path, 'r') as f: + if 'AssertionError' in f.read(): + summary.append((test, "FAIL")) + status = 'failure' + else: + summary.append((test, "OK")) + + logs_path = '{}/logs.tar.gz'.format(result_folder) + if not os.path.exists(logs_path): + logging.info("No logs tar on path %s", logs_path) + else: + paths.append(logs_path) + stdout_path = '{}/stdout.log'.format(result_folder) + if not os.path.exists(stdout_path): + logging.info("No stdout log on path %s", stdout_path) + else: + paths.append(stdout_path) + stderr_path = '{}/stderr.log'.format(result_folder) + if not os.path.exists(stderr_path): + logging.info("No stderr log on path %s", stderr_path) + else: + paths.append(stderr_path) + + description = "SQLancer test run. See report" + + return status, description, summary, paths + + +def write_results(results_file, status_file, results, status): + with open(results_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerows(results) + with open(status_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerow(status) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + parser = argparse.ArgumentParser(description="ClickHouse script for parsing results of sqlancer test") + parser.add_argument("--in-results-dir", default='/test_output/') + parser.add_argument("--out-results-file", default='/test_output/test_results.tsv') + parser.add_argument("--out-status-file", default='/test_output/check_status.tsv') + args = parser.parse_args() + + state, description, test_results, logs = process_result(args.in_results_dir) + logging.info("Result parsed") + status = (state, description) + write_results(args.out_results_file, args.out_status_file, test_results, status) + logging.info("Result written") diff --git a/docker/test/sqlancer/run.sh b/docker/test/sqlancer/run.sh index ffe0afd98a8..e465ba1c993 100755 --- a/docker/test/sqlancer/run.sh +++ b/docker/test/sqlancer/run.sh @@ -11,7 +11,7 @@ service clickhouse-server start && sleep 5 cd /sqlancer/sqlancer-master -export TIMEOUT=60 +export TIMEOUT=300 export NUM_QUERIES=1000 ( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPWhere | tee /test_output/TLPWhere.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPWhere.err @@ -29,4 +29,5 @@ tail -n 1000 /var/log/clickhouse-server/stderr.log > /test_output/stderr.log tail -n 1000 /var/log/clickhouse-server/stdout.log > /test_output/stdout.log tail -n 1000 /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log +/process_sqlancer_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv ls /test_output diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index 7779f0e9dc2..8d865431570 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -13,6 +13,25 @@ dpkg -i package_folder/clickhouse-test_*.deb function start() { + if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + # NOTE We run "clickhouse server" instead of "clickhouse-server" + # to make "pidof clickhouse-server" return single pid of the main instance. + # We wil run main instance using "service clickhouse-server start" + sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server1/config.xml --daemon \ + -- --path /var/lib/clickhouse1/ --logger.stderr /var/log/clickhouse-server/stderr1.log \ + --logger.log /var/log/clickhouse-server/clickhouse-server1.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server1.err.log \ + --tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \ + --mysql_port 19004 --postgresql_port 19005 \ + --keeper_server.tcp_port 19181 --keeper_server.server_id 2 + + sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server2/config.xml --daemon \ + -- --path /var/lib/clickhouse2/ --logger.stderr /var/log/clickhouse-server/stderr2.log \ + --logger.log /var/log/clickhouse-server/clickhouse-server2.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server2.err.log \ + --tcp_port 29000 --tcp_port_secure 29440 --http_port 28123 --https_port 28443 --interserver_http_port 29009 --tcp_with_proxy_port 29010 \ + --mysql_port 29004 --postgresql_port 29005 \ + --keeper_server.tcp_port 29181 --keeper_server.server_id 3 + fi + counter=0 until clickhouse-client --query "SELECT 1" do @@ -35,9 +54,8 @@ start /s3downloader --dataset-names $DATASETS chmod 777 -R /var/lib/clickhouse clickhouse-client --query "SHOW DATABASES" -clickhouse-client --query "ATTACH DATABASE datasets ENGINE = Ordinary" -clickhouse-client --query "CREATE DATABASE test" +clickhouse-client --query "ATTACH DATABASE datasets ENGINE = Ordinary" service clickhouse-server restart # Wait for server to start accepting connections @@ -47,21 +65,61 @@ for _ in {1..120}; do done clickhouse-client --query "SHOW TABLES FROM datasets" -clickhouse-client --query "SHOW TABLES FROM test" -clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" -clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" -clickhouse-client --query "SHOW TABLES FROM test" - -if grep -q -- "--use-skip-list" /usr/bin/clickhouse-test ; then - SKIP_LIST_OPT="--use-skip-list" -fi - -# We can have several additional options so we path them as array because it's -# more idiologically correct. -read -ra ADDITIONAL_OPTIONS <<< "${ADDITIONAL_OPTIONS:-}" if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then - ADDITIONAL_OPTIONS+=('--replicated-database') + clickhouse-client --query "CREATE DATABASE test ON CLUSTER 'test_cluster_database_replicated' + ENGINE=Replicated('/test/clickhouse/db/test', '{shard}', '{replica}')" + + clickhouse-client --query "CREATE TABLE test.hits AS datasets.hits_v1" + clickhouse-client --query "CREATE TABLE test.visits AS datasets.visits_v1" + + clickhouse-client --query "INSERT INTO test.hits SELECT * FROM datasets.hits_v1" + clickhouse-client --query "INSERT INTO test.visits SELECT * FROM datasets.visits_v1" + + clickhouse-client --query "DROP TABLE datasets.hits_v1" + clickhouse-client --query "DROP TABLE datasets.visits_v1" + + MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours) + MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000)) # set to 2.5 hours if 0 (unlimited) +else + clickhouse-client --query "CREATE DATABASE test" + clickhouse-client --query "SHOW TABLES FROM test" + clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" + clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" fi -clickhouse-test --testname --shard --zookeeper --no-stateless --hung-check --print-time "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt +clickhouse-client --query "SHOW TABLES FROM test" +clickhouse-client --query "SELECT count() FROM test.hits" +clickhouse-client --query "SELECT count() FROM test.visits" + +function run_tests() +{ + set -x + # We can have several additional options so we path them as array because it's + # more idiologically correct. + read -ra ADDITIONAL_OPTIONS <<< "${ADDITIONAL_OPTIONS:-}" + + if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + ADDITIONAL_OPTIONS+=('--replicated-database') + fi + + clickhouse-test --testname --shard --zookeeper --no-stateless --hung-check --use-skip-list --print-time "${ADDITIONAL_OPTIONS[@]}" \ + "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt +} + +export -f run_tests +timeout "$MAX_RUN_TIME" bash -c run_tests ||: + +./process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv + +pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.gz ||: +mv /var/log/clickhouse-server/stderr.log /test_output/ ||: +if [[ -n "$WITH_COVERAGE" ]] && [[ "$WITH_COVERAGE" -eq 1 ]]; then + tar -chf /test_output/clickhouse_coverage.tar.gz /profraw ||: +fi +if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + pigz < /var/log/clickhouse-server/clickhouse-server1.log > /test_output/clickhouse-server1.log.gz ||: + pigz < /var/log/clickhouse-server/clickhouse-server2.log > /test_output/clickhouse-server2.log.gz ||: + mv /var/log/clickhouse-server/stderr1.log /test_output/ ||: + mv /var/log/clickhouse-server/stderr2.log /test_output/ ||: +fi diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index 2437415d17c..658ae1f27ba 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -28,7 +28,8 @@ RUN apt-get update -y \ tree \ unixodbc \ wget \ - mysql-client=5.7* + mysql-client=5.7* \ + postgresql-client RUN pip3 install numpy scipy pandas @@ -46,4 +47,5 @@ ENV NUM_TRIES=1 ENV MAX_RUN_TIME=0 COPY run.sh / +COPY process_functional_tests_result.py / CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/stateless/process_functional_tests_result.py b/docker/test/stateless/process_functional_tests_result.py new file mode 100755 index 00000000000..02adf108212 --- /dev/null +++ b/docker/test/stateless/process_functional_tests_result.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +import os +import logging +import argparse +import csv + +OK_SIGN = "[ OK " +FAIL_SING = "[ FAIL " +TIMEOUT_SING = "[ Timeout! " +UNKNOWN_SIGN = "[ UNKNOWN " +SKIPPED_SIGN = "[ SKIPPED " +HUNG_SIGN = "Found hung queries in processlist" + +NO_TASK_TIMEOUT_SIGN = "All tests have finished" + +def process_test_log(log_path): + total = 0 + skipped = 0 + unknown = 0 + failed = 0 + success = 0 + hung = False + task_timeout = True + test_results = [] + with open(log_path, 'r') as test_file: + for line in test_file: + line = line.strip() + if NO_TASK_TIMEOUT_SIGN in line: + task_timeout = False + if HUNG_SIGN in line: + hung = True + if any(sign in line for sign in (OK_SIGN, FAIL_SING, UNKNOWN_SIGN, SKIPPED_SIGN)): + test_name = line.split(' ')[2].split(':')[0] + + test_time = '' + try: + time_token = line.split(']')[1].strip().split()[0] + float(time_token) + test_time = time_token + except: + pass + + total += 1 + if TIMEOUT_SING in line: + failed += 1 + test_results.append((test_name, "Timeout", test_time)) + elif FAIL_SING in line: + failed += 1 + test_results.append((test_name, "FAIL", test_time)) + elif UNKNOWN_SIGN in line: + unknown += 1 + test_results.append((test_name, "FAIL", test_time)) + elif SKIPPED_SIGN in line: + skipped += 1 + test_results.append((test_name, "SKIPPED", test_time)) + else: + success += int(OK_SIGN in line) + test_results.append((test_name, "OK", test_time)) + return total, skipped, unknown, failed, success, hung, task_timeout, test_results + +def process_result(result_path): + test_results = [] + state = "success" + description = "" + files = os.listdir(result_path) + if files: + logging.info("Find files in result folder %s", ','.join(files)) + result_path = os.path.join(result_path, 'test_result.txt') + else: + result_path = None + description = "No output log" + state = "error" + + if result_path and os.path.exists(result_path): + total, skipped, unknown, failed, success, hung, task_timeout, test_results = process_test_log(result_path) + is_flacky_check = 1 < int(os.environ.get('NUM_TRIES', 1)) + # If no tests were run (success == 0) it indicates an error (e.g. server did not start or crashed immediately) + # But it's Ok for "flaky checks" - they can contain just one test for check which is marked as skipped. + if failed != 0 or unknown != 0 or (success == 0 and (not is_flacky_check)): + state = "failure" + + if hung: + description = "Some queries hung, " + state = "failure" + elif task_timeout: + description = "Timeout, " + state = "failure" + else: + description = "" + + description += "fail: {}, passed: {}".format(failed, success) + if skipped != 0: + description += ", skipped: {}".format(skipped) + if unknown != 0: + description += ", unknown: {}".format(unknown) + else: + state = "failure" + description = "Output log doesn't exist" + test_results = [] + + return state, description, test_results + + +def write_results(results_file, status_file, results, status): + with open(results_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerows(results) + with open(status_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerow(status) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + parser = argparse.ArgumentParser(description="ClickHouse script for parsing results of functional tests") + parser.add_argument("--in-results-dir", default='/test_output/') + parser.add_argument("--out-results-file", default='/test_output/test_results.tsv') + parser.add_argument("--out-status-file", default='/test_output/check_status.tsv') + args = parser.parse_args() + + state, description, test_results = process_result(args.in_results_dir) + logging.info("Result parsed") + status = (state, description) + write_results(args.out_results_file, args.out_status_file, test_results, status) + logging.info("Result written") diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index d078f3739fd..e6f2d678aa9 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -34,36 +34,61 @@ if [ "$NUM_TRIES" -gt "1" ]; then # simpliest way to forward env variables to server sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon - sleep 5 else - service clickhouse-server start && sleep 5 + service clickhouse-server start fi -if grep -q -- "--use-skip-list" /usr/bin/clickhouse-test; then - SKIP_LIST_OPT="--use-skip-list" +if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + + sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server1/config.xml --daemon \ + -- --path /var/lib/clickhouse1/ --logger.stderr /var/log/clickhouse-server/stderr1.log \ + --logger.log /var/log/clickhouse-server/clickhouse-server1.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server1.err.log \ + --tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \ + --mysql_port 19004 --postgresql_port 19005 \ + --keeper_server.tcp_port 19181 --keeper_server.server_id 2 \ + --macros.replica r2 # It doesn't work :( + + sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server2/config.xml --daemon \ + -- --path /var/lib/clickhouse2/ --logger.stderr /var/log/clickhouse-server/stderr2.log \ + --logger.log /var/log/clickhouse-server/clickhouse-server2.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server2.err.log \ + --tcp_port 29000 --tcp_port_secure 29440 --http_port 28123 --https_port 28443 --interserver_http_port 29009 --tcp_with_proxy_port 29010 \ + --mysql_port 29004 --postgresql_port 29005 \ + --keeper_server.tcp_port 29181 --keeper_server.server_id 3 \ + --macros.shard s2 # It doesn't work :( + + MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours) + MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000)) # set to 2.5 hours if 0 (unlimited) fi +sleep 5 + function run_tests() { + set -x # We can have several additional options so we path them as array because it's # more idiologically correct. read -ra ADDITIONAL_OPTIONS <<< "${ADDITIONAL_OPTIONS:-}" # Skip these tests, because they fail when we rerun them multiple times if [ "$NUM_TRIES" -gt "1" ]; then + ADDITIONAL_OPTIONS+=('--order=random') ADDITIONAL_OPTIONS+=('--skip') ADDITIONAL_OPTIONS+=('00000_no_tests_to_skip') - ADDITIONAL_OPTIONS+=('--jobs') - ADDITIONAL_OPTIONS+=('4') + # Note that flaky check must be ran in parallel, but for now we run + # everything in parallel except DatabaseReplicated. See below. fi if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then ADDITIONAL_OPTIONS+=('--replicated-database') + else + # Too many tests fail for DatabaseReplicated in parallel. All other + # configurations are OK. + ADDITIONAL_OPTIONS+=('--jobs') + ADDITIONAL_OPTIONS+=('8') fi clickhouse-test --testname --shard --zookeeper --hung-check --print-time \ - --test-runs "$NUM_TRIES" \ - "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ + --use-skip-list --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ | ts '%Y-%m-%d %H:%M:%S' \ | tee -a test_output/test_result.txt } @@ -72,5 +97,34 @@ export -f run_tests timeout "$MAX_RUN_TIME" bash -c run_tests ||: +./process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv + +clickhouse-client -q "system flush logs" ||: + +pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.gz & +clickhouse-client -q "select * from system.query_log format TSVWithNamesAndTypes" | pigz > /test_output/query-log.tsv.gz & +clickhouse-client -q "select * from system.query_thread_log format TSVWithNamesAndTypes" | pigz > /test_output/query-thread-log.tsv.gz & +clickhouse-client --allow_introspection_functions=1 -q " + WITH + arrayMap(x -> concat(demangle(addressToSymbol(x)), ':', addressToLine(x)), trace) AS trace_array, + arrayStringConcat(trace_array, '\n') AS trace_string + SELECT * EXCEPT(trace), trace_string FROM system.trace_log FORMAT TSVWithNamesAndTypes +" | pigz > /test_output/trace-log.tsv.gz & +wait ||: + +mv /var/log/clickhouse-server/stderr.log /test_output/ ||: +if [[ -n "$WITH_COVERAGE" ]] && [[ "$WITH_COVERAGE" -eq 1 ]]; then + tar -chf /test_output/clickhouse_coverage.tar.gz /profraw ||: +fi tar -chf /test_output/text_log_dump.tar /var/lib/clickhouse/data/system/text_log ||: tar -chf /test_output/query_log_dump.tar /var/lib/clickhouse/data/system/query_log ||: +tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||: + +if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + pigz < /var/log/clickhouse-server/clickhouse-server1.log > /test_output/clickhouse-server1.log.gz ||: + pigz < /var/log/clickhouse-server/clickhouse-server2.log > /test_output/clickhouse-server2.log.gz ||: + mv /var/log/clickhouse-server/stderr1.log /test_output/ ||: + mv /var/log/clickhouse-server/stderr2.log /test_output/ ||: + tar -chf /test_output/coordination1.tar /var/lib/clickhouse1/coordination ||: + tar -chf /test_output/coordination2.tar /var/lib/clickhouse2/coordination ||: +fi diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index dc1e4db4477..74a88df21e0 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -53,10 +53,14 @@ handle SIGBUS stop print handle SIGABRT stop print continue thread apply all backtrace -continue +detach +quit " > script.gdb - gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" & + # FIXME Hung check may work incorrectly because of attached gdb + # 1. False positives are possible + # 2. We cannot attach another gdb to get stacktraces if some queries hung + gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" >> /test_output/gdb.log & } configure @@ -78,9 +82,62 @@ clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" clickhouse-client --query "SHOW TABLES FROM test" -./stress --hung-check --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" && echo "OK" > /test_output/script_exit_code.txt || echo "FAIL" > /test_output/script_exit_code.txt +./stress --hung-check --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" \ + && echo -e 'Test script exit code\tOK' >> /test_output/test_results.tsv \ + || echo -e 'Test script failed\tFAIL' >> /test_output/test_results.tsv stop start -clickhouse-client --query "SELECT 'Server successfuly started'" > /test_output/alive_check.txt || echo 'Server failed to start' > /test_output/alive_check.txt +clickhouse-client --query "SELECT 'Server successfully started', 'OK'" >> /test_output/test_results.tsv \ + || echo -e 'Server failed to start\tFAIL' >> /test_output/test_results.tsv + +[ -f /var/log/clickhouse-server/clickhouse-server.log ] || echo -e "Server log does not exist\tFAIL" +[ -f /var/log/clickhouse-server/stderr.log ] || echo -e "Stderr log does not exist\tFAIL" + +# Print Fatal log messages to stdout +zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log + +# Grep logs for sanitizer asserts, crashes and other critical errors + +# Sanitizer asserts +zgrep -Fa "==================" /var/log/clickhouse-server/stderr.log >> /test_output/tmp +zgrep -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp +zgrep -Fav "ASan doesn't fully support makecontext/swapcontext functions" > /dev/null \ + && echo -e 'Sanitizer assert (in stderr.log)\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'No sanitizer asserts\tOK' >> /test_output/test_results.tsv +rm -f /test_output/tmp + +# OOM +zgrep -Fa " Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + && echo -e 'OOM killer (or signal 9) in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'No OOM messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv + +# Logical errors +zgrep -Fa "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + && echo -e 'Logical error thrown (see clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'No logical errors\tOK' >> /test_output/test_results.tsv + +# Crash +zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + && echo -e 'Killed by signal (in clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'Not crashed\tOK' >> /test_output/test_results.tsv + +# It also checks for crash without stacktrace (printed by watchdog) +zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + && echo -e 'Fatal message in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'No fatal messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv + +zgrep -Fa "########################################" /test_output/* > /dev/null \ + && echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv + +# Put logs into /test_output/ +pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.gz +tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||: +mv /var/log/clickhouse-server/stderr.log /test_output/ +tar -chf /test_output/query_log_dump.tar /var/lib/clickhouse/data/system/query_log ||: +tar -chf /test_output/trace_log_dump.tar /var/lib/clickhouse/data/system/trace_log ||: + +# Write check result into check_status.tsv +clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by (lower(test) like '%hung%') LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv +[ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 841556cf090..4fbedceb0b8 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- from multiprocessing import cpu_count -from subprocess import Popen, call, STDOUT +from subprocess import Popen, call, check_output, STDOUT import os import sys import shutil @@ -58,6 +58,54 @@ def run_func_test(cmd, output_prefix, num_processes, skip_tests_option, global_t time.sleep(0.5) return pipes +def prepare_for_hung_check(): + # FIXME this function should not exist, but... + + # We attach gdb to clickhouse-server before running tests + # to print stacktraces of all crashes even if clickhouse cannot print it for some reason. + # However, it obstruct checking for hung queries. + logging.info("Will terminate gdb (if any)") + call("kill -TERM $(pidof gdb)", shell=True, stderr=STDOUT) + + # Some tests set too low memory limit for default user and forget to reset in back. + # It may cause SYSTEM queries to fail, let's disable memory limit. + call("clickhouse client --max_memory_usage_for_user=0 -q 'SELECT 1 FORMAT Null'", shell=True, stderr=STDOUT) + + # Some tests execute SYSTEM STOP MERGES or similar queries. + # It may cause some ALTERs to hang. + # Possibly we should fix tests and forbid to use such queries without specifying table. + call("clickhouse client -q 'SYSTEM START MERGES'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM START DISTRIBUTED SENDS'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM START TTL MERGES'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM START MOVES'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM START FETCHES'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM START REPLICATED SENDS'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM START REPLICATION QUEUES'", shell=True, stderr=STDOUT) + + # Issue #21004, live views are experimental, so let's just suppress it + call("""clickhouse client -q "KILL QUERY WHERE upper(query) LIKE 'WATCH %'" """, shell=True, stderr=STDOUT) + + # Kill other queries which known to be slow + # It's query from 01232_preparing_sets_race_condition_long, it may take up to 1000 seconds in slow builds + call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'insert into tableB select %'" """, shell=True, stderr=STDOUT) + # Long query from 00084_external_agregation + call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'" """, shell=True, stderr=STDOUT) + + # Wait for last queries to finish if any, not longer than 300 seconds + call("""clickhouse client -q "select sleepEachRow(( + select maxOrDefault(300 - elapsed) + 1 from system.processes where query not like '%from system.processes%' and elapsed < 300 + ) / 300) from numbers(300) format Null" """, shell=True, stderr=STDOUT) + + # Even if all clickhouse-test processes are finished, there are probably some sh scripts, + # which still run some new queries. Let's ignore them. + try: + query = """clickhouse client -q "SELECT count() FROM system.processes where where elapsed > 300" """ + output = check_output(query, shell=True, stderr=STDOUT).decode('utf-8').strip() + if int(output) == 0: + return False + except: + pass + return True if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') @@ -88,11 +136,14 @@ if __name__ == "__main__": logging.info("All processes finished") if args.hung_check: + have_long_running_queries = prepare_for_hung_check() logging.info("Checking if some queries hung") cmd = "{} {} {}".format(args.test_cmd, "--hung-check", "00001_select_1") res = call(cmd, shell=True, stderr=STDOUT) - if res != 0: + hung_check_status = "No queries hung\tOK\n" + if res != 0 and have_long_running_queries: logging.info("Hung check failed with exit code {}".format(res)) - sys.exit(1) + hung_check_status = "Hung check failed\tFAIL\n" + open(os.path.join(args.output_folder, "test_results.tsv"), 'w+').write(hung_check_status) logging.info("Stress test finished") diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index e70f9e05679..86595a77a54 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -10,14 +10,6 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ yamllint \ && pip3 install codespell - -# For |& syntax -SHELL ["bash", "-c"] - -CMD cd /ClickHouse/utils/check-style && \ - ./check-style -n |& tee /test_output/style_output.txt && \ - ./check-typos |& tee /test_output/typos_output.txt && \ - ./check-whitespaces -n |& tee /test_output/whitespaces_output.txt && \ - ./check-duplicate-includes.sh |& tee /test_output/duplicate_output.txt && \ - ./shellcheck-run.sh |& tee /test_output/shellcheck_output.txt && \ - true +COPY run.sh / +COPY process_style_check_result.py / +CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/style/process_style_check_result.py b/docker/test/style/process_style_check_result.py new file mode 100755 index 00000000000..61b1e0f05c5 --- /dev/null +++ b/docker/test/style/process_style_check_result.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +import os +import logging +import argparse +import csv + + +def process_result(result_folder): + status = "success" + description = "" + test_results = [] + + style_log_path = '{}/style_output.txt'.format(result_folder) + if not os.path.exists(style_log_path): + logging.info("No style check log on path %s", style_log_path) + return "exception", "No style check log", [] + elif os.stat(style_log_path).st_size != 0: + description += "Style check failed. " + test_results.append(("Style check", "FAIL")) + status = "failure" # Disabled for now + else: + test_results.append(("Style check", "OK")) + + typos_log_path = '{}/typos_output.txt'.format(result_folder) + if not os.path.exists(style_log_path): + logging.info("No typos check log on path %s", style_log_path) + return "exception", "No typos check log", [] + elif os.stat(typos_log_path).st_size != 0: + description += "Typos check failed. " + test_results.append(("Typos check", "FAIL")) + status = "failure" + else: + test_results.append(("Typos check", "OK")) + + whitespaces_log_path = '{}/whitespaces_output.txt'.format(result_folder) + if not os.path.exists(style_log_path): + logging.info("No whitespaces check log on path %s", style_log_path) + return "exception", "No whitespaces check log", [] + elif os.stat(whitespaces_log_path).st_size != 0: + description += "Whitespaces check failed. " + test_results.append(("Whitespaces check", "FAIL")) + status = "failure" + else: + test_results.append(("Whitespaces check", "OK")) + + duplicate_log_path = '{}/duplicate_output.txt'.format(result_folder) + if not os.path.exists(duplicate_log_path): + logging.info("No header duplicates check log on path %s", duplicate_log_path) + return "exception", "No header duplicates check log", [] + elif os.stat(duplicate_log_path).st_size != 0: + description += " Header duplicates check failed. " + test_results.append(("Header duplicates check", "FAIL")) + status = "failure" + else: + test_results.append(("Header duplicates check", "OK")) + + shellcheck_log_path = '{}/shellcheck_output.txt'.format(result_folder) + if not os.path.exists(shellcheck_log_path): + logging.info("No shellcheck log on path %s", shellcheck_log_path) + return "exception", "No shellcheck log", [] + elif os.stat(shellcheck_log_path).st_size != 0: + description += " Shellcheck check failed. " + test_results.append(("Shellcheck ", "FAIL")) + status = "failure" + else: + test_results.append(("Shellcheck", "OK")) + + if not description: + description += "Style check success" + + return status, description, test_results + + +def write_results(results_file, status_file, results, status): + with open(results_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerows(results) + with open(status_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerow(status) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + parser = argparse.ArgumentParser(description="ClickHouse script for parsing results of style check") + parser.add_argument("--in-results-dir", default='/test_output/') + parser.add_argument("--out-results-file", default='/test_output/test_results.tsv') + parser.add_argument("--out-status-file", default='/test_output/check_status.tsv') + args = parser.parse_args() + + state, description, test_results = process_result(args.in_results_dir) + logging.info("Result parsed") + status = (state, description) + write_results(args.out_results_file, args.out_status_file, test_results, status) + logging.info("Result written") diff --git a/docker/test/style/run.sh b/docker/test/style/run.sh new file mode 100755 index 00000000000..424bfe71b15 --- /dev/null +++ b/docker/test/style/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +cd /ClickHouse/utils/check-style || echo -e "failure\tRepo not found" > /test_output/check_status.tsv +./check-style -n |& tee /test_output/style_output.txt +./check-typos |& tee /test_output/typos_output.txt +./check-whitespaces -n |& tee /test_output/whitespaces_output.txt +./check-duplicate-includes.sh |& tee /test_output/duplicate_output.txt +./shellcheck-run.sh |& tee /test_output/shellcheck_output.txt +/process_style_check_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv diff --git a/docker/test/testflows/runner/Dockerfile b/docker/test/testflows/runner/Dockerfile index 4139fb9e044..bd7eee4c166 100644 --- a/docker/test/testflows/runner/Dockerfile +++ b/docker/test/testflows/runner/Dockerfile @@ -35,7 +35,7 @@ RUN apt-get update \ ENV TZ=Europe/Moscow RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -RUN pip3 install urllib3 testflows==1.6.72 docker-compose docker dicttoxml kazoo tzlocal +RUN pip3 install urllib3 testflows==1.6.74 docker-compose docker dicttoxml kazoo tzlocal ENV DOCKER_CHANNEL stable ENV DOCKER_VERSION 17.09.1-ce @@ -61,6 +61,7 @@ RUN set -eux; \ COPY modprobe.sh /usr/local/bin/modprobe COPY dockerd-entrypoint.sh /usr/local/bin/ +COPY process_testflows_result.py /usr/local/bin/ RUN set -x \ && addgroup --system dockremap \ @@ -72,5 +73,5 @@ RUN set -x \ VOLUME /var/lib/docker EXPOSE 2375 ENTRYPOINT ["dockerd-entrypoint.sh"] -CMD ["sh", "-c", "python3 regression.py --no-color -o classic --local --clickhouse-binary-path ${CLICKHOUSE_TESTS_SERVER_BIN_PATH} --log test.log ${TESTFLOWS_OPTS}; cat test.log | tfs report results --format json > results.json"] +CMD ["sh", "-c", "python3 regression.py --no-color -o classic --local --clickhouse-binary-path ${CLICKHOUSE_TESTS_SERVER_BIN_PATH} --log test.log ${TESTFLOWS_OPTS}; cat test.log | tfs report results --format json > results.json; /usr/local/bin/process_testflows_result.py || echo -e 'failure\tCannot parse results' > check_status.tsv"] diff --git a/docker/test/testflows/runner/dockerd-entrypoint.sh b/docker/test/testflows/runner/dockerd-entrypoint.sh index 1bac94a9df2..01593488648 100755 --- a/docker/test/testflows/runner/dockerd-entrypoint.sh +++ b/docker/test/testflows/runner/dockerd-entrypoint.sh @@ -16,6 +16,14 @@ while true; do done set -e +echo "Configure to use Yandex dockerhub-proxy" +cat > /etc/docker/daemon.json << EOF +{ + "insecure-registries": ["dockerhub-proxy.sas.yp-c.yandex.net:5000"], + "registry-mirrors": ["dockerhub-proxy.sas.yp-c.yandex.net:5000"] +} +EOF + echo "Start tests" export CLICKHOUSE_TESTS_SERVER_BIN_PATH=/clickhouse export CLICKHOUSE_TESTS_CLIENT_BIN_PATH=/clickhouse diff --git a/docker/test/testflows/runner/process_testflows_result.py b/docker/test/testflows/runner/process_testflows_result.py new file mode 100755 index 00000000000..37d0b6a69d1 --- /dev/null +++ b/docker/test/testflows/runner/process_testflows_result.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import os +import logging +import argparse +import csv +import json + + +def process_result(result_folder): + json_path = os.path.join(result_folder, "results.json") + if not os.path.exists(json_path): + return "success", "No testflows in branch", None, [] + + test_binary_log = os.path.join(result_folder, "test.log") + with open(json_path) as source: + results = json.loads(source.read()) + + total_tests = 0 + total_ok = 0 + total_fail = 0 + total_other = 0 + test_results = [] + for test in results["tests"]: + test_name = test['test']['test_name'] + test_result = test['result']['result_type'].upper() + test_time = str(test['result']['message_rtime']) + total_tests += 1 + if test_result == "OK": + total_ok += 1 + elif test_result == "FAIL" or test_result == "ERROR": + total_fail += 1 + else: + total_other += 1 + + test_results.append((test_name, test_result, test_time)) + if total_fail != 0: + status = "failure" + else: + status = "success" + + description = "failed: {}, passed: {}, other: {}".format(total_fail, total_ok, total_other) + return status, description, test_results, [json_path, test_binary_log] + + +def write_results(results_file, status_file, results, status): + with open(results_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerows(results) + with open(status_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerow(status) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + parser = argparse.ArgumentParser(description="ClickHouse script for parsing results of Testflows tests") + parser.add_argument("--in-results-dir", default='./') + parser.add_argument("--out-results-file", default='./test_results.tsv') + parser.add_argument("--out-status-file", default='./check_status.tsv') + args = parser.parse_args() + + state, description, test_results, logs = process_result(args.in_results_dir) + logging.info("Result parsed") + status = (state, description) + write_results(args.out_results_file, args.out_status_file, test_results, status) + logging.info("Result written") + diff --git a/docker/test/unit/Dockerfile b/docker/test/unit/Dockerfile index f01ed613918..e2f4a691939 100644 --- a/docker/test/unit/Dockerfile +++ b/docker/test/unit/Dockerfile @@ -5,6 +5,6 @@ ENV TZ=Europe/Moscow RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN apt-get install gdb -CMD service zookeeper start && sleep 7 && /usr/share/zookeeper/bin/zkCli.sh -server localhost:2181 -create create /clickhouse_test ''; \ - gdb -q -ex 'set print inferior-events off' -ex 'set confirm off' -ex 'set print thread-events off' -ex run -ex bt -ex quit --args ./unit_tests_dbms | tee test_output/test_result.txt - +COPY run.sh / +COPY process_unit_tests_result.py / +CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/unit/process_unit_tests_result.py b/docker/test/unit/process_unit_tests_result.py new file mode 100755 index 00000000000..7219aa13b82 --- /dev/null +++ b/docker/test/unit/process_unit_tests_result.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +import os +import logging +import argparse +import csv + +OK_SIGN = 'OK ]' +FAILED_SIGN = 'FAILED ]' +SEGFAULT = 'Segmentation fault' +SIGNAL = 'received signal SIG' +PASSED = 'PASSED' + +def get_test_name(line): + elements = reversed(line.split(' ')) + for element in elements: + if '(' not in element and ')' not in element: + return element + raise Exception("No test name in line '{}'".format(line)) + +def process_result(result_folder): + summary = [] + total_counter = 0 + failed_counter = 0 + result_log_path = '{}/test_result.txt'.format(result_folder) + if not os.path.exists(result_log_path): + logging.info("No output log on path %s", result_log_path) + return "exception", "No output log", [] + + status = "success" + description = "" + passed = False + with open(result_log_path, 'r') as test_result: + for line in test_result: + if OK_SIGN in line: + logging.info("Found ok line: '%s'", line) + test_name = get_test_name(line.strip()) + logging.info("Test name: '%s'", test_name) + summary.append((test_name, "OK")) + total_counter += 1 + elif FAILED_SIGN in line and 'listed below' not in line and 'ms)' in line: + logging.info("Found fail line: '%s'", line) + test_name = get_test_name(line.strip()) + logging.info("Test name: '%s'", test_name) + summary.append((test_name, "FAIL")) + total_counter += 1 + failed_counter += 1 + elif SEGFAULT in line: + logging.info("Found segfault line: '%s'", line) + status = "failure" + description += "Segmentation fault. " + break + elif SIGNAL in line: + logging.info("Received signal line: '%s'", line) + status = "failure" + description += "Exit on signal. " + break + elif PASSED in line: + logging.info("PASSED record found: '%s'", line) + passed = True + + if not passed: + status = "failure" + description += "PASSED record not found. " + + if failed_counter != 0: + status = "failure" + + if not description: + description += "fail: {}, passed: {}".format(failed_counter, total_counter - failed_counter) + + return status, description, summary + + +def write_results(results_file, status_file, results, status): + with open(results_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerows(results) + with open(status_file, 'w') as f: + out = csv.writer(f, delimiter='\t') + out.writerow(status) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + parser = argparse.ArgumentParser(description="ClickHouse script for parsing results of unit tests") + parser.add_argument("--in-results-dir", default='/test_output/') + parser.add_argument("--out-results-file", default='/test_output/test_results.tsv') + parser.add_argument("--out-status-file", default='/test_output/check_status.tsv') + args = parser.parse_args() + + state, description, test_results = process_result(args.in_results_dir) + logging.info("Result parsed") + status = (state, description) + write_results(args.out_results_file, args.out_status_file, test_results, status) + logging.info("Result written") + diff --git a/docker/test/unit/run.sh b/docker/test/unit/run.sh new file mode 100644 index 00000000000..abc35fa40d2 --- /dev/null +++ b/docker/test/unit/run.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -x + +service zookeeper start && sleep 7 && /usr/share/zookeeper/bin/zkCli.sh -server localhost:2181 -create create /clickhouse_test ''; +gdb -q -ex 'set print inferior-events off' -ex 'set confirm off' -ex 'set print thread-events off' -ex run -ex bt -ex quit --args ./unit_tests_dbms | tee test_output/test_result.txt +./process_unit_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000000..378eac25d31 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +build diff --git a/docs/README.md b/docs/README.md index 8b3066501bf..a4df023a6ad 100644 --- a/docs/README.md +++ b/docs/README.md @@ -126,7 +126,13 @@ Contribute all new information in English language. Other languages are translat ### Adding a New File -When adding a new file: +When you add a new file, it should end with a link like: + +`[Original article](https://clickhouse.tech/docs/) ` + +and there should be **a new empty line** after it. + +{## When adding a new file: - Make symbolic links for all other languages. You can use the following commands: @@ -134,7 +140,7 @@ When adding a new file: $ cd /ClickHouse/clone/directory/docs $ ln -sr en/new/file.md lang/new/file.md ``` - +##} ### Adding a New Language @@ -195,8 +201,11 @@ Templates: - [Function](_description_templates/template-function.md) - [Setting](_description_templates/template-setting.md) +- [Server Setting](_description_templates/template-server-setting.md) - [Database or Table engine](_description_templates/template-engine.md) - [System table](_description_templates/template-system-table.md) +- [Data type](_description_templates/data-type.md) +- [Statement](_description_templates/statement.md) diff --git a/docs/_description_templates/template-data-type.md b/docs/_description_templates/template-data-type.md index edb6586ee7d..5e560b9325d 100644 --- a/docs/_description_templates/template-data-type.md +++ b/docs/_description_templates/template-data-type.md @@ -26,4 +26,4 @@ The name of an additional section can be any, for example, **Usage**. - [link](#) -[Original article](https://clickhouse.tech/docs/en/data_types//) +[Original article](https://clickhouse.tech/docs/en/data-types//) diff --git a/docs/_description_templates/template-engine.md b/docs/_description_templates/template-engine.md index 35181881134..490f490fc4e 100644 --- a/docs/_description_templates/template-engine.md +++ b/docs/_description_templates/template-engine.md @@ -58,6 +58,6 @@ Result: Follow up with any text to clarify the example. -## See Also {#see-also} +**See Also** - [link](#) diff --git a/docs/_description_templates/template-function.md b/docs/_description_templates/template-function.md index a0074a76ef6..3d4d921898a 100644 --- a/docs/_description_templates/template-function.md +++ b/docs/_description_templates/template-function.md @@ -14,12 +14,12 @@ More text (Optional). **Arguments** (Optional) -- `x` — Description. [Type name](relative/path/to/type/dscr.md#type). -- `y` — Description. [Type name](relative/path/to/type/dscr.md#type). +- `x` — Description. Optional (only for optional arguments). Possible values: . Default value: . [Type name](relative/path/to/type/dscr.md#type). +- `y` — Description. Optional (only for optional arguments). Possible values: .Default value: . [Type name](relative/path/to/type/dscr.md#type). **Parameters** (Optional, only for parametric aggregate functions) -- `z` — Description. [Type name](relative/path/to/type/dscr.md#type). +- `z` — Description. Optional (only for optional parameters). Possible values: . Default value: . [Type name](relative/path/to/type/dscr.md#type). **Returned value(s)** diff --git a/docs/_description_templates/template-server-setting.md b/docs/_description_templates/template-server-setting.md index 36a2bcacfba..0b37d46cf41 100644 --- a/docs/_description_templates/template-server-setting.md +++ b/docs/_description_templates/template-server-setting.md @@ -8,14 +8,14 @@ Possible value: ... Default value: ... -Settings: (Optional) +**Settings** (Optional) If the section contains several settings, list them here. Specify possible values and default values: - setting_1 — Description. - setting_2 — Description. -**Example:** +**Example** ```xml diff --git a/docs/_description_templates/template-statement.md b/docs/_description_templates/template-statement.md index 62ea51edf83..bca015a2ac6 100644 --- a/docs/_description_templates/template-statement.md +++ b/docs/_description_templates/template-statement.md @@ -1,14 +1,14 @@ -# Statement name (for example, SHOW USER) +# Statement name (for example, SHOW USER) {#statement-name-in-lower-case} Brief description of what the statement does. -Syntax: +**Syntax** ```sql Syntax of the statement. ``` -## Other necessary sections of the description (Optional) +## Other necessary sections of the description (Optional) {#anchor} Examples of descriptions with a complicated structure: @@ -17,7 +17,7 @@ Examples of descriptions with a complicated structure: - https://clickhouse.tech/docs/en/sql-reference/statements/select/join/ -## See Also (Optional) +**See Also** (Optional) Links to related topics as a list. diff --git a/docs/en/commercial/cloud.md b/docs/en/commercial/cloud.md index 0490881c622..953a0ab5748 100644 --- a/docs/en/commercial/cloud.md +++ b/docs/en/commercial/cloud.md @@ -29,6 +29,18 @@ toc_title: Cloud - Cross-AZ scaling for performance and high availability - Built-in monitoring and SQL query editor +## Alibaba Cloud {#alibaba-cloud} + +Alibaba Cloud Managed Service for ClickHouse. [China Site](https://www.aliyun.com/product/clickhouse) (will be available at the international site in May 2021). Provides the following key features: + +- Highly reliable cloud disk storage engine based on [Alibaba Cloud Apsara](https://www.alibabacloud.com/product/apsara-stack) distributed system +- Expand capacity on-demand without manual data migration +- Support single-node, single-replica, multi-node, and multi-replica architectures, and support hot and cold data tiering +- Support access allow-list, one-key recovery, multi-layer network security protection, cloud disk encryption +- Seamless integration with cloud log systems, databases, and data application tools +- Built-in monitoring and database management platform +- Professional database expert technical support and service + ## Tencent Cloud {#tencent-cloud} [Tencent Managed Service for ClickHouse](https://cloud.tencent.com/product/cdwch) provides the following key features: diff --git a/docs/en/commercial/support.md b/docs/en/commercial/support.md index 37bc54e3e8b..1a3d1b71869 100644 --- a/docs/en/commercial/support.md +++ b/docs/en/commercial/support.md @@ -7,6 +7,10 @@ toc_title: Support !!! info "Info" If you have launched a ClickHouse commercial support service, feel free to [open a pull-request](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/commercial/support.md) adding it to the following list. + +## Yandex.Cloud + +ClickHouse worldwide support from the authors of ClickHouse. Supports on-premise and cloud deployments. Ask details on clickhouse-support@yandex-team.com ## Altinity {#altinity} diff --git a/docs/en/development/build-osx.md b/docs/en/development/build-osx.md index e0b1be710f1..24ecbdc1c2c 100644 --- a/docs/en/development/build-osx.md +++ b/docs/en/development/build-osx.md @@ -5,44 +5,80 @@ toc_title: Build on Mac OS X # How to Build ClickHouse on Mac OS X {#how-to-build-clickhouse-on-mac-os-x} -Build should work on Mac OS X 10.15 (Catalina). +Build should work on x86_64 (Intel) and arm64 (Apple Silicon) based macOS 10.15 (Catalina) and higher with recent Xcode's native AppleClang, or Homebrew's vanilla Clang or GCC compilers. ## Install Homebrew {#install-homebrew} ``` bash -$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" +# ...and follow the printed instructions on any additional steps required to complete the installation. ``` +## Install Xcode and Command Line Tools {#install-xcode-and-command-line-tools} + +Install the latest [Xcode](https://apps.apple.com/am/app/xcode/id497799835?mt=12) from App Store. + +Open it at least once to accept the end-user license agreement and automatically install the required components. + +Then, make sure that the latest Comman Line Tools are installed and selected in the system: + +``` bash +sudo rm -rf /Library/Developer/CommandLineTools +sudo xcode-select --install +``` + +Reboot. + ## Install Required Compilers, Tools, and Libraries {#install-required-compilers-tools-and-libraries} ``` bash -$ brew install cmake ninja libtool gettext llvm +brew update +brew install cmake ninja libtool gettext llvm gcc ``` ## Checkout ClickHouse Sources {#checkout-clickhouse-sources} ``` bash -$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git -``` - -or - -``` bash -$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git - -$ cd ClickHouse +git clone --recursive git@github.com:ClickHouse/ClickHouse.git +# ...alternatively, you can use https://github.com/ClickHouse/ClickHouse.git as the repo URL. ``` ## Build ClickHouse {#build-clickhouse} -> Please note: ClickHouse doesn't support build with native Apple Clang compiler, we need use clang from LLVM. +To build using Xcode's native AppleClang compiler: ``` bash -$ mkdir build -$ cd build -$ cmake .. -DCMAKE_C_COMPILER=`brew --prefix llvm`/bin/clang -DCMAKE_CXX_COMPILER=`brew --prefix llvm`/bin/clang++ -DCMAKE_PREFIX_PATH=`brew --prefix llvm` -$ ninja -$ cd .. +cd ClickHouse +rm -rf build +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . --config RelWithDebInfo +cd .. +``` + +To build using Homebrew's vanilla Clang compiler: + +``` bash +cd ClickHouse +rm -rf build +mkdir build +cd build +cmake -DCMAKE_C_COMPILER=$(brew --prefix llvm)/bin/clang -DCMAKE_CXX_COMPILER=$(brew --prefix llvm)/bin/clang++ -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . --config RelWithDebInfo +cd .. +``` + +To build using Homebrew's vanilla GCC compiler: + +``` bash +cd ClickHouse +rm -rf build +mkdir build +cd build +cmake -DCMAKE_C_COMPILER=$(brew --prefix gcc)/bin/gcc-10 -DCMAKE_CXX_COMPILER=$(brew --prefix gcc)/bin/g++-10 -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . --config RelWithDebInfo +cd .. ``` ## Caveats {#caveats} @@ -81,7 +117,7 @@ To do so, create the `/Library/LaunchDaemons/limit.maxfiles.plist` file with the Execute the following command: ``` bash -$ sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist +sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist ``` Reboot. diff --git a/docs/en/development/build.md b/docs/en/development/build.md index f98329e748f..3181f26800d 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -170,7 +170,7 @@ $ ./release Normally all tools of the ClickHouse bundle, such as `clickhouse-server`, `clickhouse-client` etc., are linked into a single static executable, `clickhouse`. This executable must be re-linked on every change, which might be slow. Two common ways to improve linking time are to use `lld` linker, and use the 'split' build configuration, which builds a separate binary for every tool, and further splits the code into serveral shared libraries. To enable these tweaks, pass the following flags to `cmake`: ``` --DCMAKE_C_FLAGS="-fuse-ld=lld" -DCMAKE_CXX_FLAGS="-fuse-ld=lld" -DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1 -DCLICKHOUSE_SPLIT_BINARY=1 +-DCMAKE_C_FLAGS="--ld-path=lld" -DCMAKE_CXX_FLAGS="--ld-path=lld" -DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1 -DCLICKHOUSE_SPLIT_BINARY=1 ``` ## You Don’t Have to Build ClickHouse {#you-dont-have-to-build-clickhouse} diff --git a/docs/en/development/cmake-in-clickhouse.md b/docs/en/development/cmake-in-clickhouse.md new file mode 100644 index 00000000000..6e6ac825587 --- /dev/null +++ b/docs/en/development/cmake-in-clickhouse.md @@ -0,0 +1,284 @@ +# CMake in ClickHouse + +## TL; DR How to make ClickHouse compile and link faster? + +Developer only! This command will likely fulfill most of your needs. Run before calling `ninja`. + +```cmake +cmake .. \ + -DCMAKE_C_COMPILER=/bin/clang-10 \ + -DCMAKE_CXX_COMPILER=/bin/clang++-10 \ + -DCMAKE_BUILD_TYPE=Debug \ + -DENABLE_CLICKHOUSE_ALL=OFF \ + -DENABLE_CLICKHOUSE_SERVER=ON \ + -DENABLE_CLICKHOUSE_CLIENT=ON \ + -DUSE_STATIC_LIBRARIES=OFF \ + -DSPLIT_SHARED_LIBRARIES=ON \ + -DENABLE_LIBRARIES=OFF \ + -DUSE_UNWIND=ON \ + -DENABLE_UTILS=OFF \ + -DENABLE_TESTS=OFF +``` + +## CMake files types + +1. ClickHouse's source CMake files (located in the root directory and in `/src`). +2. Arch-dependent CMake files (located in `/cmake/*os_name*`). +3. Libraries finders (search for contrib libraries, located in `/cmake/find`). +3. Contrib build CMake files (used instead of libraries' own CMake files, located in `/cmake/modules`) + +## List of CMake flags + +* This list is auto-generated by [this Python script](https://github.com/clickhouse/clickhouse/blob/master/docs/tools/cmake_in_clickhouse_generator.py). +* The flag name is a link to its position in the code. +* If an option's default value is itself an option, it's also a link to its position in this list. +### ClickHouse modes + +| Name | Default value | Description | Comment | +|------|---------------|-------------|---------| +| [`ENABLE_CLICKHOUSE_ALL`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L8) | `ON` | Enable all ClickHouse modes by default | The `clickhouse` binary is a multi purpose tool that contains multiple execution modes (client, server, etc.), each of them may be built and linked as a separate library. If you do not know what modes you need, turn this option OFF and enable SERVER and CLIENT only. | +| [`ENABLE_CLICKHOUSE_BENCHMARK`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L18) | `ENABLE_CLICKHOUSE_ALL` | Queries benchmarking mode | https://clickhouse.tech/docs/en/operations/utilities/clickhouse-benchmark/ | +| [`ENABLE_CLICKHOUSE_CLIENT`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L11) | `ENABLE_CLICKHOUSE_ALL` | Client mode (interactive tui/shell that connects to the server) | | +| [`ENABLE_CLICKHOUSE_COMPRESSOR`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L23) | `ENABLE_CLICKHOUSE_ALL` | Data compressor and decompressor | https://clickhouse.tech/docs/en/operations/utilities/clickhouse-compressor/ | +| [`ENABLE_CLICKHOUSE_COPIER`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L26) | `ENABLE_CLICKHOUSE_ALL` | Inter-cluster data copying mode | https://clickhouse.tech/docs/en/operations/utilities/clickhouse-copier/ | +| [`ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L20) | `ENABLE_CLICKHOUSE_ALL` | Configs processor (extract values etc.) | | +| [`ENABLE_CLICKHOUSE_FORMAT`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L28) | `ENABLE_CLICKHOUSE_ALL` | Queries pretty-printer and formatter with syntax highlighting | | +| [`ENABLE_CLICKHOUSE_GIT_IMPORT`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L40) | `ENABLE_CLICKHOUSE_ALL` | A tool to analyze Git repositories | https://presentations.clickhouse.tech/matemarketing_2020/ | +| [`ENABLE_CLICKHOUSE_INSTALL`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L44) | `OFF` | Install ClickHouse without .deb/.rpm/.tgz packages (having the binary only) | | +| [`ENABLE_CLICKHOUSE_LOCAL`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L15) | `ENABLE_CLICKHOUSE_ALL` | Local files fast processing mode | https://clickhouse.tech/docs/en/operations/utilities/clickhouse-local/ | +| [`ENABLE_CLICKHOUSE_OBFUSCATOR`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L32) | `ENABLE_CLICKHOUSE_ALL` | Table data obfuscator (convert real data to benchmark-ready one) | https://clickhouse.tech/docs/en/operations/utilities/clickhouse-obfuscator/ | +| [`ENABLE_CLICKHOUSE_ODBC_BRIDGE`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L36) | `ENABLE_CLICKHOUSE_ALL` | HTTP-server working like a proxy to ODBC driver | https://clickhouse.tech/docs/en/operations/utilities/odbc-bridge/ | +| [`ENABLE_CLICKHOUSE_SERVER`](https://github.com/clickhouse/clickhouse/blob/master/programs/CMakeLists.txt#L10) | `ENABLE_CLICKHOUSE_ALL` | Server mode (main mode) | | + +### External libraries +Note that ClickHouse uses forks of these libraries, see https://github.com/ClickHouse-Extras. + +| Name | Default value | Description | Comment | +|------|---------------|-------------|---------| +| [`ENABLE_AMQPCPP`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/amqpcpp.cmake#L1) | `ENABLE_LIBRARIES` | Enalbe AMQP-CPP | | +| [`ENABLE_AVRO`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/avro.cmake#L2) | `ENABLE_LIBRARIES` | Enable Avro | Needed when using Apache Avro serialization format | +| [`ENABLE_BASE64`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/base64.cmake#L1) | `ENABLE_LIBRARIES` | Enable base64 | | +| [`ENABLE_BROTLI`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/brotli.cmake#L1) | `ENABLE_LIBRARIES` | Enable brotli | | +| [`ENABLE_CAPNP`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/capnp.cmake#L1) | `ENABLE_LIBRARIES` | Enable Cap'n Proto | | +| [`ENABLE_CASSANDRA`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/cassandra.cmake#L1) | `ENABLE_LIBRARIES` | Enable Cassandra | | +| [`ENABLE_CCACHE`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/ccache.cmake#L22) | `ENABLE_CCACHE_BY_DEFAULT` | Speedup re-compilations using ccache (external tool) | https://ccache.dev/ | +| [`ENABLE_CLANG_TIDY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/analysis.cmake#L2) | `OFF` | Use clang-tidy static analyzer | https://clang.llvm.org/extra/clang-tidy/ | +| [`ENABLE_CURL`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/curl.cmake#L1) | `ENABLE_LIBRARIES` | Enable curl | | +| [`ENABLE_EMBEDDED_COMPILER`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/llvm.cmake#L5) | `ENABLE_LIBRARIES` | Set to TRUE to enable support for 'compile_expressions' option for query execution | | +| [`ENABLE_FASTOPS`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/fastops.cmake#L2) | `ENABLE_LIBRARIES` | Enable fast vectorized mathematical functions library by Mikhail Parakhin | | +| [`ENABLE_GPERF`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/gperf.cmake#L5) | `ENABLE_LIBRARIES` | Use gperf function hash generator tool | | +| [`ENABLE_GRPC`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/grpc.cmake#L8) | `ENABLE_GRPC_DEFAULT` | Use gRPC | | +| [`ENABLE_GSASL_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/libgsasl.cmake#L1) | `ENABLE_LIBRARIES` | Enable gsasl library | | +| [`ENABLE_H3`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/h3.cmake#L1) | `ENABLE_LIBRARIES` | Enable H3 | | +| [`ENABLE_HDFS`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/hdfs3.cmake#L2) | `ENABLE_LIBRARIES` | Enable HDFS | | +| [`ENABLE_ICU`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/icu.cmake#L2) | `ENABLE_LIBRARIES` | Enable ICU | | +| [`ENABLE_LDAP`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/ldap.cmake#L5) | `ENABLE_LIBRARIES` | Enable LDAP | | +| [`ENABLE_LIBPQXX`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/libpqxx.cmake#L1) | `ENABLE_LIBRARIES` | Enalbe libpqxx | | +| [`ENABLE_MSGPACK`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/msgpack.cmake#L1) | `ENABLE_LIBRARIES` | Enable msgpack library | | +| [`ENABLE_MYSQL`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/mysqlclient.cmake#L2) | `ENABLE_LIBRARIES` | Enable MySQL | | +| [`ENABLE_NURAFT`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/nuraft.cmake#L1) | `ENABLE_LIBRARIES` | Enable NuRaft | | +| [`ENABLE_ODBC`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/odbc.cmake#L1) | `ENABLE_LIBRARIES` | Enable ODBC library | | +| [`ENABLE_ORC`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/orc.cmake#L1) | `ENABLE_LIBRARIES` | Enable ORC | | +| [`ENABLE_PARQUET`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/parquet.cmake#L2) | `ENABLE_LIBRARIES` | Enable parquet | | +| [`ENABLE_PROTOBUF`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/protobuf.cmake#L1) | `ENABLE_LIBRARIES` | Enable protobuf | | +| [`ENABLE_RAPIDJSON`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/rapidjson.cmake#L1) | `ENABLE_LIBRARIES` | Use rapidjson | | +| [`ENABLE_RDKAFKA`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/rdkafka.cmake#L1) | `ENABLE_LIBRARIES` | Enable kafka | | +| [`ENABLE_ROCKSDB`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/rocksdb.cmake#L1) | `ENABLE_LIBRARIES` | Enable ROCKSDB | | +| [`ENABLE_S3`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/s3.cmake#L2) | `ENABLE_LIBRARIES` | Enable S3 | | +| [`ENABLE_SSL`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/ssl.cmake#L3) | `ENABLE_LIBRARIES` | Enable ssl | Needed when securely connecting to an external server, e.g. clickhouse-client --host ... --secure | +| [`ENABLE_STATS`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/stats.cmake#L1) | `ENABLE_LIBRARIES` | Enalbe StatsLib library | | + + +### External libraries system/bundled mode + +| Name | Default value | Description | Comment | +|------|---------------|-------------|---------| +| [`USE_INTERNAL_AVRO_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/avro.cmake#L11) | `ON` | Set to FALSE to use system avro library instead of bundled | | +| [`USE_INTERNAL_AWS_S3_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/s3.cmake#L14) | `ON` | Set to FALSE to use system S3 instead of bundled (experimental set to OFF on your own risk) | | +| [`USE_INTERNAL_BROTLI_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/brotli.cmake#L12) | `USE_STATIC_LIBRARIES` | Set to FALSE to use system libbrotli library instead of bundled | Many system ship only dynamic brotly libraries, so we back off to bundled by default | +| [`USE_INTERNAL_CAPNP_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/capnp.cmake#L10) | `NOT_UNBUNDLED` | Set to FALSE to use system capnproto library instead of bundled | | +| [`USE_INTERNAL_CURL`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/curl.cmake#L10) | `NOT_UNBUNDLED` | Use internal curl library | | +| [`USE_INTERNAL_GRPC_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/grpc.cmake#L25) | `NOT_UNBUNDLED` | Set to FALSE to use system gRPC library instead of bundled. (Experimental. Set to OFF on your own risk) | Normally we use the internal gRPC framework. You can set USE_INTERNAL_GRPC_LIBRARY to OFF to force using the external gRPC framework, which should be installed in the system in this case. The external gRPC framework can be installed in the system by running sudo apt-get install libgrpc++-dev protobuf-compiler-grpc | +| [`USE_INTERNAL_GTEST_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/gtest.cmake#L3) | `NOT_UNBUNDLED` | Set to FALSE to use system Google Test instead of bundled | | +| [`USE_INTERNAL_H3_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/h3.cmake#L9) | `ON` | Set to FALSE to use system h3 library instead of bundled | | +| [`USE_INTERNAL_HDFS3_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/hdfs3.cmake#L14) | `ON` | Set to FALSE to use system HDFS3 instead of bundled (experimental - set to OFF on your own risk) | | +| [`USE_INTERNAL_ICU_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/icu.cmake#L15) | `NOT_UNBUNDLED` | Set to FALSE to use system ICU library instead of bundled | | +| [`USE_INTERNAL_LDAP_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/ldap.cmake#L14) | `NOT_UNBUNDLED` | Set to FALSE to use system *LDAP library instead of bundled | | +| [`USE_INTERNAL_LIBCXX_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/cxx.cmake#L15) | `USE_INTERNAL_LIBCXX_LIBRARY_DEFAULT` | Disable to use system libcxx and libcxxabi libraries instead of bundled | | +| [`USE_INTERNAL_LIBGSASL_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/libgsasl.cmake#L12) | `USE_STATIC_LIBRARIES` | Set to FALSE to use system libgsasl library instead of bundled | when USE_STATIC_LIBRARIES we usually need to pick up hell a lot of dependencies for libgsasl | +| [`USE_INTERNAL_LIBXML2_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/libxml2.cmake#L1) | `NOT_UNBUNDLED` | Set to FALSE to use system libxml2 library instead of bundled | | +| [`USE_INTERNAL_LLVM_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/llvm.cmake#L8) | `NOT_UNBUNDLED` | Use bundled or system LLVM library. | | +| [`USE_INTERNAL_MSGPACK_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/msgpack.cmake#L10) | `NOT_UNBUNDLED` | Set to FALSE to use system msgpack library instead of bundled | | +| [`USE_INTERNAL_MYSQL_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/mysqlclient.cmake#L15) | `NOT_UNBUNDLED` | Set to FALSE to use system mysqlclient library instead of bundled | | +| [`USE_INTERNAL_ODBC_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/odbc.cmake#L22) | `NOT_UNBUNDLED` | Use internal ODBC library | | +| [`USE_INTERNAL_ORC_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/orc.cmake#L11) | `ON` | Set to FALSE to use system ORC instead of bundled (experimental set to OFF on your own risk) | | +| [`USE_INTERNAL_PARQUET_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/parquet.cmake#L16) | `NOT_UNBUNDLED` | Set to FALSE to use system parquet library instead of bundled | | +| [`USE_INTERNAL_POCO_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/poco.cmake#L1) | `ON` | Use internal Poco library | | +| [`USE_INTERNAL_PROTOBUF_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/protobuf.cmake#L14) | `NOT_UNBUNDLED` | Set to FALSE to use system protobuf instead of bundled. (Experimental. Set to OFF on your own risk) | Normally we use the internal protobuf library. You can set USE_INTERNAL_PROTOBUF_LIBRARY to OFF to force using the external protobuf library, which should be installed in the system in this case. The external protobuf library can be installed in the system by running sudo apt-get install libprotobuf-dev protobuf-compiler libprotoc-dev | +| [`USE_INTERNAL_RAPIDJSON_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/rapidjson.cmake#L9) | `NOT_UNBUNDLED` | Set to FALSE to use system rapidjson library instead of bundled | | +| [`USE_INTERNAL_RDKAFKA_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/rdkafka.cmake#L10) | `NOT_UNBUNDLED` | Set to FALSE to use system librdkafka instead of the bundled | | +| [`USE_INTERNAL_RE2_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/re2.cmake#L1) | `NOT_UNBUNDLED` | Set to FALSE to use system re2 library instead of bundled [slower] | | +| [`USE_INTERNAL_ROCKSDB_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/rocksdb.cmake#L10) | `NOT_UNBUNDLED` | Set to FALSE to use system ROCKSDB library instead of bundled | | +| [`USE_INTERNAL_SNAPPY_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/snappy.cmake#L10) | `NOT_UNBUNDLED` | Set to FALSE to use system snappy library instead of bundled | | +| [`USE_INTERNAL_SPARSEHASH_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/sparsehash.cmake#L1) | `ON` | Set to FALSE to use system sparsehash library instead of bundled | | +| [`USE_INTERNAL_SSL_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/ssl.cmake#L12) | `NOT_UNBUNDLED` | Set to FALSE to use system *ssl library instead of bundled | | +| [`USE_INTERNAL_ZLIB_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/zlib.cmake#L1) | `NOT_UNBUNDLED` | Set to FALSE to use system zlib library instead of bundled | | +| [`USE_INTERNAL_ZSTD_LIBRARY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/zstd.cmake#L1) | `NOT_UNBUNDLED` | Set to FALSE to use system zstd library instead of bundled | | + + +### Other flags + +| Name | Default value | Description | Comment | +|------|---------------|-------------|---------| +| [`ADD_GDB_INDEX_FOR_GOLD`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L195) | `OFF` | Add .gdb-index to resulting binaries for gold linker. | Ignored if `lld` is used | +| [`ARCH_NATIVE`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L248) | `OFF` | Add -march=native compiler flag | | +| [`CLICKHOUSE_SPLIT_BINARY`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L98) | `OFF` | Make several binaries (clickhouse-server, clickhouse-client etc.) instead of one bundled | | +| [`COMPILER_PIPE`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L235) | `ON` | -pipe compiler option | Less `/tmp` usage, more RAM usage. | +| [`ENABLE_CHECK_HEAVY_BUILDS`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L69) | `OFF` | Don't allow C++ translation units to compile too long or to take too much memory while compiling | | +| [`ENABLE_FUZZING`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L115) | `OFF` | Fuzzy testing using libfuzzer | Implies `WITH_COVERAGE` | +| [`ENABLE_LIBRARIES`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L357) | `ON` | Enable all external libraries by default | Turns on all external libs like s3, kafka, ODBC, ... | +| [`ENABLE_MULTITARGET_CODE`](https://github.com/clickhouse/clickhouse/blob/master/src/Functions/CMakeLists.txt#L100) | `ON` | Enable platform-dependent code | ClickHouse developers may use platform-dependent code under some macro (e.g. `ifdef ENABLE_MULTITARGET`). If turned ON, this option defines such macro. See `src/Functions/TargetSpecific.h` | +| [`ENABLE_TESTS`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L154) | `ON` | Provide unit_test_dbms target with Google.Test unit tests | If turned `ON`, assumes the user has either the system GTest library or the bundled one. | +| [`ENABLE_THINLTO`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L313) | `ON` | Clang-specific link time optimization | https://clang.llvm.org/docs/ThinLTO.html Applies to clang only. Disabled when building with tests or sanitizers. | +| [`FAIL_ON_UNSUPPORTED_OPTIONS_COMBINATION`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L32) | `ON` | Stop/Fail CMake configuration if some ENABLE_XXX option is defined (either ON or OFF) but is not possible to satisfy | If turned off: e.g. when ENABLE_FOO is ON, but FOO tool was not found, the CMake will continue. | +| [`GLIBC_COMPATIBILITY`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L159) | `ON` | Enable compatibility with older glibc libraries. | Only for Linux, x86_64. Implies `ENABLE_FASTMEMCPY` | +| [`LINKER_NAME`](https://github.com/clickhouse/clickhouse/blob/master/cmake/tools.cmake#L44) | `OFF` | Linker name or full path | Example values: `lld-10`, `gold`. | +| [`LLVM_HAS_RTTI`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/llvm.cmake#L40) | `ON` | Enable if LLVM was build with RTTI enabled | | +| [`MAKE_STATIC_LIBRARIES`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L91) | `USE_STATIC_LIBRARIES` | Disable to make shared libraries | | +| [`PARALLEL_COMPILE_JOBS`](https://github.com/clickhouse/clickhouse/blob/master/cmake/limit_jobs.cmake#L10) | `""` | Maximum number of concurrent compilation jobs | 1 if not set | +| [`PARALLEL_LINK_JOBS`](https://github.com/clickhouse/clickhouse/blob/master/cmake/limit_jobs.cmake#L13) | `""` | Maximum number of concurrent link jobs | 1 if not set | +| [`SANITIZE`](https://github.com/clickhouse/clickhouse/blob/master/cmake/sanitize.cmake#L7) | `""` | Enable one of the code sanitizers | Possible values: - `address` (ASan) - `memory` (MSan) - `thread` (TSan) - `undefined` (UBSan) - "" (no sanitizing) | +| [`SPLIT_SHARED_LIBRARIES`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L96) | `OFF` | Keep all internal libraries as separate .so files | DEVELOPER ONLY. Faster linking if turned on. | +| [`STRIP_DEBUG_SYMBOLS_FUNCTIONS`](https://github.com/clickhouse/clickhouse/blob/master/src/Functions/CMakeLists.txt#L49) | `STRIP_DSF_DEFAULT` | Do not generate debugger info for ClickHouse functions | Provides faster linking and lower binary size. Tradeoff is the inability to debug some source files with e.g. gdb (empty stack frames and no local variables)." | +| [`UNBUNDLED`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L363) | `OFF` | Use system libraries instead of ones in contrib/ | We recommend avoiding this mode for production builds because we can't guarantee all needed libraries exist in your system. This mode exists for enthusiastic developers who are searching for trouble. Useful for maintainers of OS packages. | +| [`USE_INCLUDE_WHAT_YOU_USE`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L418) | `OFF` | Automatically reduce unneeded includes in source code (external tool) | https://github.com/include-what-you-use/include-what-you-use | +| [`USE_LIBCXX`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/cxx.cmake#L1) | `NOT_UNBUNDLED` | Use libc++ and libc++abi instead of libstdc++ | | +| [`USE_SENTRY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/sentry.cmake#L13) | `ENABLE_LIBRARIES` | Use Sentry | | +| [`USE_SIMDJSON`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/simdjson.cmake#L1) | `ENABLE_LIBRARIES` | Use simdjson | | +| [`USE_SNAPPY`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/snappy.cmake#L1) | `ENABLE_LIBRARIES` | Enable snappy library | | +| [`USE_STATIC_LIBRARIES`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L90) | `ON` | Disable to use shared libraries | | +| [`USE_UNWIND`](https://github.com/clickhouse/clickhouse/blob/master/cmake/find/unwind.cmake#L1) | `ENABLE_LIBRARIES` | Enable libunwind (better stacktraces) | | +| [`WERROR`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L373) | `OFF` | Enable -Werror compiler option | Using system libs can cause a lot of warnings in includes (on macro expansion). | +| [`WEVERYTHING`](https://github.com/clickhouse/clickhouse/blob/master/cmake/warnings.cmake#L22) | `ON` | Enable -Weverything option with some exceptions. | Add some warnings that are not available even with -Wall -Wextra -Wpedantic. Intended for exploration of new compiler warnings that may be found useful. Applies to clang only | +| [`WITH_COVERAGE`](https://github.com/clickhouse/clickhouse/blob/master/CMakeLists.txt#L274) | `OFF` | Profile the resulting binary/binaries | Compiler-specific coverage flags e.g. -fcoverage-mapping for gcc | + +## Developer's guide for adding new CMake options + +### Don't be obvious. Be informative. + +Bad: +```cmake +option (ENABLE_TESTS "Enables testing" OFF) +``` + +This description is quite useless as is neither gives the viewer any additional information nor explains the option purpose. + +Better: + +```cmake +option(ENABLE_TESTS "Provide unit_test_dbms target with Google.test unit tests" OFF) +``` + +If the option's purpose can't be guessed by its name, or the purpose guess may be misleading, or option has some +pre-conditions, leave a comment above the `option()` line and explain what it does. +The best way would be linking the docs page (if it exists). +The comment is parsed into a separate column (see below). + +Even better: + +```cmake +# implies ${TESTS_ARE_ENABLED} +# see tests/CMakeLists.txt for implementation detail. +option(ENABLE_TESTS "Provide unit_test_dbms target with Google.test unit tests" OFF) +``` + +### If the option's state could produce unwanted (or unusual) result, explicitly warn the user. + +Suppose you have an option that may strip debug symbols from the ClickHouse's part. +This can speed up the linking process, but produces a binary that cannot be debugged. +In that case, prefer explicitly raising a warning telling the developer that he may be doing something wrong. +Also, such options should be disabled if applies. + +Bad: +```cmake +option(STRIP_DEBUG_SYMBOLS_FUNCTIONS + "Do not generate debugger info for ClickHouse functions. + ${STRIP_DSF_DEFAULT}) + +if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) + target_compile_options(clickhouse_functions PRIVATE "-g0") +endif() + +``` +Better: + +```cmake +# Provides faster linking and lower binary size. +# Tradeoff is the inability to debug some source files with e.g. gdb +# (empty stack frames and no local variables)." +option(STRIP_DEBUG_SYMBOLS_FUNCTIONS + "Do not generate debugger info for ClickHouse functions." + ${STRIP_DSF_DEFAULT}) + +if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) + message(WARNING "Not generating debugger info for ClickHouse functions") + target_compile_options(clickhouse_functions PRIVATE "-g0") +endif() +``` + +### In the option's description, explain WHAT the option does rather than WHY it does something. + +The WHY explanation should be placed in the comment. +You may find that the option's name is self-descriptive. + +Bad: + +```cmake +option(ENABLE_THINLTO "Enable Thin LTO. Only applicable for clang. It's also suppressed when building with tests or sanitizers." ON) +``` + +Better: + +```cmake +# Only applicable for clang. +# Turned off when building with tests or sanitizers. +option(ENABLE_THINLTO "Clang-specific link time optimisation" ON). +``` + +### Don't assume other developers know as much as you do. + +In ClickHouse, there are many tools used that an ordinary developer may not know. If you are in doubt, give a link to +the tool's docs. It won't take much of your time. + +Bad: + +```cmake +option(ENABLE_THINLTO "Enable Thin LTO. Only applicable for clang. It's also suppressed when building with tests or sanitizers." ON) +``` + +Better (combined with the above hint): + +```cmake +# https://clang.llvm.org/docs/ThinLTO.html +# Only applicable for clang. +# Turned off when building with tests or sanitizers. +option(ENABLE_THINLTO "Clang-specific link time optimisation" ON). +``` + +Other example, bad: + +```cmake +option (USE_INCLUDE_WHAT_YOU_USE "Use 'include-what-you-use' tool" OFF) +``` + +Better: + +```cmake +# https://github.com/include-what-you-use/include-what-you-use +option (USE_INCLUDE_WHAT_YOU_USE "Reduce unneeded #include s (external tool)" OFF) +``` + +### Prefer consistent default values. + +CMake allows you to pass a plethora of values representing boolean `true/false`, e.g. `1, ON, YES, ...`. +Prefer the `ON/OFF` values, if possible. diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index 76a2f647231..64ca2387029 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -5,36 +5,87 @@ toc_title: Third-Party Libraries Used # Third-Party Libraries Used {#third-party-libraries-used} -| Library | License | -|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------| -| base64 | [BSD 2-Clause License](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | -| boost | [Boost Software License 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | -| brotli | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | -| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | -| cctz | [Apache License 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | -| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | -| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | -| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | -| h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) | -| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | -| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | -| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | -| libhdfs3 | [Apache License 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | -| libmetrohash | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | -| libpcg-random | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | -| libressl | [OpenSSL License](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | -| librdkafka | [BSD 2-Clause License](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | -| libwidechar_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | -| llvm | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | -| lz4 | [BSD 2-Clause License](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | -| mariadb-connector-c | [LGPL v2.1](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | -| murmurhash | [Public Domain](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) | -| pdqsort | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | -| poco | [Boost Software License - Version 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | -| protobuf | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | -| re2 | [BSD 3-Clause License](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | -| sentry-native | [MIT License](https://github.com/getsentry/sentry-native/blob/master/LICENSE) | -| UnixODBC | [LGPL v2.1](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | -| zlib-ng | [Zlib License](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | -| zstd | [BSD 3-Clause License](https://github.com/facebook/zstd/blob/dev/LICENSE) | +The list of third-party libraries can be obtained by the following query: + +``` +SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en' +``` + +[Example](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==) + +| library_name | license_type | license_path | +|:-|:-|:-| +| abseil-cpp | Apache | /contrib/abseil-cpp/LICENSE | +| AMQP-CPP | Apache | /contrib/AMQP-CPP/LICENSE | +| arrow | Apache | /contrib/arrow/LICENSE.txt | +| avro | Apache | /contrib/avro/LICENSE.txt | +| aws | Apache | /contrib/aws/LICENSE.txt | +| aws-c-common | Apache | /contrib/aws-c-common/LICENSE | +| aws-c-event-stream | Apache | /contrib/aws-c-event-stream/LICENSE | +| aws-checksums | Apache | /contrib/aws-checksums/LICENSE | +| base64 | BSD 2-clause | /contrib/base64/LICENSE | +| boost | Boost | /contrib/boost/LICENSE_1_0.txt | +| boringssl | BSD | /contrib/boringssl/LICENSE | +| brotli | MIT | /contrib/brotli/LICENSE | +| capnproto | MIT | /contrib/capnproto/LICENSE | +| cassandra | Apache | /contrib/cassandra/LICENSE.txt | +| cctz | Apache | /contrib/cctz/LICENSE.txt | +| cityhash102 | MIT | /contrib/cityhash102/COPYING | +| cppkafka | BSD 2-clause | /contrib/cppkafka/LICENSE | +| croaring | Apache | /contrib/croaring/LICENSE | +| curl | Apache | /contrib/curl/docs/LICENSE-MIXING.md | +| cyrus-sasl | BSD 2-clause | /contrib/cyrus-sasl/COPYING | +| double-conversion | BSD 3-clause | /contrib/double-conversion/LICENSE | +| dragonbox | Apache | /contrib/dragonbox/LICENSE-Apache2-LLVM | +| fast_float | Apache | /contrib/fast_float/LICENSE | +| fastops | MIT | /contrib/fastops/LICENSE | +| flatbuffers | Apache | /contrib/flatbuffers/LICENSE.txt | +| fmtlib | Unknown | /contrib/fmtlib/LICENSE.rst | +| gcem | Apache | /contrib/gcem/LICENSE | +| googletest | BSD 3-clause | /contrib/googletest/LICENSE | +| grpc | Apache | /contrib/grpc/LICENSE | +| h3 | Apache | /contrib/h3/LICENSE | +| hyperscan | Boost | /contrib/hyperscan/LICENSE | +| icu | Public Domain | /contrib/icu/icu4c/LICENSE | +| icudata | Public Domain | /contrib/icudata/LICENSE | +| jemalloc | BSD 2-clause | /contrib/jemalloc/COPYING | +| krb5 | MIT | /contrib/krb5/src/lib/gssapi/LICENSE | +| libc-headers | LGPL | /contrib/libc-headers/LICENSE | +| libcpuid | BSD 2-clause | /contrib/libcpuid/COPYING | +| libcxx | Apache | /contrib/libcxx/LICENSE.TXT | +| libcxxabi | Apache | /contrib/libcxxabi/LICENSE.TXT | +| libdivide | zLib | /contrib/libdivide/LICENSE.txt | +| libfarmhash | MIT | /contrib/libfarmhash/COPYING | +| libgsasl | LGPL | /contrib/libgsasl/LICENSE | +| libhdfs3 | Apache | /contrib/libhdfs3/LICENSE.txt | +| libmetrohash | Apache | /contrib/libmetrohash/LICENSE | +| libpq | Unknown | /contrib/libpq/COPYRIGHT | +| libpqxx | BSD 3-clause | /contrib/libpqxx/COPYING | +| librdkafka | MIT | /contrib/librdkafka/LICENSE.murmur2 | +| libunwind | Apache | /contrib/libunwind/LICENSE.TXT | +| libuv | BSD | /contrib/libuv/LICENSE | +| llvm | Apache | /contrib/llvm/llvm/LICENSE.TXT | +| lz4 | BSD | /contrib/lz4/LICENSE | +| mariadb-connector-c | LGPL | /contrib/mariadb-connector-c/COPYING.LIB | +| miniselect | Boost | /contrib/miniselect/LICENSE_1_0.txt | +| msgpack-c | Boost | /contrib/msgpack-c/LICENSE_1_0.txt | +| murmurhash | Public Domain | /contrib/murmurhash/LICENSE | +| NuRaft | Apache | /contrib/NuRaft/LICENSE | +| openldap | Unknown | /contrib/openldap/LICENSE | +| orc | Apache | /contrib/orc/LICENSE | +| poco | Boost | /contrib/poco/LICENSE | +| protobuf | BSD 3-clause | /contrib/protobuf/LICENSE | +| rapidjson | MIT | /contrib/rapidjson/bin/jsonschema/LICENSE | +| re2 | BSD 3-clause | /contrib/re2/LICENSE | +| replxx | BSD 3-clause | /contrib/replxx/LICENSE.md | +| rocksdb | BSD 3-clause | /contrib/rocksdb/LICENSE.leveldb | +| sentry-native | MIT | /contrib/sentry-native/LICENSE | +| simdjson | Apache | /contrib/simdjson/LICENSE | +| snappy | Public Domain | /contrib/snappy/COPYING | +| sparsehash-c11 | BSD 3-clause | /contrib/sparsehash-c11/LICENSE | +| stats | Apache | /contrib/stats/LICENSE | +| thrift | Apache | /contrib/thrift/LICENSE | +| unixodbc | LGPL | /contrib/unixodbc/COPYING | +| xz | Public Domain | /contrib/xz/COPYING | +| zlib-ng | zLib | /contrib/zlib-ng/LICENSE.md | +| zstd | BSD | /contrib/zstd/LICENSE | diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index fb453e55417..7547497b9af 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -233,7 +233,7 @@ Google OSS-Fuzz can be found at `docker/fuzz`. We also use simple fuzz test to generate random SQL queries and to check that the server doesn’t die executing them. You can find it in `00746_sql_fuzzy.pl`. This test should be run continuously (overnight and longer). -We also use sophisticated AST-based query fuzzer that is able to find huge amount of corner cases. It does random permutations and substitutions in queries AST. It remembers AST nodes from previous tests to use them for fuzzing of subsequent tests while processing them in random order. +We also use sophisticated AST-based query fuzzer that is able to find huge amount of corner cases. It does random permutations and substitutions in queries AST. It remembers AST nodes from previous tests to use them for fuzzing of subsequent tests while processing them in random order. You can learn more about this fuzzer in [this blog article](https://clickhouse.tech/blog/en/2021/fuzzing-clickhouse/). ## Stress test diff --git a/docs/en/engines/database-engines/atomic.md b/docs/en/engines/database-engines/atomic.md index f019b94a00b..d897631dd6e 100644 --- a/docs/en/engines/database-engines/atomic.md +++ b/docs/en/engines/database-engines/atomic.md @@ -3,15 +3,52 @@ toc_priority: 32 toc_title: Atomic --- - # Atomic {#atomic} -It is supports non-blocking `DROP` and `RENAME TABLE` queries and atomic `EXCHANGE TABLES t1 AND t2` queries. Atomic database engine is used by default. +It supports non-blocking [DROP TABLE](#drop-detach-table) and [RENAME TABLE](#rename-table) queries and atomic [EXCHANGE TABLES t1 AND t2](#exchange-tables) queries. `Atomic` database engine is used by default. ## Creating a Database {#creating-a-database} -```sql -CREATE DATABASE test ENGINE = Atomic; +``` sql + CREATE DATABASE test[ ENGINE = Atomic]; ``` -[Original article](https://clickhouse.tech/docs/en/engines/database_engines/atomic/) +## Specifics and recommendations {#specifics-and-recommendations} + +### Table UUID {#table-uuid} + +All tables in database `Atomic` have persistent [UUID](../../sql-reference/data-types/uuid.md) and store data in directory `/clickhouse_path/store/xxx/xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy/`, where `xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy` is UUID of the table. +Usually, the UUID is generated automatically, but the user can also explicitly specify the UUID in the same way when creating the table (this is not recommended). To display the `SHOW CREATE` query with the UUID you can use setting [show_table_uuid_in_table_create_query_if_not_nil](../../operations/settings/settings.md#show_table_uuid_in_table_create_query_if_not_nil). For example: + +```sql +CREATE TABLE name UUID '28f1c61c-2970-457a-bffe-454156ddcfef' (n UInt64) ENGINE = ...; +``` +### RENAME TABLE {#rename-table} + +`RENAME` queries are performed without changing UUID and moving table data. These queries do not wait for the completion of queries using the table and will be executed instantly. + +### DROP/DETACH TABLE {#drop-detach-table} + +On `DROP TABLE` no data is removed, database `Atomic` just marks table as dropped by moving metadata to `/clickhouse_path/metadata_dropped/` and notifies background thread. Delay before final table data deletion is specify by [database_atomic_delay_before_drop_table_sec](../../operations/server-configuration-parameters/settings.md#database_atomic_delay_before_drop_table_sec) setting. +You can specify synchronous mode using `SYNC` modifier. Use the [database_atomic_wait_for_drop_and_detach_synchronously](../../operations/settings/settings.md#database_atomic_wait_for_drop_and_detach_synchronously) setting to do this. In this case `DROP` waits for running `SELECT`, `INSERT` and other queries which are using the table to finish. Table will be actually removed when it's not in use. + +### EXCHANGE TABLES {#exchange-tables} + +`EXCHANGE` query swaps tables atomically. So instead of this non-atomic operation: + +```sql +RENAME TABLE new_table TO tmp, old_table TO new_table, tmp TO old_table; +``` +you can use one atomic query: + +``` sql +EXCHANGE TABLES new_table AND old_table; +``` + +### ReplicatedMergeTree in Atomic Database {#replicatedmergetree-in-atomic-database} + +For [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) tables is recomended do not specify parameters of engine - path in ZooKeeper and replica name. In this case will be used parameters of the configuration [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) and [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name). If you want specify parameters of engine explicitly than recomended to use {uuid} macros. This is useful so that unique paths are automatically generated for each table in the ZooKeeper. + +## See Also + +- [system.databases](../../operations/system-tables/databases.md) system table diff --git a/docs/en/engines/database-engines/index.md b/docs/en/engines/database-engines/index.md index 2db11998483..b6892099378 100644 --- a/docs/en/engines/database-engines/index.md +++ b/docs/en/engines/database-engines/index.md @@ -18,4 +18,8 @@ You can also use the following database engines: - [Lazy](../../engines/database-engines/lazy.md) +- [Atomic](../../engines/database-engines/atomic.md) + +- [PostgreSQL](../../engines/database-engines/postgresql.md) + [Original article](https://clickhouse.tech/docs/en/database_engines/) diff --git a/docs/en/engines/database-engines/materialize-mysql.md b/docs/en/engines/database-engines/materialize-mysql.md index 2e361cc82f0..69d3122c268 100644 --- a/docs/en/engines/database-engines/materialize-mysql.md +++ b/docs/en/engines/database-engines/materialize-mysql.md @@ -69,7 +69,7 @@ MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ - MySQL `INSERT` query is converted into `INSERT` with `_sign=1`. -- MySQl `DELETE` query is converted into `INSERT` with `_sign=-1`. +- MySQL `DELETE` query is converted into `INSERT` with `_sign=-1`. - MySQL `UPDATE` query is converted into `INSERT` with `_sign=-1` and `INSERT` with `_sign=1`. diff --git a/docs/en/engines/database-engines/postgresql.md b/docs/en/engines/database-engines/postgresql.md new file mode 100644 index 00000000000..1fa86b7ac21 --- /dev/null +++ b/docs/en/engines/database-engines/postgresql.md @@ -0,0 +1,138 @@ +--- +toc_priority: 35 +toc_title: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +Allows to connect to databases on a remote [PostgreSQL](https://www.postgresql.org) server. Supports read and write operations (`SELECT` and `INSERT` queries) to exchange data between ClickHouse and PostgreSQL. + +Gives the real-time access to table list and table structure from remote PostgreSQL with the help of `SHOW TABLES` and `DESCRIBE TABLE` queries. + +Supports table structure modifications (`ALTER TABLE ... ADD|DROP COLUMN`). If `use_table_cache` parameter (see the Engine Parameters below) it set to `1`, the table structure is cached and not checked for being modified, but can be updated with `DETACH` and `ATTACH` queries. + +## Creating a Database {#creating-a-database} + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]); +``` + +**Engine Parameters** + +- `host:port` — PostgreSQL server address. +- `database` — Remote database name. +- `user` — PostgreSQL user. +- `password` — User password. +- `use_table_cache` — Defines if the database table structure is cached or not. Optional. Default value: `0`. + +## Data Types Support {#data_types-support} + +| PostgerSQL | ClickHouse | +|------------------|--------------------------------------------------------------| +| DATE | [Date](../../sql-reference/data-types/date.md) | +| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| REAL | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) | +| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | +| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) | +| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | +| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) | +| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) | +| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) | +| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))| +| ARRAY | [Array](../../sql-reference/data-types/array.md) | + + +## Examples of Use {#examples-of-use} + +Database in ClickHouse, exchanging data with the PostgreSQL server: + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1); +``` + +``` sql +SHOW DATABASES; +``` + +``` text +┌─name──────────┐ +│ default │ +│ test_database │ +│ system │ +└───────────────┘ +``` + +``` sql +SHOW TABLES FROM test_database; +``` + +``` text +┌─name───────┐ +│ test_table │ +└────────────┘ +``` + +Reading data from the PostgreSQL table: + +``` sql +SELECT * FROM test_database.test_table; +``` + +``` text +┌─id─┬─value─┐ +│ 1 │ 2 │ +└────┴───────┘ +``` + +Writing data to the PostgreSQL table: + +``` sql +INSERT INTO test_database.test_table VALUES (3,4); +SELECT * FROM test_database.test_table; +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +Consider the table structure was modified in PostgreSQL: + +``` sql +postgre> ALTER TABLE test_table ADD COLUMN data Text +``` + +As the `use_table_cache` parameter was set to `1` when the database was created, the table structure in ClickHouse was cached and therefore not modified: + +``` sql +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +└────────┴───────────────────┘ +``` + +After detaching the table and attaching it again, the structure was updated: + +``` sql +DETACH TABLE test_database.test_table; +ATTACH TABLE test_database.test_table; +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +│ data │ Nullable(String) │ +└────────┴───────────────────┘ +``` + +[Original article](https://clickhouse.tech/docs/en/database-engines/postgresql/) diff --git a/docs/en/engines/table-engines/index.md b/docs/en/engines/table-engines/index.md index 546557beb57..eb4fc583f88 100644 --- a/docs/en/engines/table-engines/index.md +++ b/docs/en/engines/table-engines/index.md @@ -47,12 +47,17 @@ Engines for communicating with other data storage and processing systems. Engines in the family: -- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) -- [MySQL](../../engines/table-engines/integrations/mysql.md#mysql) -- [ODBC](../../engines/table-engines/integrations/odbc.md#table-engine-odbc) -- [JDBC](../../engines/table-engines/integrations/jdbc.md#table-engine-jdbc) -- [HDFS](../../engines/table-engines/integrations/hdfs.md#hdfs) -- [S3](../../engines/table-engines/integrations/s3.md#table_engines-s3) + +- [ODBC](../../engines/table-engines/integrations/odbc.md) +- [JDBC](../../engines/table-engines/integrations/jdbc.md) +- [MySQL](../../engines/table-engines/integrations/mysql.md) +- [MongoDB](../../engines/table-engines/integrations/mongodb.md) +- [HDFS](../../engines/table-engines/integrations/hdfs.md) +- [S3](../../engines/table-engines/integrations/s3.md) +- [Kafka](../../engines/table-engines/integrations/kafka.md) +- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md) ### Special Engines {#special-engines} diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index 6e864751cc3..88c8973eeab 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 9 toc_title: EmbeddedRocksDB --- @@ -39,4 +39,4 @@ ENGINE = EmbeddedRocksDB PRIMARY KEY key ``` -[Original article](https://clickhouse.tech/docs/en/operations/table_engines/embedded-rocksdb/) +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/embedded-rocksdb/) diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 5c36e3f1c21..cf4bb5ecbf7 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -1,11 +1,11 @@ --- -toc_priority: 4 +toc_priority: 6 toc_title: HDFS --- # HDFS {#table_engines-hdfs} -This engine provides integration with [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)via ClickHouse. This engine is similar +This engine provides integration with [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) via ClickHouse. This engine is similar to the [File](../../../engines/table-engines/special/file.md#table_engines-file) and [URL](../../../engines/table-engines/special/url.md#table_engines-url) engines, but provides Hadoop-specific features. ## Usage {#usage} @@ -174,7 +174,7 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us | dfs\_domain\_socket\_path | "" | -[HDFS Configuration Reference ](https://hawq.apache.org/docs/userguide/2.3.0.0-incubating/reference/HDFSConfigurationParameterReference.html) might explain some parameters. +[HDFS Configuration Reference](https://hawq.apache.org/docs/userguide/2.3.0.0-incubating/reference/HDFSConfigurationParameterReference.html) might explain some parameters. #### ClickHouse extras {#clickhouse-extras} @@ -185,7 +185,6 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us |hadoop\_kerberos\_kinit\_command | kinit | #### Limitations {#limitations} - * hadoop\_security\_kerberos\_ticket\_cache\_path can be global only, not user specific ## Kerberos support {#kerberos-support} @@ -207,4 +206,4 @@ If hadoop\_kerberos\_keytab, hadoop\_kerberos\_principal or hadoop\_kerberos\_ki - [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) -[Original article](https://clickhouse.tech/docs/en/operations/table_engines/hdfs/) +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/hdfs/) diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index 288c9c3cd56..eb1c5411e18 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -1,6 +1,6 @@ --- toc_folder_title: Integrations -toc_priority: 30 +toc_priority: 1 --- # Table Engines for Integrations {#table-engines-for-integrations} @@ -18,3 +18,4 @@ List of supported integrations: - [Kafka](../../../engines/table-engines/integrations/kafka.md) - [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md) - [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md) diff --git a/docs/en/engines/table-engines/integrations/jdbc.md b/docs/en/engines/table-engines/integrations/jdbc.md index 2144be9f1e3..82efb842ae7 100644 --- a/docs/en/engines/table-engines/integrations/jdbc.md +++ b/docs/en/engines/table-engines/integrations/jdbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 2 +toc_priority: 3 toc_title: JDBC --- @@ -85,4 +85,4 @@ FROM jdbc_table - [JDBC table function](../../../sql-reference/table-functions/jdbc.md). -[Original article](https://clickhouse.tech/docs/en/operations/table_engines/jdbc/) +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/jdbc/) diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index c519d6bb136..2eebf5bdb92 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -1,5 +1,5 @@ --- -toc_priority: 5 +toc_priority: 8 toc_title: Kafka --- @@ -38,20 +38,20 @@ SETTINGS Required parameters: -- `kafka_broker_list` – A comma-separated list of brokers (for example, `localhost:9092`). -- `kafka_topic_list` – A list of Kafka topics. -- `kafka_group_name` – A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere. -- `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section. +- `kafka_broker_list` — A comma-separated list of brokers (for example, `localhost:9092`). +- `kafka_topic_list` — A list of Kafka topics. +- `kafka_group_name` — A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere. +- `kafka_format` — Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section. Optional parameters: -- `kafka_row_delimiter` – Delimiter character, which ends the message. -- `kafka_schema` – Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object. -- `kafka_num_consumers` – The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition. -- `kafka_max_block_size` - The maximum batch size (in messages) for poll (default: `max_block_size`). -- `kafka_skip_broken_messages` – Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data). -- `kafka_commit_every_batch` - Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`). -- `kafka_thread_per_consumer` - Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise - rows from several consumers squashed to form one block). +- `kafka_row_delimiter` — Delimiter character, which ends the message. +- `kafka_schema` — Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object. +- `kafka_num_consumers` — The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition. +- `kafka_max_block_size` — The maximum batch size (in messages) for poll (default: `max_block_size`). +- `kafka_skip_broken_messages` — Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data). +- `kafka_commit_every_batch` — Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`). +- `kafka_thread_per_consumer` — Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise — rows from several consumers squashed to form one block). Examples: @@ -194,4 +194,4 @@ Example: - [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) - [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) -[Original article](https://clickhouse.tech/docs/en/operations/table_engines/kafka/) +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/kafka/) diff --git a/docs/en/engines/table-engines/integrations/mongodb.md b/docs/en/engines/table-engines/integrations/mongodb.md index e648a13b5e0..a378ab03f55 100644 --- a/docs/en/engines/table-engines/integrations/mongodb.md +++ b/docs/en/engines/table-engines/integrations/mongodb.md @@ -1,5 +1,5 @@ --- -toc_priority: 7 +toc_priority: 5 toc_title: MongoDB --- @@ -54,4 +54,4 @@ SELECT COUNT() FROM mongo_table; └─────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/operations/table_engines/integrations/mongodb/) +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/mongodb/) diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md index 2cb1facce91..3847e7a9e0e 100644 --- a/docs/en/engines/table-engines/integrations/mysql.md +++ b/docs/en/engines/table-engines/integrations/mysql.md @@ -1,5 +1,5 @@ --- -toc_priority: 3 +toc_priority: 4 toc_title: MySQL --- @@ -24,6 +24,7 @@ The table structure can differ from the original MySQL table structure: - Column names should be the same as in the original MySQL table, but you can use just some of these columns and in any order. - Column types may differ from those in the original MySQL table. ClickHouse tries to [cast](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) values to the ClickHouse data types. +- Setting `external_table_functions_use_nulls` defines how to handle Nullable columns. Default is true, if false - table function will not make nullable columns and will insert default values instead of nulls. This is also applicable for null values inside array data types. **Engine Parameters** @@ -100,4 +101,4 @@ SELECT * FROM mysql_table - [The ‘mysql’ table function](../../../sql-reference/table-functions/mysql.md) - [Using MySQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) -[Original article](https://clickhouse.tech/docs/en/operations/table_engines/mysql/) +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/mysql/) diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index fffc125b0ff..26bfb6aeb0d 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 1 +toc_priority: 2 toc_title: ODBC --- @@ -29,6 +29,7 @@ The table structure can differ from the source table structure: - Column names should be the same as in the source table, but you can use just some of these columns and in any order. - Column types may differ from those in the source table. ClickHouse tries to [cast](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) values to the ClickHouse data types. +- Setting `external_table_functions_use_nulls` defines how to handle Nullable columns. Default is true, if false - table function will not make nullable columns and will insert default values instead of nulls. This is also applicable for null values inside array data types. **Engine Parameters** @@ -127,4 +128,4 @@ SELECT * FROM odbc_t - [ODBC external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) - [ODBC table function](../../../sql-reference/table-functions/odbc.md) -[Original article](https://clickhouse.tech/docs/en/operations/table_engines/odbc/) +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/odbc/) diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md new file mode 100644 index 00000000000..4474b764d2e --- /dev/null +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -0,0 +1,145 @@ +--- +toc_priority: 11 +toc_title: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +The PostgreSQL engine allows to perform `SELECT` and `INSERT` queries on data that is stored on a remote PostgreSQL server. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + ... +) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]); +``` + +See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +The table structure can differ from the original PostgreSQL table structure: + +- Column names should be the same as in the original PostgreSQL table, but you can use just some of these columns and in any order. +- Column types may differ from those in the original PostgreSQL table. ClickHouse tries to [cast](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) values to the ClickHouse data types. +- Setting `external_table_functions_use_nulls` defines how to handle Nullable columns. Default is 1, if 0 - table function will not make nullable columns and will insert default values instead of nulls. This is also applicable for null values inside array data types. + +**Engine Parameters** + +- `host:port` — PostgreSQL server address. +- `database` — Remote database name. +- `table` — Remote table name. +- `user` — PostgreSQL user. +- `password` — User password. +- `schema` — Non-default table schema. Optional. + +## Implementation Details {#implementation-details} + +`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. + +Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server. + +All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. + +`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. + +PostgreSQL `Array` types are converted into ClickHouse arrays. + +!!! info "Note" + Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. + +Replicas priority for PostgreSQL dictionary source is supported. The bigger the number in map, the less the priority. The highest priority is `0`. + +In the example below replica `example01-1` has the highest priority: + +```xml + + 5432 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 2 + + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` + +## Usage Example {#usage-example} + +Table in PostgreSQL: + +``` text +postgres=# CREATE TABLE "public"."test" ( +"int_id" SERIAL, +"int_nullable" INT NULL DEFAULT NULL, +"float" FLOAT NOT NULL, +"str" VARCHAR(100) NOT NULL DEFAULT '', +"float_nullable" FLOAT NULL DEFAULT NULL, +PRIMARY KEY (int_id)); + +CREATE TABLE + +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); +INSERT 0 1 + +postgresql> SELECT * FROM test; + int_id | int_nullable | float | str | float_nullable + --------+--------------+-------+------+---------------- + 1 | | 2 | test | + (1 row) +``` + +Table in ClickHouse, retrieving data from the PostgreSQL table created above: + +``` sql +CREATE TABLE default.postgresql_table +( + `float_nullable` Nullable(Float32), + `str` String, + `int_id` Int32 +) +ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgres_password'); +``` + +``` sql +SELECT * FROM postgresql_table WHERE str IN ('test'); +``` + +``` text +┌─float_nullable─┬─str──┬─int_id─┐ +│ ᴺᵁᴸᴸ │ test │ 1 │ +└────────────────┴──────┴────────┘ +``` + +Using Non-default Schema: + +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**See Also** + +- [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md) +- [Using PostgreSQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) + +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/postgresql/) diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index 4a0550275ca..5fb9ce5b151 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 10 toc_title: RabbitMQ --- @@ -163,3 +163,5 @@ Example: - `_redelivered` - `redelivered` flag of the message. - `_message_id` - messageID of the received message; non-empty if was set, when message was published. - `_timestamp` - timestamp of the received message; non-empty if was set, when message was published. + +[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/rabbitmq/) diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 5858a0803e6..6592f8b9752 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -1,52 +1,58 @@ --- -toc_priority: 4 +toc_priority: 7 toc_title: S3 --- -# S3 {#table_engines-s3} +# S3 Table Engine {#table-engine-s3} -This engine provides integration with [Amazon S3](https://aws.amazon.com/s3/) ecosystem. This engine is similar -to the [HDFS](../../../engines/table-engines/special/file.md#table_engines-hdfs) engine, but provides S3-specific features. +This engine provides integration with [Amazon S3](https://aws.amazon.com/s3/) ecosystem. This engine is similar to the [HDFS](../../../engines/table-engines/special/file.md#table_engines-hdfs) engine, but provides S3-specific features. -## Usage {#usage} +## Create Table {#creating-a-table} ``` sql +CREATE TABLE s3_engine_table (name String, value UInt32) ENGINE = S3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compression]) ``` -**Input parameters** +**Engine parameters** -- `path` — Bucket url with path to file. Supports following wildcards in readonly mode: *, ?, {abc,def} and {N..M} where N, M — numbers, `’abc’, ‘def’ — strings. +- `path` — Bucket url with path to file. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. For more information see [below](#wildcards-in-path). - `format` — The [format](../../../interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. -- `compression` — Parameter is optional. Supported values: none, gzip/gz, brotli/br, xz/LZMA, zstd/zst. By default, it will autodetect compression by file extension. +- `compression` — Compression type. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Parameter is optional. By default, it will autodetect compression by file extension. -**Example:** +**Example** -**1.** Set up the `s3_engine_table` table: +1. Set up the `s3_engine_table` table: ``` sql -CREATE TABLE s3_engine_table (name String, value UInt32) ENGINE=S3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip') +CREATE TABLE s3_engine_table (name String, value UInt32) ENGINE=S3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip'); ``` -**2.** Fill file: +2. Fill file: ``` sql -INSERT INTO s3_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) +INSERT INTO s3_engine_table VALUES ('one', 1), ('two', 2), ('three', 3); ``` -**3.** Query the data: +3. Query the data: ``` sql -SELECT * FROM s3_engine_table LIMIT 2 +SELECT * FROM s3_engine_table LIMIT 2; ``` -``` text +```text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ └──────┴───────┘ ``` +## Virtual columns {#virtual-columns} + +- `_path` — Path to the file. +- `_file` — Name of the file. + +For more information about virtual columns see [here](../../../engines/table-engines/index.md#table_engines-virtual_columns). ## Implementation Details {#implementation-details} @@ -56,9 +62,9 @@ SELECT * FROM s3_engine_table LIMIT 2 - Indexes. - Replication. -**Globs in path** +## Wildcards In Path {#wildcards-in-path} -Multiple path components can have globs. For being processed file should exist and match to the whole path pattern. Listing of files determines during `SELECT` (not at `CREATE` moment). +`path` argument can specify multiple files using bash-like wildcards. For being processed file should exist and match to the whole path pattern. Listing of files is determined during `SELECT` (not at `CREATE` moment). - `*` — Substitutes any number of any characters except `/` including empty string. - `?` — Substitutes any single character. @@ -69,7 +75,7 @@ Constructions with `{}` are similar to the [remote](../../../sql-reference/table **Example** -1. Suppose we have several files in TSV format with the following URIs on HDFS: +1. Suppose we have several files in CSV format with the following URIs on S3: - ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv’ - ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv’ @@ -78,35 +84,34 @@ Constructions with `{}` are similar to the [remote](../../../sql-reference/table - ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv’ - ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv’ -2. There are several ways to make a table consisting of all six files: +There are several ways to make a table consisting of all six files: - +The first way: ``` sql -CREATE TABLE table_with_range (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}', 'CSV') +CREATE TABLE table_with_range (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}', 'CSV'); ``` -3. Another way: +Another way: ``` sql -CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_?', 'CSV') +CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_?', 'CSV'); ``` -4. Table consists of all the files in both directories (all files should satisfy format and schema described in query): +Table consists of all the files in both directories (all files should satisfy format and schema described in query): ``` sql -CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV') +CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV'); ``` -!!! warning "Warning" - If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **Example** Create table with files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: ``` sql -CREATE TABLE big_table (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV') +CREATE TABLE big_table (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV'); ``` ## Virtual Columns {#virtual-columns} @@ -122,35 +127,82 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = S3('https://storage. The following settings can be set before query execution or placed into configuration file. -- `s3_max_single_part_upload_size` — Default value is `64Mb`. The maximum size of object to upload using singlepart upload to S3. -- `s3_min_upload_part_size` — Default value is `512Mb`. The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). -- `s3_max_redirects` — Default value is `10`. Max number of S3 redirects hops allowed. +- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `64Mb`. +- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`. +- `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`. Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. -### Endpoint-based settings {#endpointsettings} +## Endpoint-based Settings {#endpoint-settings} The following settings can be specified in configuration file for given endpoint (which will be matched by exact prefix of a URL): -- `endpoint` — Mandatory. Specifies prefix of an endpoint. -- `access_key_id` and `secret_access_key` — Optional. Specifies credentials to use with given endpoint. -- `use_environment_credentials` — Optional, default value is `false`. If set to `true`, S3 client will try to obtain credentials from environment variables and Amazon EC2 metadata for given endpoint. -- `header` — Optional, can be speficied multiple times. Adds specified HTTP header to a request to given endpoint. -- `server_side_encryption_customer_key_base64` — Optional. If specified, required headers for accessing S3 objects with SSE-C encryption will be set. +- `endpoint` — Specifies prefix of an endpoint. Mandatory. +- `access_key_id` and `secret_access_key` — Specifies credentials to use with given endpoint. Optional. +- `use_environment_credentials` — If set to `true`, S3 client will try to obtain credentials from environment variables and Amazon EC2 metadata for given endpoint. Optional, default value is `false`. +- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Optional, default value is `false`. +- `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be speficied multiple times. +- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. -Example: +**Example:** -``` +``` xml https://storage.yandexcloud.net/my-test-bucket-768/ + ``` -[Original article](https://clickhouse.tech/docs/en/operations/table_engines/s3/) +## Usage {#usage-examples} + +Suppose we have several files in TSV format with the following URIs on HDFS: + +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_3.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_1.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv' + + +1. There are several ways to make a table consisting of all six files: + +``` sql +CREATE TABLE table_with_range (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}', 'CSV'); +``` + +2. Another way: + +``` sql +CREATE TABLE table_with_question_mark (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_?', 'CSV'); +``` + +3. Table consists of all the files in both directories (all files should satisfy format and schema described in query): + +``` sql +CREATE TABLE table_with_asterisk (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV'); +``` + +!!! warning "Warning" + If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. + +4. Create table with files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: + +``` sql +CREATE TABLE big_table (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV'); +``` + +## See also + +- [S3 table function](../../../sql-reference/table-functions/s3.md) diff --git a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md index 1a997b6b237..818830646cb 100644 --- a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md @@ -3,7 +3,7 @@ toc_priority: 35 toc_title: AggregatingMergeTree --- -# Aggregatingmergetree {#aggregatingmergetree} +# AggregatingMergeTree {#aggregatingmergetree} The engine inherits from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree), altering the logic for data parts merging. ClickHouse replaces all rows with the same primary key (or more accurately, with the same [sorting key](../../../engines/table-engines/mergetree-family/mergetree.md)) with a single row (within a one data part) that stores a combination of states of aggregate functions. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 753859b46d2..9874e87be78 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -353,7 +353,7 @@ The `set` index can be used with all functions. Function subsets for other index | Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter | |------------------------------------------------------------------------------------------------------------|-------------|--------|-------------|-------------|---------------| | [equals (=, ==)](../../../sql-reference/functions/comparison-functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [notEquals(!=, \<\>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notEquals(!=, <>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | | [like](../../../sql-reference/functions/string-search-functions.md#function-like) | ✔ | ✔ | ✔ | ✔ | ✗ | | [notLike](../../../sql-reference/functions/string-search-functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ | | [startsWith](../../../sql-reference/functions/string-functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | @@ -361,10 +361,10 @@ The `set` index can be used with all functions. Function subsets for other index | [multiSearchAny](../../../sql-reference/functions/string-search-functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | | [in](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | | [notIn](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [less (\<)](../../../sql-reference/functions/comparison-functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [greater (\>)](../../../sql-reference/functions/comparison-functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [lessOrEquals (\<=)](../../../sql-reference/functions/comparison-functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [greaterOrEquals (\>=)](../../../sql-reference/functions/comparison-functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [less (<)](../../../sql-reference/functions/comparison-functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greater (>)](../../../sql-reference/functions/comparison-functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [lessOrEquals (<=)](../../../sql-reference/functions/comparison-functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greaterOrEquals (>=)](../../../sql-reference/functions/comparison-functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | | [empty](../../../sql-reference/functions/array-functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | | [notEmpty](../../../sql-reference/functions/array-functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | | hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | @@ -529,7 +529,7 @@ CREATE TABLE table_for_aggregation y Int ) ENGINE = MergeTree -ORDER BY k1, k2 +ORDER BY (k1, k2) TTL d + INTERVAL 1 MONTH GROUP BY k1, k2 SET x = max(x), y = min(y); ``` @@ -701,6 +701,32 @@ The `default` storage policy implies using only one volume, which consists of on The number of threads performing background moves of data parts can be changed by [background_move_pool_size](../../../operations/settings/settings.md#background_move_pool_size) setting. +### Details {#details} + +In the case of `MergeTree` tables, data is getting to disk in different ways: + +- As a result of an insert (`INSERT` query). +- During background merges and [mutations](../../../sql-reference/statements/alter/index.md#alter-mutations). +- When downloading from another replica. +- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter/partition.md#alter_freeze-partition). + +In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy: + +1. The first volume (in the order of definition) that has enough disk space for storing a part (`unreserved_space > current_part_size`) and allows for storing parts of a given size (`max_data_part_size_bytes > current_part_size`) is chosen. +2. Within this volume, that disk is chosen that follows the one, which was used for storing the previous chunk of data, and that has free space more than the part size (`unreserved_space - keep_free_space_bytes > current_part_size`). + +Under the hood, mutations and partition freezing make use of [hard links](https://en.wikipedia.org/wiki/Hard_link). Hard links between different disks are not supported, therefore in such cases the resulting parts are stored on the same disks as the initial ones. + +In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file. +Data is never transferred from the last one and into the first one. One may use system tables [system.part_log](../../../operations/system-tables/part_log.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. + +User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter/partition.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. + +Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas. + +After the completion of background merges and mutations, old parts are removed only after a certain amount of time (`old_parts_lifetime`). +During this time, they are not moved to other volumes or disks. Therefore, until the parts are finally removed, they are still taken into account for evaluation of the occupied disk space. + ## Using S3 for Data Storage {#table_engine-mergetree-s3} `MergeTree` family table engines is able to store data to [S3](https://aws.amazon.com/s3/) using a disk with type `s3`. @@ -722,7 +748,6 @@ Configuration markup: 10000 5000 - 100 10 1000 /var/lib/clickhouse/disks/s3/ @@ -742,10 +767,10 @@ Required parameters: Optional parameters: - `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`. +- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`. - `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL. - `connect_timeout_ms` — Socket connect timeout in milliseconds. Default value is `10 seconds`. - `request_timeout_ms` — Request timeout in milliseconds. Default value is `5 seconds`. -- `max_connections` — S3 connections pool size. Default value is `100`. - `retry_attempts` — Number of retry attempts in case of failed request. Default value is `10`. - `min_bytes_for_seek` — Minimal number of bytes to use seek operation instead of sequential read. Default value is `1 Mb`. - `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks//`. @@ -793,30 +818,4 @@ S3 disk can be configured as `main` or `cold` storage: In case of `cold` option a data can be moved to S3 if local disk free size will be smaller than `move_factor * disk_size` or by TTL move rule. -### Details {#details} - -In the case of `MergeTree` tables, data is getting to disk in different ways: - -- As a result of an insert (`INSERT` query). -- During background merges and [mutations](../../../sql-reference/statements/alter/index.md#alter-mutations). -- When downloading from another replica. -- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter/partition.md#alter_freeze-partition). - -In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy: - -1. The first volume (in the order of definition) that has enough disk space for storing a part (`unreserved_space > current_part_size`) and allows for storing parts of a given size (`max_data_part_size_bytes > current_part_size`) is chosen. -2. Within this volume, that disk is chosen that follows the one, which was used for storing the previous chunk of data, and that has free space more than the part size (`unreserved_space - keep_free_space_bytes > current_part_size`). - -Under the hood, mutations and partition freezing make use of [hard links](https://en.wikipedia.org/wiki/Hard_link). Hard links between different disks are not supported, therefore in such cases the resulting parts are stored on the same disks as the initial ones. - -In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file. -Data is never transferred from the last one and into the first one. One may use system tables [system.part_log](../../../operations/system-tables/part_log.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. - -User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter/partition.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. - -Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas. - -After the completion of background merges and mutations, old parts are removed only after a certain amount of time (`old_parts_lifetime`). -During this time, they are not moved to other volumes or disks. Therefore, until the parts are finally removed, they are still taken into account for evaluation of the occupied disk space. - [Original article](https://clickhouse.tech/docs/ru/operations/table_engines/mergetree/) diff --git a/docs/en/engines/table-engines/special/buffer.md b/docs/en/engines/table-engines/special/buffer.md index bf6c08f8f6c..8245cd19e8c 100644 --- a/docs/en/engines/table-engines/special/buffer.md +++ b/docs/en/engines/table-engines/special/buffer.md @@ -18,11 +18,17 @@ Engine parameters: - `num_layers` – Parallelism layer. Physically, the table will be represented as `num_layers` of independent buffers. Recommended value: 16. - `min_time`, `max_time`, `min_rows`, `max_rows`, `min_bytes`, and `max_bytes` – Conditions for flushing data from the buffer. +Optional engine parameters: + +- `flush_time`, `flush_rows`, `flush_bytes` – Conditions for flushing data from the buffer, that will happen only in background (ommited or zero means no `flush*` parameters). + Data is flushed from the buffer and written to the destination table if all the `min*` conditions or at least one `max*` condition are met. -- `min_time`, `max_time` – Condition for the time in seconds from the moment of the first write to the buffer. -- `min_rows`, `max_rows` – Condition for the number of rows in the buffer. -- `min_bytes`, `max_bytes` – Condition for the number of bytes in the buffer. +Also if at least one `flush*` condition are met flush initiated in background, this is different from `max*`, since `flush*` allows you to configure background flushes separately to avoid adding latency for `INSERT` (into `Buffer`) queries. + +- `min_time`, `max_time`, `flush_time` – Condition for the time in seconds from the moment of the first write to the buffer. +- `min_rows`, `max_rows`, `flush_rows` – Condition for the number of rows in the buffer. +- `min_bytes`, `max_bytes`, `flush_bytes` – Condition for the number of bytes in the buffer. During the write operation, data is inserted to a `num_layers` number of random buffers. Or, if the data part to insert is large enough (greater than `max_rows` or `max_bytes`), it is written directly to the destination table, omitting the buffer. diff --git a/docs/en/engines/table-engines/special/distributed.md b/docs/en/engines/table-engines/special/distributed.md index 7fffa962480..c47e0c27cd2 100644 --- a/docs/en/engines/table-engines/special/distributed.md +++ b/docs/en/engines/table-engines/special/distributed.md @@ -31,6 +31,12 @@ Also it accept the following settings: - `fsync_directories` - do the `fsync` for directories. Guarantees that the OS refreshed directory metadata after operations related to asynchronous inserts on Distributed table (after insert, after sending the data to shard, etc). +- `bytes_to_throw_insert` - if more than this number of compressed bytes will be pending for async INSERT, an exception will be thrown. 0 - do not throw. Default 0. + +- `bytes_to_delay_insert` - if more than this number of compressed bytes will be pending for async INSERT, the query will be delayed. 0 - do not delay. Default 0. + +- `max_delay_to_insert` - max delay of inserting data into Distributed table in seconds, if there are a lot of pending bytes for async send. Default 60. + !!! note "Note" **Durability settings** (`fsync_...`): @@ -39,6 +45,12 @@ Also it accept the following settings: - May significantly decrease the inserts' performance - Affect writing the data stored inside Distributed table folder into the **node which accepted your insert**. If you need to have guarantees of writing data to underlying MergeTree tables - see durability settings (`...fsync...`) in `system.merge_tree_settings` + For **Insert limit settings** (`..._insert`) see also: + + - [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting + - [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting + - `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert` + Example: ``` sql @@ -61,19 +73,18 @@ Clusters are set like this: ``` xml + + - - - 1 diff --git a/docs/en/engines/table-engines/special/file.md b/docs/en/engines/table-engines/special/file.md index cda16c2a226..2acec40ef02 100644 --- a/docs/en/engines/table-engines/special/file.md +++ b/docs/en/engines/table-engines/special/file.md @@ -66,7 +66,8 @@ SELECT * FROM file_engine_table ## Usage in ClickHouse-local {#usage-in-clickhouse-local} -In [clickhouse-local](../../../operations/utilities/clickhouse-local.md) File engine accepts file path in addition to `Format`. Default input/output streams can be specified using numeric or human-readable names like `0` or `stdin`, `1` or `stdout`. +In [clickhouse-local](../../../operations/utilities/clickhouse-local.md) File engine accepts file path in addition to `Format`. Default input/output streams can be specified using numeric or human-readable names like `0` or `stdin`, `1` or `stdout`. It is possible to read and write compressed files based on an additional engine parameter or file extension (`gz`, `br` or `xz`). + **Example:** ``` bash diff --git a/docs/en/faq/integration/json-import.md b/docs/en/faq/integration/json-import.md index 7038cc539d2..3fa026c794a 100644 --- a/docs/en/faq/integration/json-import.md +++ b/docs/en/faq/integration/json-import.md @@ -19,7 +19,7 @@ $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test Using [CLI interface](../../interfaces/cli.md): ``` bash -$ echo '{"foo":"bar"}' | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow" +$ echo '{"foo":"bar"}' | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow" ``` Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead. diff --git a/docs/en/getting-started/example-datasets/brown-benchmark.md b/docs/en/getting-started/example-datasets/brown-benchmark.md index b5ca23eddb9..c9b74a84a54 100644 --- a/docs/en/getting-started/example-datasets/brown-benchmark.md +++ b/docs/en/getting-started/example-datasets/brown-benchmark.md @@ -5,7 +5,7 @@ toc_title: Brown University Benchmark # Brown University Benchmark -MgBench - A new analytical benchmark for machine-generated log data, [Andrew Crotty](http://cs.brown.edu/people/acrotty/). +`MgBench` is a new analytical benchmark for machine-generated log data, [Andrew Crotty](http://cs.brown.edu/people/acrotty/). Download the data: ``` @@ -153,7 +153,7 @@ ORDER BY dt, hr; --- Q1.4: Over a 1-month period, how often was each server blocked on disk I/O? +-- Q1.4: Over 1 month, how often was each server blocked on disk I/O? SELECT machine_name, COUNT(*) AS spikes @@ -301,7 +301,7 @@ WHERE event_type = 'temperature' AND log_time >= '2019-11-29 17:00:00.000'; --- Q3.4: Over the past 6 months, how frequently was each door opened? +-- Q3.4: Over the past 6 months, how frequently were each door opened? SELECT device_name, device_floor, @@ -412,3 +412,5 @@ ORDER BY yr, ``` The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.tech/play?user=play), [example](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). + +[Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/brown-benchmark/) diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md new file mode 100644 index 00000000000..7028b650ad1 --- /dev/null +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -0,0 +1,132 @@ +--- +toc_priority: 21 +toc_title: Cell Towers +--- + +# Cell Towers {#cell-towers} + +This dataset is from [OpenCellid](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers. + +As of 2021, it contains more than 40 million records about cell towers (GSM, LTE, UMTS, etc.) around the world with their geographical coordinates and metadata (country code, network, etc). + +OpenCelliD Project is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License, and we redistribute a snapshot of this dataset under the terms of the same license. The up-to-date version of the dataset is available to download after sign in. + + +## Get the Dataset {#get-the-dataset} + +1. Download the snapshot of the dataset from February 2021: [https://datasets.clickhouse.tech/cell_towers.csv.xz] (729 MB). + +2. Validate the integrity (optional step): +``` +md5sum cell_towers.csv.xz +8cf986f4a0d9f12c6f384a0e9192c908 cell_towers.csv.xz +``` + +3. Decompress it with the following command: +``` +xz -d cell_towers.csv.xz +``` + +4. Create a table: + +``` +CREATE TABLE cell_towers +( + radio Enum8('' = 0, 'CDMA' = 1, 'GSM' = 2, 'LTE' = 3, 'NR' = 4, 'UMTS' = 5), + mcc UInt16, + net UInt16, + area UInt16, + cell UInt64, + unit Int16, + lon Float64, + lat Float64, + range UInt32, + samples UInt32, + changeable UInt8, + created DateTime, + updated DateTime, + averageSignal UInt8 +) +ENGINE = MergeTree ORDER BY (radio, mcc, net, created); +``` + +5. Insert the dataset: +``` +clickhouse-client --query "INSERT INTO cell_towers FORMAT CSVWithNames" < cell_towers.csv +``` + +## Examples {#examples} + +1. A number of cell towers by type: + +``` +SELECT radio, count() AS c FROM cell_towers GROUP BY radio ORDER BY c DESC + +┌─radio─┬────────c─┐ +│ UMTS │ 20686487 │ +│ LTE │ 12101148 │ +│ GSM │ 9931312 │ +│ CDMA │ 556344 │ +│ NR │ 867 │ +└───────┴──────────┘ + +5 rows in set. Elapsed: 0.011 sec. Processed 43.28 million rows, 43.28 MB (3.83 billion rows/s., 3.83 GB/s.) +``` + +2. Cell towers by [mobile country code (MCC)](https://en.wikipedia.org/wiki/Mobile_country_code): + +``` +SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 + +┌─mcc─┬─count()─┐ +│ 310 │ 5024650 │ +│ 262 │ 2622423 │ +│ 250 │ 1953176 │ +│ 208 │ 1891187 │ +│ 724 │ 1836150 │ +│ 404 │ 1729151 │ +│ 234 │ 1618924 │ +│ 510 │ 1353998 │ +│ 440 │ 1343355 │ +│ 311 │ 1332798 │ +└─────┴─────────┘ + +10 rows in set. Elapsed: 0.019 sec. Processed 43.28 million rows, 86.55 MB (2.33 billion rows/s., 4.65 GB/s.) +``` + +So, the top countries are: the USA, Germany, and Russia. + +You may want to create an [External Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. + + +## Use case {#use-case} + +Using `pointInPolygon` function. + +1. Create a table where we will store polygons: + +``` +CREATE TEMPORARY TABLE moscow (polygon Array(Tuple(Float64, Float64))); +``` + +2. This is a rough shape of Moscow (without "new Moscow"): + +``` +INSERT INTO moscow VALUES ([(37.84172564285271, 55.78000432402266), (37.8381207618713, 55.775874525970494), (37.83979446823122, 55.775626746008065), (37.84243326983639, 55.77446586811748), (37.84262672750849, 55.771974101091104), (37.84153238623039, 55.77114545193181), (37.841124690460184, 55.76722010265554), (37.84239076983644, 55.76654891107098), (37.842283558197025, 55.76258709833121), (37.8421759312134, 55.758073999993734), (37.84198330422974, 55.75381499999371), (37.8416827275085, 55.749277102484484), (37.84157576190186, 55.74794544108413), (37.83897929098507, 55.74525257875241), (37.83739676451868, 55.74404373042019), (37.838732481460525, 55.74298009816793), (37.841183997352545, 55.743060321833575), (37.84097476190185, 55.73938799999373), (37.84048155819702, 55.73570799999372), (37.840095812164286, 55.73228210777237), (37.83983814285274, 55.73080491981639), (37.83846476321406, 55.729799917464675), (37.83835745269769, 55.72919751082619), (37.838636380279524, 55.72859509486539), (37.8395161005249, 55.727705075632784), (37.83897964285276, 55.722727886185154), (37.83862557539366, 55.72034817326636), (37.83559735744853, 55.71944437307499), (37.835370708803126, 55.71831419154461), (37.83738169402022, 55.71765218986692), (37.83823396494291, 55.71691750159089), (37.838056931213345, 55.71547311301385), (37.836812846557606, 55.71221445615604), (37.83522525396725, 55.709331054395555), (37.83269301586908, 55.70953687463627), (37.829667367706236, 55.70903403789297), (37.83311126588435, 55.70552351822608), (37.83058993121339, 55.70041317726053), (37.82983872750851, 55.69883771404813), (37.82934501586913, 55.69718947487017), (37.828926414016685, 55.69504441658371), (37.82876530422971, 55.69287499999378), (37.82894754100031, 55.690759754047335), (37.827697554878185, 55.68951421135665), (37.82447346292115, 55.68965045405069), (37.83136543914793, 55.68322046195302), (37.833554015869154, 55.67814012759211), (37.83544184655761, 55.67295011628339), (37.837480388885474, 55.6672498719639), (37.838960677246064, 55.66316274139358), (37.83926093121332, 55.66046999999383), (37.839025050262435, 55.65869897264431), (37.83670784390257, 55.65794084879904), (37.835656529083245, 55.65694309303843), (37.83704060449217, 55.65689306460552), (37.83696819873806, 55.65550363526252), (37.83760389616388, 55.65487847246661), (37.83687972750851, 55.65356745541324), (37.83515216004943, 55.65155951234079), (37.83312418518067, 55.64979413590619), (37.82801726983639, 55.64640836412121), (37.820614174591, 55.64164525405531), (37.818908190475426, 55.6421883258084), (37.81717543386075, 55.64112490388471), (37.81690987037274, 55.63916106913107), (37.815099354492155, 55.637925371757085), (37.808769150787356, 55.633798276884455), (37.80100123544311, 55.62873670012244), (37.79598013491824, 55.62554336109055), (37.78634567724606, 55.62033499605651), (37.78334147619623, 55.618768681480326), (37.77746201055901, 55.619855533402706), (37.77527329626457, 55.61909966711279), (37.77801986242668, 55.618770300976294), (37.778212973541216, 55.617257701952106), (37.77784818518065, 55.61574504433011), (37.77016867724609, 55.61148576294007), (37.760191219573976, 55.60599579539028), (37.75338926983641, 55.60227892751446), (37.746329965606634, 55.59920577639331), (37.73939925396728, 55.59631430313617), (37.73273665739439, 55.5935318803559), (37.7299954450912, 55.59350760316188), (37.7268679946899, 55.59469840523759), (37.72626726983634, 55.59229549697373), (37.7262673598022, 55.59081598950582), (37.71897193121335, 55.5877595845419), (37.70871550793456, 55.58393177431724), (37.700497489410374, 55.580917323756644), (37.69204305026244, 55.57778089778455), (37.68544477378839, 55.57815154690915), (37.68391050793454, 55.57472945079756), (37.678803592590306, 55.57328235936491), (37.6743402539673, 55.57255251445782), (37.66813862698363, 55.57216388774464), (37.617927457672096, 55.57505691895805), (37.60443099999999, 55.5757737568051), (37.599683515869145, 55.57749105910326), (37.59754177842709, 55.57796291823627), (37.59625834786988, 55.57906686095235), (37.59501783265684, 55.57746616444403), (37.593090671936025, 55.57671634534502), (37.587018007904, 55.577944600233785), (37.578692203704804, 55.57982895000019), (37.57327546607398, 55.58116294118248), (37.57385012109279, 55.581550362779), (37.57399562266922, 55.5820107079112), (37.5735356072979, 55.58226289171689), (37.57290393054962, 55.582393529795155), (37.57037722355653, 55.581919415056234), (37.5592298306885, 55.584471614867844), (37.54189249206543, 55.58867650795186), (37.5297256269836, 55.59158133551745), (37.517837865081766, 55.59443656218868), (37.51200186508174, 55.59635625174229), (37.506808949737554, 55.59907823904434), (37.49820432275389, 55.6062944994944), (37.494406071441674, 55.60967103463367), (37.494760001358024, 55.61066689753365), (37.49397137107085, 55.61220931698269), (37.49016528606031, 55.613417718449064), (37.48773249206542, 55.61530616333343), (37.47921386508177, 55.622640129112334), (37.470652153442394, 55.62993723476164), (37.46273446298218, 55.6368075123157), (37.46350692265317, 55.64068225239439), (37.46050283203121, 55.640794546982576), (37.457627470916734, 55.64118904154646), (37.450718034393326, 55.64690488145138), (37.44239252645875, 55.65397824729769), (37.434587576721185, 55.66053543155961), (37.43582144975277, 55.661693766520735), (37.43576786245721, 55.662755031737014), (37.430982915344174, 55.664610641628116), (37.428547447097685, 55.66778515273695), (37.42945134592044, 55.668633314343566), (37.42859571562949, 55.66948145750025), (37.4262836402282, 55.670813882451405), (37.418709037048295, 55.6811141674414), (37.41922139651101, 55.68235377885389), (37.419218771842885, 55.68359335082235), (37.417196501327446, 55.684375235224735), (37.41607020370478, 55.68540557585352), (37.415640857147146, 55.68686637150793), (37.414632153442334, 55.68903015131686), (37.413344899475064, 55.690896881757396), (37.41171432275391, 55.69264232162232), (37.40948282275393, 55.69455101638112), (37.40703674603271, 55.69638690385348), (37.39607169577025, 55.70451821283731), (37.38952706878662, 55.70942491932811), (37.387778313491815, 55.71149057784176), (37.39049275399779, 55.71419814298992), (37.385557272491454, 55.7155489617061), (37.38388335714726, 55.71849856042102), (37.378368238098155, 55.7292763261685), (37.37763597123337, 55.730845879211614), (37.37890062088197, 55.73167906388319), (37.37750451918789, 55.734703664681774), (37.375610832015965, 55.734851959522246), (37.3723813571472, 55.74105626086403), (37.37014935714723, 55.746115620904355), (37.36944173016362, 55.750883999993725), (37.36975304365541, 55.76335905525834), (37.37244070571134, 55.76432079697595), (37.3724259757175, 55.76636979670426), (37.369922155757884, 55.76735417953104), (37.369892695770275, 55.76823419316575), (37.370214730163575, 55.782312184391266), (37.370493611114505, 55.78436801120489), (37.37120164550783, 55.78596427165359), (37.37284851456452, 55.7874378183096), (37.37608325135799, 55.7886695054807), (37.3764587460632, 55.78947647305964), (37.37530000265506, 55.79146512926804), (37.38235915344241, 55.79899647809345), (37.384344043655396, 55.80113596939471), (37.38594269577028, 55.80322699999366), (37.38711208598329, 55.804919036911976), (37.3880239841309, 55.806610999993666), (37.38928977249147, 55.81001864976979), (37.39038389947512, 55.81348641242801), (37.39235781481933, 55.81983538336746), (37.393709457672124, 55.82417822811877), (37.394685720901464, 55.82792275755836), (37.39557615344238, 55.830447148154136), (37.39844478226658, 55.83167107969975), (37.40019761214057, 55.83151823557964), (37.400398790382326, 55.83264967594742), (37.39659544313046, 55.83322180909622), (37.39667059524539, 55.83402792148566), (37.39682089947515, 55.83638877400216), (37.39643489154053, 55.83861656112751), (37.3955338994751, 55.84072348043264), (37.392680272491454, 55.84502158126453), (37.39241188227847, 55.84659117913199), (37.392529730163616, 55.84816071336481), (37.39486835714723, 55.85288092980303), (37.39873052645878, 55.859893456073635), (37.40272161111449, 55.86441833633205), (37.40697072750854, 55.867579567544375), (37.410007082016016, 55.868369880337), (37.4120992989502, 55.86920843741314), (37.412668021163924, 55.87055369615854), (37.41482461111453, 55.87170587948249), (37.41862266137694, 55.873183961039565), (37.42413732540892, 55.874879126654704), (37.4312182698669, 55.875614937236705), (37.43111093783558, 55.8762723478417), (37.43332105622856, 55.87706546369396), (37.43385747619623, 55.87790681284802), (37.441303050262405, 55.88027084462084), (37.44747234260555, 55.87942070143253), (37.44716141796871, 55.88072960917233), (37.44769797085568, 55.88121221323979), (37.45204320500181, 55.882080694420715), (37.45673176190186, 55.882346110794586), (37.463383999999984, 55.88252729504517), (37.46682797486874, 55.88294937719063), (37.470014457672086, 55.88361266759345), (37.47751410450743, 55.88546991372396), (37.47860317658232, 55.88534929207307), (37.48165826025772, 55.882563306475106), (37.48316434442331, 55.8815803226785), (37.483831555817645, 55.882427612793315), (37.483182967125686, 55.88372791409729), (37.483092277908824, 55.88495581062434), (37.4855716508179, 55.8875561994203), (37.486440636245746, 55.887827444039566), (37.49014203439328, 55.88897899871799), (37.493210285705544, 55.890208937135604), (37.497512451065035, 55.891342397444696), (37.49780744510645, 55.89174030252967), (37.49940333499519, 55.89239745507079), (37.50018383334346, 55.89339220941865), (37.52421672750851, 55.903869074155224), (37.52977457672118, 55.90564076517974), (37.53503220370484, 55.90661661218259), (37.54042858064267, 55.90714113744566), (37.54320461007303, 55.905645048442985), (37.545686966066306, 55.906608607018505), (37.54743976120755, 55.90788552162358), (37.55796999999999, 55.90901557907218), (37.572711542327866, 55.91059395704873), (37.57942799999998, 55.91073854155573), (37.58502865872187, 55.91009969268444), (37.58739968913264, 55.90794809960554), (37.59131567193598, 55.908713267595054), (37.612687423278814, 55.902866854295375), (37.62348079629517, 55.90041967242986), (37.635797880950896, 55.898141151686396), (37.649487626983664, 55.89639275532968), (37.65619302513125, 55.89572360207488), (37.66294133862307, 55.895295577183965), (37.66874564418033, 55.89505457604897), (37.67375601586915, 55.89254677027454), (37.67744661901856, 55.8947775867987), (37.688347, 55.89450045676125), (37.69480554232789, 55.89422926332761), (37.70107096560668, 55.89322256101114), (37.705962965606716, 55.891763491662616), (37.711885134918205, 55.889110234998974), (37.71682005026245, 55.886577568759876), (37.7199315476074, 55.88458159806678), (37.72234560316464, 55.882281005794134), (37.72364385977171, 55.8809452036196), (37.725371142837474, 55.8809722706006), (37.727870902099546, 55.88037213862385), (37.73394330422971, 55.877941504088696), (37.745339592590376, 55.87208120378722), (37.75525267724611, 55.86703807949492), (37.76919976190188, 55.859821640197474), (37.827835219574, 55.82962968399116), (37.83341438888553, 55.82575289922351), (37.83652584655761, 55.82188784027888), (37.83809213491821, 55.81612575504693), (37.83605359521481, 55.81460347077685), (37.83632178569025, 55.81276696067908), (37.838623105812026, 55.811486181656385), (37.83912198147584, 55.807329380532785), (37.839079078033414, 55.80510270463816), (37.83965844708251, 55.79940712529036), (37.840581150787344, 55.79131399999368), (37.84172564285271, 55.78000432402266)]); +``` + +3. Check how many cell towers are in Moscow: + +``` +SELECT count() FROM cell_towers WHERE pointInPolygon((lon, lat), (SELECT * FROM moscow)) + +┌─count()─┐ +│ 310463 │ +└─────────┘ + +1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.) +``` + +The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.tech/play?user=play), [example](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). + +Although you cannot create temporary tables there. \ No newline at end of file diff --git a/docs/en/getting-started/example-datasets/index.md b/docs/en/getting-started/example-datasets/index.md index 72f44d8caf1..53007c33306 100644 --- a/docs/en/getting-started/example-datasets/index.md +++ b/docs/en/getting-started/example-datasets/index.md @@ -20,5 +20,6 @@ The list of documented datasets: - [Terabyte of Click Logs from Criteo](../../getting-started/example-datasets/criteo.md) - [AMPLab Big Data Benchmark](../../getting-started/example-datasets/amplab-benchmark.md) - [Brown University Benchmark](../../getting-started/example-datasets/brown-benchmark.md) +- [Cell Towers](../../getting-started/example-datasets/cell-towers.md) [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets) diff --git a/docs/en/getting-started/example-datasets/ontime.md b/docs/en/getting-started/example-datasets/ontime.md index 6e46cddba52..83673cdceb6 100644 --- a/docs/en/getting-started/example-datasets/ontime.md +++ b/docs/en/getting-started/example-datasets/ontime.md @@ -15,17 +15,9 @@ This dataset can be obtained in two ways: Downloading data: ``` bash -for s in `seq 1987 2018` -do -for m in `seq 1 12` -do -wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip -done -done +echo https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2021}_{1..12}.zip | xargs -P10 wget --no-check-certificate --continue ``` -(from https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) - Creating a table: ``` sql @@ -145,12 +137,14 @@ ORDER BY (Carrier, FlightDate) SETTINGS index_granularity = 8192; ``` -Loading data: +Loading data with multiple threads: ``` bash -$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done +ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'" ``` +(if you will have memory shortage or other issues on your server, remove the `-P $(nproc)` part) + ## Download of Prepared Partitions {#download-of-prepared-partitions} ``` bash diff --git a/docs/en/getting-started/example-datasets/recipes.md b/docs/en/getting-started/example-datasets/recipes.md index b3c7d82f485..0f4e81c8470 100644 --- a/docs/en/getting-started/example-datasets/recipes.md +++ b/docs/en/getting-started/example-datasets/recipes.md @@ -7,15 +7,17 @@ toc_title: Recipes Dataset RecipeNLG dataset is available for download [here](https://recipenlg.cs.put.poznan.pl/dataset). It contains 2.2 million recipes. The size is slightly less than 1 GB. -## Download and unpack the dataset +## Download and Unpack the Dataset -Accept Terms and Conditions and download it [here](https://recipenlg.cs.put.poznan.pl/dataset). Unpack the zip file with `unzip`. You will get the `full_dataset.csv` file. +1. Go to the download page [https://recipenlg.cs.put.poznan.pl/dataset](https://recipenlg.cs.put.poznan.pl/dataset). +1. Accept Terms and Conditions and download zip file. +1. Unpack the zip file with `unzip`. You will get the `full_dataset.csv` file. -## Create a table +## Create a Table Run clickhouse-client and execute the following CREATE query: -``` +``` sql CREATE TABLE recipes ( title String, @@ -27,11 +29,11 @@ CREATE TABLE recipes ) ENGINE = MergeTree ORDER BY title; ``` -## Insert the data +## Insert the Data Run the following command: -``` +``` bash clickhouse-client --query " INSERT INTO recipes SELECT @@ -49,32 +51,41 @@ clickhouse-client --query " This is a showcase how to parse custom CSV, as it requires multiple tunes. Explanation: -- the dataset is in CSV format, but it requires some preprocessing on insertion; we use table function [input](../../sql-reference/table-functions/input/) to perform preprocessing; -- the structure of CSV file is specified in the argument of the table function `input`; -- the field `num` (row number) is unneeded - we parse it from file and ignore; -- we use `FORMAT CSVWithNames` but the header in CSV will be ignored (by command line parameter `--input_format_with_names_use_header 0`), because the header does not contain the name for the first field; -- file is using only double quotes to enclose CSV strings; some strings are not enclosed in double quotes, and single quote must not be parsed as the string enclosing - that's why we also add the `--format_csv_allow_single_quote 0` parameter; -- some strings from CSV cannot parse, because they contain `\M/` sequence at the beginning of the value; the only value starting with backslash in CSV can be `\N` that is parsed as SQL NULL. We add `--input_format_allow_errors_num 10` parameter and up to ten malformed records can be skipped; -- there are arrays for ingredients, directions and NER fields; these arrays are represented in unusual form: they are serialized into string as JSON and then placed in CSV - we parse them as String and then use [JSONExtract](../../sql-reference/functions/json-functions/) function to transform it to Array. +- The dataset is in CSV format, but it requires some preprocessing on insertion; we use table function [input](../../sql-reference/table-functions/input.md) to perform preprocessing; +- The structure of CSV file is specified in the argument of the table function `input`; +- The field `num` (row number) is unneeded - we parse it from file and ignore; +- We use `FORMAT CSVWithNames` but the header in CSV will be ignored (by command line parameter `--input_format_with_names_use_header 0`), because the header does not contain the name for the first field; +- File is using only double quotes to enclose CSV strings; some strings are not enclosed in double quotes, and single quote must not be parsed as the string enclosing - that's why we also add the `--format_csv_allow_single_quote 0` parameter; +- Some strings from CSV cannot parse, because they contain `\M/` sequence at the beginning of the value; the only value starting with backslash in CSV can be `\N` that is parsed as SQL NULL. We add `--input_format_allow_errors_num 10` parameter and up to ten malformed records can be skipped; +- There are arrays for ingredients, directions and NER fields; these arrays are represented in unusual form: they are serialized into string as JSON and then placed in CSV - we parse them as String and then use [JSONExtract](../../sql-reference/functions/json-functions/) function to transform it to Array. -## Validate the inserted data +## Validate the Inserted Data By checking the row count: -``` -SELECT count() FROM recipes +Query: +``` sq; +SELECT count() FROM recipes; +``` + +Result: + +``` text ┌─count()─┐ │ 2231141 │ └─────────┘ ``` +## Example Queries -## Example queries +### Top Components by the Number of Recipes: -### Top components by the number of recipes: +In this example we learn how to use [arrayJoin](../../sql-reference/functions/array-join/) function to expand an array into a set of rows. -``` +Query: + +``` sql SELECT arrayJoin(NER) AS k, count() AS c @@ -82,7 +93,11 @@ FROM recipes GROUP BY k ORDER BY c DESC LIMIT 50 +``` +Result: + +``` text ┌─k────────────────────┬──────c─┐ │ salt │ 890741 │ │ sugar │ 620027 │ @@ -139,11 +154,9 @@ LIMIT 50 50 rows in set. Elapsed: 0.112 sec. Processed 2.23 million rows, 361.57 MB (19.99 million rows/s., 3.24 GB/s.) ``` -In this example we learn how to use [arrayJoin](../../sql-reference/functions/array-join/) function to multiply data by array elements. +### The Most Complex Recipes with Strawberry -### The most complex recipes with strawberry - -``` +``` sql SELECT title, length(NER), @@ -152,7 +165,11 @@ FROM recipes WHERE has(NER, 'strawberry') ORDER BY length(directions) DESC LIMIT 10 +``` +Result: + +``` text ┌─title────────────────────────────────────────────────────────────┬─length(NER)─┬─length(directions)─┐ │ Chocolate-Strawberry-Orange Wedding Cake │ 24 │ 126 │ │ Strawberry Cream Cheese Crumble Tart │ 19 │ 47 │ @@ -171,15 +188,19 @@ LIMIT 10 In this example, we involve [has](../../sql-reference/functions/array-functions/#hasarr-elem) function to filter by array elements and sort by the number of directions. -There is a wedding cake that requires the whole 126 steps to produce! +There is a wedding cake that requires the whole 126 steps to produce! Show that directions: -Show that directions: +Query: -``` +``` sql SELECT arrayJoin(directions) FROM recipes WHERE title = 'Chocolate-Strawberry-Orange Wedding Cake' +``` +Result: + +``` text ┌─arrayJoin(directions)───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Position 1 rack in center and 1 rack in bottom third of oven and preheat to 350F. │ │ Butter one 5-inch-diameter cake pan with 2-inch-high sides, one 8-inch-diameter cake pan with 2-inch-high sides and one 12-inch-diameter cake pan with 2-inch-high sides. │ @@ -312,6 +333,8 @@ WHERE title = 'Chocolate-Strawberry-Orange Wedding Cake' 126 rows in set. Elapsed: 0.011 sec. Processed 8.19 thousand rows, 5.34 MB (737.75 thousand rows/s., 480.59 MB/s.) ``` -### Online playground +### Online Playground -The dataset is also available in the [Playground](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==). +The dataset is also available in the [Online Playground](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==). + +[Original article](https://clickhouse.tech/docs/en/getting-started/example-datasets/recipes/) diff --git a/docs/en/getting-started/playground.md b/docs/en/getting-started/playground.md index 7838dad14ea..9adf0423cf3 100644 --- a/docs/en/getting-started/playground.md +++ b/docs/en/getting-started/playground.md @@ -38,10 +38,10 @@ The queries are executed as a read-only user. It implies some limitations: The following settings are also enforced: -- [max_result_bytes=10485760](../operations/settings/query_complexity/#max-result-bytes) -- [max_result_rows=2000](../operations/settings/query_complexity/#setting-max_result_rows) -- [result_overflow_mode=break](../operations/settings/query_complexity/#result-overflow-mode) -- [max_execution_time=60000](../operations/settings/query_complexity/#max-execution-time) +- [max_result_bytes=10485760](../operations/settings/query-complexity/#max-result-bytes) +- [max_result_rows=2000](../operations/settings/query-complexity/#setting-max_result_rows) +- [result_overflow_mode=break](../operations/settings/query-complexity/#result-overflow-mode) +- [max_execution_time=60000](../operations/settings/query-complexity/#max-execution-time) ## Examples {#examples} diff --git a/docs/en/guides/apply-catboost-model.md b/docs/en/guides/apply-catboost-model.md index f614b121714..7c2c8a575ec 100644 --- a/docs/en/guides/apply-catboost-model.md +++ b/docs/en/guides/apply-catboost-model.md @@ -159,6 +159,9 @@ The fastest way to evaluate a CatBoost model is compile `libcatboostmodel./home/catboost/models/*_model.xml ``` +!!! note "Note" + You can change path to the CatBoost model configuration later without restarting server. + ## 4. Run the Model Inference from SQL {#run-model-inference} For test model run the ClickHouse client `$ clickhouse client`. diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 33bf90a8b52..5987ba0f676 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -50,7 +50,7 @@ The supported formats are: | [Parquet](#data-format-parquet) | ✔ | ✔ | | [Arrow](#data-format-arrow) | ✔ | ✔ | | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | +| [ORC](#data-format-orc) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ | @@ -1254,7 +1254,7 @@ ClickHouse supports configurable precision of `Decimal` type. The `INSERT` query Unsupported Parquet data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Data types of ClickHouse table columns can differ from the corresponding fields of the Parquet data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to that data type which is set for the ClickHouse table column. +Data types of ClickHouse table columns can differ from the corresponding fields of the Parquet data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [cast](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) the data to that data type which is set for the ClickHouse table column. ### Inserting and Selecting Data {#inserting-and-selecting-data} @@ -1284,32 +1284,33 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e ## ORC {#data-format-orc} -[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse. +[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the [Hadoop](https://hadoop.apache.org/) ecosystem. ### Data Types Matching {#data_types-matching-3} -The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` queries. +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. -| ORC data type (`INSERT`) | ClickHouse data type | -|--------------------------|-----------------------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | -| `DATE32` | [Date](../sql-reference/data-types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | -| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | -| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | +| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) | +|--------------------------|-----------------------------------------------------|--------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` | ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. -Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. +Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. @@ -1321,6 +1322,14 @@ You can insert ORC data from a file into ClickHouse table by the following comma $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` +### Selecting Data {#selecting-data-2} + +You can select data from a ClickHouse table and save them into some file in the ORC format by the following command: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} +``` + To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md). ## LineAsString {#lineasstring} @@ -1359,15 +1368,15 @@ When working with the `Regexp` format, you can use the following settings: - Escaped (similarly to [TSV](#tabseparated)) - Quoted (similarly to [Values](#data-format-values)) - Raw (extracts subpatterns as a whole, no escaping rules) -- `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Defines the need to throw an exeption in case the `format_regexp` expression does not match the imported data. Can be set to `0` or `1`. +- `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Defines the need to throw an exeption in case the `format_regexp` expression does not match the imported data. Can be set to `0` or `1`. -**Usage** +**Usage** -The regular expression from `format_regexp` setting is applied to every line of imported data. The number of subpatterns in the regular expression must be equal to the number of columns in imported dataset. +The regular expression from `format_regexp` setting is applied to every line of imported data. The number of subpatterns in the regular expression must be equal to the number of columns in imported dataset. -Lines of the imported data must be separated by newline character `'\n'` or DOS-style newline `"\r\n"`. +Lines of the imported data must be separated by newline character `'\n'` or DOS-style newline `"\r\n"`. -The content of every matched subpattern is parsed with the method of corresponding data type, according to `format_regexp_escaping_rule` setting. +The content of every matched subpattern is parsed with the method of corresponding data type, according to `format_regexp_escaping_rule` setting. If the regular expression does not match the line and `format_regexp_skip_unmatched` is set to 1, the line is silently skipped. If `format_regexp_skip_unmatched` is set to 0, exception is thrown. diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 310286e3d44..18533cfc6c2 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -148,28 +148,48 @@ $ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- For successful requests that don’t return a data table, an empty response body is returned. -You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you will need to use the special `clickhouse-compressor` program to work with it (it is installed with the `clickhouse-client` package). To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. -If you specified `compress=1` in the URL, the server compresses the data it sends you. -If you specified `decompress=1` in the URL, the server decompresses the same data that you pass in the `POST` method. +## Compression {#compression} -You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. In order for ClickHouse to compress the response, you must append `Accept-Encoding: compression_method`. ClickHouse supports `gzip`, `br`, and `deflate` [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens). To enable HTTP compression, you must use the ClickHouse [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting. You can configure the data compression level in the [http_zlib_compression_level](#settings-http_zlib_compression_level) setting for all the compression methods. +You can use compression to reduce network traffic when transmitting a large amount of data or for creating dumps that are immediately compressed. -You can use this to reduce network traffic when transmitting a large amount of data, or for creating dumps that are immediately compressed. +You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you need `clickhouse-compressor` program to work with it. It is installed with the `clickhouse-client` package. To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. -Examples of sending data with compression: +If you specify `compress=1` in the URL, the server will compress the data it sends to you. If you specify `decompress=1` in the URL, the server will decompress the data which you pass in the `POST` method. -``` bash -#Sending data to the server: -$ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' +You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). ClickHouse supports the following [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens): -#Sending data to the client: -$ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' -``` +- `gzip` +- `br` +- `deflate` +- `xz` + +To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. +In order for ClickHouse to compress the response, enable compression with [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting and append `Accept-Encoding: compression_method` header to the request. You can configure the data compression level in the [http_zlib_compression_level](../operations/settings/settings.md#settings-http_zlib_compression_level) setting for all compression methods. !!! note "Note" Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. +**Examples** + +``` bash +# Sending compressed data to the server +$ echo "SELECT 1" | gzip -c | \ + curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +``` + +``` bash +# Receiving compressed data from the server +$ curl -vsS "http://localhost:8123/?enable_http_compression=1" \ + -H 'Accept-Encoding: gzip' --output result.gz -d 'SELECT number FROM system.numbers LIMIT 3' +$ zcat result.gz +0 +1 +2 +``` + +## Default Database {#default-database} + You can use the ‘database’ URL parameter or the ‘X-ClickHouse-Database’ header to specify the default database. ``` bash diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md index c08eec61b1c..f5c85289171 100644 --- a/docs/en/interfaces/third-party/client-libraries.md +++ b/docs/en/interfaces/third-party/client-libraries.md @@ -23,6 +23,7 @@ toc_title: Client Libraries - [SeasClick C++ client](https://github.com/SeasX/SeasClick) - [one-ck](https://github.com/lizhichao/one-ck) - [glushkovds/phpclickhouse-laravel](https://packagist.org/packages/glushkovds/phpclickhouse-laravel) + - [kolya7k ClickHouse PHP extension](https://github.com//kolya7k/clickhouse-php) - Go - [clickhouse](https://github.com/kshvakov/clickhouse/) - [go-clickhouse](https://github.com/roistat/go-clickhouse) diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index fa123d8b23d..e54e40441ca 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -167,4 +167,23 @@ Features: [How to configure ClickHouse in Looker.](https://docs.looker.com/setup-and-management/database-config/clickhouse) -[Original article](https://clickhouse.tech/docs/en/interfaces/third-party/gui/) +### SeekTable {#seektable} + +[SeekTable](https://www.seektable.com) is a self-service BI tool for data exploration and operational reporting. It is available both as a cloud service and a self-hosted version. Reports from SeekTable may be embedded into any web-app. + +Features: + +- Business users-friendly reports builder. +- Powerful report parameters for SQL filtering and report-specific query customizations. +- Can connect to ClickHouse both with a native TCP/IP endpoint and a HTTP(S) interface (2 different drivers). +- It is possible to use all power of ClickHouse SQL dialect in dimensions/measures definitions. +- [Web API](https://www.seektable.com/help/web-api-integration) for automated reports generation. +- Supports reports development flow with account data [backup/restore](https://www.seektable.com/help/self-hosted-backup-restore); data models (cubes) / reports configuration is a human-readable XML and can be stored under version control system. + +SeekTable is [free](https://www.seektable.com/help/cloud-pricing) for personal/individual usage. + +[How to configure ClickHouse connection in SeekTable.](https://www.seektable.com/help/clickhouse-pivot-table) + +### Chadmin {#chadmin} + +[Chadmin](https://github.com/bun4uk/chadmin) is a simple UI where you can visualize your currently running queries on your ClickHouse cluster and info about them and kill them if you want. diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index 454d856f779..fa257a84173 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -12,9 +12,13 @@ toc_title: Adopters |---------|----------|---------|--------------|------------------------------------------------------------------------------|-----------| | 2gis | Maps | Monitoring | — | — | [Talk in Russian, July 2019](https://youtu.be/58sPkXfq6nw) | | Admiral | Martech | Engagement Management | — | — | [Webinar Slides, June 2020](https://altinity.com/presentations/2020/06/16/big-data-in-real-time-how-clickhouse-powers-admirals-visitor-relationships-for-publishers) | +| AdScribe | Ads | TV Analytics | — | — | [A quote from CTO](https://altinity.com/24x7-support/) | +| Ahrefs | SEO | Analytics | — | — | [Job listing](https://ahrefs.com/jobs/data-scientist-search) | | Alibaba Cloud | Cloud | Managed Service | — | — | [Official Website](https://help.aliyun.com/product/144466.html) | | Aloha Browser | Mobile App | Browser backend | — | — | [Slides in Russian, May 2019](https://presentations.clickhouse.tech/meetup22/aloha.pdf) | +| Altinity | Cloud, SaaS | Main product | — | — | [Official Website](https://altinity.com/) | | Amadeus | Travel | Analytics | — | — | [Press Release, April 2018](https://www.altinity.com/blog/2018/4/5/amadeus-technologies-launches-investment-and-insights-tool-based-on-machine-learning-and-strategy-algorithms) | +| ApiRoad | API marketplace | Analytics | — | — | [Blog post, Nov 2018, Mar 2020](https://pixeljets.com/blog/clickhouse-vs-elasticsearch/) | | Appsflyer | Mobile analytics | Main product | — | — | [Talk in Russian, July 2019](https://www.youtube.com/watch?v=M3wbRlcpBbY) | | ArenaData | Data Platform | Main product | — | — | [Slides in Russian, December 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup38/indexes.pdf) | | Avito | Classifieds | Monitoring | — | — | [Meetup, April 2020](https://www.youtube.com/watch?v=n1tm4j4W8ZQ) | @@ -37,23 +41,27 @@ toc_title: Adopters | CraiditX 氪信 | Finance AI | Analysis | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | | Crazypanda | Games | | — | — | Live session on ClickHouse meetup | | Criteo | Retail | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/3_storetail.pptx) | +| Cryptology | Digital Assets Trading Platform | — | — | — | [Job advertisement, March 2021](https://career.habr.com/companies/cryptology/vacancies) | | Dataliance for China Telecom | Telecom | Analytics | — | — | [Slides in Chinese, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/telecom.pdf) | | Deutsche Bank | Finance | BI Analytics | — | — | [Slides in English, October 2019](https://bigdatadays.ru/wp-content/uploads/2019/10/D2-H3-3_Yakunin-Goihburg.pdf) | | Deeplay | Gaming Analytics | — | — | — | [Job advertisement, 2020](https://career.habr.com/vacancies/1000062568) | | Diva-e | Digital consulting | Main Product | — | — | [Slides in English, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup29/ClickHouse-MeetUp-Unusual-Applications-sd-2019-09-17.pdf) | | Ecwid | E-commerce SaaS | Metrics, Logging | — | — | [Slides in Russian, April 2019](https://nastachku.ru/var/files/1/presentation/backend/2_Backend_6.pdf) | | eBay | E-commerce | Logs, Metrics and Events | — | — | [Official website, Sep 2020](https://tech.ebayinc.com/engineering/ou-online-analytical-processing/) | -| Exness | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | +| Exness | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | +| EventBunker.io | Serverless Data Processing | — | — | — | [Tweet, April 2021](https://twitter.com/Halil_D_/status/1379839133472985091) | | FastNetMon | DDoS Protection | Main Product | | — | [Official website](https://fastnetmon.com/docs-fnm-advanced/fastnetmon-advanced-traffic-persistency/) | | Flipkart | e-Commerce | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=239) | | FunCorp | Games | | — | 14 bn records/day as of Jan 2021 | [Article](https://www.altinity.com/blog/migrating-from-redshift-to-clickhouse) | | Geniee | Ad network | Main product | — | — | [Blog post in Japanese, July 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | | Genotek | Bioinformatics | Main product | — | — | [Video, August 2020](https://youtu.be/v3KyZbz9lEE) | +| Glaber | Monitoring | Main product | — | — | [Website](https://glaber.io/) | | HUYA | Video Streaming | Analytics | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | | ICA | FinTech | Risk Management | — | — | [Blog Post in English, Sep 2020](https://altinity.com/blog/clickhouse-vs-redshift-performance-for-fintech-risk-management?utm_campaign=ClickHouse%20vs%20RedShift&utm_content=143520807&utm_medium=social&utm_source=twitter&hss_channel=tw-3894792263) | | Idealista | Real Estate | Analytics | — | — | [Blog Post in English, April 2019](https://clickhouse.tech/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | | Infovista | Networks | Analytics | — | — | [Slides in English, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup30/infovista.pdf) | | InnoGames | Games | Metrics, Logging | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/graphite_and_clickHouse.pdf) | +| Instabug | APM Platform | Main product | — | — | [A quote from Co-Founder](https://altinity.com/) | | Instana | APM Platform | Main product | — | — | [Twitter post](https://twitter.com/mieldonkers/status/1248884119158882304) | | Integros | Platform for video services | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | | Ippon Technologies | Technology Consulting | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=205) | @@ -65,15 +73,20 @@ toc_title: Adopters | Lawrence Berkeley National Laboratory | Research | Traffic analysis | 1 server | 11.8 TiB | [Slides in English, April 2019](https://www.smitasin.com/presentations/2019-04-17_DOE-NSM.pdf) | | LifeStreet | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) | | Mail.ru Cloud Solutions | Cloud services | Main product | — | — | [Article in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) | +| MAXILECT | Ad Tech, Blockchain, ML, AI | — | — | — | [Job advertisement, 2021](https://www.linkedin.com/feed/update/urn:li:activity:6780842017229430784/) | | Marilyn | Advertising | Statistics | — | — | [Talk in Russian, June 2017](https://www.youtube.com/watch?v=iXlIgx2khwc) | | Mello | Marketing | Analytics | 1 server | — | [Article, Oct 2020](https://vc.ru/marketing/166180-razrabotka-tipovogo-otcheta-skvoznoy-analitiki) | | MessageBird | Telecommunications | Statistics | — | — | [Slides in English, November 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup20/messagebird.pdf) | -| MindsDB | Machine Learning | Main Product | — | — | [Official Website](https://www.mindsdb.com/blog/machine-learning-models-as-tables-in-ch) |x +| Microsoft | Web Analytics | Clarity (Main Product) | — | — | [A question on GitHub](https://github.com/ClickHouse/ClickHouse/issues/21556) | +| MindsDB | Machine Learning | Main Product | — | — | [Official Website](https://www.mindsdb.com/blog/machine-learning-models-as-tables-in-ch) | | MUX | Online Video | Video Analytics | — | — | [Talk in English, August 2019](https://altinity.com/presentations/2019/8/13/how-clickhouse-became-the-default-analytics-database-for-mux/) | | MGID | Ad network | Web-analytics | — | — | [Blog post in Russian, April 2020](http://gs-studio.com/news-about-it/32777----clickhouse---c) | +| Netskope | Network Security | — | — | — | [Job advertisement, March 2021](https://www.mendeley.com/careers/job/senior-software-developer-backend-developer-1346348) | +| NIC Labs | Network Monitoring | RaTA-DNS | — | — | [Blog post, March 2021](https://niclabs.cl/ratadns/2021/03/Clickhouse) | | NOC Project | Network Monitoring | Analytics | Main Product | — | [Official Website](https://getnoc.com/features/big-data/) | | Nuna Inc. | Health Data Analytics | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=170) | | OneAPM | Monitorings and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | +| OZON | E-commerce | — | — | — | [Official website](https://job.ozon.ru/vacancy/razrabotchik-clickhouse-ekspluatatsiya-40991870/) | | Panelbear | Analytics | Monitoring and Analytics | — | — | [Tech Stack, November 2020](https://panelbear.com/blog/tech-stack/) | | Percent 百分点 | Analytics | Main Product | — | — | [Slides in Chinese, June 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/4.%20ClickHouse万亿数据双中心的设计与实践%20.pdf) | | Percona | Performance analysis | Percona Monitoring and Management | — | — | [Official website, Mar 2020](https://www.percona.com/blog/2020/03/30/advanced-query-analysis-in-percona-monitoring-and-management-with-direct-clickhouse-access/) | @@ -90,14 +103,17 @@ toc_title: Adopters | Rspamd | Antispam | Analytics | — | — | [Official Website](https://rspamd.com/doc/modules/clickhouse.html) | | RuSIEM | SIEM | Main Product | — | — | [Official Website](https://rusiem.com/en/products/architecture) | | S7 Airlines | Airlines | Metrics, Logging | — | — | [Talk in Russian, March 2019](https://www.youtube.com/watch?v=nwG68klRpPg&t=15s) | +| Sber | Banking, Fintech, Retail, Cloud, Media | — | — | — | [Job advertisement, March 2021](https://career.habr.com/vacancies/1000073536) | | scireum GmbH | e-Commerce | Main product | — | — | [Talk in German, February 2020](https://www.youtube.com/watch?v=7QWAn5RbyR4) | | Segment | Data processing | Main product | 9 * i3en.3xlarge nodes 7.5TB NVME SSDs, 96GB Memory, 12 vCPUs | — | [Slides, 2019](https://slides.com/abraithwaite/segment-clickhouse) | +| sembot.io | Shopping Ads | — | — | — | A comment on LinkedIn, 2020 | | SEMrush | Marketing | Main product | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/5_semrush.pdf) | | Sentry | Software Development | Main product | — | — | [Blog Post in English, May 2019](https://blog.sentry.io/2019/05/16/introducing-snuba-sentrys-new-search-infrastructure) | | seo.do | Analytics | Main product | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/CH%20Presentation-%20Metehan%20Çetinkaya.pdf) | | SGK | Goverment Social Security | Analytics | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/ClickHouse%20Meetup-Ramazan%20POLAT.pdf) | | Sina | News | — | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/6.%20ClickHouse最佳实践%20高鹏_新浪.pdf) | | SMI2 | News | Analytics | — | — | [Blog Post in Russian, November 2017](https://habr.com/ru/company/smi2/blog/314558/) | +| Spark New Zealand | Telecommunications | Security Operations | — | — | [Blog Post, Feb 2020](https://blog.n0p.me/2020/02/2020-02-05-dnsmonster/) | | Splunk | Business Analytics | Main product | — | — | [Slides in English, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/splunk.pdf) | | Spotify | Music | Experimentation | — | — | [Slides, July 2018](https://www.slideshare.net/glebus/using-clickhouse-for-experimentation-104247173) | | Staffcop | Information Security | Main Product | — | — | [Official website, Documentation](https://www.staffcop.ru/sce43) | @@ -106,22 +122,31 @@ toc_title: Adopters | Tencent | Big Data | Data processing | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/5.%20ClickHouse大数据集群应用_李俊飞腾讯网媒事业部.pdf) | | Tencent | Messaging | Logging | — | — | [Talk in Chinese, November 2019](https://youtu.be/T-iVQRuw-QY?t=5050) | | Tencent Music Entertainment (TME) | BigData | Data processing | — | — | [Blog in Chinese, June 2020](https://cloud.tencent.com/developer/article/1637840) | +| Tinybird | Real-time Data Products | Data processing | — | — | [Official website](https://www.tinybird.co/) | | Traffic Stars | AD network | — | — | — | [Slides in Russian, May 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | | Uber | Taxi | Logging | — | — | [Slides, February 2020](https://presentations.clickhouse.tech/meetup40/uber.pdf) | +| UTMSTAT | Analytics | Main product | — | — | [Blog post, June 2020](https://vc.ru/tribuna/133956-striming-dannyh-iz-servisa-skvoznoy-analitiki-v-clickhouse) | | VKontakte | Social Network | Statistics, Logging | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | +| VMWare | Cloud | VeloCloud, SDN | — | — | [Product documentation](https://docs.vmware.com/en/vRealize-Operations-Manager/8.3/com.vmware.vcom.metrics.doc/GUID-A9AD72E1-C948-4CA2-971B-919385AB3CA8.html) | | Walmart Labs | Internet, Retail | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=144) | | Wargaming | Games | | — | — | [Interview](https://habr.com/en/post/496954/) | +| Wildberries | E-commerce | | — | — | [Official website](https://it.wildberries.ru/) | | Wisebits | IT Solutions | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | | Workato | Automation Software | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=334) | +| Xenoss | Marketing, Advertising | — | — | — | [Instagram, March 2021](https://www.instagram.com/p/CNATV7qBgB1/) | | Xiaoxin Tech | Education | Common purpose | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/sync-clickhouse-with-mysql-mongodb.pptx) | | Ximalaya | Audio sharing | OLAP | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/ximalaya.pdf) | | Yandex Cloud | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | | Yandex DataLens | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.tech/meetup38/datalens.pdf) | | Yandex Market | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) | | Yandex Metrica | Web analytics | Main product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.tech/meetup40/introduction/#13) | +| Yotascale | Cloud | Data pipeline | — | 2 bn records/day | [LinkedIn (Accomplishments)](https://www.linkedin.com/in/adilsaleem/) | | ЦВТ | Software Development | Metrics, Logging | — | — | [Blog Post, March 2019, in Russian](https://vc.ru/dev/62715-kak-my-stroili-monitoring-na-prometheus-clickhouse-i-elk) | | МКБ | Bank | Web-system monitoring | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/mkb.pdf) | | ЦФТ | Banking, Financial products, Payments | — | — | — | [Meetup in Russian, April 2020](https://team.cft.ru/events/162) | +| Цифровой Рабочий | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | | kakaocorp | Internet company | — | — | — | [if(kakao)2020 conference](https://if.kakao.com/session/117) | +| ООО «МПЗ Богородский» | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) | +| Tesla | Electric vehicle and clean energy company | — | — | — | [Vacancy description, March 2021](https://news.ycombinator.com/item?id=26306170) | [Original article](https://clickhouse.tech/docs/en/introduction/adopters/) diff --git a/docs/en/introduction/distinctive-features.md b/docs/en/introduction/distinctive-features.md index 4a8f31b8d56..be7c2d2e7c1 100644 --- a/docs/en/introduction/distinctive-features.md +++ b/docs/en/introduction/distinctive-features.md @@ -7,9 +7,9 @@ toc_title: Distinctive Features ## True Column-Oriented Database Management System {#true-column-oriented-dbms} -In a true column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. As an example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data. +In a real column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. For example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data. -It is worth noting because there are systems that can store values of different columns separately, but that can’t effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. In these systems, you would get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second. +It is worth noting because there are systems that can store values of different columns separately, but that can’t effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. You would get throughput around a hundred thousand rows per second in these systems, but not hundreds of millions of rows per second. It’s also worth noting that ClickHouse is a database management system, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server. diff --git a/docs/en/operations/caches.md b/docs/en/operations/caches.md index 7b096b76f75..ec7e4239a9d 100644 --- a/docs/en/operations/caches.md +++ b/docs/en/operations/caches.md @@ -8,18 +8,21 @@ toc_title: Caches When performing queries, ClichHouse uses different caches. Main cache types: + - `mark_cache` — Cache of marks used by table engines of the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) family. - `uncompressed_cache` — Cache of uncompressed data used by table engines of the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) family. Additional cache types: -- DNS cache -- [regexp](../interfaces/formats.md#data-format-regexp) cache -- compiled expressions cache -- [Avro format](../interfaces/formats.md#data-format-avro) schemas cache -- [dictionaries data cache](../sql-reference/dictionaries/index.md) + +- DNS cache. +- [Regexp](../interfaces/formats.md#data-format-regexp) cache. +- Compiled expressions cache. +- [Avro format](../interfaces/formats.md#data-format-avro) schemas cache. +- [Dictionaries](../sql-reference/dictionaries/index.md) data cache. Indirectly used: -- OS page cache + +- OS page cache. To drop cache, use [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md) statements. diff --git a/docs/en/operations/external-authenticators/index.md b/docs/en/operations/external-authenticators/index.md index 95f80f192f5..aa220f50ef8 100644 --- a/docs/en/operations/external-authenticators/index.md +++ b/docs/en/operations/external-authenticators/index.md @@ -11,3 +11,6 @@ ClickHouse supports authenticating and managing users using external services. The following external authenticators and directories are supported: - [LDAP](./ldap.md#external-authenticators-ldap) [Authenticator](./ldap.md#ldap-external-authenticator) and [Directory](./ldap.md#ldap-external-user-directory) +- Kerberos [Authenticator](./kerberos.md#external-authenticators-kerberos) + +[Original article](https://clickhouse.tech/docs/en/operations/external-authenticators/index/) diff --git a/docs/en/operations/external-authenticators/kerberos.md b/docs/en/operations/external-authenticators/kerberos.md new file mode 100644 index 00000000000..5fe0b2bfc37 --- /dev/null +++ b/docs/en/operations/external-authenticators/kerberos.md @@ -0,0 +1,115 @@ +# Kerberos {#external-authenticators-kerberos} + +Existing and properly configured ClickHouse users can be authenticated via Kerberos authentication protocol. + +Currently, Kerberos can only be used as an external authenticator for existing users, which are defined in `users.xml` or in local access control paths. Those users may only use HTTP requests and must be able to authenticate using GSS-SPNEGO mechanism. + +For this approach, Kerberos must be configured in the system and must be enabled in ClickHouse config. + + +## Enabling Kerberos in ClickHouse {#enabling-kerberos-in-clickhouse} + +To enable Kerberos, one should include `kerberos` section in `config.xml`. This section may contain additional parameters. + +#### Parameters: + +- `principal` - canonical service principal name that will be acquired and used when accepting security contexts. + - This parameter is optional, if omitted, the default principal will be used. + + +- `realm` - a realm, that will be used to restrict authentication to only those requests whose initiator's realm matches it. + - This parameter is optional, if omitted, no additional filtering by realm will be applied. + +Example (goes into `config.xml`): + +```xml + + + + +``` + +With principal specification: + +```xml + + + + HTTP/clickhouse.example.com@EXAMPLE.COM + + +``` + +With filtering by realm: + +```xml + + + + EXAMPLE.COM + + +``` + +!!! warning "Note" + You can define only one `kerberos` section. The presence of multiple `kerberos` sections will force ClickHouse to disable Kerberos authentication. + +!!! warning "Note" + `principal` and `realm` sections cannot be specified at the same time. The presence of both `principal` and `realm` sections will force ClickHouse to disable Kerberos authentication. + + +## Kerberos as an external authenticator for existing users {#kerberos-as-an-external-authenticator-for-existing-users} + +Kerberos can be used as a method for verifying the identity of locally defined users (users defined in `users.xml` or in local access control paths). Currently, **only** requests over the HTTP interface can be *kerberized* (via GSS-SPNEGO mechanism). + +Kerberos principal name format usually follows this pattern: + +- *primary/instance@REALM* + +The */instance* part may occur zero or more times. **The *primary* part of the canonical principal name of the initiator is expected to match the kerberized user name for authentication to succeed**. + +### Enabling Kerberos in `users.xml` {#enabling-kerberos-in-users-xml} + +In order to enable Kerberos authentication for the user, specify `kerberos` section instead of `password` or similar sections in the user definition. + +Parameters: + +- `realm` - a realm that will be used to restrict authentication to only those requests whose initiator's realm matches it. + - This parameter is optional, if omitted, no additional filtering by realm will be applied. + +Example (goes into `users.xml`): + +```xml + + + + + + + + EXAMPLE.COM + + + + +``` + +!!! warning "Warning" + Note that Kerberos authentication cannot be used alongside with any other authentication mechanism. The presence of any other sections like `password` alongside `kerberos` will force ClickHouse to shutdown. + +!!! info "Reminder" + Note, that now, once user `my_user` uses `kerberos`, Kerberos must be enabled in the main `config.xml` file as described previously. + +### Enabling Kerberos using SQL {#enabling-kerberos-using-sql} + +When [SQL-driven Access Control and Account Management](../access-rights.md#access-control) is enabled in ClickHouse, users identified by Kerberos can also be created using SQL statements. + +```sql +CREATE USER my_user IDENTIFIED WITH kerberos REALM 'EXAMPLE.COM' +``` + +...or, without filtering by realm: + +```sql +CREATE USER my_user IDENTIFIED WITH kerberos +``` diff --git a/docs/en/operations/external-authenticators/ldap.md b/docs/en/operations/external-authenticators/ldap.md index 36a13227852..1b65ecc968b 100644 --- a/docs/en/operations/external-authenticators/ldap.md +++ b/docs/en/operations/external-authenticators/ldap.md @@ -1,15 +1,17 @@ -# LDAP {#external-authenticators-ldap} +# LDAP {#external-authenticators-ldap} LDAP server can be used to authenticate ClickHouse users. There are two different approaches for doing this: -- use LDAP as an external authenticator for existing users, which are defined in `users.xml` or in local access control paths -- use LDAP as an external user directory and allow locally undefined users to be authenticated if they exist on the LDAP server +- Use LDAP as an external authenticator for existing users, which are defined in `users.xml` or in local access control paths. +- Use LDAP as an external user directory and allow locally undefined users to be authenticated if they exist on the LDAP server. -For both of these approaches, an internally named LDAP server must be defined in the ClickHouse config so that other parts of config are able to refer to it. +For both of these approaches, an internally named LDAP server must be defined in the ClickHouse config so that other parts of the config can refer to it. ## LDAP Server Definition {#ldap-server-definition} -To define LDAP server you must add `ldap_servers` section to the `config.xml`. For example, +To define LDAP server you must add `ldap_servers` section to the `config.xml`. + +**Example** ```xml @@ -35,38 +37,35 @@ To define LDAP server you must add `ldap_servers` section to the `config.xml`. F Note, that you can define multiple LDAP servers inside the `ldap_servers` section using distinct names. -Parameters: +**Parameters** -- `host` - LDAP server hostname or IP, this parameter is mandatory and cannot be empty. -- `port` - LDAP server port, default is `636` if `enable_tls` is set to `true`, `389` otherwise. -- `bind_dn` - template used to construct the DN to bind to. - - The resulting DN will be constructed by replacing all `{user_name}` substrings of the - template with the actual user name during each authentication attempt. -- `verification_cooldown` - a period of time, in seconds, after a successful bind attempt, - during which the user will be assumed to be successfully authenticated for all consecutive - requests without contacting the LDAP server. +- `host` — LDAP server hostname or IP, this parameter is mandatory and cannot be empty. +- `port` — LDAP server port, default is `636` if `enable_tls` is set to `true`, `389` otherwise. +- `bind_dn` — Template used to construct the DN to bind to. + - The resulting DN will be constructed by replacing all `{user_name}` substrings of the template with the actual user name during each authentication attempt. +- `verification_cooldown` — A period of time, in seconds, after a successful bind attempt, during which the user will be assumed to be successfully authenticated for all consecutive requests without contacting the LDAP server. - Specify `0` (the default) to disable caching and force contacting the LDAP server for each authentication request. -- `enable_tls` - flag to trigger use of secure connection to the LDAP server. +- `enable_tls` — A flag to trigger the use of the secure connection to the LDAP server. - Specify `no` for plain text `ldap://` protocol (not recommended). - Specify `yes` for LDAP over SSL/TLS `ldaps://` protocol (recommended, the default). - Specify `starttls` for legacy StartTLS protocol (plain text `ldap://` protocol, upgraded to TLS). -- `tls_minimum_protocol_version` - the minimum protocol version of SSL/TLS. +- `tls_minimum_protocol_version` — The minimum protocol version of SSL/TLS. - Accepted values are: `ssl2`, `ssl3`, `tls1.0`, `tls1.1`, `tls1.2` (the default). -- `tls_require_cert` - SSL/TLS peer certificate verification behavior. +- `tls_require_cert` — SSL/TLS peer certificate verification behavior. - Accepted values are: `never`, `allow`, `try`, `demand` (the default). -- `tls_cert_file` - path to certificate file. -- `tls_key_file` - path to certificate key file. -- `tls_ca_cert_file` - path to CA certificate file. -- `tls_ca_cert_dir` - path to the directory containing CA certificates. -- `tls_cipher_suite` - allowed cipher suite (in OpenSSL notation). +- `tls_cert_file` — Path to certificate file. +- `tls_key_file` — Path to certificate key file. +- `tls_ca_cert_file` — Path to CA certificate file. +- `tls_ca_cert_dir` — Path to the directory containing CA certificates. +- `tls_cipher_suite` — Allowed cipher suite (in OpenSSL notation). ## LDAP External Authenticator {#ldap-external-authenticator} -A remote LDAP server can be used as a method for verifying passwords for locally defined users (users defined in `users.xml` or in local access control paths). In order to achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition. +A remote LDAP server can be used as a method for verifying passwords for locally defined users (users defined in `users.xml` or in local access control paths). To achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition. -At each login attempt, ClickHouse will try to "bind" to the specified DN defined by the `bind_dn` parameter in the [LDAP server definition](#ldap-server-definition) using the provided credentials, and if successful, the user will be considered authenticated. This is often called a "simple bind" method. +At each login attempt, ClickHouse tries to "bind" to the specified DN defined by the `bind_dn` parameter in the [LDAP server definition](#ldap-server-definition) using the provided credentials, and if successful, the user is considered authenticated. This is often called a "simple bind" method. -For example, +**Example** ```xml @@ -85,20 +84,24 @@ For example, Note, that user `my_user` refers to `my_ldap_server`. This LDAP server must be configured in the main `config.xml` file as described previously. -When SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled in ClickHouse, users that are authenticated by LDAP servers can also be created using the [CRATE USER](../../sql-reference/statements/create/user.md#create-user-statement) statement. +When SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled, users that are authenticated by LDAP servers can also be created using the [CREATE USER](../../sql-reference/statements/create/user.md#create-user-statement) statement. +Query: + ```sql -CREATE USER my_user IDENTIFIED WITH ldap_server BY 'my_ldap_server' +CREATE USER my_user IDENTIFIED WITH ldap SERVER 'my_ldap_server'; ``` ## LDAP Exernal User Directory {#ldap-external-user-directory} -In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. In order to achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section of the `config.xml` file. +In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. To achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section of the `config.xml` file. -At each login attempt, ClickHouse will try to find the user definition locally and authenticate it as usual, but if the user is not defined, ClickHouse will assume it exists in the external LDAP directory, and will try to "bind" to the specified DN at the LDAP server using the provided credentials. If successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then be assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled and roles are created using the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. +At each login attempt, ClickHouse tries to find the user definition locally and authenticate it as usual. If the user is not defined, ClickHouse will assume the definition exists in the external LDAP directory and will try to "bind" to the specified DN at the LDAP server using the provided credentials. If successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then be assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled and roles are created using the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. -Example (goes into `config.xml`): +**Example** + +Goes into `config.xml`. ```xml @@ -123,34 +126,24 @@ Example (goes into `config.xml`): ``` -Note that `my_ldap_server` referred in the `ldap` section inside the `user_directories` section must be a previously -defined LDAP server that is configured in the `config.xml` (see [LDAP Server Definition](#ldap-server-definition)). +Note that `my_ldap_server` referred in the `ldap` section inside the `user_directories` section must be a previously defined LDAP server that is configured in the `config.xml` (see [LDAP Server Definition](#ldap-server-definition)). -Parameters: +**Parameters** -- `server` - one of LDAP server names defined in the `ldap_servers` config section above. - This parameter is mandatory and cannot be empty. -- `roles` - section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. - - If no roles are specified here or assigned during role mapping (below), user will not be able - to perform any actions after authentication. -- `role_mapping` - section with LDAP search parameters and mapping rules. - - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` - and the name of the logged in user. For each entry found during that search, the value of the specified - attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, - and the rest of the value becomes the name of a local role defined in ClickHouse, - which is expected to be created beforehand by the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. +- `server` — One of LDAP server names defined in the `ldap_servers` config section above. This parameter is mandatory and cannot be empty. +- `roles` — Section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. + - If no roles are specified here or assigned during role mapping (below), user will not be able to perform any actions after authentication. +- `role_mapping` — Section with LDAP search parameters and mapping rules. + - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` and the name of the logged-in user. For each entry found during that search, the value of the specified attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, and the rest of the value becomes the name of a local role defined in ClickHouse, which is expected to be created beforehand by the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. - There can be multiple `role_mapping` sections defined inside the same `ldap` section. All of them will be applied. - - `base_dn` - template used to construct the base DN for the LDAP search. - - The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}` - substrings of the template with the actual user name and bind DN during each LDAP search. - - `scope` - scope of the LDAP search. + - `base_dn` — Template used to construct the base DN for the LDAP search. + - The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}` substrings of the template with the actual user name and bind DN during each LDAP search. + - `scope` — Scope of the LDAP search. - Accepted values are: `base`, `one_level`, `children`, `subtree` (the default). - - `search_filter` - template used to construct the search filter for the LDAP search. - - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{base_dn}` - substrings of the template with the actual user name, bind DN, and base DN during each LDAP search. + - `search_filter` — Template used to construct the search filter for the LDAP search. + - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}` and `{base_dn}` substrings of the template with the actual user name, bind DN and base DN during each LDAP search. - Note, that the special characters must be escaped properly in XML. - - `attribute` - attribute name whose values will be returned by the LDAP search. - - `prefix` - prefix, that will be expected to be in front of each string in the original - list of strings returned by the LDAP search. Prefix will be removed from the original - strings and resulting strings will be treated as local role names. Empty, by default. + - `attribute` — Attribute name whose values will be returned by the LDAP search. + - `prefix` — Prefix, that will be expected to be in front of each string in the original list of strings returned by the LDAP search. The prefix will be removed from the original strings and the resulting strings will be treated as local role names. Empty by default. +[Original article](https://clickhouse.tech/docs/en/operations/external-authenticators/ldap/) diff --git a/docs/en/operations/opentelemetry.md b/docs/en/operations/opentelemetry.md index 2afeabc7956..74c9c6dd18f 100644 --- a/docs/en/operations/opentelemetry.md +++ b/docs/en/operations/opentelemetry.md @@ -5,74 +5,40 @@ toc_title: OpenTelemetry Support # [experimental] OpenTelemetry Support -[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting -traces and metrics from distributed application. ClickHouse has some support -for OpenTelemetry. +[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting traces and metrics from the distributed application. ClickHouse has some support for OpenTelemetry. !!! warning "Warning" -This is an experimental feature that will change in backwards-incompatible ways in the future releases. - + This is an experimental feature that will change in backwards-incompatible ways in future releases. ## Supplying Trace Context to ClickHouse -ClickHouse accepts trace context HTTP headers, as described by -the [W3C recommendation](https://www.w3.org/TR/trace-context/). -It also accepts trace context over native protocol that is used for -communication between ClickHouse servers or between the client and server. -For manual testing, trace context headers conforming to the Trace Context -recommendation can be supplied to `clickhouse-client` using -`--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags. - -If no parent trace context is supplied, ClickHouse can start a new trace, with -probability controlled by the `opentelemetry_start_trace_probability` setting. +ClickHouse accepts trace context HTTP headers, as described by the [W3C recommendation](https://www.w3.org/TR/trace-context/). It also accepts trace context over a native protocol that is used for communication between ClickHouse servers or between the client and server. For manual testing, trace context headers conforming to the Trace Context recommendation can be supplied to `clickhouse-client` using `--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags. +If no parent trace context is supplied, ClickHouse can start a new trace, with probability controlled by the [opentelemetry_start_trace_probability](../operations/settings/settings.md#opentelemetry-start-trace-probability) setting. ## Propagating the Trace Context The trace context is propagated to downstream services in the following cases: -* Queries to remote ClickHouse servers, such as when using `Distributed` table - engine. - -* `URL` table function. Trace context information is sent in HTTP headers. +* Queries to remote ClickHouse servers, such as when using [Distributed](../engines/table-engines/special/distributed.md) table engine. +* [url](../sql-reference/table-functions/url.md) table function. Trace context information is sent in HTTP headers. ## Tracing the ClickHouse Itself -ClickHouse creates _trace spans_ for each query and some of the query execution -stages, such as query planning or distributed queries. +ClickHouse creates `trace spans` for each query and some of the query execution stages, such as query planning or distributed queries. -To be useful, the tracing information has to be exported to a monitoring system -that supports OpenTelemetry, such as Jaeger or Prometheus. ClickHouse avoids -a dependency on a particular monitoring system, instead only providing the -tracing data through a system table. OpenTelemetry trace span information -[required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span) -is stored in the `system.opentelemetry_span_log` table. +To be useful, the tracing information has to be exported to a monitoring system that supports OpenTelemetry, such as [Jaeger](https://jaegertracing.io/) or [Prometheus](https://prometheus.io/). ClickHouse avoids a dependency on a particular monitoring system, instead only providing the tracing data through a system table. OpenTelemetry trace span information [required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span) is stored in the [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md) table. -The table must be enabled in the server configuration, see the `opentelemetry_span_log` -element in the default config file `config.xml`. It is enabled by default. +The table must be enabled in the server configuration, see the `opentelemetry_span_log` element in the default config file `config.xml`. It is enabled by default. -The table has the following columns: - -- `trace_id` -- `span_id` -- `parent_span_id` -- `operation_name` -- `start_time` -- `finish_time` -- `finish_date` -- `attribute.name` -- `attribute.values` - -The tags or attributes are saved as two parallel arrays, containing the keys -and values. Use `ARRAY JOIN` to work with them. +The tags or attributes are saved as two parallel arrays, containing the keys and values. Use [ARRAY JOIN](../sql-reference/statements/select/array-join.md) to work with them. ## Integration with monitoring systems -At the moment, there is no ready tool that can export the tracing data from -ClickHouse to a monitoring system. +At the moment, there is no ready tool that can export the tracing data from ClickHouse to a monitoring system. -For testing, it is possible to setup the export using a materialized view with the URL engine over the `system.opentelemetry_span_log` table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format: +For testing, it is possible to setup the export using a materialized view with the [URL](../engines/table-engines/special/url.md) engine over the [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md) table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format: ```sql CREATE MATERIALIZED VIEW default.zipkin_spans @@ -94,3 +60,5 @@ FROM system.opentelemetry_span_log ``` In case of any errors, the part of the log data for which the error has occurred will be silently lost. Check the server log for error messages if the data does not arrive. + +[Original article](https://clickhouse.tech/docs/en/operations/opentelemetry/) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 89fcbafe663..f86e9668f00 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -100,6 +100,11 @@ Default value: `1073741824` (1 GB). 1073741824 ``` +## database_atomic_delay_before_drop_table_sec {#database_atomic_delay_before_drop_table_sec} + +Sets the delay before remove table data in seconds. If the query has `SYNC` modifier, this setting is ignored. + +Default value: `480` (8 minute). ## default_database {#default-database} @@ -125,6 +130,25 @@ Settings profiles are located in the file specified in the parameter `user_confi default ``` +## default_replica_path {#default_replica_path} + +The path to the table in ZooKeeper. + +**Example** + +``` xml +/clickhouse/tables/{uuid}/{shard} +``` +## default_replica_name {#default_replica_name} + + The replica name in ZooKeeper. + +**Example** + +``` xml +{replica} +``` + ## dictionaries_config {#server_configuration_parameters-dictionaries_config} The path to the config file for external dictionaries. @@ -321,7 +345,8 @@ Similar to `interserver_http_host`, except that this hostname can be used by oth The username and password used to authenticate during [replication](../../engines/table-engines/mergetree-family/replication.md) with the Replicated\* engines. These credentials are used only for communication between replicas and are unrelated to credentials for ClickHouse clients. The server is checking these credentials for connecting replicas and use the same credentials when connecting to other replicas. So, these credentials should be set the same for all replicas in a cluster. By default, the authentication is not used. -**Note:** These credentials are common for replication through `HTTP` and `HTTPS`. +!!! note "Note" + These credentials are common for replication through `HTTP` and `HTTPS`. This section contains the following parameters: @@ -502,7 +527,15 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa ## max_concurrent_queries {#max-concurrent-queries} -The maximum number of simultaneously processed requests. +The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — Disabled. **Example** @@ -530,6 +563,21 @@ Default value: `0` that means no limit. - [max_concurrent_queries](#max-concurrent-queries) +## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries} + +The minimal number of marks read by the query for applying the [max_concurrent_queries](#max-concurrent-queries) setting. + +Possible values: + +- Positive integer. +- 0 — Disabled. + +**Example** + +``` xml +10 +``` + ## max_connections {#max-connections} The maximum number of inbound connections. diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index 77b68715ba9..b2470207dcc 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -56,6 +56,26 @@ Default value: 150. ClickHouse artificially executes `INSERT` longer (adds ‘sleep’) so that the background merge process can merge parts faster than they are added. +## inactive_parts_to_throw_insert {#inactive-parts-to-throw-insert} + +If the number of inactive parts in a single partition more than the `inactive_parts_to_throw_insert` value, `INSERT` is interrupted with the "Too many inactive parts (N). Parts cleaning are processing significantly slower than inserts" exception. + +Possible values: + +- Any positive integer. + +Default value: 0 (unlimited). + +## inactive_parts_to_delay_insert {#inactive-parts-to-delay-insert} + +If the number of inactive parts in a single partition in the table at least that many the `inactive_parts_to_delay_insert` value, an `INSERT` artificially slows down. It is useful when a server fails to clean up parts quickly enough. + +Possible values: + +- Any positive integer. + +Default value: 0 (unlimited). + ## max_delay_to_insert {#max-delay-to-insert} The value in seconds, which is used to calculate the `INSERT` delay, if the number of active parts in a single partition exceeds the [parts_to_delay_insert](#parts-to-delay-insert) value. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 6440f09bb40..6a1b25982a0 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -769,6 +769,38 @@ Example: log_query_threads=1 ``` +## log_comment {#settings-log-comment} + +Specifies the value for the `log_comment` field of the [system.query_log](../system-tables/query_log.md) table and comment text for the server log. + +It can be used to improve the readability of server logs. Additionally, it helps to select queries related to the test from the `system.query_log` after running [clickhouse-test](../../development/tests.md). + +Possible values: + +- Any string no longer than [max_query_size](#settings-max_query_size). If length is exceeded, the server throws an exception. + +Default value: empty string. + +**Example** + +Query: + +``` sql +SET log_comment = 'log_comment test', log_queries = 1; +SELECT 1; +SYSTEM FLUSH LOGS; +SELECT type, query FROM system.query_log WHERE log_comment = 'log_comment test' AND event_date >= yesterday() ORDER BY event_time DESC LIMIT 2; +``` + +Result: + +``` text +┌─type────────┬─query─────┐ +│ QueryStart │ SELECT 1; │ +│ QueryFinish │ SELECT 1; │ +└─────────────┴───────────┘ +``` + ## max_insert_block_size {#settings-max_insert_block_size} The size of blocks (in a count of rows) to form for insertion into a table. @@ -1097,14 +1129,25 @@ See the section “WITH TOTALS modifier”. ## max_parallel_replicas {#settings-max_parallel_replicas} -The maximum number of replicas for each shard when executing a query. In limited circumstances, this can make a query faster by executing it on more servers. This setting is only useful for replicated tables with a sampling key. There are cases where performance will not improve or even worsen: +The maximum number of replicas for each shard when executing a query. -- the position of the sampling key in the partitioning key's order doesn't allow efficient range scans -- adding a sampling key to the table makes filtering by other columns less efficient -- the sampling key is an expression that is expensive to calculate -- the cluster's latency distribution has a long tail, so that querying more servers increases the query's overall latency +Possible values: -In addition, this setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain conditions. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md/#max_parallel_replica-subqueries) for more details. +- Positive integer. + +Default value: `1`. + +**Additional Info** + +This setting is useful for replicated tables with a sampling key. A query may be processed faster if it is executed on several servers in parallel. But the query performance may degrade in the following cases: + +- The position of the sampling key in the partitioning key doesn't allow efficient range scans. +- Adding a sampling key to the table makes filtering by other columns less efficient. +- The sampling key is an expression that is expensive to calculate. +- The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency. + +!!! warning "Warning" + This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md#max_parallel_replica-subqueries) for more details. ## compile {#compile} @@ -1503,6 +1546,14 @@ FORMAT PrettyCompactMonoBlock Default value: 0 +## optimize_skip_unused_shards_limit {#optimize-skip-unused-shards-limit} + +Limit for number of sharding key values, turns off `optimize_skip_unused_shards` if the limit is reached. + +Too many values may require significant amount for processing, while the benefit is doubtful, since if you have huge number of values in `IN (...)`, then most likely the query will be sent to all shards anyway. + +Default value: 1000 + ## optimize_skip_unused_shards {#optimize-skip-unused-shards} Enables or disables skipping of unused shards for [SELECT](../../sql-reference/statements/select/index.md) queries that have sharding key condition in `WHERE/PREWHERE` (assuming that the data is distributed by sharding key, otherwise does nothing). @@ -1514,6 +1565,17 @@ Possible values: Default value: 0 +## optimize_skip_unused_shards_rewrite_in {#optimize-skip-unused-shardslrewrite-in} + +Rewrite IN in query for remote shards to exclude values that does not belong to the shard (requires optimize_skip_unused_shards). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1 (since it requires `optimize_skip_unused_shards` anyway, which `0` by default) + ## allow_nondeterministic_optimize_skip_unused_shards {#allow-nondeterministic-optimize-skip-unused-shards} Allow nondeterministic (like `rand` or `dictGet`, since later has some caveats with updates) functions in sharding key. @@ -1863,7 +1925,7 @@ Default value: `0`. Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table when there is no distributed key. -By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. +By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. Possible values: @@ -1872,6 +1934,53 @@ Possible values: Default value: `0`. +## insert_shard_id {#insert_shard_id} + +If not `0`, specifies the shard of [Distributed](../../engines/table-engines/special/distributed.md#distributed) table into which the data will be inserted synchronously. + +If `insert_shard_id` value is incorrect, the server will throw an exception. + +To get the number of shards on `requested_cluster`, you can check server config or use this query: + +``` sql +SELECT uniq(shard_num) FROM system.clusters WHERE cluster = 'requested_cluster'; +``` + +Possible values: + +- 0 — Disabled. +- Any number from `1` to `shards_num` of corresponding [Distributed](../../engines/table-engines/special/distributed.md#distributed) table. + +Default value: `0`. + +**Example** + +Query: + +```sql +CREATE TABLE x AS system.numbers ENGINE = MergeTree ORDER BY number; +CREATE TABLE x_dist AS x ENGINE = Distributed('test_cluster_two_shards_localhost', currentDatabase(), x); +INSERT INTO x_dist SELECT * FROM numbers(5) SETTINGS insert_shard_id = 1; +SELECT * FROM x_dist ORDER BY number ASC; +``` + +Result: + +``` text +┌─number─┐ +│ 0 │ +│ 0 │ +│ 1 │ +│ 1 │ +│ 2 │ +│ 2 │ +│ 3 │ +│ 3 │ +│ 4 │ +│ 4 │ +└────────┘ +``` + ## use_compact_format_in_distributed_parts_names {#use_compact_format_in_distributed_parts_names} Uses compact format for storing blocks for async (`insert_distributed_sync`) INSERT into tables with `Distributed` engine. @@ -1956,8 +2065,8 @@ Default value: 16. **See Also** -- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine -- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine +- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine. +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine. ## validate_polygons {#validate_polygons} @@ -2658,8 +2767,6 @@ Result: Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md) behaviour. -[Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) - ## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists} Allows to select data from a file engine table without file. @@ -2672,10 +2779,91 @@ Default value: `0`. ## engine_file_truncate_on_insert {#engine-file-truncate-on-insert} -Enables or disables truncate before insert in file engine tables. +Enables or disables truncate before insert in [File](../../engines/table-engines/special/file.md) engine tables. Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` replaces existing content of the file with the new data. Default value: `0`. + +## allow_experimental_geo_types {#allow-experimental-geo-types} + +Allows working with experimental [geo data types](../../sql-reference/data-types/geo.md). + +Possible values: + +- 0 — Working with geo data types is disabled. +- 1 — Working with geo data types is enabled. + +Default value: `0`. + +## database_atomic_wait_for_drop_and_detach_synchronously {#database_atomic_wait_for_drop_and_detach_synchronously} + +Adds a modifier `SYNC` to all `DROP` and `DETACH` queries. + +Possible values: + +- 0 — Queries will be executed with delay. +- 1 — Queries will be executed without delay. + +Default value: `0`. + +## show_table_uuid_in_table_create_query_if_not_nil {#show_table_uuid_in_table_create_query_if_not_nil} + +Sets the `SHOW TABLE` query display. + +Possible values: + +- 0 — The query will be displayed without table UUID. +- 1 — The query will be displayed with table UUID. + +Default value: `0`. + +## allow_experimental_live_view {#allow-experimental-live-view} + +Allows creation of experimental [live views](../../sql-reference/statements/create/view.md#live-view). + +Possible values: + +- 0 — Working with live views is disabled. +- 1 — Working with live views is enabled. + +Default value: `0`. + +## live_view_heartbeat_interval {#live-view-heartbeat-interval} + +Sets the heartbeat interval in seconds to indicate [live view](../../sql-reference/statements/create/view.md#live-view) is alive . + +Default value: `15`. + +## max_live_view_insert_blocks_before_refresh {#max-live-view-insert-blocks-before-refresh} + +Sets the maximum number of inserted blocks after which mergeable blocks are dropped and query for [live view](../../sql-reference/statements/create/view.md#live-view) is re-executed. + +Default value: `64`. + +## temporary_live_view_timeout {#temporary-live-view-timeout} + +Sets the interval in seconds after which [live view](../../sql-reference/statements/create/view.md#live-view) with timeout is deleted. + +Default value: `5`. + +## periodic_live_view_refresh {#periodic-live-view-refresh} + +Sets the interval in seconds after which periodically refreshed [live view](../../sql-reference/statements/create/view.md#live-view) is forced to refresh. + +Default value: `60`. + +## check_query_single_value_result {#check_query_single_value_result} + +Defines the level of detail for the [CHECK TABLE](../../sql-reference/statements/check-table.md#checking-mergetree-tables) query result for `MergeTree` family engines . + +Possible values: + +- 0 — the query shows a check status for every individual data part of a table. +- 1 — the query shows the general table check status. + +Default value: `0`. + +[Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) diff --git a/docs/en/operations/system-tables/columns.md b/docs/en/operations/system-tables/columns.md index 92a6315d06b..9160dca9a1a 100644 --- a/docs/en/operations/system-tables/columns.md +++ b/docs/en/operations/system-tables/columns.md @@ -4,7 +4,9 @@ Contains information about columns in all the tables. You can use this table to get information similar to the [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table) query, but for multiple tables at once. -The `system.columns` table contains the following columns (the column type is shown in brackets): +Columns from [temporary tables](../../sql-reference/statements/create/table.md#temporary-tables) are visible in the `system.columns` only in those session where they have been created. They are shown with the empty `database` field. + +Columns: - `database` ([String](../../sql-reference/data-types/string.md)) — Database name. - `table` ([String](../../sql-reference/data-types/string.md)) — Table name. @@ -26,7 +28,7 @@ The `system.columns` table contains the following columns (the column type is sh **Example** ```sql -:) select * from system.columns LIMIT 2 FORMAT Vertical; +SELECT * FROM system.columns LIMIT 2 FORMAT Vertical; ``` ```text @@ -65,8 +67,6 @@ is_in_sorting_key: 0 is_in_primary_key: 0 is_in_sampling_key: 0 compression_codec: - -2 rows in set. Elapsed: 0.002 sec. ``` [Original article](https://clickhouse.tech/docs/en/operations/system_tables/columns) diff --git a/docs/en/operations/system-tables/data_type_families.md b/docs/en/operations/system-tables/data_type_families.md index ddda91ed151..4e439f13aa5 100644 --- a/docs/en/operations/system-tables/data_type_families.md +++ b/docs/en/operations/system-tables/data_type_families.md @@ -1,6 +1,6 @@ # system.data_type_families {#system_tables-data_type_families} -Contains information about supported [data types](../../sql-reference/data-types/). +Contains information about supported [data types](../../sql-reference/data-types/index.md). Columns: diff --git a/docs/en/operations/system-tables/distributed_ddl_queue.md b/docs/en/operations/system-tables/distributed_ddl_queue.md index c252458af8a..fa871d215b5 100644 --- a/docs/en/operations/system-tables/distributed_ddl_queue.md +++ b/docs/en/operations/system-tables/distributed_ddl_queue.md @@ -14,7 +14,7 @@ Columns: - `initiator` ([String](../../sql-reference/data-types/string.md)) — Node that executed the query. - `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query start time. - `query_finish_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query finish time. -- `query_duration_ms` ([UInt64](../../sql-reference/data-types/datetime64.md)) — Duration of query execution (in milliseconds). +- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration of query execution (in milliseconds). - `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — Exception code from [ZooKeeper](../../operations/tips.md#zookeeper). **Example** diff --git a/docs/en/operations/system-tables/errors.md b/docs/en/operations/system-tables/errors.md index ec874efd711..583cce88ca4 100644 --- a/docs/en/operations/system-tables/errors.md +++ b/docs/en/operations/system-tables/errors.md @@ -7,11 +7,15 @@ Columns: - `name` ([String](../../sql-reference/data-types/string.md)) — name of the error (`errorCodeToName`). - `code` ([Int32](../../sql-reference/data-types/int-uint.md)) — code number of the error. - `value` ([UInt64](../../sql-reference/data-types/int-uint.md)) — the number of times this error has been happened. +- `last_error_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — time when the last error happened. +- `last_error_message` ([String](../../sql-reference/data-types/string.md)) — message for the last error. +- `last_error_trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — A [stack trace](https://en.wikipedia.org/wiki/Stack_trace) which represents a list of physical addresses where the called methods are stored. +- `remote` ([UInt8](../../sql-reference/data-types/int-uint.md)) — remote exception (i.e. received during one of the distributed query). **Example** ``` sql -SELECT * +SELECT name, code, value FROM system.errors WHERE value > 0 ORDER BY code ASC @@ -21,3 +25,12 @@ LIMIT 1 │ CANNOT_OPEN_FILE │ 76 │ 1 │ └──────────────────┴──────┴───────┘ ``` + +``` sql +WITH arrayMap(x -> demangle(addressToSymbol(x)), last_error_trace) AS all +SELECT name, arrayStringConcat(all, '\n') AS res +FROM system.errors +LIMIT 1 +SETTINGS allow_introspection_functions=1\G +``` + diff --git a/docs/en/operations/system-tables/index.md b/docs/en/operations/system-tables/index.md index 5dc23aee686..e66f082167e 100644 --- a/docs/en/operations/system-tables/index.md +++ b/docs/en/operations/system-tables/index.md @@ -20,7 +20,7 @@ System tables: Most of system tables store their data in RAM. A ClickHouse server creates such system tables at the start. -Unlike other system tables, the system log tables [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), crash_log and [text_log](../../operations/system-tables/text_log.md) are served by [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine and store their data in a storage filesystem by default. If you remove a table from a filesystem, the ClickHouse server creates the empty one again at the time of the next data writing. If system table schema changed in a new release, then ClickHouse renames the current table and creates a new one. +Unlike other system tables, the system log tables [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), [crash_log](../../operations/system-tables/crash-log.md) and [text_log](../../operations/system-tables/text_log.md) are served by [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine and store their data in a filesystem by default. If you remove a table from a filesystem, the ClickHouse server creates the empty one again at the time of the next data writing. If system table schema changed in a new release, then ClickHouse renames the current table and creates a new one. System log tables can be customized by creating a config file with the same name as the table under `/etc/clickhouse-server/config.d/`, or setting corresponding elements in `/etc/clickhouse-server/config.xml`. Elements can be customized are: @@ -33,7 +33,7 @@ System log tables can be customized by creating a config file with the same name An example: -``` +```xml system diff --git a/docs/en/operations/system-tables/part_log.md b/docs/en/operations/system-tables/part_log.md index 579fdaefb0a..3f9110349dd 100644 --- a/docs/en/operations/system-tables/part_log.md +++ b/docs/en/operations/system-tables/part_log.md @@ -17,7 +17,6 @@ The `system.part_log` table contains the following columns: - `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date. - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time. - `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds precision. - - `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration. - `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database the data part is in. - `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table the data part is in. diff --git a/docs/en/operations/system-tables/query_log.md b/docs/en/operations/system-tables/query_log.md index 32b2bdf2133..6cf87ee1f17 100644 --- a/docs/en/operations/system-tables/query_log.md +++ b/docs/en/operations/system-tables/query_log.md @@ -44,9 +44,15 @@ Columns: - `result_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of rows in a result of the `SELECT` query, or a number of rows in the `INSERT` query. - `result_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — RAM volume in bytes used to store a query result. - `memory_usage` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Memory consumption by the query. +- `current_database` ([String](../../sql-reference/data-types/string.md)) — Name of the current database. - `query` ([String](../../sql-reference/data-types/string.md)) — Query string. -- `exception` ([String](../../sql-reference/data-types/string.md)) — Exception message. +- `normalized_query_hash` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Identical hash value without the values of literals for similar queries. +- `query_kind` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — Type of the query. +- `databases` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the databases present in the query. +- `tables` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the tables present in the query. +- `columns` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the columns present in the query. - `exception_code` ([Int32](../../sql-reference/data-types/int-uint.md)) — Code of an exception. +- `exception` ([String](../../sql-reference/data-types/string.md)) — Exception message. - `stack_trace` ([String](../../sql-reference/data-types/string.md)) — [Stack trace](https://en.wikipedia.org/wiki/Stack_trace). An empty string, if the query was completed successfully. - `is_initial_query` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Query type. Possible values: - 1 — Query was initiated by the client. @@ -73,69 +79,98 @@ Columns: - 0 — The query was launched from the TCP interface. - 1 — `GET` method was used. - 2 — `POST` method was used. -- `http_user_agent` ([String](../../sql-reference/data-types/string.md)) — The `UserAgent` header passed in the HTTP request. -- `quota_key` ([String](../../sql-reference/data-types/string.md)) — The “quota key” specified in the [quotas](../../operations/quotas.md) setting (see `keyed`). +- `http_user_agent` ([String](../../sql-reference/data-types/string.md)) — HTTP header `UserAgent` passed in the HTTP query. +- `http_referer` ([String](../../sql-reference/data-types/string.md)) — HTTP header `Referer` passed in the HTTP query (contains an absolute or partial address of the page making the query). +- `forwarded_for` ([String](../../sql-reference/data-types/string.md)) — HTTP header `X-Forwarded-For` passed in the HTTP query. +- `quota_key` ([String](../../sql-reference/data-types/string.md)) — The `quota key` specified in the [quotas](../../operations/quotas.md) setting (see `keyed`). - `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse revision. -- `thread_numbers` ([Array(UInt32)](../../sql-reference/data-types/array.md)) — Number of threads that are participating in query execution. +- `log_comment` ([String](../../sql-reference/data-types/string.md)) — Log comment. It can be set to arbitrary string no longer than [max_query_size](../../operations/settings/settings.md#settings-max_query_size). An empty string if it is not defined. +- `thread_ids` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — Thread ids that are participating in query execution. - `ProfileEvents.Names` ([Array(String)](../../sql-reference/data-types/array.md)) — Counters that measure different metrics. The description of them could be found in the table [system.events](../../operations/system-tables/events.md#system_tables-events) - `ProfileEvents.Values` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — Values of metrics that are listed in the `ProfileEvents.Names` column. - `Settings.Names` ([Array(String)](../../sql-reference/data-types/array.md)) — Names of settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parameter to 1. - `Settings.Values` ([Array(String)](../../sql-reference/data-types/array.md)) — Values of settings that are listed in the `Settings.Names` column. +- `used_aggregate_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `aggregate functions`, which were used during query execution. +- `used_aggregate_function_combinators` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `aggregate functions combinators`, which were used during query execution. +- `used_database_engines` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `database engines`, which were used during query execution. +- `used_data_type_families` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `data type families`, which were used during query execution. +- `used_dictionaries` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `dictionaries`, which were used during query execution. +- `used_formats` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `formats`, which were used during query execution. +- `used_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `functions`, which were used during query execution. +- `used_storages` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `storages`, which were used during query execution. +- `used_table_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `table functions`, which were used during query execution. **Example** ``` sql -SELECT * FROM system.query_log LIMIT 1 \G +SELECT * FROM system.query_log WHERE type = 'QueryFinish' AND (query LIKE '%toDate(\'2000-12-05\')%') ORDER BY query_start_time DESC LIMIT 1 FORMAT Vertical; ``` ``` text Row 1: ────── -type: QueryStart -event_date: 2020-09-11 -event_time: 2020-09-11 10:08:17 -event_time_microseconds: 2020-09-11 10:08:17.063321 -query_start_time: 2020-09-11 10:08:17 -query_start_time_microseconds: 2020-09-11 10:08:17.063321 -query_duration_ms: 0 -read_rows: 0 -read_bytes: 0 -written_rows: 0 -written_bytes: 0 -result_rows: 0 -result_bytes: 0 -memory_usage: 0 -current_database: default -query: INSERT INTO test1 VALUES -exception_code: 0 -exception: -stack_trace: -is_initial_query: 1 -user: default -query_id: 50a320fd-85a8-49b8-8761-98a86bcbacef -address: ::ffff:127.0.0.1 -port: 33452 -initial_user: default -initial_query_id: 50a320fd-85a8-49b8-8761-98a86bcbacef -initial_address: ::ffff:127.0.0.1 -initial_port: 33452 -interface: 1 -os_user: bharatnc -client_hostname: tower -client_name: ClickHouse -client_revision: 54437 -client_version_major: 20 -client_version_minor: 7 -client_version_patch: 2 -http_method: 0 -http_user_agent: -quota_key: -revision: 54440 -thread_ids: [] -ProfileEvents.Names: [] -ProfileEvents.Values: [] -Settings.Names: ['use_uncompressed_cache','load_balancing','log_queries','max_memory_usage','allow_introspection_functions'] -Settings.Values: ['0','random','1','10000000000','1'] +type: QueryFinish +event_date: 2021-03-18 +event_time: 2021-03-18 20:54:18 +event_time_microseconds: 2021-03-18 20:54:18.676686 +query_start_time: 2021-03-18 20:54:18 +query_start_time_microseconds: 2021-03-18 20:54:18.673934 +query_duration_ms: 2 +read_rows: 100 +read_bytes: 800 +written_rows: 0 +written_bytes: 0 +result_rows: 2 +result_bytes: 4858 +memory_usage: 0 +current_database: default +query: SELECT uniqArray([1, 1, 2]), SUBSTRING('Hello, world', 7, 5), flatten([[[BIT_AND(123)]], [[mod(3, 2)], [CAST('1' AS INTEGER)]]]), week(toDate('2000-12-05')), CAST(arrayJoin([NULL, NULL]) AS Nullable(TEXT)), avgOrDefaultIf(number, number % 2), sumOrNull(number), toTypeName(sumOrNull(number)), countIf(toDate('2000-12-05') + number as d, toDayOfYear(d) % 2) FROM numbers(100) +normalized_query_hash: 17858008518552525706 +query_kind: Select +databases: ['_table_function'] +tables: ['_table_function.numbers'] +columns: ['_table_function.numbers.number'] +exception_code: 0 +exception: +stack_trace: +is_initial_query: 1 +user: default +query_id: 58f3d392-0fa0-4663-ae1d-29917a1a9c9c +address: ::ffff:127.0.0.1 +port: 37486 +initial_user: default +initial_query_id: 58f3d392-0fa0-4663-ae1d-29917a1a9c9c +initial_address: ::ffff:127.0.0.1 +initial_port: 37486 +interface: 1 +os_user: sevirov +client_hostname: clickhouse.ru-central1.internal +client_name: ClickHouse +client_revision: 54447 +client_version_major: 21 +client_version_minor: 4 +client_version_patch: 1 +http_method: 0 +http_user_agent: +http_referer: +forwarded_for: +quota_key: +revision: 54449 +log_comment: +thread_ids: [587,11939] +ProfileEvents.Names: ['Query','SelectQuery','ReadCompressedBytes','CompressedReadBufferBlocks','CompressedReadBufferBytes','IOBufferAllocs','IOBufferAllocBytes','ArenaAllocChunks','ArenaAllocBytes','FunctionExecute','TableFunctionExecute','NetworkSendElapsedMicroseconds','SelectedRows','SelectedBytes','ContextLock','RWLockAcquiredReadLocks','RealTimeMicroseconds','UserTimeMicroseconds','SystemTimeMicroseconds','SoftPageFaults','OSCPUVirtualTimeMicroseconds','OSWriteBytes'] +ProfileEvents.Values: [1,1,36,1,10,2,1048680,1,4096,36,1,110,100,800,77,1,3137,1476,1101,8,2577,8192] +Settings.Names: ['load_balancing','max_memory_usage'] +Settings.Values: ['random','10000000000'] +used_aggregate_functions: ['groupBitAnd','avg','sum','count','uniq'] +used_aggregate_function_combinators: ['OrDefault','If','OrNull','Array'] +used_database_engines: [] +used_data_type_families: ['String','Array','Int32','Nullable'] +used_dictionaries: [] +used_formats: [] +used_functions: ['toWeek','CAST','arrayFlatten','toTypeName','toDayOfYear','addDays','array','toDate','modulo','substring','plus'] +used_storages: [] +used_table_functions: ['numbers'] ``` **See Also** @@ -143,4 +178,3 @@ Settings.Values: ['0','random','1','10000000000','1'] - [system.query_thread_log](../../operations/system-tables/query_thread_log.md#system_tables-query_thread_log) — This table contains information about each query execution thread. [Original article](https://clickhouse.tech/docs/en/operations/system_tables/query_log) - diff --git a/docs/en/operations/system-tables/quota_limits.md b/docs/en/operations/system-tables/quota_limits.md index c2dcb4db34d..11616990206 100644 --- a/docs/en/operations/system-tables/quota_limits.md +++ b/docs/en/operations/system-tables/quota_limits.md @@ -17,5 +17,3 @@ Columns: - `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of rows read from all tables and table functions participated in queries. - `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of bytes read from all tables and table functions participated in queries. - `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Maximum of the query execution time, in seconds. - -[Original article](https://clickhouse.tech/docs/en/operations/system_tables/quota_limits) diff --git a/docs/en/operations/system-tables/quota_usage.md b/docs/en/operations/system-tables/quota_usage.md index 17af9ad9a30..89fdfe70069 100644 --- a/docs/en/operations/system-tables/quota_usage.md +++ b/docs/en/operations/system-tables/quota_usage.md @@ -28,5 +28,3 @@ Columns: ## See Also {#see-also} - [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) - -[Original article](https://clickhouse.tech/docs/en/operations/system_tables/quota_usage) diff --git a/docs/en/operations/system-tables/quotas_usage.md b/docs/en/operations/system-tables/quotas_usage.md index 31aafd3e697..04cf91cb990 100644 --- a/docs/en/operations/system-tables/quotas_usage.md +++ b/docs/en/operations/system-tables/quotas_usage.md @@ -30,6 +30,4 @@ Columns: ## See Also {#see-also} -- [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) - -[Original article](https://clickhouse.tech/docs/en/operations/system_tables/quotas_usage) +- [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) \ No newline at end of file diff --git a/docs/en/operations/system-tables/replication_queue.md b/docs/en/operations/system-tables/replication_queue.md index aa379caa46c..f3e3a35f13b 100644 --- a/docs/en/operations/system-tables/replication_queue.md +++ b/docs/en/operations/system-tables/replication_queue.md @@ -14,7 +14,17 @@ Columns: - `node_name` ([String](../../sql-reference/data-types/string.md)) — Node name in ZooKeeper. -- `type` ([String](../../sql-reference/data-types/string.md)) — Type of the task in the queue: `GET_PARTS`, `MERGE_PARTS`, `DETACH_PARTS`, `DROP_PARTS`, or `MUTATE_PARTS`. +- `type` ([String](../../sql-reference/data-types/string.md)) — Type of the task in the queue, one of: + - `GET_PART` - Get the part from another replica. + - `ATTACH_PART` - Attach the part, possibly from our own replica (if found in `detached` folder). + You may think of it as a `GET_PART` with some optimisations as they're nearly identical. + - `MERGE_PARTS` - Merge the parts. + - `DROP_RANGE` - Delete the parts in the specified partition in the specified number range. + - `CLEAR_COLUMN` - NOTE: Deprecated. Drop specific column from specified partition. + - `CLEAR_INDEX` - NOTE: Deprecated. Drop specific index from specified partition. + - `REPLACE_RANGE` - Drop certain range of partitions and replace them by new ones + - `MUTATE_PART` - Apply one or several mutations to the part. + - `ALTER_METADATA` - Apply alter modification according to global /metadata and /columns paths - `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was submitted for execution. @@ -70,12 +80,12 @@ num_tries: 36 last_exception: Code: 226, e.displayText() = DB::Exception: Marks file '/opt/clickhouse/data/merge/visits_v2/tmp_fetch_20201130_121373_121384_2/CounterID.mrk' doesn't exist (version 20.8.7.15 (official build)) last_attempt_time: 2020-12-08 17:35:54 num_postponed: 0 -postpone_reason: +postpone_reason: last_postpone_time: 1970-01-01 03:00:00 ``` **See Also** -- [Managing ReplicatedMergeTree Tables](../../sql-reference/statements/system.md/#query-language-system-replicated) +- [Managing ReplicatedMergeTree Tables](../../sql-reference/statements/system.md#query-language-system-replicated) [Original article](https://clickhouse.tech/docs/en/operations/system_tables/replication_queue) diff --git a/docs/en/operations/system-tables/settings.md b/docs/en/operations/system-tables/settings.md index a1db0a3d558..7034fe1204f 100644 --- a/docs/en/operations/system-tables/settings.md +++ b/docs/en/operations/system-tables/settings.md @@ -48,5 +48,6 @@ SELECT * FROM system.settings WHERE changed AND name='load_balancing' - [Settings](../../operations/settings/index.md#session-settings-intro) - [Permissions for Queries](../../operations/settings/permissions-for-queries.md#settings_readonly) - [Constraints on Settings](../../operations/settings/constraints-on-settings.md) +- [SHOW SETTINGS](../../sql-reference/statements/show.md#show-settings) statement [Original article](https://clickhouse.tech/docs/en/operations/system_tables/settings) diff --git a/docs/en/operations/system-tables/tables.md b/docs/en/operations/system-tables/tables.md index 6ad1425e032..ccc9ab94f8b 100644 --- a/docs/en/operations/system-tables/tables.md +++ b/docs/en/operations/system-tables/tables.md @@ -1,59 +1,65 @@ # system.tables {#system-tables} -Contains metadata of each table that the server knows about. Detached tables are not shown in `system.tables`. +Contains metadata of each table that the server knows about. -This table contains the following columns (the column type is shown in brackets): +[Detached](../../sql-reference/statements/detach.md) tables are not shown in `system.tables`. -- `database` (String) — The name of the database the table is in. +[Temporary tables](../../sql-reference/statements/create/table.md#temporary-tables) are visible in the `system.tables` only in those session where they have been created. They are shown with the empty `database` field and with the `is_temporary` flag switched on. -- `name` (String) — Table name. +Columns: -- `engine` (String) — Table engine name (without parameters). +- `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database the table is in. -- `is_temporary` (UInt8) - Flag that indicates whether the table is temporary. +- `name` ([String](../../sql-reference/data-types/string.md)) — Table name. -- `data_path` (String) - Path to the table data in the file system. +- `engine` ([String](../../sql-reference/data-types/string.md)) — Table engine name (without parameters). -- `metadata_path` (String) - Path to the table metadata in the file system. +- `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) - Flag that indicates whether the table is temporary. -- `metadata_modification_time` (DateTime) - Time of latest modification of the table metadata. +- `data_path` ([String](../../sql-reference/data-types/string.md)) - Path to the table data in the file system. -- `dependencies_database` (Array(String)) - Database dependencies. +- `metadata_path` ([String](../../sql-reference/data-types/string.md)) - Path to the table metadata in the file system. -- `dependencies_table` (Array(String)) - Table dependencies ([MaterializedView](../../engines/table-engines/special/materializedview.md) tables based on the current table). +- `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - Time of latest modification of the table metadata. -- `create_table_query` (String) - The query that was used to create the table. +- `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Database dependencies. -- `engine_full` (String) - Parameters of the table engine. +- `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Table dependencies ([MaterializedView](../../engines/table-engines/special/materializedview.md) tables based on the current table). -- `partition_key` (String) - The partition key expression specified in the table. +- `create_table_query` ([String](../../sql-reference/data-types/string.md)) - The query that was used to create the table. -- `sorting_key` (String) - The sorting key expression specified in the table. +- `engine_full` ([String](../../sql-reference/data-types/string.md)) - Parameters of the table engine. -- `primary_key` (String) - The primary key expression specified in the table. +- `partition_key` ([String](../../sql-reference/data-types/string.md)) - The partition key expression specified in the table. -- `sampling_key` (String) - The sampling key expression specified in the table. +- `sorting_key` ([String](../../sql-reference/data-types/string.md)) - The sorting key expression specified in the table. -- `storage_policy` (String) - The storage policy: +- `primary_key` ([String](../../sql-reference/data-types/string.md)) - The primary key expression specified in the table. + +- `sampling_key` ([String](../../sql-reference/data-types/string.md)) - The sampling key expression specified in the table. + +- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - The storage policy: - [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) - [Distributed](../../engines/table-engines/special/distributed.md#distributed) -- `total_rows` (Nullable(UInt64)) - Total number of rows, if it is possible to quickly determine exact number of rows in the table, otherwise `Null` (including underying `Buffer` table). +- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of rows, if it is possible to quickly determine exact number of rows in the table, otherwise `NULL` (including underying `Buffer` table). -- `total_bytes` (Nullable(UInt64)) - Total number of bytes, if it is possible to quickly determine exact number of bytes for the table on storage, otherwise `Null` (**does not** includes any underlying storage). +- `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of bytes, if it is possible to quickly determine exact number of bytes for the table on storage, otherwise `NULL` (does not includes any underlying storage). - If the table stores data on disk, returns used space on disk (i.e. compressed). - If the table stores data in memory, returns approximated number of used bytes in memory. -- `lifetime_rows` (Nullable(UInt64)) - Total number of rows INSERTed since server start (only for `Buffer` tables). +- `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of rows INSERTed since server start (only for `Buffer` tables). -- `lifetime_bytes` (Nullable(UInt64)) - Total number of bytes INSERTed since server start (only for `Buffer` tables). +- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of bytes INSERTed since server start (only for `Buffer` tables). The `system.tables` table is used in `SHOW TABLES` query implementation. +**Example** + ```sql -:) SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; +SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; ``` ```text @@ -100,8 +106,6 @@ sampling_key: storage_policy: total_rows: ᴺᵁᴸᴸ total_bytes: ᴺᵁᴸᴸ - -2 rows in set. Elapsed: 0.004 sec. ``` [Original article](https://clickhouse.tech/docs/en/operations/system_tables/tables) diff --git a/docs/en/operations/system-tables/trace_log.md b/docs/en/operations/system-tables/trace_log.md index 2903e0d3bd7..e4c01a65d9d 100644 --- a/docs/en/operations/system-tables/trace_log.md +++ b/docs/en/operations/system-tables/trace_log.md @@ -20,10 +20,12 @@ Columns: When connecting to the server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1 revision 54429.`. This field contains the `revision`, but not the `version` of a server. -- `timer_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Timer type: +- `trace_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Trace type: - - `Real` represents wall-clock time. - - `CPU` represents CPU time. + - `Real` represents collecting stack traces by wall-clock time. + - `CPU` represents collecting stack traces by CPU time. + - `Memory` represents collecting allocations and deallocations when memory allocation exceeds the subsequent watermark. + - `MemorySample` represents collecting random allocations and deallocations. - `thread_number` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Thread identifier. @@ -52,4 +54,5 @@ trace: [371912858,371912789,371798468,371799717,371801313,3717 size: 5244400 ``` - [Original article](https://clickhouse.tech/docs/en/operations/system_tables/trace_log) \ No newline at end of file + [Original article](https://clickhouse.tech/docs/en/operations/system-tables/trace_log) + diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index e62dea0b04e..865fe58d7cd 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -191,8 +191,9 @@ dynamicConfigFile=/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/zoo. Java version: ``` text -Java(TM) SE Runtime Environment (build 1.8.0_25-b17) -Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) +openjdk 11.0.5-shenandoah 2019-10-15 +OpenJDK Runtime Environment (build 11.0.5-shenandoah+10-adhoc.heretic.src) +OpenJDK 64-Bit Server VM (build 11.0.5-shenandoah+10-adhoc.heretic.src, mixed mode) ``` JVM parameters: @@ -204,7 +205,7 @@ ZOOCFGDIR=/etc/$NAME/conf # TODO this is really ugly # How to find out, which jars are needed? # seems, that log4j requires the log4j.properties file to be in the classpath -CLASSPATH="$ZOOCFGDIR:/usr/build/classes:/usr/build/lib/*.jar:/usr/share/zookeeper/zookeeper-3.5.1-metrika.jar:/usr/share/zookeeper/slf4j-log4j12-1.7.5.jar:/usr/share/zookeeper/slf4j-api-1.7.5.jar:/usr/share/zookeeper/servlet-api-2.5-20081211.jar:/usr/share/zookeeper/netty-3.7.0.Final.jar:/usr/share/zookeeper/log4j-1.2.16.jar:/usr/share/zookeeper/jline-2.11.jar:/usr/share/zookeeper/jetty-util-6.1.26.jar:/usr/share/zookeeper/jetty-6.1.26.jar:/usr/share/zookeeper/javacc.jar:/usr/share/zookeeper/jackson-mapper-asl-1.9.11.jar:/usr/share/zookeeper/jackson-core-asl-1.9.11.jar:/usr/share/zookeeper/commons-cli-1.2.jar:/usr/src/java/lib/*.jar:/usr/etc/zookeeper" +CLASSPATH="$ZOOCFGDIR:/usr/build/classes:/usr/build/lib/*.jar:/usr/share/zookeeper-3.6.2/lib/audience-annotations-0.5.0.jar:/usr/share/zookeeper-3.6.2/lib/commons-cli-1.2.jar:/usr/share/zookeeper-3.6.2/lib/commons-lang-2.6.jar:/usr/share/zookeeper-3.6.2/lib/jackson-annotations-2.10.3.jar:/usr/share/zookeeper-3.6.2/lib/jackson-core-2.10.3.jar:/usr/share/zookeeper-3.6.2/lib/jackson-databind-2.10.3.jar:/usr/share/zookeeper-3.6.2/lib/javax.servlet-api-3.1.0.jar:/usr/share/zookeeper-3.6.2/lib/jetty-http-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-io-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-security-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-server-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-servlet-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-util-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jline-2.14.6.jar:/usr/share/zookeeper-3.6.2/lib/json-simple-1.1.1.jar:/usr/share/zookeeper-3.6.2/lib/log4j-1.2.17.jar:/usr/share/zookeeper-3.6.2/lib/metrics-core-3.2.5.jar:/usr/share/zookeeper-3.6.2/lib/netty-buffer-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-codec-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-common-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-handler-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-resolver-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-transport-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-transport-native-epoll-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-transport-native-unix-common-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/simpleclient-0.6.0.jar:/usr/share/zookeeper-3.6.2/lib/simpleclient_common-0.6.0.jar:/usr/share/zookeeper-3.6.2/lib/simpleclient_hotspot-0.6.0.jar:/usr/share/zookeeper-3.6.2/lib/simpleclient_servlet-0.6.0.jar:/usr/share/zookeeper-3.6.2/lib/slf4j-api-1.7.25.jar:/usr/share/zookeeper-3.6.2/lib/slf4j-log4j12-1.7.25.jar:/usr/share/zookeeper-3.6.2/lib/snappy-java-1.1.7.jar:/usr/share/zookeeper-3.6.2/lib/zookeeper-3.6.2.jar:/usr/share/zookeeper-3.6.2/lib/zookeeper-jute-3.6.2.jar:/usr/share/zookeeper-3.6.2/lib/zookeeper-prometheus-metrics-3.6.2.jar:/usr/share/zookeeper-3.6.2/etc" ZOOCFG="$ZOOCFGDIR/zoo.cfg" ZOO_LOG_DIR=/var/log/$NAME @@ -213,27 +214,17 @@ GROUP=zookeeper PIDDIR=/var/run/$NAME PIDFILE=$PIDDIR/$NAME.pid SCRIPTNAME=/etc/init.d/$NAME -JAVA=/usr/bin/java +JAVA=/usr/local/jdk-11/bin/java ZOOMAIN="org.apache.zookeeper.server.quorum.QuorumPeerMain" ZOO_LOG4J_PROP="INFO,ROLLINGFILE" JMXLOCALONLY=false JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \ -Xmx{{ '{{' }} cluster.get('xmx','1G') {{ '}}' }} \ - -Xloggc:/var/log/$NAME/zookeeper-gc.log \ - -XX:+UseGCLogFileRotation \ - -XX:NumberOfGCLogFiles=16 \ - -XX:GCLogFileSize=16M \ + -Xlog:safepoint,gc*=info,age*=debug:file=/var/log/$NAME/zookeeper-gc.log:time,level,tags:filecount=16,filesize=16M -verbose:gc \ - -XX:+PrintGCTimeStamps \ - -XX:+PrintGCDateStamps \ - -XX:+PrintGCDetails - -XX:+PrintTenuringDistribution \ - -XX:+PrintGCApplicationStoppedTime \ - -XX:+PrintGCApplicationConcurrentTime \ - -XX:+PrintSafepointStatistics \ - -XX:+UseParNewGC \ - -XX:+UseConcMarkSweepGC \ --XX:+CMSParallelRemarkEnabled" + -XX:+UseG1GC \ + -Djute.maxbuffer=8388608 \ + -XX:MaxGCPauseMillis=50" ``` Salt init: diff --git a/docs/en/operations/update.md b/docs/en/operations/update.md index 9fa9c44e130..dbcf9ae2b3e 100644 --- a/docs/en/operations/update.md +++ b/docs/en/operations/update.md @@ -15,7 +15,8 @@ $ sudo service clickhouse-server restart If you installed ClickHouse using something other than the recommended `deb` packages, use the appropriate update method. -ClickHouse does not support a distributed update. The operation should be performed consecutively on each separate server. Do not update all the servers on a cluster simultaneously, or the cluster will be unavailable for some time. +!!! note "Note" + You can update multiple servers at once as soon as there is no moment when all replicas of one shard are offline. The upgrade of older version of ClickHouse to specific version: @@ -28,7 +29,3 @@ $ sudo apt-get update $ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b $ sudo service clickhouse-server restart ``` - - - - diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index 04f9f3660b5..cfabf42bff1 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -91,6 +91,8 @@ $ clickhouse-local --query " Now let’s output memory user for each Unix user: +Query: + ``` bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ | clickhouse-local --structure "user String, mem Float64" \ @@ -98,6 +100,8 @@ $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" ``` +Result: + ``` text Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ┏━━━━━━━━━━┳━━━━━━━━━━┓ diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index 015c90e90c7..259202805d3 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -27,7 +27,37 @@ Example 2: `uniqArray(arr)` – Counts the number of unique elements in all ‘a ## -SimpleState {#agg-functions-combinator-simplestate} -If you apply this combinator, the aggregate function returns the same value but with a different type. This is an `SimpleAggregateFunction(...)` that can be stored in a table to work with [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engines. +If you apply this combinator, the aggregate function returns the same value but with a different type. This is a [SimpleAggregateFunction(...)](../../sql-reference/data-types/simpleaggregatefunction.md) that can be stored in a table to work with [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) tables. + +**Syntax** + +``` sql +SimpleState(x) +``` + +**Arguments** + +- `x` — Aggregate function parameters. + +**Returned values** + +The value of an aggregate function with the `SimpleAggregateFunction(...)` type. + +**Example** + +Query: + +``` sql +WITH anySimpleState(number) AS c SELECT toTypeName(c), c FROM numbers(1); +``` + +Result: + +``` text +┌─toTypeName(c)────────────────────────┬─c─┐ +│ SimpleAggregateFunction(any, UInt64) │ 0 │ +└──────────────────────────────────────┴───┘ +``` ## -State {#agg-functions-combinator-state} @@ -249,5 +279,3 @@ FROM people └────────┴───────────────────────────┘ ``` - -[Original article](https://clickhouse.tech/docs/en/query_language/agg_functions/combinators/) diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index 543a5d3fed8..d2b46f6de53 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -59,4 +59,3 @@ SELECT groupArray(y) FROM t_null_big `groupArray` does not include `NULL` in the resulting array. -[Original article](https://clickhouse.tech/docs/en/query_language/agg_functions/) diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 035bc91b9ed..b9d504241db 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -243,7 +243,7 @@ The function works according to the algorithm: **Syntax** ``` sql -windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) +windowFunnel(window, [mode, [mode, ... ]])(timestamp, cond1, cond2, ..., condN) ``` **Arguments** @@ -253,9 +253,11 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) **Parameters** -- `window` — Length of the sliding window. The unit of `window` depends on the timestamp itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`. -- `mode` - It is an optional parameter. - - `'strict'` - When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values. +- `window` — Length of the sliding window, it is the time interval between first condition and last condition. The unit of `window` depends on the `timestamp` itself and varies. Determined using the expression `timestamp of cond1 <= timestamp of cond2 <= ... <= timestamp of condN <= timestamp of cond1 + window`. +- `mode` — It is an optional argument. One or more modes can be set. + - `'strict'` — If same condition holds for sequence of events then such non-unique events would be skipped. + - `'strict_order'` — Don't allow interventions of other events. E.g. in the case of `A->B->D->C`, it stops finding `A->B->C` at the `D` and the max event level is 2. + - `'strict_increase'` — Apply conditions only to events with strictly increasing timestamps. **Returned value** @@ -336,14 +338,14 @@ retention(cond1, cond2, ..., cond32); **Arguments** -- `cond` — an expression that returns a `UInt8` result (1 or 0). +- `cond` — An expression that returns a `UInt8` result (1 or 0). **Returned value** The array of 1 or 0. -- 1 — condition was met for the event. -- 0 — condition wasn’t met for the event. +- 1 — Condition was met for the event. +- 0 — Condition wasn’t met for the event. Type: `UInt8`. @@ -500,7 +502,6 @@ Problem: Generate a report that shows only keywords that produced at least 5 uni Solution: Write in the GROUP BY query SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 ``` -[Original article](https://clickhouse.tech/docs/en/query_language/agg_functions/parametric_functions/) ## sumMapFiltered(keys_to_keep)(keys, values) {#summapfilteredkeys-to-keepkeys-values} diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 7639117042f..72aa607a751 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -52,15 +52,15 @@ Input table: Query: ``` sql -SELECT argMax(user, salary), argMax(tuple(user, salary)) FROM salary; +SELECT argMax(user, salary), argMax(tuple(user, salary), salary), argMax(tuple(user, salary)) FROM salary; ``` Result: ``` text -┌─argMax(user, salary)─┬─argMax(tuple(user, salary))─┐ -│ director │ ('director',5000) │ -└──────────────────────┴─────────────────────────────┘ +┌─argMax(user, salary)─┬─argMax(tuple(user, salary), salary)─┬─argMax(tuple(user, salary))─┐ +│ director │ ('director',5000) │ ('director',5000) │ +└──────────────────────┴─────────────────────────────────────┴─────────────────────────────┘ ``` [Original article](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/argmax/) diff --git a/docs/en/sql-reference/aggregate-functions/reference/avg.md b/docs/en/sql-reference/aggregate-functions/reference/avg.md index 12dc4ac1e9d..cbd409ccab6 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/avg.md +++ b/docs/en/sql-reference/aggregate-functions/reference/avg.md @@ -9,31 +9,24 @@ Calculates the arithmetic mean. **Syntax** ``` sql -avgWeighted(x) +avg(x) ``` **Arguments** -- `x` — Values. - -`x` must be -[Integer](../../../sql-reference/data-types/int-uint.md), -[floating-point](../../../sql-reference/data-types/float.md), or -[Decimal](../../../sql-reference/data-types/decimal.md). +- `x` — input values, must be [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md), or [Decimal](../../../sql-reference/data-types/decimal.md). **Returned value** -- `NaN` if the supplied parameter is empty. -- Mean otherwise. - -**Return type** is always [Float64](../../../sql-reference/data-types/float.md). +- The arithmetic mean, always as [Float64](../../../sql-reference/data-types/float.md). +- `NaN` if the input parameter `x` is empty. **Example** Query: ``` sql -SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5) +SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5); ``` Result: @@ -46,11 +39,20 @@ Result: **Example** +Create a temp table: + Query: ``` sql CREATE table test (t UInt8) ENGINE = Memory; -SELECT avg(t) FROM test +``` + +Get the arithmetic mean: + +Query: + +``` +SELECT avg(t) FROM test; ``` Result: @@ -60,3 +62,5 @@ Result: │ nan │ └────────┘ ``` + +[Original article](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/avg/) diff --git a/docs/en/sql-reference/aggregate-functions/reference/count.md b/docs/en/sql-reference/aggregate-functions/reference/count.md index 0a5aef2fe97..48c6f3f8c05 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/count.md +++ b/docs/en/sql-reference/aggregate-functions/reference/count.md @@ -7,8 +7,9 @@ toc_priority: 1 Counts the number of rows or not-NULL values. ClickHouse supports the following syntaxes for `count`: -- `count(expr)` or `COUNT(DISTINCT expr)`. -- `count()` or `COUNT(*)`. The `count()` syntax is ClickHouse-specific. + +- `count(expr)` or `COUNT(DISTINCT expr)`. +- `count()` or `COUNT(*)`. The `count()` syntax is ClickHouse-specific. **Arguments** diff --git a/docs/en/sql-reference/aggregate-functions/reference/deltasum.md b/docs/en/sql-reference/aggregate-functions/reference/deltasum.md index bb6f802ccaf..e0c74576bb6 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/deltasum.md +++ b/docs/en/sql-reference/aggregate-functions/reference/deltasum.md @@ -4,16 +4,66 @@ toc_priority: 141 # deltaSum {#agg_functions-deltasum} -Syntax: `deltaSum(value)` +Sums the arithmetic difference between consecutive rows. If the difference is negative, it is ignored. -Adds the differences between consecutive rows. If the difference is negative, it is ignored. -`value` must be some integer or floating point type. +**Syntax** -Example: - -```sql -select deltaSum(arrayJoin([1, 2, 3])); -- => 2 -select deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3])); -- => 7 -select deltaSum(arrayJoin([2.25, 3, 4.5])); -- => 2.25 +``` sql +deltaSum(value) ``` +**Arguments** + +- `value` — Input values, must be [Integer](../../data-types/int-uint.md) or [Float](../../data-types/float.md) type. + +**Returned value** + +- A gained arithmetic difference of the `Integer` or `Float` type. + +**Examples** + +Query: + +``` sql +SELECT deltaSum(arrayJoin([1, 2, 3])); +``` + +Result: + +``` text +┌─deltaSum(arrayJoin([1, 2, 3]))─┐ +│ 2 │ +└────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3])); +``` + +Result: + +``` text +┌─deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]))─┐ +│ 7 │ +└───────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT deltaSum(arrayJoin([2.25, 3, 4.5])); +``` + +Result: + +``` text +┌─deltaSum(arrayJoin([2.25, 3, 4.5]))─┐ +│ 2.25 │ +└─────────────────────────────────────┘ +``` + +## See Also {#see-also} + +- [runningDifference](../../functions/other-functions.md#other_functions-runningdifference) diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md index 68456bf7844..d29550b007e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md +++ b/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md @@ -9,7 +9,7 @@ Inserts a value into the array at the specified position. **Syntax** ``` sql -groupArrayInsertAt(default_x, size)(x, pos); +groupArrayInsertAt(default_x, size)(x, pos) ``` If in one query several values are inserted into the same position, the function behaves in the following ways: @@ -21,8 +21,8 @@ If in one query several values are inserted into the same position, the function - `x` — Value to be inserted. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in one of the [supported data types](../../../sql-reference/data-types/index.md). - `pos` — Position at which the specified element `x` is to be inserted. Index numbering in the array starts from zero. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). -- `default_x`— Default value for substituting in empty positions. Optional parameter. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in the data type configured for the `x` parameter. If `default_x` is not defined, the [default values](../../../sql-reference/statements/create/table.md#create-default-values) are used. -- `size`— Length of the resulting array. Optional parameter. When using this parameter, the default value `default_x` must be specified. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). +- `default_x` — Default value for substituting in empty positions. Optional parameter. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in the data type configured for the `x` parameter. If `default_x` is not defined, the [default values](../../../sql-reference/statements/create/table.md#create-default-values) are used. +- `size` — Length of the resulting array. Optional parameter. When using this parameter, the default value `default_x` must be specified. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). **Returned value** diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md index a4d99fd29e3..d3f40f63f65 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md @@ -14,7 +14,7 @@ groupBitmapOr(expr) `expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type. -**Return value** +**Returned value** Value of the `UInt64` type. diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md index 834f088d02f..cbe01e08145 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md @@ -14,7 +14,7 @@ groupBitmapOr(expr) `expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type. -**Return value** +**Returned value** Value of the `UInt64` type. diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md index e427a9ad970..24077de0adc 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md @@ -14,7 +14,7 @@ groupBitOr(expr) `expr` – An expression that results in `UInt*` type. -**Return value** +**Returned value** Value of the `UInt*` type. diff --git a/docs/en/sql-reference/aggregate-functions/reference/initializeAggregation.md b/docs/en/sql-reference/aggregate-functions/reference/initializeAggregation.md index 313d6bf81f5..c8fb535089b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/initializeAggregation.md +++ b/docs/en/sql-reference/aggregate-functions/reference/initializeAggregation.md @@ -10,7 +10,7 @@ Use it for tests or to process columns of types `AggregateFunction` and `Aggrega **Syntax** ``` sql -initializeAggregation (aggregate_function, column_1, column_2); +initializeAggregation (aggregate_function, column_1, column_2) ``` **Arguments** diff --git a/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md index db402c99663..c51c4b92e74 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md @@ -21,5 +21,5 @@ The kurtosis of the given distribution. Type — [Float64](../../../sql-referenc **Example** ``` sql -SELECT kurtPop(value) FROM series_with_value_column +SELECT kurtPop(value) FROM series_with_value_column; ``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md index 4bb9f76763b..0ee40138adc 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md @@ -23,5 +23,5 @@ The kurtosis of the given distribution. Type — [Float64](../../../sql-referenc **Example** ``` sql -SELECT kurtSamp(value) FROM series_with_value_column +SELECT kurtSamp(value) FROM series_with_value_column; ``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md index dc5fc45b878..34e8188299c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -27,7 +27,7 @@ The null hypothesis is that two populations are stochastically equal. Also one-s - `'two-sided'`; - `'greater'`; - `'less'`. -- `continuity_correction` - if not 0 then continuity correction in the normal approximation for the p-value is applied. (Optional, default: 1.) [UInt64](../../../sql-reference/data-types/int-uint.md). +- `continuity_correction` — if not 0 then continuity correction in the normal approximation for the p-value is applied. (Optional, default: 1.) [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned values** diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md index dcc665a68af..dd0d59978d1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md @@ -6,7 +6,7 @@ toc_priority: 207 Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. -The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic. +Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic. The performance of the function is lower than performance of [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) or [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`. diff --git a/docs/en/sql-reference/aggregate-functions/reference/skewpop.md b/docs/en/sql-reference/aggregate-functions/reference/skewpop.md index b9dfc390f9d..f84f8897a35 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/skewpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/skewpop.md @@ -21,5 +21,5 @@ The skewness of the given distribution. Type — [Float64](../../../sql-referenc **Example** ``` sql -SELECT skewPop(value) FROM series_with_value_column +SELECT skewPop(value) FROM series_with_value_column; ``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md index f7a6df8f507..48a049ca69d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md @@ -23,5 +23,5 @@ The skewness of the given distribution. Type — [Float64](../../../sql-referenc **Example** ``` sql -SELECT skewSamp(value) FROM series_with_value_column +SELECT skewSamp(value) FROM series_with_value_column; ``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/studentttest.md b/docs/en/sql-reference/aggregate-functions/reference/studentttest.md index a1d7ae33fe1..3398fc1ca8c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/studentttest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/studentttest.md @@ -18,8 +18,8 @@ The null hypothesis is that means of populations are equal. Normal distribution **Arguments** -- `sample_data` — sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). -- `sample_index` — sample index. [Integer](../../../sql-reference/data-types/int-uint.md). +- `sample_data` — Sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). +- `sample_index` — Sample index. [Integer](../../../sql-reference/data-types/int-uint.md). **Returned values** diff --git a/docs/en/sql-reference/aggregate-functions/reference/topk.md b/docs/en/sql-reference/aggregate-functions/reference/topk.md index b3e79803ba1..b9bea013ea8 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/topk.md +++ b/docs/en/sql-reference/aggregate-functions/reference/topk.md @@ -18,13 +18,13 @@ We recommend using the `N < 10` value; performance is reduced with large `N` val **Arguments** -- ‘N’ is the number of elements to return. +- `N` – The number of elements to return. If the parameter is omitted, default value 10 is used. **Arguments** -- ’ x ’ – The value to calculate frequency. +- `x` – The value to calculate frequency. **Example** diff --git a/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md index 02b9f77ea6f..8562336c829 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md @@ -18,7 +18,7 @@ topKWeighted(N)(x, weight) **Arguments** -- `x` – The value. +- `x` — The value. - `weight` — The weight. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned value** diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md b/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md index 5b23ea81eae..4983220ed7f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md @@ -26,7 +26,7 @@ Function: - Uses the HyperLogLog algorithm to approximate the number of different argument values. - 212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). + 2^12 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). - Provides the determinate result (it doesn’t depend on the query processing order). diff --git a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md index b391fb1d979..02238de42ef 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md @@ -18,8 +18,8 @@ The null hypothesis is that means of populations are equal. Normal distribution **Arguments** -- `sample_data` — sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). -- `sample_index` — sample index. [Integer](../../../sql-reference/data-types/int-uint.md). +- `sample_data` — Sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). +- `sample_index` — Sample index. [Integer](../../../sql-reference/data-types/int-uint.md). **Returned values** diff --git a/docs/en/sql-reference/data-types/date.md b/docs/en/sql-reference/data-types/date.md index 886e93f433c..0cfac4d59fe 100644 --- a/docs/en/sql-reference/data-types/date.md +++ b/docs/en/sql-reference/data-types/date.md @@ -5,7 +5,7 @@ toc_title: Date # Date {#data_type-date} -A date. Stored in two bytes as the number of days since 1970-01-01 (unsigned). Allows storing values from just after the beginning of the Unix Epoch to the upper threshold defined by a constant at the compilation stage (currently, this is until the year 2106, but the final fully-supported year is 2105). +A date. Stored in two bytes as the number of days since 1970-01-01 (unsigned). Allows storing values from just after the beginning of the Unix Epoch to the upper threshold defined by a constant at the compilation stage (currently, this is until the year 2149, but the final fully-supported year is 2148). The date value is stored without the time zone. diff --git a/docs/en/sql-reference/data-types/datetime.md b/docs/en/sql-reference/data-types/datetime.md index 07a6e38a5fa..d95abe57510 100644 --- a/docs/en/sql-reference/data-types/datetime.md +++ b/docs/en/sql-reference/data-types/datetime.md @@ -19,15 +19,17 @@ Resolution: 1 second. ## Usage Remarks {#usage-remarks} -The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. Additionally, the `DateTime` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01’). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. -A list of supported time zones can be found in the [IANA Time Zone Database](https://www.iana.org/time-zones). -The `tzdata` package, containing [IANA Time Zone Database](https://www.iana.org/time-zones), should be installed in the system. Use the `timedatectl list-timezones` command to list timezones known by a local system. +The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. The time zone affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01’). -You can explicitly set a time zone for `DateTime`-type columns when creating a table. If the time zone isn’t set, ClickHouse uses the value of the [timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) parameter in the server settings or the operating system settings at the moment of the ClickHouse server start. +Timezone agnostic unix timestamp is stored in tables, and the timezone is used to transform it to text format or back during data import/export or to make calendar calculations on the values (example: `toDate`, `toHour` functions et cetera). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. + +A list of supported time zones can be found in the [IANA Time Zone Database](https://www.iana.org/time-zones) and also can be queried by `SELECT * FROM system.time_zones`. + +You can explicitly set a time zone for `DateTime`-type columns when creating a table. Example: `DateTime('UTC')`. If the time zone isn’t set, ClickHouse uses the value of the [timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) parameter in the server settings or the operating system settings at the moment of the ClickHouse server start. The [clickhouse-client](../../interfaces/cli.md) applies the server time zone by default if a time zone isn’t explicitly set when initializing the data type. To use the client time zone, run `clickhouse-client` with the `--use_client_time_zone` parameter. -ClickHouse outputs values depending on the value of the [date\_time\_output\_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionaly you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function. +ClickHouse outputs values depending on the value of the [date_time_output_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionaly you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function. When inserting data into ClickHouse, you can use different formats of date and time strings, depending on the value of the [date_time_input_format](../../operations/settings/settings.md#settings-date_time_input_format) setting. @@ -114,6 +116,24 @@ FROM dt └─────────────────────┴─────────────────────┘ ``` +As timezone conversion only changes the metadata, the operation has no computation cost. + + +## Limitations on timezones support + +Some timezones may not be supported completely. There are a few cases: + +If the offset from UTC is not a multiple of 15 minutes, the calculation of hours and minutes can be incorrect. For example, the time zone in Monrovia, Liberia has offset UTC -0:44:30 before 7 Jan 1972. If you are doing calculations on the historical time in Monrovia timezone, the time processing functions may give incorrect results. The results after 7 Jan 1972 will be correct nevertheless. + +If the time transition (due to daylight saving time or for other reasons) was performed at a point of time that is not a multiple of 15 minutes, you can also get incorrect results at this specific day. + +Non-monotonic calendar dates. For example, in Happy Valley - Goose Bay, the time was transitioned one hour backwards at 00:01:00 7 Nov 2010 (one minute after midnight). So after 6th Nov has ended, people observed a whole one minute of 7th Nov, then time was changed back to 23:01 6th Nov and after another 59 minutes the 7th Nov started again. ClickHouse does not (yet) support this kind of fun. During these days the results of time processing functions may be slightly incorrect. + +Similar issue exists for Casey Antarctic station in year 2010. They changed time three hours back at 5 Mar, 02:00. If you are working in antarctic station, please don't afraid to use ClickHouse. Just make sure you set timezone to UTC or be aware of inaccuracies. + +Time shifts for multiple days. Some pacific islands changed their timezone offset from UTC+14 to UTC-12. That's alright but some inaccuracies may present if you do calculations with their timezone for historical time points at the days of conversion. + + ## See Also {#see-also} - [Type conversion functions](../../sql-reference/functions/type-conversion-functions.md) diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index 5cba8315090..1d3725b9fb3 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -9,7 +9,7 @@ Allows to store an instant in time, that can be expressed as a calendar date and Tick size (precision): 10-precision seconds -Syntax: +**Syntax:** ``` sql DateTime64(precision, [timezone]) @@ -17,9 +17,11 @@ DateTime64(precision, [timezone]) Internally, stores data as a number of ‘ticks’ since epoch start (1970-01-01 00:00:00 UTC) as Int64. The tick resolution is determined by the precision parameter. Additionally, the `DateTime64` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime64` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01.000’). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. See details in [DateTime](../../sql-reference/data-types/datetime.md). +Supported range from January 1, 1925 till December 31, 2283. + ## Examples {#examples} -**1.** Creating a table with `DateTime64`-type column and inserting data into it: +1. Creating a table with `DateTime64`-type column and inserting data into it: ``` sql CREATE TABLE dt @@ -27,15 +29,15 @@ CREATE TABLE dt `timestamp` DateTime64(3, 'Europe/Moscow'), `event_id` UInt8 ) -ENGINE = TinyLog +ENGINE = TinyLog; ``` ``` sql -INSERT INTO dt Values (1546300800000, 1), ('2019-01-01 00:00:00', 2) +INSERT INTO dt Values (1546300800000, 1), ('2019-01-01 00:00:00', 2); ``` ``` sql -SELECT * FROM dt +SELECT * FROM dt; ``` ``` text @@ -45,13 +47,13 @@ SELECT * FROM dt └─────────────────────────┴──────────┘ ``` -- When inserting datetime as an integer, it is treated as an appropriately scaled Unix Timestamp (UTC). `1546300800000` (with precision 3) represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Europe/Moscow` (UTC+3) timezone specified, when outputting as a string the value will be shown as `'2019-01-01 03:00:00'` +- When inserting datetime as an integer, it is treated as an appropriately scaled Unix Timestamp (UTC). `1546300800000` (with precision 3) represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Europe/Moscow` (UTC+3) timezone specified, when outputting as a string the value will be shown as `'2019-01-01 03:00:00'`. - When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Europe/Moscow` timezone and stored as `1546290000000`. -**2.** Filtering on `DateTime64` values +2. Filtering on `DateTime64` values ``` sql -SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow') +SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow'); ``` ``` text @@ -60,12 +62,12 @@ SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europ └─────────────────────────┴──────────┘ ``` -Unlike `DateTime`, `DateTime64` values are not converted from `String` automatically +Unlike `DateTime`, `DateTime64` values are not converted from `String` automatically. -**3.** Getting a time zone for a `DateTime64`-type value: +3. Getting a time zone for a `DateTime64`-type value: ``` sql -SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x +SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x; ``` ``` text @@ -74,13 +76,13 @@ SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS └─────────────────────────┴────────────────────────────────┘ ``` -**4.** Timezone conversion +4. Timezone conversion ``` sql SELECT toDateTime64(timestamp, 3, 'Europe/London') as lon_time, toDateTime64(timestamp, 3, 'Europe/Moscow') as mos_time -FROM dt +FROM dt; ``` ``` text @@ -90,7 +92,7 @@ FROM dt └─────────────────────────┴─────────────────────────┘ ``` -## See Also {#see-also} +**See Also** - [Type conversion functions](../../sql-reference/functions/type-conversion-functions.md) - [Functions for working with dates and times](../../sql-reference/functions/date-time-functions.md) diff --git a/docs/en/sql-reference/data-types/geo.md b/docs/en/sql-reference/data-types/geo.md new file mode 100644 index 00000000000..9ed328e0de6 --- /dev/null +++ b/docs/en/sql-reference/data-types/geo.md @@ -0,0 +1,106 @@ +--- +toc_priority: 62 +toc_title: Geo +--- + +# Geo Data Types {#geo-data-types} + +Clickhouse supports data types for representing geographical objects — locations, lands, etc. + +!!! warning "Warning" + Currently geo data types are an experimental feature. To work with them you must set `allow_experimental_geo_types = 1`. + +**See Also** +- [Representing simple geographical features](https://en.wikipedia.org/wiki/GeoJSON). +- [allow_experimental_geo_types](../../operations/settings/settings.md#allow-experimental-geo-types) setting. + +## Point {#point-data-type} + +`Point` is represented by its X and Y coordinates, stored as a [Tuple](tuple.md)([Float64](float.md), [Float64](float.md)). + +**Example** + +Query: + +```sql +SET allow_experimental_geo_types = 1; +CREATE TABLE geo_point (p Point) ENGINE = Memory(); +INSERT INTO geo_point VALUES((10, 10)); +SELECT p, toTypeName(p) FROM geo_point; +``` +Result: + +``` text +┌─p─────┬─toTypeName(p)─┐ +│ (10,10) │ Point │ +└───────┴───────────────┘ +``` + +## Ring {#ring-data-type} + +`Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point-data-type)). + +**Example** + +Query: + +```sql +SET allow_experimental_geo_types = 1; +CREATE TABLE geo_ring (r Ring) ENGINE = Memory(); +INSERT INTO geo_ring VALUES([(0, 0), (10, 0), (10, 10), (0, 10)]); +SELECT r, toTypeName(r) FROM geo_ring; +``` +Result: + +``` text +┌─r─────────────────────────────┬─toTypeName(r)─┐ +│ [(0,0),(10,0),(10,10),(0,10)] │ Ring │ +└───────────────────────────────┴───────────────┘ +``` + +## Polygon {#polygon-data-type} + +`Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring-data-type)). First element of outer array is the outer shape of polygon and all the following elements are holes. + +**Example** + +This is a polygon with one hole: + +```sql +SET allow_experimental_geo_types = 1; +CREATE TABLE geo_polygon (pg Polygon) ENGINE = Memory(); +INSERT INTO geo_polygon VALUES([[(20, 20), (50, 20), (50, 50), (20, 50)], [(30, 30), (50, 50), (50, 30)]]); +SELECT pg, toTypeName(pg) FROM geo_polygon; +``` + +Result: + +``` text +┌─pg────────────────────────────────────────────────────────────┬─toTypeName(pg)─┐ +│ [[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] │ Polygon │ +└───────────────────────────────────────────────────────────────┴────────────────┘ +``` + +## MultiPolygon {#multipolygon-data-type} + +`MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon-data-type)). + +**Example** + +This multipolygon consists of two separate polygons — the first one without holes, and the second with one hole: + +```sql +SET allow_experimental_geo_types = 1; +CREATE TABLE geo_multipolygon (mpg MultiPolygon) ENGINE = Memory(); +INSERT INTO geo_multipolygon VALUES([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]]); +SELECT mpg, toTypeName(mpg) FROM geo_multipolygon; +``` +Result: + +``` text +┌─mpg─────────────────────────────────────────────────────────────────────────────────────────────┬─toTypeName(mpg)─┐ +│ [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] │ MultiPolygon │ +└─────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────┘ +``` + +[Original article](https://clickhouse.tech/docs/en/data-types/geo/) diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index 2d2746f85d3..af12a03ab51 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -2,6 +2,8 @@ `SimpleAggregateFunction(name, types_of_arguments…)` data type stores current value of the aggregate function, and does not store its full state as [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) does. This optimization can be applied to functions for which the following property holds: the result of applying a function `f` to a row set `S1 UNION ALL S2` can be obtained by applying `f` to parts of the row set separately, and then again applying `f` to the results: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. This property guarantees that partial aggregation results are enough to compute the combined one, so we don’t have to store and process any extra data. +The common way to produce an aggregate function value is by calling the aggregate function with the [-SimpleState](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-simplestate) suffix. + The following aggregate functions are supported: - [`any`](../../sql-reference/aggregate-functions/reference/any.md#agg_function-any) @@ -21,7 +23,11 @@ The following aggregate functions are supported: - [`argMin`](../../sql-reference/aggregate-functions/reference/argmin.md) - [`argMax`](../../sql-reference/aggregate-functions/reference/argmax.md) -Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. `SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. + +!!! note "Note" + Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. + + `SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. **Parameters** @@ -31,11 +37,7 @@ Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way **Example** ``` sql -CREATE TABLE t -( - column1 SimpleAggregateFunction(sum, UInt64), - column2 SimpleAggregateFunction(any, String) -) ENGINE = ... +CREATE TABLE simple (id UInt64, val SimpleAggregateFunction(sum, Double)) ENGINE=AggregatingMergeTree ORDER BY id; ``` [Original article](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md index a5e105d2e13..08d3b8d8ad0 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md @@ -65,4 +65,3 @@ For our example, the structure of dictionary can be the following: ``` -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_hierarchical/) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index efef91b4b09..de6a780235f 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -320,8 +320,6 @@ Similar to `cache`, but stores data on SSD and index in RAM. 1048576 /var/lib/clickhouse/clickhouse_dictionaries/test_dict - - 1048576 ``` @@ -329,8 +327,8 @@ Similar to `cache`, but stores data on SSD and index in RAM. or ``` sql -LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 - PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576)) +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 + PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict)) ``` ### complex_key_ssd_cache {#complex-key-ssd-cache} @@ -445,4 +443,3 @@ Other types are not supported yet. The function returns the attribute for the pr Data must completely fit into RAM. -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_layout/) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md index 20486ebbcc8..081cc5b0b69 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md @@ -19,6 +19,8 @@ Example of settings: ``` +or + ``` sql CREATE DICTIONARY (...) ... @@ -58,7 +60,7 @@ When upgrading the dictionaries, the ClickHouse server applies different logic d - For MySQL source, the time of modification is checked using a `SHOW TABLE STATUS` query (in case of MySQL 8 you need to disable meta-information caching in MySQL by `set global information_schema_stats_expiry=0`. - Dictionaries from other sources are updated every time by default. -For other sources (ODBC, ClickHouse, etc), you can set up a query that will update the dictionaries only if they really changed, rather than each time. To do this, follow these steps: +For other sources (ODBC, PostgreSQL, ClickHouse, etc), you can set up a query that will update the dictionaries only if they really changed, rather than each time. To do this, follow these steps: - The dictionary table must have a field that always changes when the source data is updated. - The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md). @@ -84,4 +86,3 @@ SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source wher ... ``` -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_lifetime/) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index 7cd26a9dffb..dc0b6e17198 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -65,9 +65,12 @@ Types of sources (`source_type`): - DBMS - [ODBC](#dicts-external_dicts_dict_sources-odbc) - [MySQL](#dicts-external_dicts_dict_sources-mysql) + - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - [Redis](#dicts-external_dicts_dict_sources-redis) + - [Cassandra](#dicts-external_dicts_dict_sources-cassandra) + - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) ## Local File {#dicts-external_dicts_dict_sources-local_file} @@ -659,7 +662,7 @@ Example of settings: Setting fields: - `host` – The Cassandra host or comma-separated list of hosts. -- `port` – The port on the Cassandra servers. If not specified, default port is used. +- `port` – The port on the Cassandra servers. If not specified, default port 9042 is used. - `user` – Name of the Cassandra user. - `password` – Password of the Cassandra user. - `keyspace` – Name of the keyspace (database). @@ -673,4 +676,52 @@ Default value is 1 (the first key column is a partition key and other key column - `where` – Optional selection criteria. - `max_threads` – The maximum number of threads to use for loading data from multiple partitions in compose key dictionaries. -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_sources/) +### PosgreSQL {#dicts-external_dicts_dict_sources-postgresql} + +Example of settings: + +``` xml + + + 5432 + clickhouse + qwerty + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` + +or + +``` sql +SOURCE(POSTGRESQL( + port 5432 + host 'postgresql-hostname' + user 'postgres_user' + password 'postgres_password' + db 'db_name' + table 'table_name' + replica(host 'example01-1' port 5432 priority 1) + replica(host 'example01-2' port 5432 priority 2) + where 'id=10' + invalidate_query 'SQL_QUERY' +)) +``` + +Setting fields: + +- `host` – The host on the PostgreSQL server. You can specify it for all replicas, or for each one individually (inside ``). +- `port` – The port on the PostgreSQL server. You can specify it for all replicas, or for each one individually (inside ``). +- `user` – Name of the PostgreSQL user. You can specify it for all replicas, or for each one individually (inside ``). +- `password` – Password of the PostgreSQL user. You can specify it for all replicas, or for each one individually (inside ``). +- `replica` – Section of replica configurations. There can be multiple sections. + - `replica/host` – The PostgreSQL host. + - `replica/port` – The PostgreSQL port. + - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. +- `db` – Name of the database. +- `table` – Name of the table. +- `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in PostgreSQL, for example, `id > 10 AND id < 20`. Optional parameter. +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md index e25b3ab78c3..f22d2a0b59e 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md @@ -159,15 +159,14 @@ Configuration fields: | Tag | Description | Required | |------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| | `name` | Column name. | Yes | -| `type` | ClickHouse data type.
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../../sql-reference/data-types/nullable.md) is not supported. | Yes | -| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. You cannot use `NULL` in this field. | Yes | +| `type` | ClickHouse data type.
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../../sql-reference/data-types/nullable.md) is currently supported for [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md) dictionaries. In [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache), [IPTrie](external-dicts-dict-layout.md#ip-trie) dictionaries `Nullable` types are not supported. | Yes | +| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../../syntax.md#null-literal) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | | `expression` | [Expression](../../../sql-reference/syntax.md#syntax-expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | | `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md).

Default value: `false`. | No | | `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | | `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. | No | -## See Also {#see-also} +**See Also** - [Functions for working with external dictionaries](../../../sql-reference/functions/ext-dict-functions.md). -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_structure/) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md index 17ad110aa19..e15d944130e 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md @@ -48,4 +48,3 @@ LIFETIME(...) -- Lifetime of dictionary in memory - [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md) — Structure of the dictionary . A key and attributes that can be retrieved by this key. - [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) — Frequency of dictionary updates. -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict/) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md index 99a62002822..8217fb8da3a 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md @@ -57,4 +57,3 @@ You can [configure](../../../sql-reference/dictionaries/external-dictionaries/ex - [Dictionary Key and Fields](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md) - [Functions for Working with External Dictionaries](../../../sql-reference/functions/ext-dict-functions.md) -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts/) diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index 420182642bb..22f4182a1c0 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -10,11 +10,8 @@ A dictionary is a mapping (`key -> attributes`) that is convenient for various t ClickHouse supports special functions for working with dictionaries that can be used in queries. It is easier and more efficient to use dictionaries with functions than a `JOIN` with reference tables. -[NULL](../../sql-reference/syntax.md#null-literal) values can’t be stored in a dictionary. - ClickHouse supports: - [Built-in dictionaries](../../sql-reference/dictionaries/internal-dicts.md#internal_dicts) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). - [Plug-in (external) dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md#dicts-external-dicts) with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/) diff --git a/docs/en/sql-reference/dictionaries/internal-dicts.md b/docs/en/sql-reference/dictionaries/internal-dicts.md index 7d657d4177f..472351a19a4 100644 --- a/docs/en/sql-reference/dictionaries/internal-dicts.md +++ b/docs/en/sql-reference/dictionaries/internal-dicts.md @@ -50,4 +50,3 @@ We recommend periodically updating the dictionaries with the geobase. During an There are also functions for working with OS identifiers and Yandex.Metrica search engines, but they shouldn’t be used. -[Original article](https://clickhouse.tech/docs/en/query_language/dicts/internal_dicts/) diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index c4b151f59ce..faa03dfc9d3 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -82,4 +82,3 @@ An exception is thrown when dividing by zero or when dividing a minimal negative Returns the least common multiple of the numbers. An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one. -[Original article](https://clickhouse.tech/docs/en/query_language/functions/arithmetic_functions/) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index c9c418d57a4..499376a70d4 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -245,7 +245,7 @@ Elements set to `NULL` are handled as normal values. Returns the number of elements in the arr array for which func returns something other than 0. If ‘func’ is not specified, it returns the number of non-zero elements in the array. -Note that the `arrayCount` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. +Note that the `arrayCount` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. ## countEqual(arr, x) {#countequalarr-x} @@ -376,7 +376,7 @@ arrayPopBack(array) **Example** ``` sql -SELECT arrayPopBack([1, 2, 3]) AS res +SELECT arrayPopBack([1, 2, 3]) AS res; ``` ``` text @@ -400,7 +400,7 @@ arrayPopFront(array) **Example** ``` sql -SELECT arrayPopFront([1, 2, 3]) AS res +SELECT arrayPopFront([1, 2, 3]) AS res; ``` ``` text @@ -425,7 +425,7 @@ arrayPushBack(array, single_value) **Example** ``` sql -SELECT arrayPushBack(['a'], 'b') AS res +SELECT arrayPushBack(['a'], 'b') AS res; ``` ``` text @@ -450,7 +450,7 @@ arrayPushFront(array, single_value) **Example** ``` sql -SELECT arrayPushFront(['b'], 'a') AS res +SELECT arrayPushFront(['b'], 'a') AS res; ``` ``` text @@ -482,7 +482,7 @@ An array of length `size`. **Examples of calls** ``` sql -SELECT arrayResize([1], 3) +SELECT arrayResize([1], 3); ``` ``` text @@ -492,7 +492,7 @@ SELECT arrayResize([1], 3) ``` ``` sql -SELECT arrayResize([1], 3, NULL) +SELECT arrayResize([1], 3, NULL); ``` ``` text @@ -513,12 +513,12 @@ arraySlice(array, offset[, length]) - `array` – Array of data. - `offset` – Indent from the edge of the array. A positive value indicates an offset on the left, and a negative value is an indent on the right. Numbering of the array items begins with 1. -- `length` - The length of the required slice. If you specify a negative value, the function returns an open slice `[offset, array_length - length)`. If you omit the value, the function returns the slice `[offset, the_end_of_array]`. +- `length` – The length of the required slice. If you specify a negative value, the function returns an open slice `[offset, array_length - length)`. If you omit the value, the function returns the slice `[offset, the_end_of_array]`. **Example** ``` sql -SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res +SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res; ``` ``` text @@ -766,7 +766,7 @@ Type: [UInt\*](https://clickhouse.tech/docs/en/data_types/int_uint/#uint-ranges) Query: ``` sql -SELECT arrayDifference([1, 2, 3, 4]) +SELECT arrayDifference([1, 2, 3, 4]); ``` Result: @@ -782,7 +782,7 @@ Example of the overflow due to result type Int64: Query: ``` sql -SELECT arrayDifference([0, 10000000000000000000]) +SELECT arrayDifference([0, 10000000000000000000]); ``` Result: @@ -816,7 +816,7 @@ Returns an array containing the distinct elements. Query: ``` sql -SELECT arrayDistinct([1, 2, 2, 3, 1]) +SELECT arrayDistinct([1, 2, 2, 3, 1]); ``` Result: @@ -883,7 +883,7 @@ arrayReduce(agg_func, arr1, arr2, ..., arrN) Query: ``` sql -SELECT arrayReduce('max', [1, 2, 3]) +SELECT arrayReduce('max', [1, 2, 3]); ``` Result: @@ -899,7 +899,7 @@ If an aggregate function takes multiple arguments, then this function must be ap Query: ``` sql -SELECT arrayReduce('maxIf', [3, 5], [1, 0]) +SELECT arrayReduce('maxIf', [3, 5], [1, 0]); ``` Result: @@ -915,7 +915,7 @@ Example with a parametric aggregate function: Query: ``` sql -SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); ``` Result: @@ -1014,7 +1014,7 @@ Alias: `flatten`. **Examples** ``` sql -SELECT flatten([[[1]], [[2], [3]]]) +SELECT flatten([[[1]], [[2], [3]]]); ``` ``` text @@ -1048,7 +1048,7 @@ Type: `Array`. Query: ``` sql -SELECT arrayCompact([1, 1, nan, nan, 2, 3, 3, 3]) +SELECT arrayCompact([1, 1, nan, nan, 2, 3, 3, 3]); ``` Result: @@ -1086,7 +1086,7 @@ Type: [Array](../../sql-reference/data-types/array.md). Query: ``` sql -SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]) +SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]); ``` Result: @@ -1108,17 +1108,20 @@ arrayAUC(arr_scores, arr_labels) ``` **Arguments** + - `arr_scores` — scores prediction model gives. - `arr_labels` — labels of samples, usually 1 for positive sample and 0 for negtive sample. **Returned value** + Returns AUC value with type Float64. **Example** + Query: ``` sql -select arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]) +select arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]); ``` Result: @@ -1226,7 +1229,7 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, └────────────────────────────────────┘ ``` -Note that the `arrayReverseFilter` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. +Note that the `arrayReverseFill` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. ## arraySplit(func, arr1, …) {#array-split} @@ -1290,7 +1293,7 @@ Note that the `arrayFirstIndex` is a [higher-order function](../../sql-reference ## arrayMin {#array-min} -Returns the minimum of elements in the source array. +Returns the minimum of elements in the source array. If the `func` function is specified, returns the mininum of elements converted by this function. @@ -1309,9 +1312,9 @@ arrayMin([func,] arr) **Returned value** -- The minimum of function values (or the array minimum). +- The minimum of function values (or the array minimum). -Type: if `func` is specified, matches `func` return value type, else matches the array elements type. +Type: if `func` is specified, matches `func` return value type, else matches the array elements type. **Examples** @@ -1345,7 +1348,7 @@ Result: ## arrayMax {#array-max} -Returns the maximum of elements in the source array. +Returns the maximum of elements in the source array. If the `func` function is specified, returns the maximum of elements converted by this function. @@ -1364,9 +1367,9 @@ arrayMax([func,] arr) **Returned value** -- The maximum of function values (or the array maximum). +- The maximum of function values (or the array maximum). -Type: if `func` is specified, matches `func` return value type, else matches the array elements type. +Type: if `func` is specified, matches `func` return value type, else matches the array elements type. **Examples** @@ -1400,7 +1403,7 @@ Result: ## arraySum {#array-sum} -Returns the sum of elements in the source array. +Returns the sum of elements in the source array. If the `func` function is specified, returns the sum of elements converted by this function. @@ -1415,7 +1418,7 @@ arraySum([func,] arr) **Arguments** - `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — Array. [Array](../../sql-reference/data-types/array.md). +- `arr` — Array. [Array](../../sql-reference/data-types/array.md). **Returned value** @@ -1455,7 +1458,7 @@ Result: ## arrayAvg {#array-avg} -Returns the average of elements in the source array. +Returns the average of elements in the source array. If the `func` function is specified, returns the average of elements converted by this function. @@ -1470,7 +1473,7 @@ arrayAvg([func,] arr) **Arguments** - `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — Array. [Array](../../sql-reference/data-types/array.md). +- `arr` — Array. [Array](../../sql-reference/data-types/array.md). **Returned value** @@ -1541,4 +1544,3 @@ SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res ``` Note that the `arraySumNonNegative` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -[Original article](https://clickhouse.tech/docs/en/query_language/functions/array_functions/) diff --git a/docs/en/sql-reference/functions/array-join.md b/docs/en/sql-reference/functions/array-join.md index f1f9a545366..f35e0d10117 100644 --- a/docs/en/sql-reference/functions/array-join.md +++ b/docs/en/sql-reference/functions/array-join.md @@ -32,4 +32,3 @@ SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src └─────┴───────────┴─────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/array_join/) diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index a3d0c82d8ab..e07f28c0f24 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -37,8 +37,8 @@ SELECT bitTest(number, index) **Arguments** -- `number` – integer number. -- `index` – position of bit. +- `number` – Integer number. +- `index` – Position of bit. **Returned values** @@ -53,7 +53,7 @@ For example, the number 43 in base-2 (binary) numeral system is 101011. Query: ``` sql -SELECT bitTest(43, 1) +SELECT bitTest(43, 1); ``` Result: @@ -69,7 +69,7 @@ Another example: Query: ``` sql -SELECT bitTest(43, 2) +SELECT bitTest(43, 2); ``` Result: @@ -102,8 +102,8 @@ SELECT bitTestAll(number, index1, index2, index3, index4, ...) **Arguments** -- `number` – integer number. -- `index1`, `index2`, `index3`, `index4` – positions of bit. For example, for set of positions (`index1`, `index2`, `index3`, `index4`) is true if and only if all of its positions are true (`index1` ⋀ `index2`, ⋀ `index3` ⋀ `index4`). +- `number` – Integer number. +- `index1`, `index2`, `index3`, `index4` – Positions of bit. For example, for set of positions (`index1`, `index2`, `index3`, `index4`) is true if and only if all of its positions are true (`index1` ⋀ `index2`, ⋀ `index3` ⋀ `index4`). **Returned values** @@ -118,7 +118,7 @@ For example, the number 43 in base-2 (binary) numeral system is 101011. Query: ``` sql -SELECT bitTestAll(43, 0, 1, 3, 5) +SELECT bitTestAll(43, 0, 1, 3, 5); ``` Result: @@ -134,7 +134,7 @@ Another example: Query: ``` sql -SELECT bitTestAll(43, 0, 1, 3, 5, 2) +SELECT bitTestAll(43, 0, 1, 3, 5, 2); ``` Result: @@ -167,8 +167,8 @@ SELECT bitTestAny(number, index1, index2, index3, index4, ...) **Arguments** -- `number` – integer number. -- `index1`, `index2`, `index3`, `index4` – positions of bit. +- `number` – Integer number. +- `index1`, `index2`, `index3`, `index4` – Positions of bit. **Returned values** @@ -183,7 +183,7 @@ For example, the number 43 in base-2 (binary) numeral system is 101011. Query: ``` sql -SELECT bitTestAny(43, 0, 2) +SELECT bitTestAny(43, 0, 2); ``` Result: @@ -199,7 +199,7 @@ Another example: Query: ``` sql -SELECT bitTestAny(43, 4, 2) +SELECT bitTestAny(43, 4, 2); ``` Result: @@ -239,7 +239,7 @@ Take for example the number 333. Its binary representation: 0000000101001101. Query: ``` sql -SELECT bitCount(333) +SELECT bitCount(333); ``` Result: @@ -250,4 +250,53 @@ Result: └───────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/bit_functions/) +## bitHammingDistance {#bithammingdistance} + +Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between the bit representations of two integer values. Can be used with [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) functions for detection of semi-duplicate strings. The smaller is the distance, the more likely those strings are the same. + +**Syntax** + +``` sql +bitHammingDistance(int1, int2) +``` + +**Arguments** + +- `int1` — First integer value. [Int64](../../sql-reference/data-types/int-uint.md). +- `int2` — Second integer value. [Int64](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- The Hamming distance. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Examples** + +Query: + +``` sql +SELECT bitHammingDistance(111, 121); +``` + +Result: + +``` text +┌─bitHammingDistance(111, 121)─┐ +│ 3 │ +└──────────────────────────────┘ +``` + +With [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash): + +``` sql +SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat')); +``` + +Result: + +``` text +┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐ +│ 5 │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index bfff70576f2..4875532605e 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -23,17 +23,17 @@ bitmapBuild(array) **Arguments** -- `array` – unsigned integer array. +- `array` – Unsigned integer array. **Example** ``` sql -SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) +SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res); ``` ``` text ┌─res─┬─toTypeName(bitmapBuild([1, 2, 3, 4, 5]))─────┐ -│  │ AggregateFunction(groupBitmap, UInt8) │ +│ │ AggregateFunction(groupBitmap, UInt8) │ └─────┴──────────────────────────────────────────────┘ ``` @@ -47,12 +47,12 @@ bitmapToArray(bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res +SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res; ``` ``` text @@ -72,13 +72,13 @@ bitmapSubsetInRange(bitmap, range_start, range_end) **Arguments** - `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – range start point. Type: [UInt32](../../sql-reference/data-types/int-uint.md). -- `range_end` – range end point(excluded). Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- `range_start` – Range start point. Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- `range_end` – Range end point (excluded). Type: [UInt32](../../sql-reference/data-types/int-uint.md). **Example** ``` sql -SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res; ``` ``` text @@ -114,7 +114,7 @@ Type: `Bitmap object`. Query: ``` sql -SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res; ``` Result: @@ -148,7 +148,7 @@ Type: `UInt8`. **Example** ``` sql -SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res +SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res; ``` ``` text @@ -169,7 +169,7 @@ If you are sure that `bitmap2` contains strictly one element, consider using the **Arguments** -- `bitmap*` – bitmap object. +- `bitmap*` – Bitmap object. **Return values** @@ -179,7 +179,7 @@ If you are sure that `bitmap2` contains strictly one element, consider using the **Example** ``` sql -SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res +SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` ``` text @@ -199,12 +199,12 @@ bitmapHasAll(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res +SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` ``` text @@ -223,12 +223,12 @@ bitmapCardinality(bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res +SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res; ``` ``` text @@ -245,17 +245,19 @@ Retrun the smallest value of type UInt64 in the set, UINT32_MAX if the set is em **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res +SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res; ``` - ┌─res─┐ - │ 1 │ - └─────┘ +``` text + ┌─res─┐ + │ 1 │ + └─────┘ +``` ## bitmapMax {#bitmapmax} @@ -265,17 +267,19 @@ Retrun the greatest value of type UInt64 in the set, 0 if the set is empty. **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res +SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res; ``` - ┌─res─┐ - │ 5 │ - └─────┘ +``` text + ┌─res─┐ + │ 5 │ + └─────┘ +``` ## bitmapTransform {#bitmaptransform} @@ -285,19 +289,21 @@ Transform an array of values in a bitmap to another array of values, the result **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. - `from_array` – UInt32 array. For idx in range \[0, from_array.size()), if bitmap contains from_array\[idx\], then replace it with to_array\[idx\]. Note that the result depends on array ordering if there are common elements between from_array and to_array. - `to_array` – UInt32 array, its size shall be the same to from_array. **Example** ``` sql -SELECT bitmapToArray(bitmapTransform(bitmapBuild([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), cast([5,999,2] as Array(UInt32)), cast([2,888,20] as Array(UInt32)))) AS res +SELECT bitmapToArray(bitmapTransform(bitmapBuild([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), cast([5,999,2] as Array(UInt32)), cast([2,888,20] as Array(UInt32)))) AS res; ``` - ┌─res───────────────────┐ - │ [1,3,4,6,7,8,9,10,20] │ - └───────────────────────┘ +``` text + ┌─res───────────────────┐ + │ [1,3,4,6,7,8,9,10,20] │ + └───────────────────────┘ +``` ## bitmapAnd {#bitmapand} @@ -309,12 +315,12 @@ bitmapAnd(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res; ``` ``` text @@ -333,12 +339,12 @@ bitmapOr(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res; ``` ``` text @@ -357,12 +363,12 @@ bitmapXor(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res; ``` ``` text @@ -381,12 +387,12 @@ bitmapAndnot(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** ``` sql -SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res; ``` ``` text @@ -405,7 +411,7 @@ bitmapAndCardinality(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** @@ -429,7 +435,7 @@ bitmapOrCardinality(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** @@ -453,7 +459,7 @@ bitmapXorCardinality(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** @@ -477,7 +483,7 @@ bitmapAndnotCardinality(bitmap,bitmap) **Arguments** -- `bitmap` – bitmap object. +- `bitmap` – Bitmap object. **Example** @@ -491,4 +497,3 @@ SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res └─────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/bitmap_functions/) diff --git a/docs/en/sql-reference/functions/comparison-functions.md b/docs/en/sql-reference/functions/comparison-functions.md index 0b6d8b6e36e..edaf0a01c73 100644 --- a/docs/en/sql-reference/functions/comparison-functions.md +++ b/docs/en/sql-reference/functions/comparison-functions.md @@ -32,4 +32,3 @@ Strings are compared by bytes. A shorter string is smaller than all strings that ## greaterOrEquals, \>= operator {#function-greaterorequals} -[Original article](https://clickhouse.tech/docs/en/query_language/functions/comparison_functions/) diff --git a/docs/en/sql-reference/functions/conditional-functions.md b/docs/en/sql-reference/functions/conditional-functions.md index 2d57cbb3bd5..a23da82a9c6 100644 --- a/docs/en/sql-reference/functions/conditional-functions.md +++ b/docs/en/sql-reference/functions/conditional-functions.md @@ -20,8 +20,8 @@ If the condition `cond` evaluates to a non-zero value, returns the result of the **Arguments** - `cond` – The condition for evaluation that can be zero or not. The type is UInt8, Nullable(UInt8) or NULL. -- `then` - The expression to return if condition is met. -- `else` - The expression to return if condition is not met. +- `then` – The expression to return if condition is met. +- `else` – The expression to return if condition is not met. **Returned values** @@ -32,7 +32,7 @@ The function executes `then` and `else` expressions and returns its result, depe Query: ``` sql -SELECT if(1, plus(2, 2), plus(2, 6)) +SELECT if(1, plus(2, 2), plus(2, 6)); ``` Result: @@ -46,7 +46,7 @@ Result: Query: ``` sql -SELECT if(0, plus(2, 2), plus(2, 6)) +SELECT if(0, plus(2, 2), plus(2, 6)); ``` Result: @@ -202,4 +202,3 @@ FROM LEFT_RIGHT └──────┴───────┴──────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/conditional_functions/) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index f26e1bee6c9..b0636b0305e 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -61,40 +61,58 @@ int32samoa: 1546300800 Converts a date or date with time to a UInt16 number containing the year number (AD). +Alias: `YEAR`. + ## toQuarter {#toquarter} Converts a date or date with time to a UInt8 number containing the quarter number. +Alias: `QUARTER`. + ## toMonth {#tomonth} Converts a date or date with time to a UInt8 number containing the month number (1-12). +Alias: `MONTH`. + ## toDayOfYear {#todayofyear} Converts a date or date with time to a UInt16 number containing the number of the day of the year (1-366). +Alias: `DAYOFYEAR`. + ## toDayOfMonth {#todayofmonth} Converts a date or date with time to a UInt8 number containing the number of the day of the month (1-31). +Aliases: `DAYOFMONTH`, `DAY`. + ## toDayOfWeek {#todayofweek} Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7). +Alias: `DAYOFWEEK`. + ## toHour {#tohour} Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23). This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true – even in Moscow the clocks were twice changed at a different time). +Alias: `HOUR`. + ## toMinute {#tominute} Converts a date with time to a UInt8 number containing the number of the minute of the hour (0-59). +Alias: `MINUTE`. + ## toSecond {#tosecond} Converts a date with time to a UInt8 number containing the number of the second in the minute (0-59). Leap seconds are not accounted for. +Alias: `SECOND`. + ## toUnixTimestamp {#to-unix-timestamp} For DateTime argument: converts value to the number with type UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time). @@ -129,6 +147,9 @@ Result: └────────────────┘ ``` +!!! attention "Attention" + The return type `toStartOf*` functions described below is `Date` or `DateTime`. Though these functions can take `DateTime64` as an argument, passing them a `DateTime64` that is out of normal range (years 1970 - 2105) will give incorrect result. + ## toStartOfYear {#tostartofyear} Rounds down a date or date with time to the first day of the year. @@ -370,13 +391,13 @@ SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(d Truncates date and time data to the specified part of date. -**Syntax** +**Syntax** ``` sql date_trunc(unit, value[, timezone]) ``` -Alias: `dateTrunc`. +Alias: `dateTrunc`. **Arguments** @@ -431,41 +452,55 @@ Result: └─────────────────────┴────────────────────────────────────────────┘ ``` -**See also** +**See Also** - [toStartOfInterval](#tostartofintervaltime-or-data-interval-x-unit-time-zone) ## date\_add {#date_add} -Adds specified date/time interval to the provided date. +Adds the time interval or date interval to the provided date or date with time. -**Syntax** +**Syntax** ``` sql date_add(unit, value, date) ``` -Aliases: `dateAdd`, `DATE_ADD`. +Aliases: `dateAdd`, `DATE_ADD`. **Arguments** - `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md). + Possible values: - Supported values: second, minute, hour, day, week, month, quarter, year. -- `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md) -- `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` +- `value` — Value of interval to add. [Int](../../sql-reference/data-types/int-uint.md). +- `date` — The date or date with time to which `value` is added. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). **Returned value** -Returns Date or DateTime with `value` expressed in `unit` added to `date`. +Date or date with time obtained by adding `value`, expressed in `unit`, to `date`. + +Type: [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). **Example** +Query: + ```sql -select date_add(YEAR, 3, toDate('2018-01-01')); +SELECT date_add(YEAR, 3, toDate('2018-01-01')); ``` +Result: + ```text ┌─plus(toDate('2018-01-01'), toIntervalYear(3))─┐ │ 2021-01-01 │ @@ -474,7 +509,7 @@ select date_add(YEAR, 3, toDate('2018-01-01')); ## date\_diff {#date_diff} -Returns the difference between two Date or DateTime values. +Returns the difference between two dates or dates with time values. **Syntax** @@ -482,25 +517,33 @@ Returns the difference between two Date or DateTime values. date_diff('unit', startdate, enddate, [timezone]) ``` -Aliases: `dateDiff`, `DATE_DIFF`. +Aliases: `dateDiff`, `DATE_DIFF`. **Arguments** -- `unit` — The type of interval for result [String](../../sql-reference/data-types/string.md). +- `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md). + Possible values: - Supported values: second, minute, hour, day, week, month, quarter, year. + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` - `startdate` — The first time value to subtract (the subtrahend). [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). - `enddate` — The second time value to subtract from (the minuend). [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). -- `timezone` — Optional parameter. If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. [String](../../sql-reference/data-types/string.md). **Returned value** Difference between `enddate` and `startdate` expressed in `unit`. -Type: `int`. +Type: [Int](../../sql-reference/data-types/int-uint.md). **Example** @@ -520,7 +563,7 @@ Result: ## date\_sub {#date_sub} -Subtracts a time/date interval from the provided date. +Subtracts the time interval or date interval from the provided date or date with time. **Syntax** @@ -528,19 +571,30 @@ Subtracts a time/date interval from the provided date. date_sub(unit, value, date) ``` -Aliases: `dateSub`, `DATE_SUB`. +Aliases: `dateSub`, `DATE_SUB`. **Arguments** - `unit` — The type of interval to subtract. [String](../../sql-reference/data-types/string.md). + Possible values: - Supported values: second, minute, hour, day, week, month, quarter, year. -- `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md) -- `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md) to subtract value from. + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` + +- `value` — Value of interval to subtract. [Int](../../sql-reference/data-types/int-uint.md). +- `date` — The date or date with time from which `value` is subtracted. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). **Returned value** -Returns Date or DateTime with `value` expressed in `unit` subtracted from `date`. +Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. + +Type: [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -562,32 +616,46 @@ Result: Adds the specified time value with the provided date or date time value. -**Syntax** +**Syntax** ``` sql timestamp_add(date, INTERVAL value unit) ``` -Aliases: `timeStampAdd`, `TIMESTAMP_ADD`. +Aliases: `timeStampAdd`, `TIMESTAMP_ADD`. **Arguments** - -- `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). -- `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md) -- `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md). - Supported values: second, minute, hour, day, week, month, quarter, year. +- `date` — Date or date with time. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). +- `value` — Value of interval to add. [Int](../../sql-reference/data-types/int-uint.md). +- `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md). + Possible values: + + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` **Returned value** -Returns Date or DateTime with the specified `value` expressed in `unit` added to `date`. - +Date or date with time with the specified `value` expressed in `unit` added to `date`. + +Type: [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). + **Example** +Query: + ```sql select timestamp_add(toDate('2018-01-01'), INTERVAL 3 MONTH); ``` +Result: + ```text ┌─plus(toDate('2018-01-01'), toIntervalMonth(3))─┐ │ 2018-04-01 │ @@ -596,45 +664,60 @@ select timestamp_add(toDate('2018-01-01'), INTERVAL 3 MONTH); ## timestamp\_sub {#timestamp_sub} -Returns the difference between two dates in the specified unit. +Subtracts the time interval from the provided date or date with time. -**Syntax** +**Syntax** ``` sql timestamp_sub(unit, value, date) ``` -Aliases: `timeStampSub`, `TIMESTAMP_SUB`. +Aliases: `timeStampSub`, `TIMESTAMP_SUB`. **Arguments** -- `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md). +- `unit` — The type of interval to subtract. [String](../../sql-reference/data-types/string.md). + Possible values: - Supported values: second, minute, hour, day, week, month, quarter, year. -- `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md). -- `date`- [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` + +- `value` — Value of interval to subtract. [Int](../../sql-reference/data-types/int-uint.md). +- `date` — Date or date with time. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). **Returned value** -Difference between `date` and the specified `value` expressed in `unit`. +Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. + +Type: [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). **Example** +Query: + ```sql select timestamp_sub(MONTH, 5, toDateTime('2018-12-18 01:02:03')); ``` +Result: + ```text ┌─minus(toDateTime('2018-12-18 01:02:03'), toIntervalMonth(5))─┐ │ 2018-07-18 01:02:03 │ └──────────────────────────────────────────────────────────────┘ ``` - + ## now {#now} -Returns the current date and time. +Returns the current date and time. -**Syntax** +**Syntax** ``` sql now([timezone]) @@ -753,7 +836,7 @@ This is necessary for searching for pageviews in the corresponding session. ## formatDateTime {#formatdatetime} -Function formats a Time according given Format string. N.B.: Format is a constant expression, e.g. you can not have multiple formats for single result column. +Formats a Time according to the given Format string. Format is a constant expression, so you cannot have multiple formats for a single result column. **Syntax** @@ -773,7 +856,7 @@ Using replacement fields, you can define a pattern for the resulting string. “ | %C | year divided by 100 and truncated to integer (00-99) | 20 | | %d | day of the month, zero-padded (01-31) | 02 | | %D | Short MM/DD/YY date, equivalent to %m/%d/%y | 01/02/18 | -| %e | day of the month, space-padded ( 1-31) | 2 | +| %e | day of the month, space-padded ( 1-31) |   2 | | %F | short YYYY-MM-DD date, equivalent to %Y-%m-%d | 2018-01-02 | | %G | four-digit year format for ISO week number, calculated from the week-based year [defined by the ISO 8601](https://en.wikipedia.org/wiki/ISO_8601#Week_dates) standard, normally useful only with %V | 2018 | | %g | two-digit year format, aligned to ISO 8601, abbreviated from four-digit notation | 18 | @@ -812,31 +895,32 @@ Result: └────────────────────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/date_time_functions/) - ## FROM\_UNIXTIME {#fromunixfime} -When there is only single argument of integer type, it act in the same way as `toDateTime` and return [DateTime](../../sql-reference/data-types/datetime.md). -type. +Function converts Unix timestamp to a calendar date and a time of a day. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type. -For example: +**Example:** + +Query: ```sql -SELECT FROM_UNIXTIME(423543535) +SELECT FROM_UNIXTIME(423543535); ``` +Result: + ```text ┌─FROM_UNIXTIME(423543535)─┐ │ 1983-06-04 10:58:55 │ └──────────────────────────┘ ``` -When there are two arguments, first is integer or DateTime, second is constant format string, it act in the same way as `formatDateTime` and return `String` type. +When there are two arguments: first is an [Integer](../../sql-reference/data-types/int-uint.md) or [DateTime](../../sql-reference/data-types/datetime.md), second is a constant format string — it acts in the same way as [formatDateTime](#formatdatetime) and return [String](../../sql-reference/data-types/string.md#string) type. For example: ```sql -SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime +SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime; ``` ```text diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index 31e84c08b39..6b72d3c2269 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -30,7 +30,7 @@ Type: `String`. Query: ``` sql -SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello +SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello; ``` Result: @@ -75,6 +75,8 @@ Result: Returns a string containing the argument’s hexadecimal representation. +Alias: `HEX`. + **Syntax** ``` sql @@ -170,4 +172,3 @@ Accepts an integer. Returns a string containing the list of powers of two that t Accepts an integer. Returns an array of UInt64 numbers containing the list of powers of two that total the source number when summed. Numbers in the array are in ascending order. -[Original article](https://clickhouse.tech/docs/en/query_language/functions/encoding_functions/) diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md index 0dd7469b25e..df27685dcb3 100644 --- a/docs/en/sql-reference/functions/encryption-functions.md +++ b/docs/en/sql-reference/functions/encryption-functions.md @@ -55,7 +55,7 @@ CREATE TABLE encryption_test `comment` String, `secret` String ) -ENGINE = Memory +ENGINE = Memory; ``` Insert some data (please avoid storing the keys/ivs in the database as this undermines the whole concept of encryption), also storing 'hints' is unsafe too and used only for illustrative purposes: @@ -110,7 +110,7 @@ Result: Compatible with mysql encryption and resulting ciphertext can be decrypted with [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt) function. -Will produce same ciphertext as `encrypt` on equal inputs. But when `key` or `iv` are longer than they should normally be, `aes_encrypt_mysql` will stick to what MySQL's `aes_encrypt` does: 'fold' `key` and ignore excess bits of `IV`. +Will produce the same ciphertext as `encrypt` on equal inputs. But when `key` or `iv` are longer than they should normally be, `aes_encrypt_mysql` will stick to what MySQL's `aes_encrypt` does: 'fold' `key` and ignore excess bits of `iv`. Supported encryption modes: @@ -132,13 +132,12 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv]) - `mode` — Encryption mode. [String](../../sql-reference/data-types/string.md#string). - `plaintext` — Text that needs to be encrypted. [String](../../sql-reference/data-types/string.md#string). - `key` — Encryption key. If key is longer than required by mode, MySQL-specific key folding is performed. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Optinal, only first 16 bytes are taken into account [String](../../sql-reference/data-types/string.md#string). +- `iv` — Initialization vector. Optional, only first 16 bytes are taken into account [String](../../sql-reference/data-types/string.md#string). **Returned value** - Ciphertext binary string. [String](../../sql-reference/data-types/string.md#string). - **Examples** Given equal input `encrypt` and `aes_encrypt_mysql` produce the same ciphertext: @@ -157,7 +156,6 @@ Result: └───────────────────┘ ``` - But `encrypt` fails when `key` or `iv` is longer than expected: Query: @@ -252,7 +250,7 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad]) **Examples** -Re-using table from [encrypt](./encryption-functions.md#encrypt). +Re-using table from [encrypt](#encrypt). Query: @@ -284,6 +282,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 ``` Result: + ``` text ┌─comment─────────────────────────────┬─plaintext─┐ │ aes-256-cfb128 no IV │ Secret │ @@ -294,7 +293,7 @@ Result: └─────────────────────────────────────┴───────────┘ ``` -Notice how only portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption. +Notice how only a portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption. ## aes_decrypt_mysql {#aes_decrypt_mysql} @@ -331,6 +330,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) **Examples** Let's decrypt data we've previously encrypted with MySQL: + ``` sql mysql> SET block_encryption_mode='aes-256-cfb128'; Query OK, 0 rows affected (0.00 sec) @@ -345,11 +345,13 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv ``` Query: + ``` sql SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext ``` Result: + ``` text ┌─plaintext─┐ │ Secret │ diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 834fcdf8282..5fc146f603f 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -203,4 +203,3 @@ dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ClickHouse throws an exception if it cannot parse the value of the attribute or the value doesn’t match the attribute data type. -[Original article](https://clickhouse.tech/docs/en/query_language/functions/ext_dict_functions/) diff --git a/docs/en/sql-reference/functions/files.md b/docs/en/sql-reference/functions/files.md new file mode 100644 index 00000000000..9cbf8932465 --- /dev/null +++ b/docs/en/sql-reference/functions/files.md @@ -0,0 +1,35 @@ +--- +toc_priority: 43 +toc_title: Files +--- + +# Functions for Working with Files {#functions-for-working-with-files} + +## file {#file} + +Reads file as a String. The file content is not parsed, so any information is read as one string and placed into the specified column. + +**Syntax** + +``` sql +file(path) +``` + +**Arguments** + +- `path` — The relative path to the file from [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file support following wildcards: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. + +**Example** + +Inserting data from files a.txt and b.txt into a table as strings: + +Query: + +``` sql +INSERT INTO table SELECT file('a.txt'), file('b.txt'); +``` + +**See Also** + +- [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path) +- [file](../table-functions/file.md) diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md index df75e96c8fb..5cc95fe298a 100644 --- a/docs/en/sql-reference/functions/functions-for-nulls.md +++ b/docs/en/sql-reference/functions/functions-for-nulls.md @@ -13,6 +13,8 @@ Checks whether the argument is [NULL](../../sql-reference/syntax.md#null-literal isNull(x) ``` +Alias: `ISNULL`. + **Arguments** - `x` — A value with a non-compound data type. @@ -36,7 +38,7 @@ Input table Query ``` sql -SELECT x FROM t_null WHERE isNull(y) +SELECT x FROM t_null WHERE isNull(y); ``` ``` text @@ -76,7 +78,7 @@ Input table Query ``` sql -SELECT x FROM t_null WHERE isNotNull(y) +SELECT x FROM t_null WHERE isNotNull(y); ``` ``` text @@ -118,7 +120,7 @@ The `mail` and `phone` fields are of type String, but the `icq` field is `UInt32 Get the first available contact method for the customer from the contact list: ``` sql -SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook +SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook; ``` ``` text @@ -149,7 +151,7 @@ ifNull(x,alt) **Example** ``` sql -SELECT ifNull('a', 'b') +SELECT ifNull('a', 'b'); ``` ``` text @@ -159,7 +161,7 @@ SELECT ifNull('a', 'b') ``` ``` sql -SELECT ifNull(NULL, 'b') +SELECT ifNull(NULL, 'b'); ``` ``` text @@ -188,7 +190,7 @@ nullIf(x, y) **Example** ``` sql -SELECT nullIf(1, 1) +SELECT nullIf(1, 1); ``` ``` text @@ -198,7 +200,7 @@ SELECT nullIf(1, 1) ``` ``` sql -SELECT nullIf(1, 2) +SELECT nullIf(1, 2); ``` ``` text @@ -229,7 +231,7 @@ assumeNotNull(x) Consider the `t_null` table. ``` sql -SHOW CREATE TABLE t_null +SHOW CREATE TABLE t_null; ``` ``` text @@ -248,7 +250,7 @@ SHOW CREATE TABLE t_null Apply the `assumeNotNull` function to the `y` column. ``` sql -SELECT assumeNotNull(y) FROM t_null +SELECT assumeNotNull(y) FROM t_null; ``` ``` text @@ -259,7 +261,7 @@ SELECT assumeNotNull(y) FROM t_null ``` ``` sql -SELECT toTypeName(assumeNotNull(y)) FROM t_null +SELECT toTypeName(assumeNotNull(y)) FROM t_null; ``` ``` text @@ -288,7 +290,7 @@ toNullable(x) **Example** ``` sql -SELECT toTypeName(10) +SELECT toTypeName(10); ``` ``` text @@ -298,7 +300,7 @@ SELECT toTypeName(10) ``` ``` sql -SELECT toTypeName(toNullable(10)) +SELECT toTypeName(toNullable(10)); ``` ``` text @@ -307,4 +309,3 @@ SELECT toTypeName(toNullable(10)) └────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/functions_for_nulls/) diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md index c27eab0b421..cfe35746809 100644 --- a/docs/en/sql-reference/functions/geo/geohash.md +++ b/docs/en/sql-reference/functions/geo/geohash.md @@ -29,7 +29,7 @@ geohashEncode(longitude, latitude, [precision]) **Example** ``` sql -SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res +SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res; ``` ``` text @@ -53,7 +53,7 @@ Decodes any [geohash](#geohash)-encoded string into longitude and latitude. **Example** ``` sql -SELECT geohashDecode('ezs42') AS res +SELECT geohashDecode('ezs42') AS res; ``` ``` text @@ -98,8 +98,9 @@ Type: [Array](../../../sql-reference/data-types/array.md)([String](../../../sql- Query: ``` sql -SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos +SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos; ``` + Result: ``` text diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index 9dda947b3a7..20dc7b29902 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -40,8 +40,9 @@ Type: [UInt8](../../../sql-reference/data-types/int-uint.md). Query: ``` sql -SELECT h3IsValid(630814730351855103) as h3IsValid +SELECT h3IsValid(630814730351855103) as h3IsValid; ``` + Result: ``` text @@ -76,8 +77,9 @@ Type: [UInt8](../../../sql-reference/data-types/int-uint.md). Query: ``` sql -SELECT h3GetResolution(639821929606596015) as resolution +SELECT h3GetResolution(639821929606596015) as resolution; ``` + Result: ``` text @@ -109,8 +111,9 @@ h3EdgeAngle(resolution) Query: ``` sql -SELECT h3EdgeAngle(10) as edgeAngle +SELECT h3EdgeAngle(10) as edgeAngle; ``` + Result: ``` text @@ -142,8 +145,9 @@ h3EdgeLengthM(resolution) Query: ``` sql -SELECT h3EdgeLengthM(15) as edgeLengthM +SELECT h3EdgeLengthM(15) as edgeLengthM; ``` + Result: ``` text @@ -180,7 +184,7 @@ Type: [UInt64](../../../sql-reference/data-types/int-uint.md). Query: ``` sql -SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index +SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index; ``` Result: @@ -217,8 +221,9 @@ Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql- Query: ``` sql -SELECT arrayJoin(h3kRing(644325529233966508, 1)) AS h3index +SELECT arrayJoin(h3kRing(644325529233966508, 1)) AS h3index; ``` + Result: ``` text diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 6bf1bebabaa..0ea4cfd6fbe 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -7,9 +7,11 @@ toc_title: Hash Hash functions can be used for the deterministic pseudo-random shuffling of elements. +Simhash is a hash function, which returns close hash values for close (similar) arguments. + ## halfMD5 {#hash-functions-halfmd5} -[Interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. +[Interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. ``` sql halfMD5(par1, ...) @@ -29,7 +31,7 @@ A [UInt64](../../sql-reference/data-types/int-uint.md) data type hash value. **Example** ``` sql -SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS halfMD5hash, toTypeName(halfMD5hash) AS type +SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS halfMD5hash, toTypeName(halfMD5hash) AS type; ``` ``` text @@ -54,7 +56,7 @@ sipHash64(par1,...) This is a cryptographic hash function. It works at least three times faster than the [MD5](#hash_functions-md5) function. -Function [interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. Then combines hashes by the following algorithm: +Function [interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. Then combines hashes by the following algorithm: 1. After hashing all the input parameters, the function gets the array of hashes. 2. Function takes the first and the second elements and calculates a hash for the array of them. @@ -72,7 +74,7 @@ A [UInt64](../../sql-reference/data-types/int-uint.md) data type hash value. **Example** ``` sql -SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type +SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type; ``` ``` text @@ -110,7 +112,7 @@ A [UInt64](../../sql-reference/data-types/int-uint.md) data type hash value. Call example: ``` sql -SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS CityHash, toTypeName(CityHash) AS type +SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS CityHash, toTypeName(CityHash) AS type; ``` ``` text @@ -177,7 +179,7 @@ A [UInt64](../../sql-reference/data-types/int-uint.md) data type hash value. **Example** ``` sql -SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS FarmHash, toTypeName(FarmHash) AS type +SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS FarmHash, toTypeName(FarmHash) AS type; ``` ``` text @@ -193,7 +195,7 @@ Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add97 **Syntax** ``` sql -SELECT javaHash(''); +SELECT javaHash('') ``` **Returned value** @@ -241,7 +243,7 @@ Correct query with UTF-16LE encoded string. Query: ``` sql -SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')) +SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')); ``` Result: @@ -257,7 +259,7 @@ Result: Calculates `HiveHash` from a string. ``` sql -SELECT hiveHash(''); +SELECT hiveHash('') ``` This is just [JavaHash](#hash_functions-javahash) with zeroed out sign bit. This function is used in [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) for versions before 3.0. This hash function is neither fast nor having a good quality. The only reason to use it is when this algorithm is already used in another system and you have to calculate exactly the same result. @@ -303,7 +305,7 @@ A [UInt64](../../sql-reference/data-types/int-uint.md) data type hash value. **Example** ``` sql -SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MetroHash, toTypeName(MetroHash) AS type +SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MetroHash, toTypeName(MetroHash) AS type; ``` ``` text @@ -339,7 +341,7 @@ Both functions take a variable number of input parameters. Arguments can be any **Example** ``` sql -SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash2, toTypeName(MurmurHash2) AS type +SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash2, toTypeName(MurmurHash2) AS type; ``` ``` text @@ -355,7 +357,7 @@ Calculates a 64-bit [MurmurHash2](https://github.com/aappleby/smhasher) hash val **Syntax** ``` sql -gccMurmurHash(par1, ...); +gccMurmurHash(par1, ...) ``` **Arguments** @@ -407,7 +409,7 @@ Both functions take a variable number of input parameters. Arguments can be any **Example** ``` sql -SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash3, toTypeName(MurmurHash3) AS type +SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash3, toTypeName(MurmurHash3) AS type; ``` ``` text @@ -435,13 +437,13 @@ A [FixedString(16)](../../sql-reference/data-types/fixedstring.md) data type has **Example** ``` sql -SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) AS type +SELECT hex(murmurHash3_128('example_string')) AS MurmurHash3, toTypeName(MurmurHash3) AS type; ``` ``` text -┌─MurmurHash3──────┬─type────────────┐ -│ 6�1�4"S5KT�~~q │ FixedString(16) │ -└──────────────────┴─────────────────┘ +┌─MurmurHash3──────────────────────┬─type───┐ +│ 368A1A311CB7342253354B548E7E7E71 │ String │ +└──────────────────────────────────┴────────┘ ``` ## xxHash32, xxHash64 {#hash-functions-xxhash32} @@ -449,11 +451,11 @@ SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) Calculates `xxHash` from a string. It is proposed in two flavors, 32 and 64 bits. ``` sql -SELECT xxHash32(''); +SELECT xxHash32('') OR -SELECT xxHash64(''); +SELECT xxHash64('') ``` **Returned value** @@ -482,4 +484,938 @@ Result: - [xxHash](http://cyan4973.github.io/xxHash/). -[Original article](https://clickhouse.tech/docs/en/query_language/functions/hash_functions/) +## ngramSimHash {#ngramsimhash} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHash(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHash('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1627567969 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashCaseInsensitive(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌──────Hash─┐ +│ 562180645 │ +└───────────┘ +``` + +## ngramSimHashUTF8 {#ngramsimhashutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashUTF8(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashUTF8('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1628157797 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +ngramSimHashCaseInsensitiveUTF8(string[, ngramsize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 1636742693 │ +└────────────┘ +``` + +## wordShingleSimHash {#wordshinglesimhash} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHash(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashCaseInsensitive(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## wordShingleSimHashUTF8 {#wordshinglesimhashutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashUTF8(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optinal. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. + +**Syntax** + +``` sql +wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Hash value. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Result: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## ngramMinHash {#ngramminhash} + +Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHash(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHash('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,9054248444481805918) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (2106263556442004574,13203602793651726206) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashUTF8 {#ngramminhashutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,6742163577938632877) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple───────────────────────────────────────┐ +│ (12493625717655877135,13203602793651726206) │ +└─────────────────────────────────────────────┘ +``` + +## ngramMinHashArg {#ngramminhasharg} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHash](#ngramminhash) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +ngramMinHashArg(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArg('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive} + +Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgUTF8 {#ngramminhashargutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashUTF8](#ngramminhashutf8) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +ngramMinHashArgUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8} + +Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-grams with minimum and maximum hashes, calculated by the [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` n-grams each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHash {#wordshingleminhash} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHash(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashUTF8 {#wordshingleminhashutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. + +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. + +**Syntax** + +``` sql +wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two hashes — the minimum and the maximum. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashArg {#wordshingleminhasharg} + +Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordshingleMinHash](#wordshingleminhash) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +wordShingleMinHashArg(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive} + +Splits a ASCII string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashUTF8](#wordshingleminhashutf8) function with the same input. Is case sensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8} + +Splits a UTF-8 string into parts (shingles) of `shinglesize` words each and returns the shingles with minimum and maximum word hashes, calculated by the [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) function with the same input. Is case insensitive. + +**Syntax** + +``` sql +wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Arguments** + +- `string` — String. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Tuple with two tuples with `hashnum` word shingles each. + +Type: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Example** + +Query: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Result: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/in-functions.md b/docs/en/sql-reference/functions/in-functions.md index dd3c1900fdc..c8936e74954 100644 --- a/docs/en/sql-reference/functions/in-functions.md +++ b/docs/en/sql-reference/functions/in-functions.md @@ -9,4 +9,3 @@ toc_title: IN Operator See the section [IN operators](../../sql-reference/operators/in.md#select-in-operators). -[Original article](https://clickhouse.tech/docs/en/query_language/functions/in_functions/) diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index 1a0b9d83b5f..32408759b98 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -84,4 +84,3 @@ Another example is the `hostName` function, which returns the name of the server If a function in a query is performed on the requestor server, but you need to perform it on remote servers, you can wrap it in an ‘any’ aggregate function or add it to a key in `GROUP BY`. -[Original article](https://clickhouse.tech/docs/en/query_language/functions/) diff --git a/docs/en/sql-reference/functions/introspection.md b/docs/en/sql-reference/functions/introspection.md index 964265a461b..44685e3cb67 100644 --- a/docs/en/sql-reference/functions/introspection.md +++ b/docs/en/sql-reference/functions/introspection.md @@ -53,13 +53,13 @@ Type: [String](../../sql-reference/data-types/string.md). Enabling introspection functions: ``` sql -SET allow_introspection_functions=1 +SET allow_introspection_functions=1; ``` Selecting the first string from the `trace_log` system table: ``` sql -SELECT * FROM system.trace_log LIMIT 1 \G +SELECT * FROM system.trace_log LIMIT 1 \G; ``` ``` text @@ -79,7 +79,7 @@ The `trace` field contains the stack trace at the moment of sampling. Getting the source code filename and the line number for a single address: ``` sql -SELECT addressToLine(94784076370703) \G +SELECT addressToLine(94784076370703) \G; ``` ``` text @@ -139,13 +139,13 @@ Type: [String](../../sql-reference/data-types/string.md). Enabling introspection functions: ``` sql -SET allow_introspection_functions=1 +SET allow_introspection_functions=1; ``` Selecting the first string from the `trace_log` system table: ``` sql -SELECT * FROM system.trace_log LIMIT 1 \G +SELECT * FROM system.trace_log LIMIT 1 \G; ``` ``` text @@ -165,7 +165,7 @@ The `trace` field contains the stack trace at the moment of sampling. Getting a symbol for a single address: ``` sql -SELECT addressToSymbol(94138803686098) \G +SELECT addressToSymbol(94138803686098) \G; ``` ``` text @@ -236,13 +236,13 @@ Type: [String](../../sql-reference/data-types/string.md). Enabling introspection functions: ``` sql -SET allow_introspection_functions=1 +SET allow_introspection_functions=1; ``` Selecting the first string from the `trace_log` system table: ``` sql -SELECT * FROM system.trace_log LIMIT 1 \G +SELECT * FROM system.trace_log LIMIT 1 \G; ``` ``` text @@ -262,7 +262,7 @@ The `trace` field contains the stack trace at the moment of sampling. Getting a function name for a single address: ``` sql -SELECT demangle(addressToSymbol(94138803686098)) \G +SELECT demangle(addressToSymbol(94138803686098)) \G; ``` ``` text @@ -335,6 +335,7 @@ Result: │ 3878 │ └───────┘ ``` + ## logTrace {#logtrace} Emits trace log message to server log for each [Block](https://clickhouse.tech/docs/en/development/architecture/#block). @@ -369,4 +370,3 @@ Result: └──────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/introspection/) diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index eaea5e250fb..0b5dd7160b8 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -9,10 +9,14 @@ toc_title: IP Addresses Takes a UInt32 number. Interprets it as an IPv4 address in big endian. Returns a string containing the corresponding IPv4 address in the format A.B.C.d (dot-separated numbers in decimal form). +Alias: `INET_NTOA`. + ## IPv4StringToNum(s) {#ipv4stringtonums} The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. +Alias: `INET_ATON`. + ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Similar to IPv4NumToString, but using xxx instead of the last octet. @@ -49,10 +53,14 @@ Since using ‘xxx’ is highly unusual, this may be changed in the future. We r ### IPv6NumToString(x) {#ipv6numtostringx} Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format. -IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. Examples: +IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. + +Alias: `INET6_NTOA`. + +Examples: ``` sql -SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr +SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr; ``` ``` text @@ -113,28 +121,59 @@ LIMIT 10 └────────────────────────────┴────────┘ ``` -## IPv6StringToNum(s) {#ipv6stringtonums} +## IPv6StringToNum {#ipv6stringtonums} -The reverse function of IPv6NumToString. If the IPv6 address has an invalid format, it returns a string of null bytes. -If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned. +The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes. + +If the input string contains a valid IPv4 address, returns its IPv6 equivalent. HEX can be uppercase or lowercase. +Alias: `INET6_ATON`. + +**Syntax** + ``` sql -SELECT cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0); +IPv6StringToNum(string) ``` -``` text -┌─cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0)─┐ -│ ::ffff:127.0.0.1 │ -└─────────────────────────────────────────────┘ +**Argument** + +- `string` — IP address. [String](../../sql-reference/data-types/string.md). + +**Returned value** + +- IPv6 address in binary format. + +Type: [FixedString(16)](../../sql-reference/data-types/fixedstring.md). + +**Example** + +Query: + +``` sql +SELECT addr, cutIPv6(IPv6StringToNum(addr), 0, 0) FROM (SELECT ['notaddress', '127.0.0.1', '1111::ffff'] AS addr) ARRAY JOIN addr; ``` +Result: + +``` text +┌─addr───────┬─cutIPv6(IPv6StringToNum(addr), 0, 0)─┐ +│ notaddress │ :: │ +│ 127.0.0.1 │ ::ffff:127.0.0.1 │ +│ 1111::ffff │ 1111::ffff │ +└────────────┴──────────────────────────────────────┘ +``` + +**See Also** + +- [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4). + ## IPv4ToIPv6(x) {#ipv4toipv6x} Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples: ``` sql -SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr +SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr; ``` ``` text @@ -167,7 +206,7 @@ SELECT Accepts an IPv4 and an UInt8 value containing the [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing). Return a tuple with two IPv4 containing the lower range and the higher range of the subnet. ``` sql -SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) +SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16); ``` ``` text @@ -222,43 +261,56 @@ SELECT └───────────────────────────────────┴──────────────────────────┘ ``` -## toIPv6(string) {#toipv6string} +## toIPv6 {#toipv6string} -An alias to `IPv6StringToNum()` that takes a string form of IPv6 address and returns value of [IPv6](../../sql-reference/data-types/domains/ipv6.md) type, which is binary equal to value returned by `IPv6StringToNum()`. -If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned. +Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. +Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format. -``` sql -WITH - '2001:438:ffff::407d:1bc1' as IPv6_string -SELECT - toTypeName(IPv6StringToNum(IPv6_string)), - toTypeName(toIPv6(IPv6_string)) +If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned. + +**Syntax** + +```sql +toIPv6(string) ``` -``` text -┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ -│ FixedString(16) │ IPv6 │ -└──────────────────────────────────────────┴─────────────────────────────────┘ -``` +**Argument** + +- `string` — IP address. [String](../../sql-reference/data-types/string.md) + +**Returned value** + +- IP address. + +Type: [IPv6](../../sql-reference/data-types/domains/ipv6.md). + +**Examples** + +Query: ``` sql -WITH - '2001:438:ffff::407d:1bc1' as IPv6_string +WITH '2001:438:ffff::407d:1bc1' AS IPv6_string SELECT hex(IPv6StringToNum(IPv6_string)), - hex(toIPv6(IPv6_string)) + hex(toIPv6(IPv6_string)); ``` +Result: + ``` text ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ │ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ └───────────────────────────────────┴──────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toIPv6('127.0.0.1') +SELECT toIPv6('127.0.0.1'); ``` +Result: + ``` text ┌─toIPv6('127.0.0.1')─┐ │ ::ffff:127.0.0.1 │ @@ -290,7 +342,7 @@ Type: [UInt8](../../sql-reference/data-types/int-uint.md). Query: ```sql -SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr +SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr; ``` Result: @@ -328,7 +380,7 @@ Type: [UInt8](../../sql-reference/data-types/int-uint.md). Query: ``` sql -SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr +SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr; ``` Result: @@ -342,4 +394,55 @@ Result: └──────────────────┴────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/ip_address_functions/) +## isIPAddressInRange {#isipaddressinrange} + +Determines if an IP address is contained in a network represented in the [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) notation. Returns `1` if true, or `0` otherwise. + +**Syntax** + +``` sql +isIPAddressInRange(address, prefix) +``` + +This function accepts both IPv4 and IPv6 addresses (and networks) represented as strings. It returns `0` if the IP version of the address and the CIDR don't match. + +**Arguments** + +- `address` — An IPv4 or IPv6 address. [String](../../sql-reference/data-types/string.md). +- `prefix` — An IPv4 or IPv6 network prefix in CIDR. [String](../../sql-reference/data-types/string.md). + +**Returned value** + +- `1` or `0`. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +``` sql +SELECT isIPAddressInRange('127.0.0.1', '127.0.0.0/8') +``` + +Result: + +``` text +┌─isIPAddressInRange('127.0.0.1', '127.0.0.0/8')─┐ +│ 1 │ +└────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT isIPAddressInRange('127.0.0.1', 'ffff::/16') +``` + +Result: + +``` text +┌─isIPAddressInRange('127.0.0.1', 'ffff::/16')─┐ +│ 0 │ +└──────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index edee048eb77..d545a0ae4e6 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -16,46 +16,60 @@ The following assumptions are made: ## visitParamHas(params, name) {#visitparamhasparams-name} -Checks whether there is a field with the ‘name’ name. +Checks whether there is a field with the `name` name. + +Alias: `simpleJSONHas`. ## visitParamExtractUInt(params, name) {#visitparamextractuintparams-name} -Parses UInt64 from the value of the field named ‘name’. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn’t exist, or it exists but doesn’t contain a number, it returns 0. +Parses UInt64 from the value of the field named `name`. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn’t exist, or it exists but doesn’t contain a number, it returns 0. + +Alias: `simpleJSONExtractUInt`. ## visitParamExtractInt(params, name) {#visitparamextractintparams-name} The same as for Int64. +Alias: `simpleJSONExtractInt`. + ## visitParamExtractFloat(params, name) {#visitparamextractfloatparams-name} The same as for Float64. +Alias: `simpleJSONExtractFloat`. + ## visitParamExtractBool(params, name) {#visitparamextractboolparams-name} Parses a true/false value. The result is UInt8. +Alias: `simpleJSONExtractBool`. + ## visitParamExtractRaw(params, name) {#visitparamextractrawparams-name} Returns the value of a field, including separators. +Alias: `simpleJSONExtractRaw`. + Examples: ``` sql -visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' -visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' +visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"'; +visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}'; ``` ## visitParamExtractString(params, name) {#visitparamextractstringparams-name} Parses the string in double quotes. The value is unescaped. If unescaping failed, it returns an empty string. +Alias: `simpleJSONExtractString`. + Examples: ``` sql -visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' -visitParamExtractString('{"abc":"hello}', 'abc') = '' +visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'; +visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺'; +visitParamExtractString('{"abc":"\\u263"}', 'abc') = ''; +visitParamExtractString('{"abc":"hello}', 'abc') = ''; ``` There is currently no support for code points in the format `\uXXXX\uYYYY` that are not from the basic multilingual plane (they are converted to CESU-8 instead of UTF-8). @@ -199,7 +213,7 @@ Parses key-value pairs from a JSON where the values are of the given ClickHouse Example: ``` sql -SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)] +SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; ``` ## JSONExtractRaw(json\[, indices_or_keys\]…) {#jsonextractrawjson-indices-or-keys} @@ -211,7 +225,7 @@ If the part does not exist or has a wrong type, an empty string will be returned Example: ``` sql -SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' +SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]'; ``` ## JSONExtractArrayRaw(json\[, indices_or_keys…\]) {#jsonextractarrayrawjson-indices-or-keys} @@ -223,7 +237,7 @@ If the part does not exist or isn’t array, an empty array will be returned. Example: ``` sql -SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = ['-100', '200.0', '"hello"']' +SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = ['-100', '200.0', '"hello"']'; ``` ## JSONExtractKeysAndValuesRaw {#json-extract-keys-and-values-raw} @@ -253,7 +267,7 @@ Type: [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-referen Query: ``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}') +SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}'); ``` Result: @@ -267,7 +281,7 @@ Result: Query: ``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', 'b') +SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', 'b'); ``` Result: @@ -281,7 +295,7 @@ Result: Query: ``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', -1, 'c') +SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', -1, 'c'); ``` Result: @@ -292,4 +306,3 @@ Result: └───────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/json_functions/) diff --git a/docs/en/sql-reference/functions/logical-functions.md b/docs/en/sql-reference/functions/logical-functions.md index 13452f88a85..6cce0e4fff5 100644 --- a/docs/en/sql-reference/functions/logical-functions.md +++ b/docs/en/sql-reference/functions/logical-functions.md @@ -17,4 +17,3 @@ Zero as an argument is considered “false,” while any non-zero value is consi ## xor {#xor} -[Original article](https://clickhouse.tech/docs/en/query_language/functions/logical_functions/) diff --git a/docs/en/sql-reference/functions/machine-learning-functions.md b/docs/en/sql-reference/functions/machine-learning-functions.md index f103a4ea421..60dabd73781 100644 --- a/docs/en/sql-reference/functions/machine-learning-functions.md +++ b/docs/en/sql-reference/functions/machine-learning-functions.md @@ -9,7 +9,7 @@ toc_title: Machine Learning Prediction using fitted regression models uses `evalMLMethod` function. See link in `linearRegression`. -## stochasticLinearRegressionn {#stochastic-linear-regression} +## stochasticLinearRegression {#stochastic-linear-regression} The [stochasticLinearRegression](../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md#agg_functions-stochasticlinearregression) aggregate function implements stochastic gradient descent method using linear model and MSE loss function. Uses `evalMLMethod` to predict on new data. @@ -36,14 +36,14 @@ bayesAB(distribution_name, higher_is_better, variant_names, x, y) - `higher_is_better` — Boolean flag. [Boolean](../../sql-reference/data-types/boolean.md). Possible values: - - `0` - lower values are considered to be better than higher - - `1` - higher values are considered to be better than lower + - `0` — lower values are considered to be better than higher + - `1` — higher values are considered to be better than lower -- `variant_names` - Variant names. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- `variant_names` — Variant names. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -- `x` - Numbers of tests for the corresponding variants. [Array](../../sql-reference/data-types/array.md)([Float64](../../sql-reference/data-types/float.md)). +- `x` — Numbers of tests for the corresponding variants. [Array](../../sql-reference/data-types/array.md)([Float64](../../sql-reference/data-types/float.md)). -- `y` - Numbers of successful tests for the corresponding variants. [Array](../../sql-reference/data-types/array.md)([Float64](../../sql-reference/data-types/float.md)). +- `y` — Numbers of successful tests for the corresponding variants. [Array](../../sql-reference/data-types/array.md)([Float64](../../sql-reference/data-types/float.md)). !!! note "Note" All three arrays must have the same size. All `x` and `y` values must be non-negative constant numbers. `y` cannot be larger than `x`. @@ -51,8 +51,8 @@ bayesAB(distribution_name, higher_is_better, variant_names, x, y) **Returned values** For each variant the function calculates: -- `beats_control` - long-term probability to out-perform the first (control) variant -- `to_be_best` - long-term probability to out-perform all other variants +- `beats_control` — long-term probability to out-perform the first (control) variant +- `to_be_best` — long-term probability to out-perform all other variants Type: JSON. @@ -94,4 +94,3 @@ Result: } ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/machine-learning-functions/) diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index f56a721c0c0..2b3c000bc19 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -54,7 +54,7 @@ If ‘x’ is non-negative, then `erf(x / σ√2)` is the probability that a ran Example (three sigma rule): ``` sql -SELECT erf(3 / sqrt(2)) +SELECT erf(3 / sqrt(2)); ``` ``` text @@ -415,7 +415,7 @@ Result: ## sign(x) {#signx} -The `sign` function can extract the sign of a real number. +Returns the sign of a real number. **Syntax** @@ -433,9 +433,9 @@ sign(x) - 0 for `x = 0` - 1 for `x > 0` -**Example** +**Examples** -Query: +Sign for the zero value: ``` sql SELECT sign(0); @@ -449,7 +449,7 @@ Result: └─────────┘ ``` -Query: +Sign for the positive value: ``` sql SELECT sign(1); @@ -463,7 +463,7 @@ Result: └─────────┘ ``` -Query: +Sign for the negative value: ``` sql SELECT sign(-1); @@ -477,4 +477,3 @@ Result: └──────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/math_functions/) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 04e921b5c55..9d7743e186f 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -728,7 +728,7 @@ The result of the function depends on the affected data blocks and the order of It can reach the neighbor rows only inside the currently processed data block. The rows order used during the calculation of `neighbor` can differ from the order of rows returned to the user. -To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery. +To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. **Arguments** @@ -834,12 +834,12 @@ Calculates the difference between successive row values ​​in the data block. Returns 0 for the first row and the difference from the previous row for each subsequent row. !!! warning "Warning" - It can reach the previos row only inside the currently processed data block. + It can reach the previous row only inside the currently processed data block. The result of the function depends on the affected data blocks and the order of data in the block. The rows order used during the calculation of `runningDifference` can differ from the order of rows returned to the user. -To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery. +To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. Example: @@ -907,66 +907,64 @@ WHERE diff != 1 ## runningDifferenceStartingWithFirstValue {#runningdifferencestartingwithfirstvalue} -Same as for [runningDifference](../../sql-reference/functions/other-functions.md#other_functions-runningdifference), the difference is the value of the first row, returned the value of the first row, and each subsequent row returns the difference from the previous row. +Same as for [runningDifference](./other-functions.md#other_functions-runningdifference), the difference is the value of the first row, returned the value of the first row, and each subsequent row returns the difference from the previous row. ## runningConcurrency {#runningconcurrency} -Given a series of beginning time and ending time of events, this function calculates concurrency of the events at each of the data point, that is, the beginning time. +Calculates the number of concurrent events. +Each event has a start time and an end time. The start time is included in the event, while the end time is excluded. Columns with a start time and an end time must be of the same data type. +The function calculates the total number of active (concurrent) events for each event start time. + !!! warning "Warning" - Events spanning multiple data blocks will not be processed correctly. The function resets its state for each new data block. - -The result of the function depends on the order of data in the block. It assumes the beginning time is sorted in ascending order. + Events must be ordered by the start time in ascending order. If this requirement is violated the function raises an exception. + Every data block is processed separately. If events from different data blocks overlap then they can not be processed correctly. **Syntax** ``` sql -runningConcurrency(begin, end) +runningConcurrency(start, end) ``` **Arguments** -- `begin` — A column for the beginning time of events (inclusive). [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md). -- `end` — A column for the ending time of events (exclusive). [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md). - -Note that two columns `begin` and `end` must have the same type. +- `start` — A column with the start time of events. [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `end` — A column with the end time of events. [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md). **Returned values** -- The concurrency of events at the data point. +- The number of concurrent events at each event start time. Type: [UInt32](../../sql-reference/data-types/int-uint.md) **Example** -Input table: +Consider the table: ``` text -┌───────────────begin─┬─────────────────end─┐ -│ 2020-12-01 00:00:00 │ 2020-12-01 00:59:59 │ -│ 2020-12-01 00:30:00 │ 2020-12-01 00:59:59 │ -│ 2020-12-01 00:40:00 │ 2020-12-01 01:30:30 │ -│ 2020-12-01 01:10:00 │ 2020-12-01 01:30:30 │ -│ 2020-12-01 01:50:00 │ 2020-12-01 01:59:59 │ -└─────────────────────┴─────────────────────┘ +┌──────start─┬────────end─┐ +│ 2021-03-03 │ 2021-03-11 │ +│ 2021-03-06 │ 2021-03-12 │ +│ 2021-03-07 │ 2021-03-08 │ +│ 2021-03-11 │ 2021-03-12 │ +└────────────┴────────────┘ ``` Query: ``` sql -SELECT runningConcurrency(begin, end) FROM example +SELECT start, runningConcurrency(start, end) FROM example_table; ``` Result: ``` text -┌─runningConcurrency(begin, end)─┐ -│ 1 │ -│ 2 │ -│ 3 │ -│ 2 │ -│ 1 │ -└────────────────────────────────┘ +┌──────start─┬─runningConcurrency(start, end)─┐ +│ 2021-03-03 │ 1 │ +│ 2021-03-06 │ 2 │ +│ 2021-03-07 │ 3 │ +│ 2021-03-11 │ 2 │ +└────────────┴────────────────────────────────┘ ``` ## MACNumToString(num) {#macnumtostringnum} @@ -1194,6 +1192,109 @@ SELECT defaultValueOfTypeName('Nullable(Int8)') └──────────────────────────────────────────┘ ``` +## indexHint {#indexhint} +The function is intended for debugging and introspection purposes. The function ignores it's argument and always returns 1. Arguments are not even evaluated. + +But for the purpose of index analysis, the argument of this function is analyzed as if it was present directly without being wrapped inside `indexHint` function. This allows to select data in index ranges by the corresponding condition but without further filtering by this condition. The index in ClickHouse is sparse and using `indexHint` will yield more data than specifying the same condition directly. + +**Syntax** + +```sql +SELECT * FROM table WHERE indexHint() +``` + +**Returned value** + +1. Type: [Uint8](https://clickhouse.yandex/docs/en/data_types/int_uint/#diapazony-uint). + +**Example** + +Here is the example of test data from the table [ontime](../../getting-started/example-datasets/ontime.md). + +Input table: + +```sql +SELECT count() FROM ontime +``` + +```text +┌─count()─┐ +│ 4276457 │ +└─────────┘ +``` + +The table has indexes on the fields `(FlightDate, (Year, FlightDate))`. + +Create a query, where the index is not used. + +Query: + +```sql +SELECT FlightDate AS k, count() FROM ontime GROUP BY k ORDER BY k +``` + +ClickHouse processed the entire table (`Processed 4.28 million rows`). + +Result: + +```text +┌──────────k─┬─count()─┐ +│ 2017-01-01 │ 13970 │ +│ 2017-01-02 │ 15882 │ +........................ +│ 2017-09-28 │ 16411 │ +│ 2017-09-29 │ 16384 │ +│ 2017-09-30 │ 12520 │ +└────────────┴─────────┘ +``` + +To apply the index, select a specific date. + +Query: + +```sql +SELECT FlightDate AS k, count() FROM ontime WHERE k = '2017-09-15' GROUP BY k ORDER BY k +``` + +By using the index, ClickHouse processed a significantly smaller number of rows (`Processed 32.74 thousand rows`). + +Result: + +```text +┌──────────k─┬─count()─┐ +│ 2017-09-15 │ 16428 │ +└────────────┴─────────┘ +``` + +Now wrap the expression `k = '2017-09-15'` into `indexHint` function. + +Query: + +```sql +SELECT + FlightDate AS k, + count() +FROM ontime +WHERE indexHint(k = '2017-09-15') +GROUP BY k +ORDER BY k ASC +``` + +ClickHouse used the index in the same way as the previous time (`Processed 32.74 thousand rows`). +The expression `k = '2017-09-15'` was not used when generating the result. +In examle the `indexHint` function allows to see adjacent dates. + +Result: + +```text +┌──────────k─┬─count()─┐ +│ 2017-09-14 │ 7071 │ +│ 2017-09-15 │ 16428 │ +│ 2017-09-16 │ 1077 │ +│ 2017-09-30 │ 8167 │ +└────────────┴─────────┘ +``` + ## replicate {#other-functions-replicate} Creates an array with a single value. @@ -1762,7 +1863,6 @@ Result: ``` - ## randomStringUTF8 {#randomstringutf8} Generates a random string of a specified length. Result string contains valid UTF-8 code points. The value of code points may be outside of the range of assigned Unicode. @@ -1971,4 +2071,3 @@ Result: - [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) -[Original article](https://clickhouse.tech/docs/en/query_language/functions/other_functions/) diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index 2b9846344e4..aab9483de45 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -102,4 +102,3 @@ FROM numbers(3) │ aeca2A │ └───────────────────────────────────────┘ -[Original article](https://clickhouse.tech/docs/en/query_language/functions/random_functions/) diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index 83db1975366..c0bd44a6467 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -35,7 +35,7 @@ The function returns the nearest number of the specified order. In case when giv round(expression [, decimal_places]) ``` -**Arguments:** +**Arguments** - `expression` — A number to be rounded. Can be any [expression](../../sql-reference/syntax.md#syntax-expressions) returning the numeric [data type](../../sql-reference/data-types/index.md#data_types). - `decimal-places` — An integer value. @@ -185,4 +185,3 @@ Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rou Accepts a number and rounds it down to an element in the specified array. If the value is less than the lowest bound, the lowest bound is returned. -[Original article](https://clickhouse.tech/docs/en/query_language/functions/rounding_functions/) diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index c70ee20f076..bd7e209549c 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -150,4 +150,3 @@ Result: └───────────────────────────────────────────────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/splitting_merging_functions/) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 191bd100dda..85570cb408d 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -73,19 +73,19 @@ Returns 1, if the set of bytes is valid UTF-8 encoded, otherwise 0. Replaces invalid UTF-8 characters by the `�` (U+FFFD) character. All running in a row invalid characters are collapsed into the one replacement character. ``` sql -toValidUTF8( input_string ) +toValidUTF8(input_string) ``` **Arguments** -- input_string — Any set of bytes represented as the [String](../../sql-reference/data-types/string.md) data type object. +- `input_string` — Any set of bytes represented as the [String](../../sql-reference/data-types/string.md) data type object. Returned value: Valid UTF-8 string. **Example** ``` sql -SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') +SELECT toValidUTF8('\x61\xF0\x80\x80\x80b'); ``` ``` text @@ -98,6 +98,8 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') Repeats a string as many times as specified and concatenates the replicated values as a single string. +Alias: `REPEAT`. + **Syntax** ``` sql @@ -120,7 +122,7 @@ Type: `String`. Query: ``` sql -SELECT repeat('abc', 10) +SELECT repeat('abc', 10); ``` Result: @@ -188,7 +190,7 @@ If any of argument values is `NULL`, `concat` returns `NULL`. Query: ``` sql -SELECT concat('Hello, ', 'World!') +SELECT concat('Hello, ', 'World!'); ``` Result: @@ -243,7 +245,7 @@ SELECT * from key_val; Query: ``` sql -SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY concatAssumeInjective(key1, key2) +SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY concatAssumeInjective(key1, key2); ``` Result: @@ -276,10 +278,14 @@ Returns the string ‘s’ that was converted from the encoding in ‘from’ to Encodes ‘s’ string into base64 +Alias: `TO_BASE64`. + ## base64Decode(s) {#base64decode} Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception. +Alias: `FROM_BASE64`. + ## tryBase64Decode(s) {#trybase64decode} Similar to base64Decode, but in case of error an empty string would be returned. @@ -330,8 +336,8 @@ trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) **Arguments** -- `trim_character` — specified characters for trim. [String](../../sql-reference/data-types/string.md). -- `input_string` — string for trim. [String](../../sql-reference/data-types/string.md). +- `trim_character` — Specified characters for trim. [String](../../sql-reference/data-types/string.md). +- `input_string` — String for trim. [String](../../sql-reference/data-types/string.md). **Returned value** @@ -344,7 +350,7 @@ Type: `String`. Query: ``` sql -SELECT trim(BOTH ' ()' FROM '( Hello, world! )') +SELECT trim(BOTH ' ()' FROM '( Hello, world! )'); ``` Result: @@ -382,7 +388,7 @@ Type: `String`. Query: ``` sql -SELECT trimLeft(' Hello, world! ') +SELECT trimLeft(' Hello, world! '); ``` Result: @@ -420,7 +426,7 @@ Type: `String`. Query: ``` sql -SELECT trimRight(' Hello, world! ') +SELECT trimRight(' Hello, world! '); ``` Result: @@ -458,7 +464,7 @@ Type: `String`. Query: ``` sql -SELECT trimBoth(' Hello, world! ') +SELECT trimBoth(' Hello, world! '); ``` Result: @@ -491,7 +497,8 @@ The result type is UInt64. Replaces literals, sequences of literals and complex aliases with placeholders. -**Syntax** +**Syntax** + ``` sql normalizeQuery(x) ``` @@ -611,7 +618,7 @@ This function also replaces numeric character references with Unicode characters decodeXMLComponent(x) ``` -**Parameters** +**Arguments** - `x` — A sequence of characters. [String](../../sql-reference/data-types/string.md). @@ -642,4 +649,65 @@ Result: - [List of XML and HTML character entity references](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references) -[Original article](https://clickhouse.tech/docs/en/query_language/functions/string_functions/) + +## extractTextFromHTML {#extracttextfromhtml} + +A function to extract text from HTML or XHTML. +It does not necessarily 100% conform to any of the HTML, XML or XHTML standards, but the implementation is reasonably accurate and it is fast. The rules are the following: + +1. Comments are skipped. Example: ``. Comment must end with `-->`. Nested comments are not possible. +Note: constructions like `` and `` are not valid comments in HTML but they are skipped by other rules. +2. CDATA is pasted verbatim. Note: CDATA is XML/XHTML specific. But it is processed for "best-effort" approach. +3. `script` and `style` elements are removed with all their content. Note: it is assumed that closing tag cannot appear inside content. For example, in JS string literal has to be escaped like `"<\/script>"`. +Note: comments and CDATA are possible inside `script` or `style` - then closing tags are not searched inside CDATA. Example: `]]>`. But they are still searched inside comments. Sometimes it becomes complicated: ` var y = "-->"; alert(x + y);` +Note: `script` and `style` can be the names of XML namespaces - then they are not treated like usual `script` or `style` elements. Example: `Hello`. +Note: whitespaces are possible after closing tag name: `` but not before: `< / script>`. +4. Other tags or tag-like elements are skipped without inner content. Example: `.` +Note: it is expected that this HTML is illegal: `` +Note: it also skips something like tags: `<>`, ``, etc. +Note: tag without end is skipped to the end of input: `world`, `Helloworld` - there is no whitespace in HTML, but the function inserts it. Also consider: `Hello

world

`, `Hello
world`. This behavior is reasonable for data analysis, e.g. to convert HTML to a bag of words. +7. Also note that correct handling of whitespaces requires the support of `
` and CSS `display` and `white-space` properties.
+
+**Syntax**
+
+``` sql
+extractTextFromHTML(x)
+```
+
+**Arguments**
+
+-   `x` — input text. [String](../../sql-reference/data-types/string.md). 
+
+**Returned value**
+
+-   Extracted text.
+
+Type: [String](../../sql-reference/data-types/string.md).
+
+**Example**
+
+The first example contains several tags and a comment and also shows whitespace processing.
+The second example shows `CDATA` and `script` tag processing.
+In the third example text is extracted from the full HTML response received by the [url](../../sql-reference/table-functions/url.md) function.
+
+Query:
+
+``` sql
+SELECT extractTextFromHTML(' 

A text withtags.

'); +SELECT extractTextFromHTML('CDATA]]> '); +SELECT extractTextFromHTML(html) FROM url('http://www.donothingfor2minutes.com/', RawBLOB, 'html String'); +``` + +Result: + +``` text +A text with tags . +The content within CDATA +Do Nothing for 2 Minutes 2:00   +``` diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 8905500995c..144b4fbc1da 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -92,4 +92,3 @@ Predefined characters: `\0`, `\\`, `|`, `(`, `)`, `^`, `$`, `.`, `[`, `]`, `?`, This implementation slightly differs from re2::RE2::QuoteMeta. It escapes zero byte as `\0` instead of `\x00` and it escapes only required characters. For more information, see the link: [RE2](https://github.com/google/re2/blob/master/re2/re2.cc#L473) -[Original article](https://clickhouse.tech/docs/en/query_language/functions/string_replace_functions/) diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 83b0edea438..01b1dd2d004 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -12,7 +12,9 @@ The search is case-sensitive by default in all these functions. There are separa ## position(haystack, needle), locate(haystack, needle) {#position} -Returns the position (in bytes) of the found substring in the string, starting from 1. +Searches for the substring `needle` in the string `haystack`. + +Returns the position (in bytes) of the found substring in the string, starting from 1. For a case-insensitive search, use the function [positionCaseInsensitive](#positioncaseinsensitive). @@ -20,15 +22,22 @@ For a case-insensitive search, use the function [positionCaseInsensitive](#posit ``` sql position(haystack, needle[, start_pos]) -``` +``` + +``` sql +position(needle IN haystack) +``` Alias: `locate(haystack, needle[, start_pos])`. +!!! note "Note" + Syntax of `position(needle IN haystack)` provides SQL-compatibility, the function works the same way as to `position(haystack, needle)`. + **Arguments** -- `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) +- `haystack` — String, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` – Position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md). Optional. **Returned values** @@ -44,7 +53,7 @@ The phrase “Hello, world!” contains a set of bytes representing a single-byt Query: ``` sql -SELECT position('Hello, world!', '!') +SELECT position('Hello, world!', '!'); ``` Result: @@ -72,7 +81,7 @@ The same phrase in Russian contains characters which can’t be represented usin Query: ``` sql -SELECT position('Привет, мир!', '!') +SELECT position('Привет, мир!', '!'); ``` Result: @@ -83,6 +92,36 @@ Result: └───────────────────────────────┘ ``` +**Examples for POSITION(needle IN haystack) syntax** + +Query: + +```sql +SELECT 3 = position('c' IN 'abc'); +``` + +Result: + +```text +┌─equals(3, position('abc', 'c'))─┐ +│ 1 │ +└─────────────────────────────────┘ +``` + +Query: + +```sql +SELECT 6 = position('/' IN s) FROM (SELECT 'Hello/World' AS s); +``` + +Result: + +```text +┌─equals(6, position(s, '/'))─┐ +│ 1 │ +└─────────────────────────────┘ +``` + ## positionCaseInsensitive {#positioncaseinsensitive} The same as [position](#position) returns the position (in bytes) of the found substring in the string, starting from 1. Use the function for a case-insensitive search. @@ -97,9 +136,9 @@ positionCaseInsensitive(haystack, needle[, start_pos]) **Arguments** -- `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) +- `haystack` — String, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` — Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md). **Returned values** @@ -113,7 +152,7 @@ Type: `Integer`. Query: ``` sql -SELECT positionCaseInsensitive('Hello, world!', 'hello') +SELECT positionCaseInsensitive('Hello, world!', 'hello'); ``` Result: @@ -140,9 +179,9 @@ positionUTF8(haystack, needle[, start_pos]) **Arguments** -- `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) +- `haystack` — String, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` — Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Returned values** @@ -158,7 +197,7 @@ The phrase “Hello, world!” in Russian contains a set of Unicode points repre Query: ``` sql -SELECT positionUTF8('Привет, мир!', '!') +SELECT positionUTF8('Привет, мир!', '!'); ``` Result: @@ -174,7 +213,7 @@ The phrase “Salut, étudiante!”, where character `é` can be represented usi Query for the letter `é`, which is represented one Unicode point `U+00E9`: ``` sql -SELECT positionUTF8('Salut, étudiante!', '!') +SELECT positionUTF8('Salut, étudiante!', '!'); ``` Result: @@ -188,7 +227,7 @@ Result: Query for the letter `é`, which is represented two Unicode points `U+0065U+0301`: ``` sql -SELECT positionUTF8('Salut, étudiante!', '!') +SELECT positionUTF8('Salut, étudiante!', '!'); ``` Result: @@ -213,9 +252,9 @@ positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) **Arguments** -- `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) +- `haystack` — String, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` — Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Returned value** @@ -229,7 +268,7 @@ Type: `Integer`. Query: ``` sql -SELECT positionCaseInsensitiveUTF8('Привет, мир!', 'Мир') +SELECT positionCaseInsensitiveUTF8('Привет, мир!', 'Мир'); ``` Result: @@ -258,8 +297,8 @@ multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) **Arguments** -- `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `haystack` — String, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). **Returned values** @@ -270,7 +309,7 @@ multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) Query: ``` sql -SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']) +SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']); ``` Result: @@ -387,7 +426,7 @@ If `haystack` doesn’t match the `pattern` regex, an array of empty arrays is r Query: ``` sql -SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)') +SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)'); ``` Result: @@ -428,7 +467,7 @@ If `haystack` doesn’t match the `pattern` regex, an empty array is returned. Query: ``` sql -SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)') +SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)'); ``` Result: @@ -506,7 +545,7 @@ Input table: Query: ``` sql -SELECT * FROM Months WHERE ilike(name, '%j%') +SELECT * FROM Months WHERE ilike(name, '%j%'); ``` Result: @@ -618,7 +657,7 @@ countSubstringsCaseInsensitive(haystack, needle[, start_pos]) - `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Position of the first character in the string to start search. Optional. [UInt](../../sql-reference/data-types/int-uint.md). +- `start_pos` — Position of the first character in the string to start search. Optional. [UInt](../../sql-reference/data-types/int-uint.md). **Returned values** @@ -631,7 +670,7 @@ Type: [UInt64](../../sql-reference/data-types/int-uint.md). Query: ``` sql -select countSubstringsCaseInsensitive('aba', 'B'); +SELECT countSubstringsCaseInsensitive('aba', 'B'); ``` Result: @@ -684,7 +723,7 @@ SELECT countSubstringsCaseInsensitiveUTF8(haystack, needle[, start_pos]) - `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Position of the first character in the string to start search. Optional. [UInt](../../sql-reference/data-types/int-uint.md). +- `start_pos` — Position of the first character in the string to start search. Optional. [UInt](../../sql-reference/data-types/int-uint.md). **Returned values** @@ -772,5 +811,3 @@ Result: │ 2 │ └───────────────────────────────┘ ``` - -[Original article](https://clickhouse.tech/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 1006b68b8ee..86442835425 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -47,7 +47,7 @@ You can use the `EXCEPT` expression to skip columns as a result of the query. **Arguments** -- `x` - A `tuple` function, column, or tuple of elements. [Tuple](../../sql-reference/data-types/tuple.md). +- `x` — A `tuple` function, column, or tuple of elements. [Tuple](../../sql-reference/data-types/tuple.md). **Returned value** @@ -111,4 +111,55 @@ Result: - [Tuple](../../sql-reference/data-types/tuple.md) -[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-functions/) +## tupleHammingDistance {#tuplehammingdistance} + +Returns the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) between two tuples of the same size. + +**Syntax** + +``` sql +tupleHammingDistance(tuple1, tuple2) +``` + +**Arguments** + +- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). + +Tuples should have the same type of the elements. + +**Returned value** + +- The Hamming distance. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Examples** + +Query: + +``` sql +SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance; +``` + +Result: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` + +Can be used with [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) functions for detection of semi-duplicate strings: + +``` sql +SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string); +``` + +Result: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index 2b3a9d9103f..8b0710c0182 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -66,7 +66,6 @@ Result: - [Map(key, value)](../../sql-reference/data-types/map.md) data type - ## mapAdd {#function-mapadd} Collect all the keys and sum corresponding values. @@ -174,4 +173,129 @@ Result: └──────────────────────────────┴───────────────────────────────────┘ ``` +## mapContains {#mapcontains} + +Determines whether the `map` contains the `key` parameter. + +**Syntax** + +``` sql +mapContains(map, key) +``` + +**Parameters** + +- `map` — Map. [Map](../../sql-reference/data-types/map.md). +- `key` — Key. Type matches the type of keys of `map` parameter. + +**Returned value** + +- `1` if `map` contains `key`, `0` if not. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapContains(a, 'name') FROM test; + +``` + +Result: + +```text +┌─mapContains(a, 'name')─┐ +│ 1 │ +│ 0 │ +└────────────────────────┘ +``` + +## mapKeys {#mapkeys} + +Returns all keys from the `map` parameter. + +**Syntax** + +```sql +mapKeys(map) +``` + +**Parameters** + +- `map` — Map. [Map](../../sql-reference/data-types/map.md). + +**Returned value** + +- Array containing all keys from the `map`. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Example** + +Query: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapKeys(a) FROM test; +``` + +Result: + +```text +┌─mapKeys(a)────────────┐ +│ ['name','age'] │ +│ ['number','position'] │ +└───────────────────────┘ +``` + +## mapValues {#mapvalues} + +Returns all values from the `map` parameter. + +**Syntax** + +```sql +mapKeys(map) +``` + +**Parameters** + +- `map` — Map. [Map](../../sql-reference/data-types/map.md). + +**Returned value** + +- Array containing all the values from `map`. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Example** + +Query: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapValues(a) FROM test; +``` + +Result: + +```text +┌─mapValues(a)─────┐ +│ ['eleven','11'] │ +│ ['twelve','6.0'] │ +└──────────────────┘ +``` + [Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-map-functions/) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 0ea2bf0f1a6..d8d13d81d97 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/f **Example** +Query: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Result: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +SELECT toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Result: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +SELECT toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +Result: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../.. **Example** +Query: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Result: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -124,6 +140,8 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ## toDate {#todate} +Alias: `DATE`. + ## toDateOrZero {#todateorzero} ## toDateOrNull {#todateornull} @@ -168,20 +186,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Examples** +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Result: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Result: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -213,20 +239,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Example** +Query: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); ``` +Result: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); ``` +Result: + ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -258,12 +292,18 @@ Conversion between numeric types uses the same rules as assignments between diff Additionally, the toString function of the DateTime argument can take a second String argument containing the name of the time zone. Example: `Asia/Yekaterinburg` In this case, the time is formatted according to the specified time zone. +**Example** + +Query: + ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat + toString(now(), 'Asia/Yekaterinburg') AS now_yekat; ``` +Result: + ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -281,76 +321,159 @@ If the string has fewer bytes than N, it is padded with null bytes to the right. Accepts a String or FixedString argument. Returns the String with the content truncated at the first zero byte found. -Example: +**Example** + +Query: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Result: + ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` +Query: + ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Result: + ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ └────────────┴───────┘ ``` -## reinterpret(x, T) {#type_conversion_function-reinterpret} +## reinterpretAsUInt(8\|16\|32\|64) {#reinterpretasuint8163264} -Performs byte reinterpretation of ‘x’ as ‘t’ data type. +## reinterpretAsInt(8\|16\|32\|64) {#reinterpretasint8163264} -Following reinterpretations are allowed: -1. Any type that has fixed size and value of that type can be represented continuously into FixedString. -2. Any type that if value of that type can be represented continuously into String. Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long. -3. FixedString, String, types that can be interpreted as numeric (Integers, Float, Date, DateTime, UUID) into types that can be interpreted as numeric (Integers, Float, Date, DateTime, UUID) into FixedString, +## reinterpretAsFloat(32\|64) {#reinterpretasfloat3264} + +## reinterpretAsDate {#reinterpretasdate} + +## reinterpretAsDateTime {#reinterpretasdatetime} + +These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). If the string isn’t long enough, the functions work as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. A date is interpreted as the number of days since the beginning of the Unix Epoch, and a date with time is interpreted as the number of seconds since the beginning of the Unix Epoch. + +## reinterpretAsString {#type_conversion_functions-reinterpretAsString} + +This function accepts a number or date or date with time, and returns a string containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long. + +## reinterpretAsFixedString {#reinterpretasfixedstring} + +This function accepts a number or date or date with time, and returns a FixedString containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a FixedString that is one byte long. + +## reinterpretAsUUID {#reinterpretasuuid} + +This function accepts 16 bytes string, and returns UUID containing bytes representing the corresponding value in network byte order (big-endian). If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. + +**Syntax** ``` sql +reinterpretAsUUID(fixed_string) +``` + +**Arguments** + +- `fixed_string` — Big-endian byte string. [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring). + +**Returned value** + +- The UUID type value. [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). + +**Examples** + +String to UUID. + +Query: + +``` sql +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); +``` + +Result: + +``` text +┌─reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f')))─┐ +│ 08090a0b-0c0d-0e0f-0001-020304050607 │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +Going back and forth from String to UUID. + +Query: + +``` sql +WITH + generateUUIDv4() AS uuid, + identity(lower(hex(reverse(reinterpretAsString(uuid))))) AS str, + reinterpretAsUUID(reverse(unhex(str))) AS uuid2 +SELECT uuid = uuid2; +``` + +Result: + +``` text +┌─equals(uuid, uuid2)─┐ +│ 1 │ +└─────────────────────┘ +``` + +## reinterpret(x, T) {#type_conversion_function-reinterpret} + +Use the same source in-memory bytes sequence for `x` value and reinterpret it to destination type + +Query: +```sql SELECT reinterpret(toInt8(-1), 'UInt8') as int_to_uint, reinterpret(toInt8(1), 'Float32') as int_to_float, reinterpret('1', 'UInt32') as string_to_int; ``` -``` text +Result: + +``` ┌─int_to_uint─┬─int_to_float─┬─string_to_int─┐ │ 255 │ 1e-45 │ 49 │ └─────────────┴──────────────┴───────────────┘ ``` -## reinterpretAsUInt(8\|16\|32\|64\|256) {#reinterpretAsUInt8163264256} - -## reinterpretAsInt(8\|16\|32\|64\|128\|256) {#reinterpretAsInt8163264128256} - -## reinterpretAsDecimal(32\|64\|128\|256) {#reinterpretAsDecimal3264128256} - -## reinterpretAsFloat(32\|64) {#type_conversion_function-reinterpretAsFloat} - -## reinterpretAsDate {#type_conversion_function-reinterpretAsDate} - -## reinterpretAsDateTime {#type_conversion_function-reinterpretAsDateTime} - -## reinterpretAsDateTime64 {#type_conversion_function-reinterpretAsDateTime64} - -## reinterpretAsString {#type_conversion_function-reinterpretAsString} - -## reinterpretAsFixedString {#type_conversion_function-reinterpretAsFixedString} - -## reinterpretAsUUID {#type_conversion_function-reinterpretAsUUID} - -These functions are aliases for `reinterpret` function. - ## CAST(x, T) {#type_conversion_function-cast} -Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported. +Converts input value `x` to the `T` data type. Unlike to `reinterpret` function use external representation of `x` value. -Example: +The syntax `CAST(x AS t)` is also supported. + +Note, that if value `x` does not fit the bounds of type T, the function overflows. For example, CAST(-1, 'UInt8') returns 255. + +**Examples** + +Query: + +```sql +SELECT + cast(toInt8(-1), 'UInt8') AS cast_int_to_uint, + cast(toInt8(1), 'Float32') AS cast_int_to_float, + cast('1', 'UInt32') AS cast_string_to_int +``` + +Result: + +``` +┌─cast_int_to_uint─┬─cast_int_to_float─┬─cast_string_to_int─┐ +│ 255 │ 1 │ 1 │ +└──────────────────┴───────────────────┴────────────────────┘ +``` + +Query: ``` sql SELECT @@ -358,9 +481,11 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string + CAST(timestamp, 'FixedString(22)') AS fixed_string; ``` +Result: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -369,12 +494,18 @@ SELECT Conversion to FixedString(N) only works for arguments of type String or FixedString(N). -Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. Example: +Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. + +**Example** + +Query: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -382,10 +513,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Query: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -399,15 +534,19 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts ‘x’ to the ‘t’ data type. The differente from cast(x, T) is that accurateCast -does not allow overflow of numeric types during cast if type value x does not fit -bounds of type T. +Converts `x` to the `T` data type. + +The difference from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. + +**Example** + +Query: -Example ``` sql SELECT cast(-1, 'UInt8') as uint8; ``` +Result: ``` text ┌─uint8─┐ @@ -415,38 +554,46 @@ SELECT cast(-1, 'UInt8') as uint8; └───────┘ ``` +Query: + ```sql SELECT accurateCast(-1, 'UInt8') as uint8; ``` +Result: + ``` text Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. - ``` ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} -Converts ‘x’ to the ‘t’ data type. Always returns nullable type and returns NULL -if the casted value is not representable in the target type. +Converts input value `x` to the specified data type `T`. Always returns [Nullable](../../sql-reference/data-types/nullable.md) type and returns [NULL](../../sql-reference/syntax.md#null-literal) if the casted value is not representable in the target type. -Example: +**Syntax** + +```sql +accurateCastOrNull(x, T) +``` + +**Parameters** + +- `x` — Input value. +- `T` — The name of the returned data type. + +**Returned value** + +- The value, converted to the specified data type `T`. + +**Example** + +Query: ``` sql -SELECT - accurateCastOrNull(-1, 'UInt8') as uint8, - accurateCastOrNull(128, 'Int8') as int8, - accurateCastOrNull('Test', 'FixedString(2)') as fixed_string +SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); ``` -``` text -┌─uint8─┬─int8─┬─fixed_string─┐ -│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -└───────┴──────┴──────────────┘┘ -``` - -``` sql -SELECT toTypeName(accurateCastOrNull(5, 'UInt8')) -``` +Result: ``` text ┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ @@ -454,6 +601,23 @@ SELECT toTypeName(accurateCastOrNull(5, 'UInt8')) └────────────────────────────────────────────┘ ``` +Query: + +``` sql +SELECT + accurateCastOrNull(-1, 'UInt8') as uint8, + accurateCastOrNull(128, 'Int8') as int8, + accurateCastOrNull('Test', 'FixedString(2)') as fixed_string; +``` + +Result: + +``` text +┌─uint8─┬─int8─┬─fixed_string─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +└───────┴──────┴──────────────┘ +``` + ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} Converts a Number type argument to an [Interval](../../sql-reference/data-types/special-data-types/interval.md) data type. @@ -481,6 +645,8 @@ toIntervalYear(number) **Example** +Query: + ``` sql WITH toDate('2019-01-01') AS date, @@ -488,9 +654,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Result: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -498,6 +666,7 @@ SELECT ``` ## parseDateTimeBestEffort {#parsedatetimebesteffort} +## parseDateTime32BestEffort {#parsedatetime32besteffort} Converts a date and time in the [String](../../sql-reference/data-types/string.md) representation to [DateTime](../../sql-reference/data-types/datetime.md#data_type-datetime) data type. @@ -506,7 +675,7 @@ The function parses [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601), [RFC 112 **Syntax** ``` sql -parseDateTimeBestEffort(time_string [, time_zone]); +parseDateTimeBestEffort(time_string [, time_zone]) ``` **Arguments** @@ -549,7 +718,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -564,7 +733,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -579,7 +748,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -593,7 +762,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Result: @@ -613,12 +782,12 @@ Result: ## parseDateTimeBestEffortUS {#parsedatetimebesteffortUS} -This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. +This function is similar to [parseDateTimeBestEffort](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. **Syntax** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Arguments** @@ -686,13 +855,238 @@ Result: ``` ## parseDateTimeBestEffortOrNull {#parsedatetimebesteffortornull} +## parseDateTime32BestEffortOrNull {#parsedatetime32besteffortornull} -Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it returns null when it encounters a date format that cannot be processed. +Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it returns `NULL` when it encounters a date format that cannot be processed. ## parseDateTimeBestEffortOrZero {#parsedatetimebesteffortorzero} +## parseDateTime32BestEffortOrZero {#parsedatetime32besteffortorzero} Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. +## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} + +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns `NULL` when it encounters a date format that cannot be processed. + +**Syntax** + +``` sql +parseDateTimeBestEffortUSOrNull(time_string[, time_zone]) +``` + +**Parameters** + +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`, etc). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). + +**Supported non-standard formats** + +- A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). +- A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY`, etc. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. +- A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. + +**Returned values** + +- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. +- `NULL` if the input string cannot be converted to the `DateTime` data type. + +**Examples** + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-11 00:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ ᴺᵁᴸᴸ │ +└─────────────────────────────────┘ +``` + +## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} + +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. + +**Syntax** + +``` sql +parseDateTimeBestEffortUSOrZero(time_string[, time_zone]) +``` + +**Parameters** + +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`, etc). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). + +**Supported non-standard formats** + +- A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). +- A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY`, etc. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. +- A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. + +**Returned values** + +- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. +- Zero date or zero date with time if the input string cannot be converted to the `DateTime` data type. + +**Examples** + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-11 00:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021') AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.2021') AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 1970-01-01 00:00:00 │ +└─────────────────────────────────┘ +``` + +## parseDateTime64BestEffort {#parsedatetime64besteffort} + +Same as [parseDateTimeBestEffort](#parsedatetimebesteffort) function but also parse milliseconds and microseconds and return `DateTime64(3)` or `DateTime64(6)` data types. + +**Syntax** + +``` sql +parseDateTime64BestEffort(time_string [, precision [, time_zone]]) +``` + +**Parameters** + +- `time_string` — String containing a date or date with time to convert. [String](../../sql-reference/data-types/string.md). +- `precision` — `3` for milliseconds, `6` for microseconds. Default `3`. Optional [UInt8](../../sql-reference/data-types/int-uint.md). +- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). + +**Examples** + +Query: + +```sql +SELECT parseDateTime64BestEffort('2021-01-01') AS a, toTypeName(a) AS t +UNION ALL +SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346') AS a, toTypeName(a) AS t +UNION ALL +SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346',6) AS a, toTypeName(a) AS t +UNION ALL +SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346',3,'Europe/Moscow') AS a, toTypeName(a) AS t +FORMAT PrettyCompactMonoBlcok +``` + +Result: + +``` +┌──────────────────────────a─┬─t──────────────────────────────┐ +│ 2021-01-01 01:01:00.123000 │ DateTime64(3) │ +│ 2021-01-01 00:00:00.000000 │ DateTime64(3) │ +│ 2021-01-01 01:01:00.123460 │ DateTime64(6) │ +│ 2020-12-31 22:01:00.123000 │ DateTime64(3, 'Europe/Moscow') │ +└────────────────────────────┴────────────────────────────────┘ +``` + +## parseDateTime64BestEffortOrNull {#parsedatetime32besteffortornull} + +Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort) except that it returns `NULL` when it encounters a date format that cannot be processed. + +## parseDateTime64BestEffortOrZero {#parsedatetime64besteffortorzero} + +Same as for [parseDateTime64BestEffort](#parsedatetimebesteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. + + ## toLowCardinality {#tolowcardinality} Converts input parameter to the [LowCardianlity](../../sql-reference/data-types/lowcardinality.md) version of same data type. @@ -701,7 +1095,7 @@ To convert data from the `LowCardinality` data type use the [CAST](#type_convers **Syntax** -``` sql +```sql toLowCardinality(expr) ``` @@ -719,8 +1113,8 @@ Type: `LowCardinality(expr_result_type)` Query: -``` sql -SELECT toLowCardinality('1') +```sql +SELECT toLowCardinality('1'); ``` Result: @@ -737,7 +1131,8 @@ Result: ## toUnixTimestamp64Nano {#tounixtimestamp64nano} -Converts a `DateTime64` to a `Int64` value with fixed sub-second precision. Input value is scaled up or down appropriately depending on it precision. Please note that output value is a timestamp in UTC, not in timezone of `DateTime64`. +Converts a `DateTime64` to a `Int64` value with fixed sub-second precision. +Input value is scaled up or down appropriately depending on it precision. Please note that output value is a timestamp in UTC, not in timezone of `DateTime64`. **Syntax** @@ -759,7 +1154,7 @@ Query: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` Result: @@ -770,9 +1165,11 @@ Result: └──────────────────────────────┘ ``` +Query: + ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` Result: @@ -806,13 +1203,17 @@ fromUnixTimestamp64Milli(value [, ti]) - `value` converted to the `DateTime64` data type. -**Examples** +**Example** + +Query: ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` +Result: + ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ │ 2009-02-13 23:31:31.011 │ @@ -844,7 +1245,7 @@ Query: ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: @@ -885,7 +1286,7 @@ Query: ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: @@ -898,4 +1299,3 @@ Result: └───────────────────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/type_conversion_functions/) diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index 9e79ef2d0cb..9feb7a3c711 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -55,7 +55,7 @@ Type: `String`. **Example** ``` sql -SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk') +SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk'); ``` ``` text @@ -98,7 +98,7 @@ Type: `String`. **Example** ``` sql -SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk') +SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk'); ``` ``` text @@ -420,4 +420,3 @@ Removes the query string and fragment identifier. The question mark and number s Removes the ‘name’ URL parameter, if present. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument. -[Original article](https://clickhouse.tech/docs/en/query_language/functions/url_functions/) diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 01a61c65b67..e7e55c699cd 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -165,4 +165,3 @@ SELECT - [dictGetUUID](../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions-other) -[Original article](https://clickhouse.tech/docs/en/query_language/functions/uuid_function/) diff --git a/docs/en/sql-reference/functions/ym-dict-functions.md b/docs/en/sql-reference/functions/ym-dict-functions.md index 56530b5e83b..941f75ff006 100644 --- a/docs/en/sql-reference/functions/ym-dict-functions.md +++ b/docs/en/sql-reference/functions/ym-dict-functions.md @@ -112,7 +112,7 @@ Finds the highest continent in the hierarchy for the region. **Syntax** ``` sql -regionToTopContinent(id[, geobase]); +regionToTopContinent(id[, geobase]) ``` **Arguments** @@ -150,4 +150,3 @@ Accepts a UInt32 number – the region ID from the Yandex geobase. A string with `ua` and `uk` both mean Ukrainian. -[Original article](https://clickhouse.tech/docs/en/query_language/functions/ym_dict_functions/) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index bfa8b3d1003..0abeabc7f57 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -13,10 +13,28 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... If the left side is a single column that is in the index, and the right side is a set of constants, the system uses the index for processing the query. -Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”), then use a subquery. +Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section [External data for query processing](../../engines/table-engines/special/external-data.md)), then use a subquery. The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. +ClickHouse allows types to differ in the left and the right parts of `IN` subquery. In this case it converts the left side value to the type of the right side, as if the [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null) function is applied. That means, that the data type becomes [Nullable](../../sql-reference/data-types/nullable.md), and if the conversion cannot be performed, it returns [NULL](../../sql-reference/syntax.md#null-literal). + +**Example** + +Query: + +``` sql +SELECT '1' IN (SELECT 1); +``` + +Result: + +``` text +┌─in('1', _subquery49)─┐ +│ 1 │ +└──────────────────────┘ +``` + If the right side of the operator is the name of a table (for example, `UserID IN users`), this is equivalent to the subquery `UserID IN (SELECT * FROM users)`. Use this when working with external data that is sent along with the query. For example, the query can be sent together with a set of user IDs loaded to the ‘users’ temporary table, which should be filtered. If the right side of the operator is a table name that has the Set engine (a prepared data set that is always in RAM), the data set will not be created over again for each query. @@ -203,7 +221,7 @@ It also makes sense to specify a local table in the `GLOBAL IN` clause, in case When max_parallel_replicas is greater than 1, distributed queries are further transformed. For example, the following: ```sql -SEELECT CounterID, count() FROM distributed_table_1 WHERE UserID IN (SELECT UserID FROM local_table_2 WHERE CounterID < 100) +SELECT CounterID, count() FROM distributed_table_1 WHERE UserID IN (SELECT UserID FROM local_table_2 WHERE CounterID < 100) SETTINGS max_parallel_replicas=3 ``` diff --git a/docs/en/sql-reference/operators/index.md b/docs/en/sql-reference/operators/index.md index 274f7269bc8..e073d5f23f0 100644 --- a/docs/en/sql-reference/operators/index.md +++ b/docs/en/sql-reference/operators/index.md @@ -296,4 +296,3 @@ SELECT * FROM t_null WHERE y IS NOT NULL └───┴───┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/operators/) diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index 16aa266ebf9..d661bd4cd59 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -74,6 +74,9 @@ Deletes the column with the name `name`. If the `IF EXISTS` clause is specified, Deletes data from the file system. Since this deletes entire files, the query is completed almost instantly. +!!! warning "Warning" + You can’t delete a column if it is referenced by [materialized view](../../../sql-reference/statements/create/view.md#materialized). Otherwise, it returns an error. + Example: ``` sql @@ -144,7 +147,7 @@ This query changes the `name` column properties: - TTL - For examples of columns TTL modifying, see [Column TTL](../../engines/table_engines/mergetree_family/mergetree.md#mergetree-column-ttl). +For examples of columns TTL modifying, see [Column TTL](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl). If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist. @@ -180,7 +183,7 @@ ALTER TABLE table_name MODIFY column_name REMOVE property; ALTER TABLE table_with_ttl MODIFY COLUMN column_ttl REMOVE TTL; ``` -## See Also +**See Also** - [REMOVE TTL](ttl.md). @@ -191,7 +194,7 @@ Renames an existing column. Syntax: ```sql -ALTER TABLE table_name RENAME COLUMN column_name TO new_column_name; +ALTER TABLE table_name RENAME COLUMN column_name TO new_column_name ``` **Example** diff --git a/docs/en/sql-reference/statements/alter/index.md b/docs/en/sql-reference/statements/alter/index.md index 30603122096..71333e6fcce 100644 --- a/docs/en/sql-reference/statements/alter/index.md +++ b/docs/en/sql-reference/statements/alter/index.md @@ -47,4 +47,3 @@ For `ALTER ... ATTACH|DETACH|DROP` queries, you can use the `replication_alter_p For `ALTER TABLE ... UPDATE|DELETE` queries the synchronicity is defined by the [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting. -[Original article](https://clickhouse.tech/docs/en/query_language/alter/) diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 42396223b86..948711e6d9e 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -16,7 +16,7 @@ The following operations with [partitions](../../../engines/table-engines/merget - [CLEAR COLUMN IN PARTITION](#alter_clear-column-partition) — Resets the value of a specified column in a partition. - [CLEAR INDEX IN PARTITION](#alter_clear-index-partition) — Resets the specified secondary index in a partition. - [FREEZE PARTITION](#alter_freeze-partition) — Creates a backup of a partition. -- [FETCH PARTITION](#alter_fetch-partition) — Downloads a partition from another server. +- [FETCH PARTITION\|PART](#alter_fetch-partition) — Downloads a part or partition from another server. - [MOVE PARTITION\|PART](#alter_move-partition) — Move partition/data part to another disk or volume. @@ -40,7 +40,7 @@ Read about setting the partition expression in a section [How to specify the par After the query is executed, you can do whatever you want with the data in the `detached` directory — delete it from the file system, or just leave it. -This query is replicated – it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replica. +This query is replicated – it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replicas (as multiple leaders are allowed). ## DROP PARTITION\|PART {#alter_drop-partition} @@ -85,9 +85,15 @@ ALTER TABLE visits ATTACH PART 201901_2_2_0; Read more about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr). -This query is replicated. The replica-initiator checks whether there is data in the `detached` directory. If data exists, the query checks its integrity. If everything is correct, the query adds the data to the table. All other replicas download the data from the replica-initiator. +This query is replicated. The replica-initiator checks whether there is data in the `detached` directory. +If data exists, the query checks its integrity. If everything is correct, the query adds the data to the table. -So you can put data to the `detached` directory on one replica, and use the `ALTER ... ATTACH` query to add it to the table on all replicas. +If the non-initiator replica, receiving the attach command, finds the part with the correct checksums in its own +`detached` folder, it attaches the data without fetching it from other replicas. +If there is no part with the correct checksums, the data is downloaded from any replica having the part. + +You can put data to the `detached` directory on one replica and use the `ALTER ... ATTACH` query to add it to the +table on all replicas. ## ATTACH PARTITION FROM {#alter_attach-partition-from} @@ -95,7 +101,8 @@ So you can put data to the `detached` directory on one replica, and use the `ALT ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1 ``` -This query copies the data partition from the `table1` to `table2` adds data to exsisting in the `table2`. Note that data won’t be deleted from `table1`. +This query copies the data partition from the `table1` to `table2`. +Note that data won't be deleted neither from `table1` nor from `table2`. For the query to run successfully, the following conditions must be met: @@ -191,29 +198,35 @@ ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr The query works similar to `CLEAR COLUMN`, but it resets an index instead of a column data. -## FETCH PARTITION {#alter_fetch-partition} +## FETCH PARTITION|PART {#alter_fetch-partition} ``` sql -ALTER TABLE table_name FETCH PARTITION partition_expr FROM 'path-in-zookeeper' +ALTER TABLE table_name FETCH PARTITION|PART partition_expr FROM 'path-in-zookeeper' ``` Downloads a partition from another server. This query only works for the replicated tables. The query does the following: -1. Downloads the partition from the specified shard. In ‘path-in-zookeeper’ you must specify a path to the shard in ZooKeeper. +1. Downloads the partition|part from the specified shard. In ‘path-in-zookeeper’ you must specify a path to the shard in ZooKeeper. 2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION\|PART](#alter_attach-partition) query to add the data to the table. For example: +1. FETCH PARTITION ``` sql ALTER TABLE users FETCH PARTITION 201902 FROM '/clickhouse/tables/01-01/visits'; ALTER TABLE users ATTACH PARTITION 201902; ``` +2. FETCH PART +``` sql +ALTER TABLE users FETCH PART 201901_2_2_0 FROM '/clickhouse/tables/01-01/visits'; +ALTER TABLE users ATTACH PART 201901_2_2_0; +``` Note that: -- The `ALTER ... FETCH PARTITION` query isn’t replicated. It places the partition to the `detached` directory only on the local server. +- The `ALTER ... FETCH PARTITION|PART` query isn’t replicated. It places the part or partition to the `detached` directory only on the local server. - The `ALTER TABLE ... ATTACH` query is replicated. It adds the data to all replicas. The data is added to one of the replicas from the `detached` directory, and to the others - from neighboring replicas. Before downloading, the system checks if the partition exists and the table structure matches. The most appropriate replica is selected automatically from the healthy replicas. diff --git a/docs/en/sql-reference/statements/alter/quota.md b/docs/en/sql-reference/statements/alter/quota.md index a43b5255598..05130a569ab 100644 --- a/docs/en/sql-reference/statements/alter/quota.md +++ b/docs/en/sql-reference/statements/alter/quota.md @@ -36,4 +36,4 @@ For the default user limit the maximum execution time with half a second in 30 m ``` sql ALTER QUOTA IF EXISTS qB FOR INTERVAL 30 minute MAX execution_time = 0.5, FOR INTERVAL 5 quarter MAX queries = 321, errors = 10 TO default; -``` +``` \ No newline at end of file diff --git a/docs/en/sql-reference/statements/alter/ttl.md b/docs/en/sql-reference/statements/alter/ttl.md index 5331afdb2f8..9cd63d3b8fe 100644 --- a/docs/en/sql-reference/statements/alter/ttl.md +++ b/docs/en/sql-reference/statements/alter/ttl.md @@ -18,7 +18,7 @@ ALTER TABLE table_name MODIFY TTL ttl_expression; TTL-property can be removed from table with the following query: ```sql -ALTER TABLE table_name REMOVE TTL +ALTER TABLE table_name REMOVE TTL ``` **Example** @@ -79,7 +79,7 @@ The `TTL` is no longer there, so the second row is not deleted: └───────────────────────┴─────────┴──────────────┘ ``` -### See Also +**See Also** -- More about the [TTL-expression](../../../sql-reference/statements/create/table#ttl-expression). -- Modify column [with TTL](../../../sql-reference/statements/alter/column#alter_modify-column). +- More about the [TTL-expression](../../../sql-reference/statements/create/table.md#ttl-expression). +- Modify column [with TTL](../../../sql-reference/statements/alter/column.md#alter_modify-column). diff --git a/docs/en/sql-reference/statements/alter/user.md b/docs/en/sql-reference/statements/alter/user.md index efad6561439..b590bf4887d 100644 --- a/docs/en/sql-reference/statements/alter/user.md +++ b/docs/en/sql-reference/statements/alter/user.md @@ -12,10 +12,10 @@ Syntax: ``` sql ALTER USER [IF EXISTS] name1 [ON CLUSTER cluster_name1] [RENAME TO new_name1] [, name2 [ON CLUSTER cluster_name2] [RENAME TO new_name2] ...] - [IDENTIFIED [WITH {PLAINTEXT_PASSWORD|SHA256_PASSWORD|DOUBLE_SHA1_PASSWORD}] BY {'password'|'hash'}] - [[ADD|DROP] HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] + [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']}] + [[ADD | DROP] HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [DEFAULT ROLE role [,...] | ALL | ALL EXCEPT role [,...] ] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] + [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY | WRITABLE] | PROFILE 'profile_name'] [,...] ``` To use `ALTER USER` you must have the [ALTER USER](../../../sql-reference/statements/grant.md#grant-access-management) privilege. diff --git a/docs/en/sql-reference/statements/attach.md b/docs/en/sql-reference/statements/attach.md index 035441ef5f1..01783e9cb2f 100644 --- a/docs/en/sql-reference/statements/attach.md +++ b/docs/en/sql-reference/statements/attach.md @@ -5,16 +5,55 @@ toc_title: ATTACH # ATTACH Statement {#attach} -This query is exactly the same as [CREATE](../../sql-reference/statements/create/table.md), but +Attaches the table, for example, when moving a database to another server. -- Instead of the word `CREATE` it uses the word `ATTACH`. -- The query does not create data on the disk, but assumes that data is already in the appropriate places, and just adds information about the table to the server. - After executing an ATTACH query, the server will know about the existence of the table. +The query does not create data on the disk, but assumes that data is already in the appropriate places, and just adds information about the table to the server. After executing an `ATTACH` query, the server will know about the existence of the table. -If the table was previously detached ([DETACH](../../sql-reference/statements/detach.md)), meaning that its structure is known, you can use shorthand without defining the structure. +If the table was previously detached ([DETACH](../../sql-reference/statements/detach.md)) query, meaning that its structure is known, you can use shorthand without defining the structure. + +## Syntax Forms {#syntax-forms} +### Attach Existing Table {#attach-existing-table} ``` sql ATTACH TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster] ``` -This query is used when starting the server. The server stores table metadata as files with `ATTACH` queries, which it simply runs at launch (with the exception of system tables, which are explicitly created on the server). +This query is used when starting the server. The server stores table metadata as files with `ATTACH` queries, which it simply runs at launch (with the exception of some system tables, which are explicitly created on the server). + +If the table was detached permanently, it won't be reattached at the server start, so you need to use `ATTACH` query explicitly. + +### Сreate New Table And Attach Data {#create-new-table-and-attach-data} + +**With specify path to table data** + +```sql +ATTACH TABLE name FROM 'path/to/data/' (col1 Type1, ...) +``` + +It creates new table with provided structure and attaches table data from provided directory in `user_files`. + +**Example** + +Query: + +```sql +DROP TABLE IF EXISTS test; +INSERT INTO TABLE FUNCTION file('01188_attach/test/data.TSV', 'TSV', 's String, n UInt8') VALUES ('test', 42); +ATTACH TABLE test FROM '01188_attach/test' (s String, n UInt8) ENGINE = File(TSV); +SELECT * FROM test; +``` +Result: + +```sql +┌─s────┬──n─┐ +│ test │ 42 │ +└──────┴────┘ +``` + +**With specify table UUID** (Only for `Atomic` database) + +```sql +ATTACH TABLE name UUID '' (col1 Type1, ...) +``` + +It creates new table with provided structure and attaches data from table with the specified UUID. \ No newline at end of file diff --git a/docs/en/sql-reference/statements/check-table.md b/docs/en/sql-reference/statements/check-table.md index 450447acaf8..65e6238ebbc 100644 --- a/docs/en/sql-reference/statements/check-table.md +++ b/docs/en/sql-reference/statements/check-table.md @@ -30,9 +30,36 @@ Performed over the tables with another table engines causes an exception. Engines from the `*Log` family don’t provide automatic data recovery on failure. Use the `CHECK TABLE` query to track data loss in a timely manner. -For `MergeTree` family engines, the `CHECK TABLE` query shows a check status for every individual data part of a table on the local server. +## Checking the MergeTree Family Tables {#checking-mergetree-tables} -**If the data is corrupted** +For `MergeTree` family engines, if [check_query_single_value_result](../../operations/settings/settings.md#check_query_single_value_result) = 0, the `CHECK TABLE` query shows a check status for every individual data part of a table on the local server. + +```sql +SET check_query_single_value_result = 0; +CHECK TABLE test_table; +``` + +```text +┌─part_path─┬─is_passed─┬─message─┐ +│ all_1_4_1 │ 1 │ │ +│ all_1_4_2 │ 1 │ │ +└───────────┴───────────┴─────────┘ +``` + +If `check_query_single_value_result` = 0, the `CHECK TABLE` query shows the general table check status. + +```sql +SET check_query_single_value_result = 1; +CHECK TABLE test_table; +``` + +```text +┌─result─┐ +│ 1 │ +└────────┘ +``` + +## If the Data Is Corrupted {#if-data-is-corrupted} If the table is corrupted, you can copy the non-corrupted data to another table. To do this: diff --git a/docs/en/sql-reference/statements/create/quota.md b/docs/en/sql-reference/statements/create/quota.md index 71416abf588..0698d9bede5 100644 --- a/docs/en/sql-reference/statements/create/quota.md +++ b/docs/en/sql-reference/statements/create/quota.md @@ -18,7 +18,7 @@ CREATE QUOTA [IF NOT EXISTS | OR REPLACE] name [ON CLUSTER cluster_name] [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] ``` -Keys `user_name`, `ip_address`, `client_key`, `client_key, user_name` and `client_key, ip_address` correspond to the fields in the [system.quotas](../../../operations/system-tables/quotas.md) table. +Keys `user_name`, `ip_address`, `client_key`, `client_key, user_name` and `client_key, ip_address` correspond to the fields in the [system.quotas](../../../operations/system-tables/quotas.md) table. Parameters `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time` correspond to the fields in the [system.quotas_usage](../../../operations/system-tables/quotas_usage.md) table. diff --git a/docs/en/sql-reference/statements/create/row-policy.md b/docs/en/sql-reference/statements/create/row-policy.md index cbe639c6fc5..5a1fa218fad 100644 --- a/docs/en/sql-reference/statements/create/row-policy.md +++ b/docs/en/sql-reference/statements/create/row-policy.md @@ -5,39 +5,81 @@ toc_title: ROW POLICY # CREATE ROW POLICY {#create-row-policy-statement} -Creates [filters for rows](../../../operations/access-rights.md#row-policy-management), which a user can read from a table. +Creates a [row policy](../../../operations/access-rights.md#row-policy-management), i.e. a filter used to determine which rows a user can read from a table. Syntax: ``` sql CREATE [ROW] POLICY [IF NOT EXISTS | OR REPLACE] policy_name1 [ON CLUSTER cluster_name1] ON [db1.]table1 [, policy_name2 [ON CLUSTER cluster_name2] ON [db2.]table2 ...] + [FOR SELECT] USING condition [AS {PERMISSIVE | RESTRICTIVE}] - [FOR SELECT] - [USING condition] [TO {role1 [, role2 ...] | ALL | ALL EXCEPT role1 [, role2 ...]}] ``` -`ON CLUSTER` clause allows creating row policies on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). +## USING Clause {#create-row-policy-using} -## AS Clause {#create-row-policy-as} - -Using this section you can create permissive or restrictive policies. - -Permissive policy grants access to rows. Permissive policies which apply to the same table are combined together using the boolean `OR` operator. Policies are permissive by default. - -Restrictive policy restricts access to rows. Restrictive policies which apply to the same table are combined together using the boolean `AND` operator. - -Restrictive policies apply to rows that passed the permissive filters. If you set restrictive policies but no permissive policies, the user can’t get any row from the table. +Allows to specify a condition to filter rows. An user will see a row if the condition is calculated to non-zero for the row. ## TO Clause {#create-row-policy-to} -In the section `TO` you can provide a mixed list of roles and users, for example, `CREATE ROW POLICY ... TO accountant, john@localhost`. +In the section `TO` you can provide a list of users and roles this policy should work for. For example, `CREATE ROW POLICY ... TO accountant, john@localhost`. -Keyword `ALL` means all the ClickHouse users including current user. Keywords `ALL EXCEPT` allow to exclude some users from the all users list, for example, `CREATE ROW POLICY ... TO ALL EXCEPT accountant, john@localhost` +Keyword `ALL` means all the ClickHouse users including current user. Keyword `ALL EXCEPT` allow to exclude some users from the all users list, for example, `CREATE ROW POLICY ... TO ALL EXCEPT accountant, john@localhost` -## Examples {#examples} +!!! note "Note" + If there are no row policies defined for a table then any user can `SELECT` all the row from the table. Defining one or more row policies for the table makes the access to the table depending on the row policies no matter if those row policies are defined for the current user or not. For example, the following policy + + `CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter` -`CREATE ROW POLICY filter ON mydb.mytable FOR SELECT USING a<1000 TO accountant, john@localhost` + forbids the users `mira` and `peter` to see the rows with `b != 1`, and any non-mentioned user (e.g., the user `paul`) will see no rows from `mydb.table1` at all. + + If that's not desirable it can't be fixed by adding one more row policy, like the following: -`CREATE ROW POLICY filter ON mydb.mytable FOR SELECT USING a<1000 TO ALL EXCEPT mira` + `CREATE ROW POLICY pol2 ON mydb.table1 USING 1 TO ALL EXCEPT mira, peter` + +## AS Clause {#create-row-policy-as} + +It's allowed to have more than one policy enabled on the same table for the same user at the one time. So we need a way to combine the conditions from multiple policies. + +By default policies are combined using the boolean `OR` operator. For example, the following policies + +``` sql +CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter +CREATE ROW POLICY pol2 ON mydb.table1 USING c=2 TO peter, antonio +``` + +enables the user `peter` to see rows with either `b=1` or `c=2`. + +The `AS` clause specifies how policies should be combined with other policies. Policies can be either permissive or restrictive. By default policies are permissive, which means they are combined using the boolean `OR` operator. + +A policy can be defined as restrictive as an alternative. Restrictive policies are combined using the boolean `AND` operator. + +Here is the general formula: + +``` +row_is_visible = (one or more of the permissive policies' conditions are non-zero) AND + (all of the restrictive policies's conditions are non-zero) +``` + +For example, the following policies + +``` sql +CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter +CREATE ROW POLICY pol2 ON mydb.table1 USING c=2 AS RESTRICTIVE TO peter, antonio +``` + +enables the user `peter` to see rows only if both `b=1` AND `c=2`. + +## ON CLUSTER Clause {#create-row-policy-on-cluster} + +Allows creating row policies on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). + + +## Examples + +`CREATE ROW POLICY filter1 ON mydb.mytable USING a<1000 TO accountant, john@localhost` + +`CREATE ROW POLICY filter2 ON mydb.mytable USING a<1000 AND b=5 TO ALL EXCEPT mira` + +`CREATE ROW POLICY filter3 ON mydb.mytable USING 1 TO admin` diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 0090eec14b7..5f1f0151350 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -47,19 +47,38 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_function() Creates a table with the same result as that of the [table function](../../../sql-reference/table-functions/index.md#table-functions) specified. The created table will also work in the same way as the corresponding table function that was specified. +### From SELECT query {#from-select-query} + ``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... +CREATE TABLE [IF NOT EXISTS] [db.]table_name[(name1 [type1], name2 [type2], ...)] ENGINE = engine AS SELECT ... ``` -Creates a table with a structure like the result of the `SELECT` query, with the `engine` engine, and fills it with data from SELECT. +Creates a table with a structure like the result of the `SELECT` query, with the `engine` engine, and fills it with data from `SELECT`. Also you can explicitly specify columns description. -In all cases, if `IF NOT EXISTS` is specified, the query won’t return an error if the table already exists. In this case, the query won’t do anything. +If the table already exists and `IF NOT EXISTS` is specified, the query won’t do anything. There can be other clauses after the `ENGINE` clause in the query. See detailed documentation on how to create tables in the descriptions of [table engines](../../../engines/table-engines/index.md#table_engines). +**Example** + +Query: + +``` sql +CREATE TABLE t1 (x String) ENGINE = Memory AS SELECT 1; +SELECT x, toTypeName(x) FROM t1; +``` + +Result: + +```text +┌─x─┬─toTypeName(x)─┐ +│ 1 │ String │ +└───┴───────────────┘ +``` + ## NULL Or NOT NULL Modifiers {#null-modifiers} -`NULL` and `NOT NULL` modifiers after data type in column definition allow or do not allow it to be [Nullable](../../../sql-reference/data-types/nullable.md#data_type-nullable). +`NULL` and `NOT NULL` modifiers after data type in column definition allow or do not allow it to be [Nullable](../../../sql-reference/data-types/nullable.md#data_type-nullable). If the type is not `Nullable` and if `NULL` is specified, it will be treated as `Nullable`; if `NOT NULL` is specified, then no. For example, `INT NULL` is the same as `Nullable(INT)`. If the type is `Nullable` and `NULL` or `NOT NULL` modifiers are specified, the exception will be thrown. @@ -109,16 +128,16 @@ It is not possible to set default values for elements in nested data structures. ## Primary Key {#primary-key} -You can define a [primary key](../../../engines/table-engines/mergetree-family/mergetree.md#primary-keys-and-indexes-in-queries) when creating a table. Primary key can be specified in two ways: +You can define a [primary key](../../../engines/table-engines/mergetree-family/mergetree.md#primary-keys-and-indexes-in-queries) when creating a table. Primary key can be specified in two ways: - Inside the column list ``` sql -CREATE TABLE db.table_name -( - name1 type1, name2 type2, ..., +CREATE TABLE db.table_name +( + name1 type1, name2 type2, ..., PRIMARY KEY(expr1[, expr2,...])] -) +) ENGINE = engine; ``` @@ -126,9 +145,9 @@ ENGINE = engine; ``` sql CREATE TABLE db.table_name -( +( name1 type1, name2 type2, ... -) +) ENGINE = engine PRIMARY KEY(expr1[, expr2,...]); ``` @@ -285,7 +304,9 @@ REPLACE TABLE myOldTable SELECT * FROM myOldTable WHERE CounterID <12345; ### Syntax -{CREATE [OR REPLACE]|REPLACE} TABLE [db.]table_name +``` sql +{CREATE [OR REPLACE] | REPLACE} TABLE [db.]table_name +``` All syntax forms for `CREATE` query also work for this query. `REPLACE` for a non-existent table will cause an error. @@ -333,5 +354,3 @@ SELECT * FROM base.t1; │ 3 │ └───┘ ``` - - [Original article](https://clickhouse.tech/docs/en/sql-reference/statements/create/table) diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md index c1a52e3b864..49a4e3813a1 100644 --- a/docs/en/sql-reference/statements/create/user.md +++ b/docs/en/sql-reference/statements/create/user.md @@ -12,10 +12,10 @@ Syntax: ``` sql CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] [, name2 [ON CLUSTER cluster_name2] ...] - [IDENTIFIED [WITH {NO_PASSWORD|PLAINTEXT_PASSWORD|SHA256_PASSWORD|SHA256_HASH|DOUBLE_SHA1_PASSWORD|DOUBLE_SHA1_HASH|LDAP_SERVER}] BY {'password'|'hash'}] + [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']}] [HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [DEFAULT ROLE role [,...]] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] + [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY | WRITABLE] | PROFILE 'profile_name'] [,...] ``` `ON CLUSTER` clause allows creating users on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). @@ -30,7 +30,8 @@ There are multiple ways of user identification: - `IDENTIFIED WITH sha256_hash BY 'hash'` - `IDENTIFIED WITH double_sha1_password BY 'qwerty'` - `IDENTIFIED WITH double_sha1_hash BY 'hash'` -- `IDENTIFIED WITH ldap_server BY 'server'` +- `IDENTIFIED WITH ldap SERVER 'server_name'` +- `IDENTIFIED WITH kerberos` or `IDENTIFIED WITH kerberos REALM 'realm'` ## User Host {#user-host} diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 8acd58f4338..633db355d4a 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -62,13 +62,13 @@ Note that materialized view is influenced by [optimize_on_insert](../../../opera Views look the same as normal tables. For example, they are listed in the result of the `SHOW TABLES` query. -There isn’t a separate query for deleting views. To delete a view, use [DROP TABLE](../../../sql-reference/statements/drop.md). +To delete a view, use [DROP VIEW](../../../sql-reference/statements/drop.md#drop-view). Although `DROP TABLE` works for VIEWs as well. ## Live View (Experimental) {#live-view} !!! important "Important" This is an experimental feature that may change in backwards-incompatible ways in the future releases. - Enable usage of live views and `WATCH` query using `set allow_experimental_live_view = 1`. + Enable usage of live views and `WATCH` query using [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view) setting. Input the command `set allow_experimental_live_view = 1`. ```sql @@ -90,7 +90,9 @@ Live views work similarly to how a query in a distributed table works. But inste See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround. -You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query +### Monitoring Changes {#live-view-monitoring} + +You can monitor changes in the `LIVE VIEW` query result using [WATCH](../../../sql-reference/statements/watch.md) query. ```sql WATCH [db.]live_view @@ -102,11 +104,10 @@ WATCH [db.]live_view CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x; CREATE LIVE VIEW lv AS SELECT sum(x) FROM mt; ``` - Watch a live view while doing a parallel insert into the source table. ```sql -WATCH lv +WATCH lv; ``` ```bash @@ -128,16 +129,16 @@ INSERT INTO mt VALUES (2); INSERT INTO mt VALUES (3); ``` -or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause to just get change events. +Or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause to just get change events. ```sql -WATCH [db.]live_view EVENTS +WATCH [db.]live_view EVENTS; ``` **Example:** ```sql -WATCH lv EVENTS +WATCH lv EVENTS; ``` ```bash @@ -163,15 +164,15 @@ SELECT * FROM [db.]live_view WHERE ... You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRESH` statement. -### With Timeout {#live-view-with-timeout} +### WITH TIMEOUT Clause {#live-view-with-timeout} -When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query that was watching the live view. +When a live view is created with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query that was watching the live view. ```sql CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ... ``` -If the timeout value is not specified then the value specified by the `temporary_live_view_timeout` setting is used. +If the timeout value is not specified then the value specified by the [temporary_live_view_timeout](../../../operations/settings/settings.md#temporary-live-view-timeout) setting is used. **Example:** @@ -180,7 +181,7 @@ CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x; CREATE LIVE VIEW lv WITH TIMEOUT 15 AS SELECT sum(x) FROM mt; ``` -### With Refresh {#live-view-with-refresh} +### WITH REFRESH Clause {#live-view-with-refresh} When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed after the specified number of seconds elapse since the last refresh or trigger. @@ -188,7 +189,7 @@ When a live view is created with a `WITH REFRESH` clause then it will be automat CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ... ``` -If the refresh value is not specified then the value specified by the `periodic_live_view_refresh` setting is used. +If the refresh value is not specified then the value specified by the [periodic_live_view_refresh](../../../operations/settings/settings.md#periodic-live-view-refresh) setting is used. **Example:** @@ -231,7 +232,7 @@ WATCH lv Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.lv doesn't exist.. ``` -### Usage +### Usage {#live-view-usage} Most common uses of live view tables include: @@ -240,15 +241,4 @@ Most common uses of live view tables include: - Watching for table changes and triggering a follow-up select queries. - Watching metrics from system tables using periodic refresh. -### Settings {#live-view-settings} - -You can use the following settings to control the behaviour of live views. - -- `allow_experimental_live_view` - enable live views. Default is `0`. -- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default is `15` seconds. -- `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which - mergeable blocks are dropped and query is re-executed. Default is `64` inserts. -- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default is `5` seconds. -- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default is `60` seconds. - [Original article](https://clickhouse.tech/docs/en/sql-reference/statements/create/view/) diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index 62a7c0cc1e0..e9c9ed3693c 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -5,12 +5,66 @@ toc_title: DETACH # DETACH Statement {#detach} -Deletes information about the ‘name’ table from the server. The server stops knowing about the table’s existence. +Makes the server "forget" about the existence of the table or materialized view. + +Syntax: ``` sql -DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] +DETACH TABLE|VIEW [IF EXISTS] [db.]name [PERMANENTLY] [ON CLUSTER cluster] ``` -This does not delete the table’s data or metadata. On the next server launch, the server will read the metadata and find out about the table again. +Detaching does not delete the data or metadata for the table or materialized view. If the table or view was not detached `PERMANENTLY`, on the next server launch the server will read the metadata and recall the table/view again. If the table or view was detached `PERMANENTLY`, there will be no automatic recall. -Similarly, a “detached” table can be re-attached using the `ATTACH` query (with the exception of system tables, which do not have metadata stored for them). +Whether the table was detached permanently or not, in both cases you can reattach it using the [ATTACH](../../sql-reference/statements/attach.md). System log tables can be also attached back (e.g. `query_log`, `text_log`, etc). Other system tables can't be reattached. On the next server launch the server will recall those tables again. + +`ATTACH MATERIALIZED VIEW` doesn't work with short syntax (without `SELECT`), but you can attach it using the `ATTACH TABLE` query. + +Note that you can not detach permanently the table which is already detached (temporary). But you can attach it back and then detach permanently again. + +Also you can not [DROP](../../sql-reference/statements/drop.md#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query. + +**Example** + +Creating a table: + +Query: + +``` sql +CREATE TABLE test ENGINE = Log AS SELECT * FROM numbers(10); +SELECT * FROM test; +``` + +Result: + +``` text +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ 4 │ +│ 5 │ +│ 6 │ +│ 7 │ +│ 8 │ +│ 9 │ +└────────┘ +``` + +Detaching the table: + +Query: + +``` sql +DETACH TABLE test; +SELECT * FROM test; +``` + +Result: + +``` text +Received exception from server (version 21.4.1): +Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.test doesn't exist. +``` + +[Original article](https://clickhouse.tech/docs/en/sql-reference/statements/detach/) diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index f3829de2fbb..89f35b5f701 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -91,7 +91,7 @@ Hierarchy of privileges: - `ALTER ADD CONSTRAINT` - `ALTER DROP CONSTRAINT` - `ALTER TTL` - - `ALTER MATERIALIZE TTL` + - `ALTER MATERIALIZE TTL` - `ALTER SETTINGS` - `ALTER MOVE PARTITION` - `ALTER FETCH PARTITION` @@ -102,9 +102,9 @@ Hierarchy of privileges: - [CREATE](#grant-create) - `CREATE DATABASE` - `CREATE TABLE` + - `CREATE TEMPORARY TABLE` - `CREATE VIEW` - `CREATE DICTIONARY` - - `CREATE TEMPORARY TABLE` - [DROP](#grant-drop) - `DROP DATABASE` - `DROP TABLE` @@ -150,7 +150,7 @@ Hierarchy of privileges: - `SYSTEM RELOAD` - `SYSTEM RELOAD CONFIG` - `SYSTEM RELOAD DICTIONARY` - - `SYSTEM RELOAD EMBEDDED DICTIONARIES` + - `SYSTEM RELOAD EMBEDDED DICTIONARIES` - `SYSTEM MERGES` - `SYSTEM TTL MERGES` - `SYSTEM FETCHES` @@ -276,10 +276,10 @@ Allows executing [ALTER](../../sql-reference/statements/alter/index.md) queries - `ALTER ADD CONSTRAINT`. Level: `TABLE`. Aliases: `ADD CONSTRAINT` - `ALTER DROP CONSTRAINT`. Level: `TABLE`. Aliases: `DROP CONSTRAINT` - `ALTER TTL`. Level: `TABLE`. Aliases: `ALTER MODIFY TTL`, `MODIFY TTL` - - `ALTER MATERIALIZE TTL`. Level: `TABLE`. Aliases: `MATERIALIZE TTL` + - `ALTER MATERIALIZE TTL`. Level: `TABLE`. Aliases: `MATERIALIZE TTL` - `ALTER SETTINGS`. Level: `TABLE`. Aliases: `ALTER SETTING`, `ALTER MODIFY SETTING`, `MODIFY SETTING` - `ALTER MOVE PARTITION`. Level: `TABLE`. Aliases: `ALTER MOVE PART`, `MOVE PARTITION`, `MOVE PART` - - `ALTER FETCH PARTITION`. Level: `TABLE`. Aliases: `FETCH PARTITION` + - `ALTER FETCH PARTITION`. Level: `TABLE`. Aliases: `ALTER FETCH PART`, `FETCH PARTITION`, `FETCH PART` - `ALTER FREEZE PARTITION`. Level: `TABLE`. Aliases: `FREEZE PARTITION` - `ALTER VIEW` Level: `GROUP` - `ALTER VIEW REFRESH`. Level: `VIEW`. Aliases: `ALTER LIVE VIEW REFRESH`, `REFRESH VIEW` @@ -304,9 +304,9 @@ Allows executing [CREATE](../../sql-reference/statements/create/index.md) and [A - `CREATE`. Level: `GROUP` - `CREATE DATABASE`. Level: `DATABASE` - `CREATE TABLE`. Level: `TABLE` + - `CREATE TEMPORARY TABLE`. Level: `GLOBAL` - `CREATE VIEW`. Level: `VIEW` - `CREATE DICTIONARY`. Level: `DICTIONARY` - - `CREATE TEMPORARY TABLE`. Level: `GLOBAL` **Notes** @@ -401,7 +401,7 @@ Allows a user to execute [SYSTEM](../../sql-reference/statements/system.md) quer - `SYSTEM RELOAD`. Level: `GROUP` - `SYSTEM RELOAD CONFIG`. Level: `GLOBAL`. Aliases: `RELOAD CONFIG` - `SYSTEM RELOAD DICTIONARY`. Level: `GLOBAL`. Aliases: `SYSTEM RELOAD DICTIONARIES`, `RELOAD DICTIONARY`, `RELOAD DICTIONARIES` - - `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Level: `GLOBAL`. Aliases: R`ELOAD EMBEDDED DICTIONARIES` + - `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Level: `GLOBAL`. Aliases: `RELOAD EMBEDDED DICTIONARIES` - `SYSTEM MERGES`. Level: `TABLE`. Aliases: `SYSTEM STOP MERGES`, `SYSTEM START MERGES`, `STOP MERGES`, `START MERGES` - `SYSTEM TTL MERGES`. Level: `TABLE`. Aliases: `SYSTEM STOP TTL MERGES`, `SYSTEM START TTL MERGES`, `STOP TTL MERGES`, `START TTL MERGES` - `SYSTEM FETCHES`. Level: `TABLE`. Aliases: `SYSTEM STOP FETCHES`, `SYSTEM START FETCHES`, `STOP FETCHES`, `START FETCHES` @@ -473,4 +473,3 @@ Doesn’t grant any privileges. The `ADMIN OPTION` privilege allows a user to grant their role to another user. -[Original article](https://clickhouse.tech/docs/en/query_language/grant/) diff --git a/docs/en/sql-reference/statements/insert-into.md b/docs/en/sql-reference/statements/insert-into.md index c517a515ab7..66effcccc3f 100644 --- a/docs/en/sql-reference/statements/insert-into.md +++ b/docs/en/sql-reference/statements/insert-into.md @@ -117,4 +117,3 @@ Performance will not decrease if: - Data is added in real time. - You upload data that is usually sorted by time. -[Original article](https://clickhouse.tech/docs/en/query_language/insert_into/) diff --git a/docs/en/sql-reference/statements/optimize.md b/docs/en/sql-reference/statements/optimize.md index a67f282e793..247252d3f4e 100644 --- a/docs/en/sql-reference/statements/optimize.md +++ b/docs/en/sql-reference/statements/optimize.md @@ -5,20 +5,89 @@ toc_title: OPTIMIZE # OPTIMIZE Statement {#misc_operations-optimize} +This query tries to initialize an unscheduled merge of data parts for tables. + +!!! warning "Warning" + `OPTIMIZE` can’t fix the `Too many parts` error. + +**Syntax** + ``` sql -OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE] +OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]] ``` -This query tries to initialize an unscheduled merge of data parts for tables with a table engine from the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family. - -The `OPTMIZE` query is also supported for the [MaterializedView](../../engines/table-engines/special/materializedview.md) and the [Buffer](../../engines/table-engines/special/buffer.md) engines. Other table engines aren’t supported. +The `OPTMIZE` query is supported for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family, the [MaterializedView](../../engines/table-engines/special/materializedview.md) and the [Buffer](../../engines/table-engines/special/buffer.md) engines. Other table engines aren’t supported. When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md) family of table engines, ClickHouse creates a task for merging and waits for execution on all nodes (if the `replication_alter_partitions_sync` setting is enabled). - If `OPTIMIZE` doesn’t perform a merge for any reason, it doesn’t notify the client. To enable notifications, use the [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop) setting. - If you specify a `PARTITION`, only the specified partition is optimized. [How to set partition expression](../../sql-reference/statements/alter/index.md#alter-how-to-specify-part-expr). -- If you specify `FINAL`, optimization is performed even when all the data is already in one part. -- If you specify `DEDUPLICATE`, then completely identical rows will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine. +- If you specify `FINAL`, optimization is performed even when all the data is already in one part. Also merge is forced even if concurrent merges are performed. +- If you specify `DEDUPLICATE`, then completely identical rows (unless by-clause is specified) will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine. -!!! warning "Warning" - `OPTIMIZE` can’t fix the “Too many parts” error. + +## BY expression {#by-expression} + +If you want to perform deduplication on custom set of columns rather than on all, you can specify list of columns explicitly or use any combination of [`*`](../../sql-reference/statements/select/index.md#asterisk), [`COLUMNS`](../../sql-reference/statements/select/index.md#columns-expression) or [`EXCEPT`](../../sql-reference/statements/select/index.md#except-modifier) expressions. The explictly written or implicitly expanded list of columns must include all columns specified in row ordering expression (both primary and sorting keys) and partitioning expression (partitioning key). + +!!! note "Note" + Notice that `*` behaves just like in `SELECT`: `MATERIALIZED` and `ALIAS` columns are not used for expansion. + Also, it is an error to specify empty list of columns, or write an expression that results in an empty list of columns, or deduplicate by an ALIAS column. + +``` sql +OPTIMIZE TABLE table DEDUPLICATE; -- the old one +OPTIMIZE TABLE table DEDUPLICATE BY *; -- not the same as the old one, excludes MATERIALIZED columns (see the note above) +OPTIMIZE TABLE table DEDUPLICATE BY * EXCEPT colX; +OPTIMIZE TABLE table DEDUPLICATE BY * EXCEPT (colX, colY); +OPTIMIZE TABLE table DEDUPLICATE BY col1,col2,col3; +OPTIMIZE TABLE table DEDUPLICATE BY COLUMNS('column-matched-by-regex'); +OPTIMIZE TABLE table DEDUPLICATE BY COLUMNS('column-matched-by-regex') EXCEPT colX; +OPTIMIZE TABLE table DEDUPLICATE BY COLUMNS('column-matched-by-regex') EXCEPT (colX, colY); +``` + +**Examples** + +Create a table: + +``` sql +CREATE TABLE example ( + primary_key Int32, + secondary_key Int32, + value UInt32, + partition_key UInt32, + materialized_value UInt32 MATERIALIZED 12345, + aliased_value UInt32 ALIAS 2, + PRIMARY KEY primary_key +) ENGINE=MergeTree +PARTITION BY partition_key +ORDER BY (primary_key, secondary_key); +``` + +The 'old' deduplicate, all columns are taken into account, i.e. row is removed only if all values in all columns are equal to corresponding values in previous row. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE; +``` + +Deduplicate by all columns that are not `ALIAS` or `MATERIALIZED`: `primary_key`, `secondary_key`, `value`, `partition_key`, and `materialized_value` columns. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY *; +``` + +Deduplicate by all columns that are not `ALIAS` or `MATERIALIZED` and explicitly not `materialized_value`: `primary_key`, `secondary_key`, `value`, and `partition_key` columns. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY * EXCEPT materialized_value; +``` + +Deduplicate explicitly by `primary_key`, `secondary_key`, and `partition_key` columns. +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY primary_key, secondary_key, partition_key; +``` + +Deduplicate by any column matching a regex: `primary_key`, `secondary_key`, and `partition_key` columns. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY COLUMNS('.*_key'); +``` diff --git a/docs/en/sql-reference/statements/rename.md b/docs/en/sql-reference/statements/rename.md index 4f14ad016a3..a9dda6ed3b2 100644 --- a/docs/en/sql-reference/statements/rename.md +++ b/docs/en/sql-reference/statements/rename.md @@ -5,6 +5,14 @@ toc_title: RENAME # RENAME Statement {#misc_operations-rename} +## RENAME DATABASE {#misc_operations-rename_database} +Renames database, support only for Atomic database engine + +``` +RENAME DATABASE atomic_database1 TO atomic_database2 [ON CLUSTER cluster] +``` + +## RENAME TABLE {#misc_operations-rename_table} Renames one or more tables. ``` sql diff --git a/docs/en/sql-reference/statements/select/all.md b/docs/en/sql-reference/statements/select/all.md index 5e0de4c142b..891b82c4319 100644 --- a/docs/en/sql-reference/statements/select/all.md +++ b/docs/en/sql-reference/statements/select/all.md @@ -4,10 +4,8 @@ toc_title: ALL # ALL Clause {#select-all} -`SELECT ALL` is identical to `SELECT` without `DISTINCT`. +If there are multiple matching rows in the table, then `ALL` returns all of them. `SELECT ALL` is identical to `SELECT` without `DISTINCT`. If both `ALL` and `DISTINCT` specified, exception will be thrown. -- If `ALL` specified, ignore it. -- If both `ALL` and `DISTINCT` specified, exception will be thrown. `ALL` can also be specified inside aggregate function with the same effect(noop), for instance: @@ -19,3 +17,5 @@ equals to ```sql SELECT sum(number) FROM numbers(10); ``` + +[Original article](https://clickhouse.tech/docs/en/sql-reference/statements/select/all) diff --git a/docs/en/sql-reference/statements/select/index.md b/docs/en/sql-reference/statements/select/index.md index e99ebef838c..ada4699c224 100644 --- a/docs/en/sql-reference/statements/select/index.md +++ b/docs/en/sql-reference/statements/select/index.md @@ -57,6 +57,9 @@ Specifics of each optional clause are covered in separate sections, which are li If you want to include all columns in the result, use the asterisk (`*`) symbol. For example, `SELECT * FROM ...`. + +### COLUMNS expression {#columns-expression} + To match some columns in the result with a [re2](https://en.wikipedia.org/wiki/RE2_(software)) regular expression, you can use the `COLUMNS` expression. ``` sql diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index 35631f8c8d6..7b3f709b876 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -428,4 +428,69 @@ errors_count: 0 estimated_recovery_time: 0 ``` -[Original article](https://clickhouse.tech/docs/en/query_language/show/) +## SHOW SETTINGS {#show-settings} + +Returns a list of system settings and their values. Selects data from the [system.settings](../../operations/system-tables/settings.md) table. + +**Syntax** + +```sql +SHOW [CHANGED] SETTINGS LIKE|ILIKE +``` + +**Clauses** + +`LIKE|ILIKE` allow to specify a matching pattern for the setting name. It can contain globs such as `%` or `_`. `LIKE` clause is case-sensitive, `ILIKE` — case insensitive. + +When the `CHANGED` clause is used, the query returns only settings changed from their default values. + +**Examples** + +Query with the `LIKE` clause: + +```sql +SHOW SETTINGS LIKE 'send_timeout'; +``` +Result: + +```text +┌─name─────────┬─type────┬─value─┐ +│ send_timeout │ Seconds │ 300 │ +└──────────────┴─────────┴───────┘ +``` + +Query with the `ILIKE` clause: + +```sql +SHOW SETTINGS ILIKE '%CONNECT_timeout%' +``` + +Result: + +```text +┌─name────────────────────────────────────┬─type─────────┬─value─┐ +│ connect_timeout │ Seconds │ 10 │ +│ connect_timeout_with_failover_ms │ Milliseconds │ 50 │ +│ connect_timeout_with_failover_secure_ms │ Milliseconds │ 100 │ +└─────────────────────────────────────────┴──────────────┴───────┘ +``` + +Query with the `CHANGED` clause: + +```sql +SHOW CHANGED SETTINGS ILIKE '%MEMORY%' +``` + +Result: + +```text +┌─name─────────────┬─type───┬─value───────┐ +│ max_memory_usage │ UInt64 │ 10000000000 │ +└──────────────────┴────────┴─────────────┘ +``` + +**See Also** + +- [system.settings](../../operations/system-tables/settings.md) table + +[Original article](https://clickhouse.tech/docs/en/sql-reference/statements/show/) diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index bb279703cc2..919bd65d56b 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -169,7 +169,7 @@ SYSTEM START MERGES [ON VOLUME | [db.]merge_tree_family_table_name ### STOP TTL MERGES {#query_language-stop-ttl-merges} Provides possibility to stop background delete old data according to [TTL expression](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) for tables in the MergeTree family: -Return `Ok.` even table doesn’t exists or table have not MergeTree engine. Return error when database doesn’t exists: +Returns `Ok.` even if table doesn’t exist or table has not MergeTree engine. Returns error when database doesn’t exist: ``` sql SYSTEM STOP TTL MERGES [[db.]merge_tree_family_table_name] @@ -178,7 +178,7 @@ SYSTEM STOP TTL MERGES [[db.]merge_tree_family_table_name] ### START TTL MERGES {#query_language-start-ttl-merges} Provides possibility to start background delete old data according to [TTL expression](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) for tables in the MergeTree family: -Return `Ok.` even table doesn’t exists. Return error when database doesn’t exists: +Returns `Ok.` even if table doesn’t exist. Returns error when database doesn’t exist: ``` sql SYSTEM START TTL MERGES [[db.]merge_tree_family_table_name] @@ -187,7 +187,7 @@ SYSTEM START TTL MERGES [[db.]merge_tree_family_table_name] ### STOP MOVES {#query_language-stop-moves} Provides possibility to stop background move data according to [TTL table expression with TO VOLUME or TO DISK clause](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-table-ttl) for tables in the MergeTree family: -Return `Ok.` even table doesn’t exists. Return error when database doesn’t exists: +Returns `Ok.` even if table doesn’t exist. Returns error when database doesn’t exist: ``` sql SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] @@ -196,7 +196,7 @@ SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] ### START MOVES {#query_language-start-moves} Provides possibility to start background move data according to [TTL table expression with TO VOLUME and TO DISK clause](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-table-ttl) for tables in the MergeTree family: -Return `Ok.` even table doesn’t exists. Return error when database doesn’t exists: +Returns `Ok.` even if table doesn’t exist. Returns error when database doesn’t exist: ``` sql SYSTEM STOP MOVES [[db.]merge_tree_family_table_name] @@ -209,7 +209,7 @@ ClickHouse can manage background replication related processes in [ReplicatedMer ### STOP FETCHES {#query_language-system-stop-fetches} Provides possibility to stop background fetches for inserted parts for tables in the `ReplicatedMergeTree` family: -Always returns `Ok.` regardless of the table engine and even table or database doesn’t exists. +Always returns `Ok.` regardless of the table engine and even if table or database doesn’t exist. ``` sql SYSTEM STOP FETCHES [[db.]replicated_merge_tree_family_table_name] @@ -218,7 +218,7 @@ SYSTEM STOP FETCHES [[db.]replicated_merge_tree_family_table_name] ### START FETCHES {#query_language-system-start-fetches} Provides possibility to start background fetches for inserted parts for tables in the `ReplicatedMergeTree` family: -Always returns `Ok.` regardless of the table engine and even table or database doesn’t exists. +Always returns `Ok.` regardless of the table engine and even if table or database doesn’t exist. ``` sql SYSTEM START FETCHES [[db.]replicated_merge_tree_family_table_name] @@ -264,6 +264,10 @@ Wait until a `ReplicatedMergeTree` table will be synced with other replicas in a SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name ``` +After running this statement the `[db.]replicated_merge_tree_family_table_name` fetches commands from +the common replicated log into its own replication queue, and then the query waits till the replica processes all +of the fetched commands. + ### RESTART REPLICA {#query_language-system-restart-replica} Provides possibility to reinitialize Zookeeper sessions state for `ReplicatedMergeTree` table, will compare current state with Zookeeper as source of true and add tasks to Zookeeper queue if needed @@ -276,5 +280,3 @@ SYSTEM RESTART REPLICA [db.]replicated_merge_tree_family_table_name ### RESTART REPLICAS {#query_language-system-restart-replicas} Provides possibility to reinitialize Zookeeper sessions state for all `ReplicatedMergeTree` tables, will compare current state with Zookeeper as source of true and add tasks to Zookeeper queue if needed - -[Original article](https://clickhouse.tech/docs/en/query_language/system/) diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 761bc8a041e..be793d30f3d 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -17,19 +17,21 @@ WATCH [db.]live_view [FORMAT format] ``` -The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a [live view](./create/view.md#live-view). +The `WATCH` query performs continuous data retrieval from a [LIVE VIEW](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a [LIVE VIEW](./create/view.md#live-view). ```sql -WATCH [db.]live_view +WATCH [db.]live_view [EVENTS] [LIMIT n] [FORMAT format] ``` +## Virtual columns {#watch-virtual-columns} + The virtual `_version` column in the query result indicates the current result version. **Example:** ```sql CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); -WATCH lv +WATCH lv; ``` ```bash @@ -47,6 +49,8 @@ WATCH lv By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. +**Example:** + ```sql INSERT INTO [db.]table WATCH [db.]live_view ... ``` @@ -56,14 +60,14 @@ INSERT INTO [db.]table WATCH [db.]live_view ... The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query result version. ```sql -WATCH [db.]live_view EVENTS +WATCH [db.]live_view EVENTS; ``` **Example:** ```sql CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); -WATCH lv EVENTS +WATCH lv EVENTS; ``` ```bash @@ -78,17 +82,17 @@ WATCH lv EVENTS ## LIMIT Clause {#limit-clause} -The `LIMIT n` clause species the number of updates the `WATCH` query should wait for before terminating. By default there is no limit on the number of updates and therefore the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query is evaluated. +The `LIMIT n` clause specifies the number of updates the `WATCH` query should wait for before terminating. By default there is no limit on the number of updates and therefore the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query result is evaluated. ```sql -WATCH [db.]live_view LIMIT 1 +WATCH [db.]live_view LIMIT 1; ``` **Example:** ```sql CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); -WATCH lv EVENTS LIMIT 1 +WATCH lv EVENTS LIMIT 1; ``` ```bash @@ -102,5 +106,4 @@ WATCH lv EVENTS LIMIT 1 The `FORMAT` clause works the same way as for the [SELECT](../../sql-reference/statements/select/format.md#format-clause). !!! info "Note" - The [JSONEachRowWithProgress](../../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. - + The [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress) format should be used when watching [LIVE VIEW](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index d1eb81e52c6..e1459b5e254 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -5,7 +5,7 @@ toc_title: file # file {#file} -Creates a table from a file. This table function is similar to [url](../../sql-reference/table-functions/url.md) and [hdfs](../../sql-reference/table-functions/hdfs.md) ones. +Creates a table from a file. This table function is similar to [url](../../sql-reference/table-functions/url.md) and [hdfs](../../sql-reference/table-functions/hdfs.md) ones. `file` function can be used in `SELECT` and `INSERT` queries on data in [File](../../engines/table-engines/special/file.md) tables. @@ -15,9 +15,9 @@ Creates a table from a file. This table function is similar to [url](../../sql-r file(path, format, structure) ``` -**Input parameters** +**Parameters** -- `path` — The relative path to the file from [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. +- `path` — The relative path to the file from [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file support following globs in read-only mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. - `format` — The [format](../../interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format: `'column1_name column1_type, column2_name column2_type, ...'`. @@ -39,7 +39,7 @@ $ cat /var/lib/clickhouse/user_files/test.csv 78,43,45 ``` -Getting data from a table in `test.csv` and selecting first two rows from it: +Getting data from a table in `test.csv` and selecting the first two rows from it: ``` sql SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2; @@ -51,7 +51,8 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U │ 3 │ 2 │ 1 │ └─────────┴─────────┴─────────┘ ``` -Getting the first 10 lines of a table that contains 3 columns of UInt32 type from a CSV file: + +Getting the first 10 lines of a table that contains 3 columns of [UInt32](../../sql-reference/data-types/int-uint.md) type from a CSV file: ``` sql SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10; @@ -71,17 +72,16 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U └─────────┴─────────┴─────────┘ ``` - ## Globs in Path {#globs-in-path} -Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). +Multiple path components can have globs. For being processed file must exist and match to the whole path pattern (not only suffix or prefix). - `*` — Substitutes any number of any characters except `/` including empty string. - `?` — Substitutes any single character. - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. -Constructions with `{}` are similar to the [remote table function](../../sql-reference/table-functions/remote.md)). +Constructions with `{}` are similar to the [remote](remote.md) table function. **Example** @@ -94,13 +94,13 @@ Suppose we have several files with the following relative paths: - 'another_dir/some_file_2' - 'another_dir/some_file_3' -Query the amount of rows in these files: +Query the number of rows in these files: ``` sql SELECT count(*) FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32'); ``` -Query the amount of rows in all files of these two directories: +Query the number of rows in all files of these two directories: ``` sql SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32'); @@ -124,6 +124,6 @@ SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, **See Also** -- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) +- [Virtual columns](../../engines/table-engines/index.md#table_engines-virtual_columns) [Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/file/) diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md index be6ba2b8bc4..ae22e1a1b88 100644 --- a/docs/en/sql-reference/table-functions/generate.md +++ b/docs/en/sql-reference/table-functions/generate.md @@ -10,7 +10,7 @@ Allows to populate test tables with data. Supports all data types that can be stored in table except `LowCardinality` and `AggregateFunction`. ``` sql -generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]); +generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]) ``` **Arguments** @@ -39,4 +39,3 @@ SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64( └──────────┴──────────────┴────────────────────────────────────────────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/generate/) diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index 512f47a2b46..a7c3baca299 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -97,6 +97,5 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin **See Also** -- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) +- [Virtual columns](../../engines/table-engines/index.md#table_engines-virtual_columns) -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/hdfs/) diff --git a/docs/en/sql-reference/table-functions/index.md b/docs/en/sql-reference/table-functions/index.md index 691687dea25..d65a18ab985 100644 --- a/docs/en/sql-reference/table-functions/index.md +++ b/docs/en/sql-reference/table-functions/index.md @@ -21,17 +21,18 @@ You can use table functions in: !!! warning "Warning" You can’t use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled. -| Function | Description | -|-----------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| -| [file](../../sql-reference/table-functions/file.md) | Creates a [File](../../engines/table-engines/special/file.md)-engine table. | -| [merge](../../sql-reference/table-functions/merge.md) | Creates a [Merge](../../engines/table-engines/special/merge.md)-engine table. | -| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. | -| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md)-engine table. | -| [url](../../sql-reference/table-functions/url.md) | Creates a [Url](../../engines/table-engines/special/url.md)-engine table. | -| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a [MySQL](../../engines/table-engines/integrations/mysql.md)-engine table. | -| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a [JDBC](../../engines/table-engines/integrations/jdbc.md)-engine table. | -| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a [ODBC](../../engines/table-engines/integrations/odbc.md)-engine table. | -| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a [HDFS](../../engines/table-engines/integrations/hdfs.md)-engine table. | -| [s3](../../sql-reference/table-functions/s3.md) | Creates a [S3](../../engines/table-engines/integrations/s3.md)-engine table. | +| Function | Description | +|------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| +| [file](../../sql-reference/table-functions/file.md) | Creates a [File](../../engines/table-engines/special/file.md)-engine table. | +| [merge](../../sql-reference/table-functions/merge.md) | Creates a [Merge](../../engines/table-engines/special/merge.md)-engine table. | +| [numbers](../../sql-reference/table-functions/numbers.md) | Creates a table with a single column filled with integer numbers. | +| [remote](../../sql-reference/table-functions/remote.md) | Allows you to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md)-engine table. | +| [url](../../sql-reference/table-functions/url.md) | Creates a [Url](../../engines/table-engines/special/url.md)-engine table. | +| [mysql](../../sql-reference/table-functions/mysql.md) | Creates a [MySQL](../../engines/table-engines/integrations/mysql.md)-engine table. | +| [postgresql](../../sql-reference/table-functions/postgresql.md) | Creates a [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)-engine table. | +| [jdbc](../../sql-reference/table-functions/jdbc.md) | Creates a [JDBC](../../engines/table-engines/integrations/jdbc.md)-engine table. | +| [odbc](../../sql-reference/table-functions/odbc.md) | Creates a [ODBC](../../engines/table-engines/integrations/odbc.md)-engine table. | +| [hdfs](../../sql-reference/table-functions/hdfs.md) | Creates a [HDFS](../../engines/table-engines/integrations/hdfs.md)-engine table. | +| [s3](../../sql-reference/table-functions/s3.md) | Creates a [S3](../../engines/table-engines/integrations/s3.md)-engine table. | -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/) +[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/) diff --git a/docs/en/sql-reference/table-functions/input.md b/docs/en/sql-reference/table-functions/input.md index 40f9f4f7f6f..17707b798d6 100644 --- a/docs/en/sql-reference/table-functions/input.md +++ b/docs/en/sql-reference/table-functions/input.md @@ -42,4 +42,3 @@ $ cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" $ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" ``` -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/input/) diff --git a/docs/en/sql-reference/table-functions/jdbc.md b/docs/en/sql-reference/table-functions/jdbc.md index 6fd53b0e794..c6df022c342 100644 --- a/docs/en/sql-reference/table-functions/jdbc.md +++ b/docs/en/sql-reference/table-functions/jdbc.md @@ -24,4 +24,3 @@ SELECT * FROM jdbc('mysql://localhost:3306/?user=root&password=root', 'schema', SELECT * FROM jdbc('datasource://mysql-local', 'schema', 'table') ``` -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/jdbc/) diff --git a/docs/en/sql-reference/table-functions/merge.md b/docs/en/sql-reference/table-functions/merge.md index 7b3d88f6266..a5c74b71069 100644 --- a/docs/en/sql-reference/table-functions/merge.md +++ b/docs/en/sql-reference/table-functions/merge.md @@ -9,4 +9,3 @@ toc_title: merge The table structure is taken from the first table encountered that matches the regular expression. -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/merge/) diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md index 14cd4369285..7b4e2a301b3 100644 --- a/docs/en/sql-reference/table-functions/mysql.md +++ b/docs/en/sql-reference/table-functions/mysql.md @@ -44,7 +44,7 @@ The rest of the conditions and the `LIMIT` sampling constraint are executed in C A table object with the same columns as the original MySQL table. !!! info "Note" - In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. + In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list, you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. **Examples** diff --git a/docs/en/sql-reference/table-functions/numbers.md b/docs/en/sql-reference/table-functions/numbers.md index 53e4e42a2f8..f9735056b05 100644 --- a/docs/en/sql-reference/table-functions/numbers.md +++ b/docs/en/sql-reference/table-functions/numbers.md @@ -25,4 +25,3 @@ Examples: select toDate('2010-01-01') + number as d FROM numbers(365); ``` -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/numbers/) diff --git a/docs/en/sql-reference/table-functions/odbc.md b/docs/en/sql-reference/table-functions/odbc.md index ea79cd44a93..a8481fbfd68 100644 --- a/docs/en/sql-reference/table-functions/odbc.md +++ b/docs/en/sql-reference/table-functions/odbc.md @@ -102,5 +102,3 @@ SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') - [ODBC external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) - [ODBC table engine](../../engines/table-engines/integrations/odbc.md). - -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/jdbc/) diff --git a/docs/en/sql-reference/table-functions/postgresql.md b/docs/en/sql-reference/table-functions/postgresql.md new file mode 100644 index 00000000000..3eab572ac12 --- /dev/null +++ b/docs/en/sql-reference/table-functions/postgresql.md @@ -0,0 +1,120 @@ +--- +toc_priority: 42 +toc_title: postgresql +--- + +# postgresql {#postgresql} + +Allows `SELECT` and `INSERT` queries to be performed on data that is stored on a remote PostgreSQL server. + +**Syntax** + +``` sql +postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) +``` + +**Arguments** + +- `host:port` — PostgreSQL server address. +- `database` — Remote database name. +- `table` — Remote table name. +- `user` — PostgreSQL user. +- `password` — User password. +- `schema` — Non-default table schema. Optional. + +**Returned Value** + +A table object with the same columns as the original PostgreSQL table. + +!!! info "Note" + In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. + +## Implementation Details {#implementation-details} + +`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. + +Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server. + +All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. + +`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. + +PostgreSQL Array types converts into ClickHouse arrays. + +!!! info "Note" + Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. + +Supports replicas priority for PostgreSQL dictionary source. The bigger the number in map, the less the priority. The highest priority is `0`. + +**Examples** + +Table in PostgreSQL: + +``` text +postgres=# CREATE TABLE "public"."test" ( +"int_id" SERIAL, +"int_nullable" INT NULL DEFAULT NULL, +"float" FLOAT NOT NULL, +"str" VARCHAR(100) NOT NULL DEFAULT '', +"float_nullable" FLOAT NULL DEFAULT NULL, +PRIMARY KEY (int_id)); + +CREATE TABLE + +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); +INSERT 0 1 + +postgresql> SELECT * FROM test; + int_id | int_nullable | float | str | float_nullable + --------+--------------+-------+------+---------------- + 1 | | 2 | test | +(1 row) +``` + +Selecting data from ClickHouse: + +```sql +SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'password') WHERE str IN ('test'); +``` + +``` text +┌─int_id─┬─int_nullable─┬─float─┬─str──┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ 2 │ test │ ᴺᵁᴸᴸ │ +└────────┴──────────────┴───────┴──────┴────────────────┘ +``` + +Inserting: + +```sql +INSERT INTO TABLE FUNCTION postgresql('localhost:5432', 'test', 'test', 'postgrsql_user', 'password') (int_id, float) VALUES (2, 3); +SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'password'); +``` + +``` text +┌─int_id─┬─int_nullable─┬─float─┬─str──┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ 2 │ test │ ᴺᵁᴸᴸ │ +│ 2 │ ᴺᵁᴸᴸ │ 3 │ │ ᴺᵁᴸᴸ │ +└────────┴──────────────┴───────┴──────┴────────────────┘ +``` + +Using Non-default Schema: + +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**See Also** + +- [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md) +- [Using PostgreSQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) + +[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/postgresql/) diff --git a/docs/en/sql-reference/table-functions/remote.md b/docs/en/sql-reference/table-functions/remote.md index 8af5b588412..e80e58a76aa 100644 --- a/docs/en/sql-reference/table-functions/remote.md +++ b/docs/en/sql-reference/table-functions/remote.md @@ -5,7 +5,7 @@ toc_title: remote # remote, remoteSecure {#remote-remotesecure} -Allows to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md) table. `remoteSecure` - same as `remote` but with secured connection. +Allows to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md) table. `remoteSecure` - same as `remote` but with a secured connection. Both functions can be used in `SELECT` and `INSERT` queries. @@ -18,31 +18,31 @@ remoteSecure('addresses_expr', db, table[, 'user'[, 'password'], sharding_key]) remoteSecure('addresses_expr', db.table[, 'user'[, 'password'], sharding_key]) ``` -**Input parameters** +**Parameters** -- `addresses_expr` – An expression that generates addresses of remote servers. This may be just one server address. The server address is `host:port`, or just `host`. +- `addresses_expr` — An expression that generates addresses of remote servers. This may be just one server address. The server address is `host:port`, or just `host`. The host can be specified as the server name, or as the IPv4 or IPv6 address. An IPv6 address is specified in square brackets. - The port is the TCP port on the remote server. If the port is omitted, it uses [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) from the server’s config file in `remote` (by default, 9000) and [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure) in `remoteSecure` (by default, 9440). + The port is the TCP port on the remote server. If the port is omitted, it uses [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) from the server’s config file in `remote` (by default, 9000) and [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure) in `remoteSecure` (by default, 9440). The port is required for an IPv6 address. Type: [String](../../sql-reference/data-types/string.md). -- `db` - Database name. Type: [String](../../sql-reference/data-types/string.md). -- `table` - Table name. Type: [String](../../sql-reference/data-types/string.md). -- `user` - User name. If the user is not specified, `default` is used. Type: [String](../../sql-reference/data-types/string.md). -- `password` - User password. If the password is not specified, an empty password is used. Type: [String](../../sql-reference/data-types/string.md). -- `sharding_key` - Sharding key to support distributing data across nodes. For example: `insert into remote('127.0.0.1:9000,127.0.0.2', db, table, 'default', rand())`. Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- `db` — Database name. Type: [String](../../sql-reference/data-types/string.md). +- `table` — Table name. Type: [String](../../sql-reference/data-types/string.md). +- `user` — User name. If the user is not specified, `default` is used. Type: [String](../../sql-reference/data-types/string.md). +- `password` — User password. If the password is not specified, an empty password is used. Type: [String](../../sql-reference/data-types/string.md). +- `sharding_key` — Sharding key to support distributing data across nodes. For example: `insert into remote('127.0.0.1:9000,127.0.0.2', db, table, 'default', rand())`. Type: [UInt32](../../sql-reference/data-types/int-uint.md). **Returned value** -Dataset from remote servers. +The dataset from remote servers. **Usage** -Using the `remote` table function is less optimal than creating a `Distributed` table, because in this case the server connection is re-established for every request. In addition, if host names are set, the names are resolved, and errors are not counted when working with various replicas. When processing a large number of queries, always create the `Distributed` table ahead of time, and don’t use the `remote` table function. +Using the `remote` table function is less optimal than creating a `Distributed` table because in this case the server connection is re-established for every request. Also, if hostnames are set, the names are resolved, and errors are not counted when working with various replicas. When processing a large number of queries, always create the `Distributed` table ahead of time, and don’t use the `remote` table function. The `remote` table function can be useful in the following cases: @@ -62,7 +62,7 @@ localhost [2a02:6b8:0:1111::11]:9000 ``` -Multiple addresses can be comma-separated. In this case, ClickHouse will use distributed processing, so it will send the query to all specified addresses (like to shards with different data). Example: +Multiple addresses can be comma-separated. In this case, ClickHouse will use distributed processing, so it will send the query to all specified addresses (like shards with different data). Example: ``` text example01-01-1,example01-02-1 @@ -82,7 +82,7 @@ example01-{01..02}-1 If you have multiple pairs of curly brackets, it generates the direct product of the corresponding sets. -Addresses and parts of addresses in curly brackets can be separated by the pipe symbol (\|). In this case, the corresponding sets of addresses are interpreted as replicas, and the query will be sent to the first healthy replica. However, the replicas are iterated in the order currently set in the [load_balancing](../../operations/settings/settings.md) setting. This example specifies two shards that each have two replicas: +Addresses and parts of addresses in curly brackets can be separated by the pipe symbol (\|). In this case, the corresponding sets of addresses are interpreted as replicas, and the query will be sent to the first healthy replica. However, the replicas are iterated in the order currently set in the [load_balancing](../../operations/settings/settings.md#settings-load_balancing) setting. This example specifies two shards that each have two replicas: ``` text example01-{01..02}-{1|2} diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 76a0e042ea4..285ec862aab 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -3,33 +3,35 @@ toc_priority: 45 toc_title: s3 --- -# s3 {#s3} +# S3 Table Function {#s3-table-function} -Provides table-like interface to select/insert files in S3. This table function is similar to [hdfs](../../sql-reference/table-functions/hdfs.md). +Provides table-like interface to select/insert files in [Amazon S3](https://aws.amazon.com/s3/). This table function is similar to [hdfs](../../sql-reference/table-functions/hdfs.md), but provides S3-specific features. + +**Syntax** ``` sql s3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compression]) ``` -**Input parameters** +**Arguments** -- `path` — Bucket url with path to file. Supports following wildcards in readonly mode: *, ?, {abc,def} and {N..M} where N, M — numbers, `’abc’, ‘def’ — strings. +- `path` — Bucket url with path to file. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. For more information see [here](../../engines/table-engines/integrations/s3.md#wildcards-in-path). - `format` — The [format](../../interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. -- `compression` — Parameter is optional. Supported values: none, gzip/gz, brotli/br, xz/LZMA, zstd/zst. By default, it will autodetect compression by file extension. +- `compression` — Parameter is optional. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. **Returned value** A table with the specified structure for reading or writing data in the specified file. -**Example** +**Examples** -Table from S3 file `https://storage.yandexcloud.net/my-test-bucket-768/data.csv` and selection of the first two rows from it: +Selecting the first two rows from the table from S3 file `https://storage.yandexcloud.net/my-test-bucket-768/data.csv`: ``` sql SELECT * FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/data.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') -LIMIT 2 +LIMIT 2; ``` ``` text @@ -44,7 +46,7 @@ The similar but from file with `gzip` compression: ``` sql SELECT * FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/data.csv.gz', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32', 'gzip') -LIMIT 2 +LIMIT 2; ``` ``` text @@ -54,33 +56,20 @@ LIMIT 2 └─────────┴─────────┴─────────┘ ``` -**Globs in path** +## Usage {#usage-examples} -Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). +Suppose that we have several files with following URIs on S3: -- `*` — Substitutes any number of any characters except `/` including empty string. -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. N and M can have leading zeroes e.g. `000..078`. +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_3.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_4.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_1.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_4.csv' -Constructions with `{}` are similar to the [remote table function](../../sql-reference/table-functions/remote.md)). - -**Example** - -1. Suppose that we have several files with following URIs on S3: - -- ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv’ -- ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv’ -- ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_3.csv’ -- ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_4.csv’ -- ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_1.csv’ -- ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv’ -- ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv’ -- ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_4.csv’ - -2. Query the amount of rows in files end with number from 1 to 3: - - +Count the amount of rows in files ending with numbers from 1 to 3: ``` sql SELECT count(*) @@ -93,9 +82,7 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefi └─────────┘ ``` -3. Query the amount of rows in all files of these two directories: - - +Count the total amount of rows in all files in these two directories: ``` sql SELECT count(*) @@ -108,17 +95,14 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefi └─────────┘ ``` - !!! warning "Warning" If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -**Example** - -Query the data from files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: ``` sql SELECT count(*) -FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV', 'name String, value UInt32') +FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV', 'name String, value UInt32'); ``` ``` text @@ -127,43 +111,22 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000 └─────────┘ ``` -**Data insert** - -The S3 table function may be used for data insert as well. - -**Example** - -Insert a data into file `test-data.csv.gz`: +Insert data into file `test-data.csv.gz`: ``` sql INSERT INTO s3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip') -VALUES ('test-data', 1), ('test-data-2', 2) +VALUES ('test-data', 1), ('test-data-2', 2); ``` -Insert a data into file `test-data.csv.gz` from existing table: +Insert data into file `test-data.csv.gz` from existing table: ``` sql INSERT INTO s3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip') -SELECT name, value FROM existing_table +SELECT name, value FROM existing_table; ``` -## Virtual Columns {#virtual-columns} - -- `_path` — Path to the file. -- `_file` — Name of the file. - -## S3-related settings {#settings} - -The following settings can be set before query execution or placed into configuration file. - -- `s3_max_single_part_upload_size` — Default value is `64Mb`. The maximum size of object to upload using singlepart upload to S3. -- `s3_min_upload_part_size` — Default value is `512Mb`. The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). -- `s3_max_redirects` — Default value is `10`. Max number of S3 redirects hops allowed. - -Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. - **See Also** -- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) +- [S3 engine](../../engines/table-engines/integrations/s3.md) -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/s3/) +[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/s3/) diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index d70774b7588..2192b69d006 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -15,25 +15,25 @@ toc_title: url url(URL, format, structure) ``` -**Input parameters** +**Parameters** -- `URL` - HTTP or HTTPS server address, which can accept `GET` (for `SELECT`) or `POST` (for `INSERT`) requests. Type: [String](../../sql-reference/data-types/string.md). -- `format` - [Format](../../interfaces/formats.md#formats) of the data. Type: [String](../../sql-reference/data-types/string.md). -- `structure` - Table structure in `'UserID UInt64, Name String'` format. Determines column names and types. Type: [String](../../sql-reference/data-types/string.md). +- `URL` — HTTP or HTTPS server address, which can accept `GET` or `POST` requests (for `SELECT` or `INSERT` queries correspondingly). Type: [String](../../sql-reference/data-types/string.md). +- `format` — [Format](../../interfaces/formats.md#formats) of the data. Type: [String](../../sql-reference/data-types/string.md). +- `structure` — Table structure in `'UserID UInt64, Name String'` format. Determines column names and types. Type: [String](../../sql-reference/data-types/string.md). **Returned value** -A table with the specified format and structure and with data from the defined URL. +A table with the specified format and structure and with data from the defined `URL`. **Examples** -Getting the first 3 lines of a table that contains columns of `String` and `UInt32` type from HTTP-server which answers in `CSV` format. +Getting the first 3 lines of a table that contains columns of `String` and [UInt32](../../sql-reference/data-types/int-uint.md) type from HTTP-server which answers in [CSV](../../interfaces/formats.md#csv) format. ``` sql SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3; ``` -Inserting data from a URL into a table: +Inserting data from a `URL` into a table: ``` sql CREATE TABLE test_table (column1 String, column2 UInt32) ENGINE=Memory; diff --git a/docs/en/sql-reference/table-functions/view.md b/docs/en/sql-reference/table-functions/view.md index 08096c2b019..e49a9f5218b 100644 --- a/docs/en/sql-reference/table-functions/view.md +++ b/docs/en/sql-reference/table-functions/view.md @@ -37,7 +37,7 @@ Input table: Query: ``` sql -SELECT * FROM view(SELECT name FROM months) +SELECT * FROM view(SELECT name FROM months); ``` Result: @@ -54,14 +54,15 @@ Result: You can use the `view` function as a parameter of the [remote](https://clickhouse.tech/docs/en/sql-reference/table-functions/remote/#remote-remotesecure) and [cluster](https://clickhouse.tech/docs/en/sql-reference/table-functions/cluster/#cluster-clusterallreplicas) table functions: ``` sql -SELECT * FROM remote(`127.0.0.1`, view(SELECT a, b, c FROM table_name)) +SELECT * FROM remote(`127.0.0.1`, view(SELECT a, b, c FROM table_name)); ``` ``` sql -SELECT * FROM cluster(`cluster_name`, view(SELECT a, b, c FROM table_name)) +SELECT * FROM cluster(`cluster_name`, view(SELECT a, b, c FROM table_name)); ``` **See Also** - [View Table Engine](https://clickhouse.tech/docs/en/engines/table-engines/special/view/) -[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/view/) \ No newline at end of file + +[Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/view/) diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index cbf03a44d46..a646347ea60 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -23,7 +23,9 @@ ClickHouse supports the standard grammar for defining windows and window functio | `GROUPS` frame | not supported | | Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported | | `rank()`, `dense_rank()`, `row_number()` | supported | -| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`| +| `lag/lead(value, offset)` | Not supported. Workarounds: | +| | 1) replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`| +| | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | ## References diff --git a/docs/es/commercial/cloud.md b/docs/es/commercial/cloud.md deleted file mode 100644 index bc593a82ad7..00000000000 --- a/docs/es/commercial/cloud.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 1 -toc_title: Nube ---- - -# Proveedores de servicios en la nube de ClickHouse {#clickhouse-cloud-service-providers} - -!!! info "INFO" - Si ha lanzado una nube pública con el servicio ClickHouse administrado, no dude en [abrir una solicitud de extracción](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/commercial/cloud.md) añadiéndolo a la siguiente lista. - -## Nube de Yandex {#yandex-cloud} - -[Servicio administrado de Yandex para ClickHouse](https://cloud.yandex.com/services/managed-clickhouse?utm_source=referrals&utm_medium=clickhouseofficialsite&utm_campaign=link3) proporciona las siguientes características clave: - -- Servicio ZooKeeper totalmente gestionado para [Replicación de ClickHouse](../engines/table-engines/mergetree-family/replication.md) -- Múltiples opciones de tipo de almacenamiento -- Réplicas en diferentes zonas de disponibilidad -- Cifrado y aislamiento -- Mantenimiento automatizado - -{## [Artículo Original](https://clickhouse.tech/docs/en/commercial/cloud/) ##} diff --git a/docs/es/commercial/index.md b/docs/es/commercial/index.md deleted file mode 100644 index b367631ae1c..00000000000 --- a/docs/es/commercial/index.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Comercial -toc_priority: 70 -toc_title: Comercial ---- - - diff --git a/docs/es/commercial/support.md b/docs/es/commercial/support.md deleted file mode 100644 index a817d90dcb5..00000000000 --- a/docs/es/commercial/support.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 3 -toc_title: Apoyo ---- - -# Proveedores de servicios de soporte comercial ClickHouse {#clickhouse-commercial-support-service-providers} - -!!! info "INFO" - Si ha lanzado un servicio de soporte comercial ClickHouse, no dude en [abrir una solicitud de extracción](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/commercial/support.md) añadiéndolo a la siguiente lista. - -## Altinidad {#altinity} - -Altinity ha ofrecido soporte y servicios empresariales ClickHouse desde 2017. Los clientes de Altinity van desde empresas Fortune 100 hasta startups. Visitar [Más información](https://www.altinity.com/) para más información. - -## Mafiree {#mafiree} - -[Descripción del servicio](http://mafiree.com/clickhouse-analytics-services.php) - -## MinervaDB {#minervadb} - -[Descripción del servicio](https://minervadb.com/index.php/clickhouse-consulting-and-support-by-minervadb/) diff --git a/docs/es/development/architecture.md b/docs/es/development/architecture.md deleted file mode 100644 index 1620a58a3a0..00000000000 --- a/docs/es/development/architecture.md +++ /dev/null @@ -1,203 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 62 -toc_title: "Descripci\xF3n general de la arquitectura ClickHouse" ---- - -# Descripción general de la arquitectura ClickHouse {#overview-of-clickhouse-architecture} - -ClickHouse es un verdadero DBMS orientado a columnas. Los datos se almacenan por columnas y durante la ejecución de matrices (vectores o fragmentos de columnas). Siempre que sea posible, las operaciones se envían en matrices, en lugar de en valores individuales. Se llama “vectorized query execution,” y ayuda a reducir el costo del procesamiento de datos real. - -> Esta idea no es nada nuevo. Se remonta a la `APL` lenguaje de programación y sus descendientes: `A +`, `J`, `K`, y `Q`. La programación de matrices se utiliza en el procesamiento de datos científicos. Tampoco es esta idea algo nuevo en las bases de datos relacionales: por ejemplo, se usa en el `Vectorwise` sistema. - -Existen dos enfoques diferentes para acelerar el procesamiento de consultas: la ejecución de consultas vectorizadas y la generación de código en tiempo de ejecución. Este último elimina toda la indirección y el despacho dinámico. Ninguno de estos enfoques es estrictamente mejor que el otro. La generación de código de tiempo de ejecución puede ser mejor cuando fusiona muchas operaciones, utilizando así las unidades de ejecución de la CPU y la canalización. La ejecución de consultas vectorizadas puede ser menos práctica porque implica vectores temporales que deben escribirse en la memoria caché y leerse. Si los datos temporales no caben en la memoria caché L2, esto se convierte en un problema. Pero la ejecución de consultas vectorizadas utiliza más fácilmente las capacidades SIMD de la CPU. Un [documento de investigación](http://15721.courses.cs.cmu.edu/spring2016/papers/p5-sompolski.pdf) escrito por nuestros amigos muestra que es mejor combinar ambos enfoques. ClickHouse utiliza la ejecución de consultas vectorizadas y tiene un soporte inicial limitado para la generación de código en tiempo de ejecución. - -## Columna {#columns} - -`IColumn` interfaz se utiliza para representar columnas en la memoria (en realidad, fragmentos de columnas). Esta interfaz proporciona métodos auxiliares para la implementación de varios operadores relacionales. Casi todas las operaciones son inmutables: no modifican la columna original, sino que crean una nueva modificada. Por ejemplo, el `IColumn :: filter` método acepta una máscara de bytes de filtro. Se utiliza para el `WHERE` y `HAVING` operadores relacionales. Ejemplos adicionales: el `IColumn :: permute` para apoyar `ORDER BY`, el `IColumn :: cut` para apoyar `LIMIT`. - -Diversos `IColumn` aplicación (`ColumnUInt8`, `ColumnString`, y así sucesivamente) son responsables del diseño de memoria de las columnas. El diseño de memoria suele ser una matriz contigua. Para el tipo entero de columnas, es solo una matriz contigua, como `std :: vector`. Para `String` y `Array` columnas, son dos vectores: uno para todos los elementos de la matriz, colocados contiguamente, y un segundo para los desplazamientos al comienzo de cada matriz. También hay `ColumnConst` que almacena solo un valor en la memoria, pero parece una columna. - -## Campo {#field} - -Sin embargo, también es posible trabajar con valores individuales. Para representar un valor individual, el `Field` se utiliza. `Field` es sólo una unión discriminada de `UInt64`, `Int64`, `Float64`, `String` y `Array`. `IColumn` tiene el `operator[]` para obtener el valor n-ésimo como un `Field` y el `insert` método para agregar un `Field` al final de una columna. Estos métodos no son muy eficientes, ya que requieren tratar con temporal `Field` objetos que representan un valor individual. Hay métodos más eficientes, tales como `insertFrom`, `insertRangeFrom` y así sucesivamente. - -`Field` no tiene suficiente información sobre un tipo de datos específico para una tabla. Por ejemplo, `UInt8`, `UInt16`, `UInt32`, y `UInt64` todos están representados como `UInt64` en una `Field`. - -## Abstracciones con fugas {#leaky-abstractions} - -`IColumn` tiene métodos para transformaciones relacionales comunes de datos, pero no satisfacen todas las necesidades. Por ejemplo, `ColumnUInt64` no tiene un método para calcular la suma de dos columnas, y `ColumnString` no tiene un método para ejecutar una búsqueda de subcadena. Estas innumerables rutinas se implementan fuera de `IColumn`. - -Varias funciones en columnas se pueden implementar de una manera genérica, no eficiente utilizando `IColumn` para extraer `Field` valores, o de una manera especializada utilizando el conocimiento del diseño de la memoria interna de los datos en un `IColumn` aplicación. Se implementa mediante la conversión de funciones a un `IColumn` escriba y trate con la representación interna directamente. Por ejemplo, `ColumnUInt64` tiene el `getData` método que devuelve una referencia a una matriz interna, luego una rutina separada lee o llena esa matriz directamente. Tenemos “leaky abstractions” para permitir especializaciones eficientes de varias rutinas. - -## Tipos de datos {#data_types} - -`IDataType` es responsable de la serialización y deserialización: para leer y escribir fragmentos de columnas o valores individuales en formato binario o de texto. `IDataType` corresponde directamente a los tipos de datos en las tablas. Por ejemplo, hay `DataTypeUInt32`, `DataTypeDateTime`, `DataTypeString` y así sucesivamente. - -`IDataType` y `IColumn` están vagamente relacionados entre sí. Diferentes tipos de datos se pueden representar en la memoria por el mismo `IColumn` aplicación. Por ejemplo, `DataTypeUInt32` y `DataTypeDateTime` están representados por `ColumnUInt32` o `ColumnConstUInt32`. Además, el mismo tipo de datos se puede representar mediante `IColumn` aplicación. Por ejemplo, `DataTypeUInt8` puede ser representado por `ColumnUInt8` o `ColumnConstUInt8`. - -`IDataType` sólo almacena metadatos. Por ejemplo, `DataTypeUInt8` no almacena nada en absoluto (excepto vptr) y `DataTypeFixedString` tiendas solo `N` (el tamaño de las cadenas de tamaño fijo). - -`IDataType` tiene métodos auxiliares para varios formatos de datos. Los ejemplos son métodos para serializar un valor con posibles citas, para serializar un valor para JSON y para serializar un valor como parte del formato XML. No hay correspondencia directa con los formatos de datos. Por ejemplo, los diferentes formatos de datos `Pretty` y `TabSeparated` puede utilizar el mismo `serializeTextEscaped` método de ayuda de la `IDataType` interfaz. - -## Bloque {#block} - -A `Block` es un contenedor que representa un subconjunto (porción) de una tabla en la memoria. Es sólo un conjunto de triples: `(IColumn, IDataType, column name)`. Durante la ejecución de la consulta, los datos son procesados por `Block`s. Si tenemos un `Block`, tenemos datos (en el `IColumn` objeto), tenemos información sobre su tipo (en `IDataType`) que nos dice cómo lidiar con esa columna, y tenemos el nombre de la columna. Podría ser el nombre de columna original de la tabla o algún nombre artificial asignado para obtener resultados temporales de los cálculos. - -Cuando calculamos alguna función sobre columnas en un bloque, agregamos otra columna con su resultado al bloque, y no tocamos columnas para argumentos de la función porque las operaciones son inmutables. Más tarde, las columnas innecesarias se pueden eliminar del bloque, pero no se pueden modificar. Es conveniente para la eliminación de subexpresiones comunes. - -Se crean bloques para cada fragmento de datos procesado. Tenga en cuenta que para el mismo tipo de cálculo, los nombres y tipos de columna siguen siendo los mismos para diferentes bloques y solo cambian los datos de columna. Es mejor dividir los datos del bloque desde el encabezado del bloque porque los tamaños de bloque pequeños tienen una gran sobrecarga de cadenas temporales para copiar shared_ptrs y nombres de columna. - -## Bloquear flujos {#block-streams} - -Los flujos de bloques son para procesar datos. Usamos flujos de bloques para leer datos de algún lugar, realizar transformaciones de datos o escribir datos en algún lugar. `IBlockInputStream` tiene el `read` método para buscar el siguiente bloque mientras esté disponible. `IBlockOutputStream` tiene el `write` método para empujar el bloque en alguna parte. - -Los flujos son responsables de: - -1. Leer o escribir en una mesa. La tabla solo devuelve una secuencia para leer o escribir bloques. -2. Implementación de formatos de datos. Por ejemplo, si desea enviar datos a un terminal en `Pretty` formato, crea un flujo de salida de bloque donde presiona bloques y los formatea. -3. Realización de transformaciones de datos. Digamos que tienes `IBlockInputStream` y desea crear una secuencia filtrada. Usted crea `FilterBlockInputStream` e inicializarlo con su transmisión. Luego, cuando tiras de un bloque de `FilterBlockInputStream`, extrae un bloque de su flujo, lo filtra y le devuelve el bloque filtrado. Las canalizaciones de ejecución de consultas se representan de esta manera. - -Hay transformaciones más sofisticadas. Por ejemplo, cuando tiras de `AggregatingBlockInputStream`, lee todos los datos de su origen, los agrega y, a continuación, devuelve un flujo de datos agregados para usted. Otro ejemplo: `UnionBlockInputStream` acepta muchas fuentes de entrada en el constructor y también una serie de subprocesos. Lanza múltiples hilos y lee de múltiples fuentes en paralelo. - -> Las secuencias de bloques usan el “pull” enfoque para controlar el flujo: cuando extrae un bloque de la primera secuencia, en consecuencia extrae los bloques requeridos de las secuencias anidadas, y toda la tubería de ejecución funcionará. Ni “pull” ni “push” es la mejor solución, porque el flujo de control está implícito y eso limita la implementación de varias características, como la ejecución simultánea de múltiples consultas (fusionando muchas tuberías). Esta limitación podría superarse con coroutines o simplemente ejecutando hilos adicionales que se esperan el uno al otro. Podemos tener más posibilidades si hacemos explícito el flujo de control: si localizamos la lógica para pasar datos de una unidad de cálculo a otra fuera de esas unidades de cálculo. Lea esto [artículo](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/) para más pensamientos. - -Debemos tener en cuenta que la canalización de ejecución de consultas crea datos temporales en cada paso. Tratamos de mantener el tamaño del bloque lo suficientemente pequeño para que los datos temporales se ajusten a la memoria caché de la CPU. Con esa suposición, escribir y leer datos temporales es casi gratis en comparación con otros cálculos. Podríamos considerar una alternativa, que es fusionar muchas operaciones en la tubería. Podría hacer que la tubería sea lo más corta posible y eliminar gran parte de los datos temporales, lo que podría ser una ventaja, pero también tiene inconvenientes. Por ejemplo, una canalización dividida facilita la implementación de almacenamiento en caché de datos intermedios, el robo de datos intermedios de consultas similares que se ejecutan al mismo tiempo y la fusión de canalizaciones para consultas similares. - -## Formato {#formats} - -Los formatos de datos se implementan con flujos de bloques. Hay “presentational” sólo es adecuado para la salida de datos al cliente, tales como `Pretty` formato, que proporciona sólo `IBlockOutputStream`. Y hay formatos de entrada / salida, como `TabSeparated` o `JSONEachRow`. - -También hay secuencias de filas: `IRowInputStream` y `IRowOutputStream`. Permiten pull/push datos por filas individuales, no por bloques. Y solo son necesarios para simplificar la implementación de formatos orientados a filas. Envoltura `BlockInputStreamFromRowInputStream` y `BlockOutputStreamFromRowOutputStream` le permite convertir flujos orientados a filas en flujos regulares orientados a bloques. - -## I/O {#io} - -Para la entrada / salida orientada a bytes, hay `ReadBuffer` y `WriteBuffer` clases abstractas. Se usan en lugar de C ++ `iostream`s. No se preocupe: cada proyecto maduro de C ++ está usando algo más que `iostream`s por buenas razones. - -`ReadBuffer` y `WriteBuffer` son solo un búfer contiguo y un cursor apuntando a la posición en ese búfer. Las implementaciones pueden poseer o no la memoria del búfer. Hay un método virtual para llenar el búfer con los siguientes datos (para `ReadBuffer`) o para vaciar el búfer en algún lugar (para `WriteBuffer`). Los métodos virtuales rara vez se llaman. - -Implementaciones de `ReadBuffer`/`WriteBuffer` se utilizan para trabajar con archivos y descriptores de archivos y sockets de red, para implementar la compresión (`CompressedWriteBuffer` is initialized with another WriteBuffer and performs compression before writing data to it), and for other purposes – the names `ConcatReadBuffer`, `LimitReadBuffer`, y `HashingWriteBuffer` hablar por sí mismos. - -Read / WriteBuffers solo se ocupan de bytes. Hay funciones de `ReadHelpers` y `WriteHelpers` archivos de encabezado para ayudar con el formato de entrada / salida. Por ejemplo, hay ayudantes para escribir un número en formato decimal. - -Veamos qué sucede cuando quieres escribir un conjunto de resultados en `JSON` formato a stdout. Tiene un conjunto de resultados listo para ser recuperado de `IBlockInputStream`. Usted crea `WriteBufferFromFileDescriptor(STDOUT_FILENO)` para escribir bytes en stdout. Usted crea `JSONRowOutputStream`, inicializado con eso `WriteBuffer` para escribir filas en `JSON` a stdout. Usted crea `BlockOutputStreamFromRowOutputStream` encima de él, para representarlo como `IBlockOutputStream`. Entonces usted llama `copyData` para transferir datos desde `IBlockInputStream` a `IBlockOutputStream` y todo funciona. Internamente, `JSONRowOutputStream` escribirá varios delimitadores JSON y llamará al `IDataType::serializeTextJSON` con una referencia a `IColumn` y el número de fila como argumentos. Consecuentemente, `IDataType::serializeTextJSON` llamará a un método de `WriteHelpers.h`: por ejemplo, `writeText` para tipos numéricos y `writeJSONString` para `DataTypeString`. - -## Tabla {#tables} - -El `IStorage` interfaz representa tablas. Las diferentes implementaciones de esa interfaz son diferentes motores de tabla. Los ejemplos son `StorageMergeTree`, `StorageMemory` y así sucesivamente. Las instancias de estas clases son solo tablas. - -Clave `IStorage` son `read` y `write`. También hay `alter`, `rename`, `drop` y así sucesivamente. El `read` método acepta los siguientes argumentos: el conjunto de columnas para leer de una tabla, el `AST` consulta a considerar, y el número deseado de flujos para devolver. Devuelve uno o varios `IBlockInputStream` objetos e información sobre la etapa de procesamiento de datos que se completó dentro de un motor de tablas durante la ejecución de la consulta. - -En la mayoría de los casos, el método de lectura solo es responsable de leer las columnas especificadas de una tabla, no de ningún procesamiento de datos adicional. Todo el procesamiento de datos adicional es realizado por el intérprete de consultas y está fuera de la responsabilidad de `IStorage`. - -Pero hay excepciones notables: - -- La consulta AST se pasa al `read` método, y el motor de tablas puede usarlo para derivar el uso del índice y leer menos datos de una tabla. -- A veces, el motor de tablas puede procesar los datos a una etapa específica. Por ejemplo, `StorageDistributed` puede enviar una consulta a servidores remotos, pedirles que procesen datos a una etapa donde se puedan fusionar datos de diferentes servidores remotos y devolver esos datos preprocesados. El intérprete de consultas termina de procesar los datos. - -Tabla `read` método puede devolver múltiples `IBlockInputStream` objetos para permitir el procesamiento de datos en paralelo. Estos flujos de entrada de bloques múltiples pueden leer de una tabla en paralelo. A continuación, puede ajustar estas secuencias con varias transformaciones (como la evaluación de expresiones o el filtrado) que se pueden calcular de forma independiente y crear un `UnionBlockInputStream` encima de ellos, para leer desde múltiples flujos en paralelo. - -También hay `TableFunction`s. Estas son funciones que devuelven un `IStorage` objeto a utilizar en el `FROM` cláusula de una consulta. - -Para tener una idea rápida de cómo implementar su motor de tabla, vea algo simple, como `StorageMemory` o `StorageTinyLog`. - -> Como resultado de la `read` método, `IStorage` devoluciones `QueryProcessingStage` – information about what parts of the query were already calculated inside storage. - -## Analizador {#parsers} - -Un analizador de descenso recursivo escrito a mano analiza una consulta. Por ejemplo, `ParserSelectQuery` simplemente llama recursivamente a los analizadores subyacentes para varias partes de la consulta. Los analizadores crean un `AST`. El `AST` está representado por nodos, que son instancias de `IAST`. - -> Los generadores de analizadores no se utilizan por razones históricas. - -## Interprete {#interpreters} - -Los intérpretes son responsables de crear la canalización de ejecución de consultas `AST`. Hay intérpretes simples, como `InterpreterExistsQuery` y `InterpreterDropQuery` o el más sofisticado `InterpreterSelectQuery`. La canalización de ejecución de consultas es una combinación de flujos de entrada o salida de bloques. Por ejemplo, el resultado de interpretar el `SELECT` la consulta es la `IBlockInputStream` para leer el conjunto de resultados; el resultado de la consulta INSERT es el `IBlockOutputStream` para escribir datos para su inserción, y el resultado de interpretar el `INSERT SELECT` la consulta es la `IBlockInputStream` que devuelve un conjunto de resultados vacío en la primera lectura, pero que copia datos de `SELECT` a `INSERT` al mismo tiempo. - -`InterpreterSelectQuery` utilizar `ExpressionAnalyzer` y `ExpressionActions` maquinaria para el análisis de consultas y transformaciones. Aquí es donde se realizan la mayoría de las optimizaciones de consultas basadas en reglas. `ExpressionAnalyzer` es bastante complicado y debe reescribirse: se deben extraer varias transformaciones de consultas y optimizaciones para separar clases para permitir transformaciones modulares o consultas. - -## Función {#functions} - -Hay funciones ordinarias y funciones agregadas. Para las funciones agregadas, consulte la siguiente sección. - -Ordinary functions don't change the number of rows – they work as if they are processing each row independently. In fact, functions are not called for individual rows, but for `Block`de datos para implementar la ejecución de consultas vectorizadas. - -Hay algunas funciones diversas, como [BlockSize](../sql-reference/functions/other-functions.md#function-blocksize), [rowNumberInBlock](../sql-reference/functions/other-functions.md#function-rownumberinblock), y [runningAccumulate](../sql-reference/functions/other-functions.md#function-runningaccumulate), que explotan el procesamiento de bloques y violan la independencia de las filas. - -ClickHouse tiene una tipificación fuerte, por lo que no hay conversión de tipo implícita. Si una función no admite una combinación específica de tipos, produce una excepción. Pero las funciones pueden funcionar (estar sobrecargadas) para muchas combinaciones diferentes de tipos. Por ejemplo, el `plus` función (para implementar el `+` operador) funciona para cualquier combinación de tipos numéricos: `UInt8` + `Float32`, `UInt16` + `Int8` y así sucesivamente. Además, algunas funciones variadas pueden aceptar cualquier número de argumentos, como el `concat` función. - -Implementar una función puede ser un poco inconveniente porque una función distribuye explícitamente tipos de datos compatibles y `IColumns`. Por ejemplo, el `plus` La función tiene código generado por la creación de instancias de una plantilla de C ++ para cada combinación de tipos numéricos y argumentos izquierdo y derecho constantes o no constantes. - -Es un excelente lugar para implementar la generación de código en tiempo de ejecución para evitar la hinchazón del código de plantilla. Además, permite agregar funciones fusionadas como multiplicar-agregar fusionado o hacer comparaciones múltiples en una iteración de bucle. - -Debido a la ejecución de consultas vectorizadas, las funciones no se cortocircuitan. Por ejemplo, si escribe `WHERE f(x) AND g(y)`, ambos lados se calculan, incluso para las filas, cuando `f(x)` es cero (excepto cuando `f(x)` es una expresión constante cero). Pero si la selectividad del `f(x)` la condición es alta, y el cálculo de `f(x)` es mucho más barato que `g(y)`, es mejor implementar el cálculo de paso múltiple. Primero calcularía `f(x)`, a continuación, filtrar columnas por el resultado, y luego calcular `g(y)` solo para trozos de datos más pequeños y filtrados. - -## Funciones agregadas {#aggregate-functions} - -Las funciones agregadas son funciones con estado. Acumulan valores pasados en algún estado y le permiten obtener resultados de ese estado. Se gestionan con el `IAggregateFunction` interfaz. Los estados pueden ser bastante simples (el estado para `AggregateFunctionCount` es sólo una sola `UInt64` valor) o bastante complejo (el estado de `AggregateFunctionUniqCombined` es una combinación de una matriz lineal, una tabla hash, y un `HyperLogLog` estructura de datos probabilística). - -Los Estados están asignados en `Arena` (un grupo de memoria) para tratar con múltiples estados mientras se ejecuta una alta cardinalidad `GROUP BY` consulta. Los estados pueden tener un constructor y destructor no trivial: por ejemplo, los estados de agregación complicados pueden asignar memoria adicional ellos mismos. Requiere cierta atención a la creación y destrucción de estados y a la adecuada aprobación de su orden de propiedad y destrucción. - -Los estados de agregación se pueden serializar y deserializar para pasar a través de la red durante la ejecución de consultas distribuidas o para escribirlos en el disco donde no hay suficiente RAM. Incluso se pueden almacenar en una tabla con el `DataTypeAggregateFunction` para permitir la agregación incremental de datos. - -> El formato de datos serializados para los estados de función agregados no tiene versiones en este momento. Está bien si los estados agregados solo se almacenan temporalmente. Pero tenemos el `AggregatingMergeTree` motor de tabla para la agregación incremental, y la gente ya lo está utilizando en producción. Es la razón por la que se requiere compatibilidad con versiones anteriores al cambiar el formato serializado para cualquier función agregada en el futuro. - -## Servidor {#server} - -El servidor implementa varias interfaces diferentes: - -- Una interfaz HTTP para cualquier cliente extranjero. -- Una interfaz TCP para el cliente nativo de ClickHouse y para la comunicación entre servidores durante la ejecución de consultas distribuidas. -- Una interfaz para transferir datos para la replicación. - -Internamente, es solo un servidor multiproceso primitivo sin corutinas ni fibras. Dado que el servidor no está diseñado para procesar una alta tasa de consultas simples, sino para procesar una tasa relativamente baja de consultas complejas, cada uno de ellos puede procesar una gran cantidad de datos para análisis. - -El servidor inicializa el `Context` clase con el entorno necesario para la ejecución de consultas: la lista de bases de datos disponibles, usuarios y derechos de acceso, configuración, clústeres, la lista de procesos, el registro de consultas, etc. Los intérpretes utilizan este entorno. - -Mantenemos una compatibilidad total con versiones anteriores y posteriores para el protocolo TCP del servidor: los clientes antiguos pueden hablar con servidores nuevos y los nuevos clientes pueden hablar con servidores antiguos. Pero no queremos mantenerlo eternamente, y estamos eliminando el soporte para versiones antiguas después de aproximadamente un año. - -!!! note "Nota" - Para la mayoría de las aplicaciones externas, recomendamos usar la interfaz HTTP porque es simple y fácil de usar. El protocolo TCP está más estrechamente vinculado a las estructuras de datos internas: utiliza un formato interno para pasar bloques de datos y utiliza marcos personalizados para datos comprimidos. No hemos lanzado una biblioteca C para ese protocolo porque requiere vincular la mayor parte de la base de código ClickHouse, lo cual no es práctico. - -## Ejecución de consultas distribuidas {#distributed-query-execution} - -Los servidores de una configuración de clúster son en su mayoría independientes. Puede crear un `Distributed` en uno o todos los servidores de un clúster. El `Distributed` table does not store data itself – it only provides a “view” a todas las tablas locales en varios nodos de un clúster. Cuando se SELECCIONA desde un `Distributed` tabla, reescribe esa consulta, elige nodos remotos de acuerdo con la configuración de equilibrio de carga y les envía la consulta. El `Distributed` table solicita a los servidores remotos que procesen una consulta hasta una etapa en la que se pueden fusionar resultados intermedios de diferentes servidores. Luego recibe los resultados intermedios y los fusiona. La tabla distribuida intenta distribuir tanto trabajo como sea posible a servidores remotos y no envía muchos datos intermedios a través de la red. - -Las cosas se vuelven más complicadas cuando tiene subconsultas en cláusulas IN o JOIN, y cada una de ellas usa un `Distributed` tabla. Tenemos diferentes estrategias para la ejecución de estas consultas. - -No existe un plan de consulta global para la ejecución de consultas distribuidas. Cada nodo tiene su plan de consulta local para su parte del trabajo. Solo tenemos una ejecución simple de consultas distribuidas de un solo paso: enviamos consultas para nodos remotos y luego fusionamos los resultados. Pero esto no es factible para consultas complicadas con alta cardinalidad GROUP BY o con una gran cantidad de datos temporales para JOIN. En tales casos, necesitamos “reshuffle” datos entre servidores, lo que requiere una coordinación adicional. ClickHouse no admite ese tipo de ejecución de consultas, y tenemos que trabajar en ello. - -## Árbol de fusión {#merge-tree} - -`MergeTree` es una familia de motores de almacenamiento que admite la indexación por clave principal. La clave principal puede ser una tupla arbitraria de columnas o expresiones. Datos en un `MergeTree` se almacena en “parts”. Cada parte almacena los datos en el orden de la clave principal, por lo que la tupla de la clave principal ordena los datos lexicográficamente. Todas las columnas de la tabla se almacenan en `column.bin` archivos en estas partes. Los archivos consisten en bloques comprimidos. Cada bloque suele ser de 64 KB a 1 MB de datos sin comprimir, dependiendo del tamaño del valor promedio. Los bloques constan de valores de columna colocados contiguamente uno tras otro. Los valores de columna están en el mismo orden para cada columna (la clave principal define el orden), por lo que cuando itera por muchas columnas, obtiene valores para las filas correspondientes. - -La clave principal en sí es “sparse”. No aborda cada fila, sino solo algunos rangos de datos. Separado `primary.idx` file tiene el valor de la clave principal para cada fila N-ésima, donde se llama N `index_granularity` (generalmente, N = 8192). Además, para cada columna, tenemos `column.mrk` archivos con “marks,” que son desplazamientos a cada fila N-ésima en el archivo de datos. Cada marca es un par: el desplazamiento en el archivo al comienzo del bloque comprimido y el desplazamiento en el bloque descomprimido al comienzo de los datos. Por lo general, los bloques comprimidos están alineados por marcas, y el desplazamiento en el bloque descomprimido es cero. Datos para `primary.idx` siempre reside en la memoria, y los datos para `column.mrk` archivos se almacena en caché. - -Cuando vamos a leer algo de una parte en `MergeTree` miramos `primary.idx` datos y localice rangos que podrían contener datos solicitados, luego mire `column.mrk` datos y calcular compensaciones para dónde comenzar a leer esos rangos. Debido a la escasez, el exceso de datos puede ser leído. ClickHouse no es adecuado para una gran carga de consultas de puntos simples, porque todo el rango con `index_granularity` se deben leer filas para cada clave, y todo el bloque comprimido debe descomprimirse para cada columna. Hicimos que el índice sea disperso porque debemos poder mantener billones de filas por único servidor sin un consumo de memoria notable para el índice. Además, debido a que la clave principal es escasa, no es única: no puede verificar la existencia de la clave en la tabla en el momento de INSERTAR. Podría tener muchas filas con la misma clave en una tabla. - -Cuando `INSERT` un montón de datos en `MergeTree`, ese grupo está ordenado por orden de clave primaria y forma una nueva parte. Hay subprocesos de fondo que seleccionan periódicamente algunas partes y las fusionan en una sola parte ordenada para mantener el número de partes relativamente bajo. Es por eso que se llama `MergeTree`. Por supuesto, la fusión conduce a “write amplification”. Todas las partes son inmutables: solo se crean y eliminan, pero no se modifican. Cuando se ejecuta SELECT, contiene una instantánea de la tabla (un conjunto de partes). Después de la fusión, también mantenemos las piezas viejas durante algún tiempo para facilitar la recuperación después de la falla, por lo que si vemos que alguna parte fusionada probablemente esté rota, podemos reemplazarla con sus partes de origen. - -`MergeTree` no es un árbol de LSM porque no contiene “memtable” y “log”: inserted data is written directly to the filesystem. This makes it suitable only to INSERT data in batches, not by individual row and not very frequently – about once per second is ok, but a thousand times a second is not. We did it this way for simplicity's sake, and because we are already inserting data in batches in our applications. - -> Las tablas MergeTree solo pueden tener un índice (primario): no hay índices secundarios. Sería bueno permitir múltiples representaciones físicas bajo una tabla lógica, por ejemplo, para almacenar datos en más de un orden físico o incluso para permitir representaciones con datos preagregados junto con datos originales. - -Hay motores MergeTree que están haciendo un trabajo adicional durante las fusiones en segundo plano. Los ejemplos son `CollapsingMergeTree` y `AggregatingMergeTree`. Esto podría tratarse como soporte especial para actualizaciones. Tenga en cuenta que estas no son actualizaciones reales porque los usuarios generalmente no tienen control sobre el tiempo en que se ejecutan las fusiones en segundo plano y los datos en un `MergeTree` casi siempre se almacena en más de una parte, no en forma completamente fusionada. - -## Replicación {#replication} - -La replicación en ClickHouse se puede configurar por tabla. Podría tener algunas tablas replicadas y otras no replicadas en el mismo servidor. También puede tener tablas replicadas de diferentes maneras, como una tabla con replicación de dos factores y otra con replicación de tres factores. - -La replicación se implementa en el `ReplicatedMergeTree` motor de almacenamiento. El camino en `ZooKeeper` se especifica como un parámetro para el motor de almacenamiento. Todas las tablas con la misma ruta en `ZooKeeper` se convierten en réplicas entre sí: sincronizan sus datos y mantienen la coherencia. Las réplicas se pueden agregar y eliminar dinámicamente simplemente creando o soltando una tabla. - -La replicación utiliza un esquema multi-maestro asíncrono. Puede insertar datos en cualquier réplica que tenga una sesión con `ZooKeeper`, y los datos se replican en todas las demás réplicas de forma asíncrona. Como ClickHouse no admite UPDATE, la replicación está libre de conflictos. Como no hay reconocimiento de quórum de inserciones, los datos recién insertados pueden perderse si un nodo falla. - -Los metadatos para la replicación se almacenan en ZooKeeper. Hay un registro de replicación que enumera las acciones que se deben realizar. Las acciones son: obtener parte; fusionar partes; soltar una partición, etc. Cada réplica copia el registro de replicación en su cola y, a continuación, ejecuta las acciones desde la cola. Por ejemplo, en la inserción, el “get the part” la acción se crea en el registro y cada réplica descarga esa parte. Las fusiones se coordinan entre réplicas para obtener resultados idénticos en bytes. Todas las piezas se combinan de la misma manera en todas las réplicas. Se logra eligiendo una réplica como líder, y esa réplica inicia fusiones y escrituras “merge parts” acciones al registro. - -La replicación es física: solo las partes comprimidas se transfieren entre nodos, no consultas. Las fusiones se procesan en cada réplica de forma independiente en la mayoría de los casos para reducir los costos de red al evitar la amplificación de la red. Las piezas combinadas grandes se envían a través de la red solo en casos de retraso de replicación significativo. - -Además, cada réplica almacena su estado en ZooKeeper como el conjunto de piezas y sus sumas de comprobación. Cuando el estado en el sistema de archivos local difiere del estado de referencia en ZooKeeper, la réplica restaura su coherencia descargando partes faltantes y rotas de otras réplicas. Cuando hay algunos datos inesperados o rotos en el sistema de archivos local, ClickHouse no los elimina, sino que los mueve a un directorio separado y los olvida. - -!!! note "Nota" - El clúster ClickHouse consta de fragmentos independientes y cada fragmento consta de réplicas. El clúster es **no elástico**, por lo tanto, después de agregar un nuevo fragmento, los datos no se reequilibran automáticamente entre fragmentos. En su lugar, se supone que la carga del clúster debe ajustarse para que sea desigual. Esta implementación le da más control, y está bien para clústeres relativamente pequeños, como decenas de nodos. Pero para los clústeres con cientos de nodos que estamos utilizando en producción, este enfoque se convierte en un inconveniente significativo. Debemos implementar un motor de tablas que abarque todo el clúster con regiones replicadas dinámicamente que puedan dividirse y equilibrarse entre clústeres automáticamente. - -{## [Artículo Original](https://clickhouse.tech/docs/en/development/architecture/) ##} diff --git a/docs/es/development/browse-code.md b/docs/es/development/browse-code.md deleted file mode 100644 index ca031ad03f3..00000000000 --- a/docs/es/development/browse-code.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 63 -toc_title: "Buscar c\xF3digo fuente" ---- - -# Examinar el código fuente de ClickHouse {#browse-clickhouse-source-code} - -Usted puede utilizar **Woboq** navegador de código en línea disponible [aqui](https://clickhouse.tech/codebrowser/html_report/ClickHouse/src/index.html). Proporciona navegación de código y resaltado semántico, búsqueda e indexación. La instantánea de código se actualiza diariamente. - -Además, puede navegar por las fuentes en [GitHub](https://github.com/ClickHouse/ClickHouse) como de costumbre. - -Si está interesado en qué IDE usar, recomendamos CLion, QT Creator, VS Code y KDevelop (con advertencias). Puedes usar cualquier IDE favorito. Vim y Emacs también cuentan. diff --git a/docs/es/development/build-cross-arm.md b/docs/es/development/build-cross-arm.md deleted file mode 100644 index 2758e9a0e94..00000000000 --- a/docs/es/development/build-cross-arm.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 67 -toc_title: "C\xF3mo construir ClickHouse en Linux para AARCH64 (ARM64)" ---- - -# Cómo construir ClickHouse en Linux para la arquitectura AARCH64 (ARM64) {#how-to-build-clickhouse-on-linux-for-aarch64-arm64-architecture} - -Esto es para el caso cuando tiene una máquina Linux y desea usarla para compilar `clickhouse` binario que se ejecutará en otra máquina Linux con arquitectura de CPU AARCH64. Esto está destinado a las comprobaciones de integración continua que se ejecutan en servidores Linux. - -La compilación cruzada para AARCH64 se basa en el [Instrucciones de construcción](build.md), seguirlos primero. - -# Instalar Clang-8 {#install-clang-8} - -Siga las instrucciones de https://apt.llvm.org/ para la configuración de Ubuntu o Debian. -Por ejemplo, en Ubuntu Bionic puede usar los siguientes comandos: - -``` bash -echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" | sudo tee /etc/apt/sources.list.d/llvm.list -sudo apt-get update -sudo apt-get install clang-8 -``` - -# Instalar conjunto de herramientas de compilación cruzada {#install-cross-compilation-toolset} - -``` bash -cd ClickHouse -mkdir -p build-aarch64/cmake/toolchain/linux-aarch64 -wget 'https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en' -O gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build-aarch64/cmake/toolchain/linux-aarch64 --strip-components=1 -``` - -# Construir ClickHouse {#build-clickhouse} - -``` bash -cd ClickHouse -mkdir build-arm64 -CC=clang-8 CXX=clang++-8 cmake . -Bbuild-arm64 -DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-aarch64.cmake -ninja -C build-arm64 -``` - -El binario resultante se ejecutará solo en Linux con la arquitectura de CPU AARCH64. diff --git a/docs/es/development/build-cross-osx.md b/docs/es/development/build-cross-osx.md deleted file mode 100644 index d00e57c5d31..00000000000 --- a/docs/es/development/build-cross-osx.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 66 -toc_title: "C\xF3mo construir ClickHouse en Linux para Mac OS X" ---- - -# Cómo construir ClickHouse en Linux para Mac OS X {#how-to-build-clickhouse-on-linux-for-mac-os-x} - -Esto es para el caso cuando tiene una máquina Linux y desea usarla para compilar `clickhouse` Esto está destinado a las comprobaciones de integración continuas que se ejecutan en servidores Linux. Si desea crear ClickHouse directamente en Mac OS X, continúe con [otra instrucción](build-osx.md). - -La compilación cruzada para Mac OS X se basa en el [Instrucciones de construcción](build.md), seguirlos primero. - -# Instalar Clang-8 {#install-clang-8} - -Siga las instrucciones de https://apt.llvm.org/ para la configuración de Ubuntu o Debian. -Por ejemplo, los comandos para Bionic son como: - -``` bash -sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" >> /etc/apt/sources.list -sudo apt-get install clang-8 -``` - -# Instalar conjunto de herramientas de compilación cruzada {#install-cross-compilation-toolset} - -Recordemos la ruta donde instalamos `cctools` como ${CCTOOLS} - -``` bash -mkdir ${CCTOOLS} - -git clone https://github.com/tpoechtrager/apple-libtapi.git -cd apple-libtapi -INSTALLPREFIX=${CCTOOLS} ./build.sh -./install.sh -cd .. - -git clone https://github.com/tpoechtrager/cctools-port.git -cd cctools-port/cctools -./configure --prefix=${CCTOOLS} --with-libtapi=${CCTOOLS} --target=x86_64-apple-darwin -make install -``` - -Además, necesitamos descargar macOS X SDK en el árbol de trabajo. - -``` bash -cd ClickHouse -wget 'https://github.com/phracker/MacOSX-SDKs/releases/download/10.15/MacOSX10.15.sdk.tar.xz' -mkdir -p build-darwin/cmake/toolchain/darwin-x86_64 -tar xJf MacOSX10.15.sdk.tar.xz -C build-darwin/cmake/toolchain/darwin-x86_64 --strip-components=1 -``` - -# Construir ClickHouse {#build-clickhouse} - -``` bash -cd ClickHouse -mkdir build-osx -CC=clang-8 CXX=clang++-8 cmake . -Bbuild-osx -DCMAKE_TOOLCHAIN_FILE=cmake/darwin/toolchain-x86_64.cmake \ - -DCMAKE_AR:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ar \ - -DCMAKE_RANLIB:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ranlib \ - -DLINKER_NAME=${CCTOOLS}/bin/x86_64-apple-darwin-ld -ninja -C build-osx -``` - -El binario resultante tendrá un formato ejecutable Mach-O y no se puede ejecutar en Linux. diff --git a/docs/es/development/build-osx.md b/docs/es/development/build-osx.md deleted file mode 100644 index 39eba389798..00000000000 --- a/docs/es/development/build-osx.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 65 -toc_title: "C\xF3mo crear ClickHouse en Mac OS X" ---- - -# Cómo crear ClickHouse en Mac OS X {#how-to-build-clickhouse-on-mac-os-x} - -Build debería funcionar en Mac OS X 10.15 (Catalina) - -## Instalar Homebrew {#install-homebrew} - -``` bash -$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" -``` - -## Instalar compiladores, herramientas y bibliotecas necesarios {#install-required-compilers-tools-and-libraries} - -``` bash -$ brew install cmake ninja libtool gettext -``` - -## Fuentes de ClickHouse de pago {#checkout-clickhouse-sources} - -``` bash -$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git -``` - -o - -``` bash -$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git - -$ cd ClickHouse -``` - -## Construir ClickHouse {#build-clickhouse} - -``` bash -$ mkdir build -$ cd build -$ cmake .. -DCMAKE_CXX_COMPILER=`which clang++` -DCMAKE_C_COMPILER=`which clang` -$ ninja -$ cd .. -``` - -## Advertencia {#caveats} - -Si tiene la intención de ejecutar clickhouse-server, asegúrese de aumentar la variable maxfiles del sistema. - -!!! info "Nota" - Tendrás que usar sudo. - -Para ello, cree el siguiente archivo: - -/Library/LaunchDaemons/limit.maxfiles.lista: - -``` xml - - - - - Label - limit.maxfiles - ProgramArguments - - launchctl - limit - maxfiles - 524288 - 524288 - - RunAtLoad - - ServiceIPC - - - -``` - -Ejecute el siguiente comando: - -``` bash -$ sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist -``` - -Reiniciar. - -Para verificar si está funcionando, puede usar `ulimit -n` comando. - -[Artículo Original](https://clickhouse.tech/docs/en/development/build_osx/) diff --git a/docs/es/development/build.md b/docs/es/development/build.md deleted file mode 100644 index 42cd9b5433f..00000000000 --- a/docs/es/development/build.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 64 -toc_title: "C\xF3mo crear ClickHouse en Linux" ---- - -# Cómo construir ClickHouse para el desarrollo {#how-to-build-clickhouse-for-development} - -El siguiente tutorial se basa en el sistema Ubuntu Linux. -Con los cambios apropiados, también debería funcionar en cualquier otra distribución de Linux. -Plataformas compatibles: x86_64 y AArch64. El soporte para Power9 es experimental. - -## Instalar Git, CMake, Python y Ninja {#install-git-cmake-python-and-ninja} - -``` bash -$ sudo apt-get install git cmake python ninja-build -``` - -O cmake3 en lugar de cmake en sistemas más antiguos. - -## Instalar GCC 10 {#install-gcc-10} - -Hay varias formas de hacer esto. - -### Instalar desde un paquete PPA {#install-from-a-ppa-package} - -``` bash -$ sudo apt-get install software-properties-common -$ sudo apt-add-repository ppa:ubuntu-toolchain-r/test -$ sudo apt-get update -$ sudo apt-get install gcc-10 g++-10 -``` - -### Instalar desde fuentes {#install-from-sources} - -Mira [Sistema abierto.](https://github.com/ClickHouse/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh) - -## Usar GCC 10 para compilaciones {#use-gcc-10-for-builds} - -``` bash -$ export CC=gcc-10 -$ export CXX=g++-10 -``` - -## Fuentes de ClickHouse de pago {#checkout-clickhouse-sources} - -``` bash -$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git -``` - -o - -``` bash -$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git -``` - -## Construir ClickHouse {#build-clickhouse} - -``` bash -$ cd ClickHouse -$ mkdir build -$ cd build -$ cmake .. -$ ninja -$ cd .. -``` - -Para crear un ejecutable, ejecute `ninja clickhouse`. -Esto creará el `programs/clickhouse` ejecutable, que se puede usar con `client` o `server` argumento. - -# Cómo construir ClickHouse en cualquier Linux {#how-to-build-clickhouse-on-any-linux} - -La compilación requiere los siguientes componentes: - -- Git (se usa solo para verificar las fuentes, no es necesario para la compilación) -- CMake 3.10 o más reciente -- Ninja (recomendado) o Hacer -- Compilador de C ++: gcc 10 o clang 8 o más reciente -- Enlazador: lld u oro (el clásico GNU ld no funcionará) -- Python (solo se usa dentro de la compilación LLVM y es opcional) - -Si todos los componentes están instalados, puede compilar de la misma manera que los pasos anteriores. - -Ejemplo para Ubuntu Eoan: - - sudo apt update - sudo apt install git cmake ninja-build g++ python - git clone --recursive https://github.com/ClickHouse/ClickHouse.git - mkdir build && cd build - cmake ../ClickHouse - ninja - -Ejemplo de OpenSUSE Tumbleweed: - - sudo zypper install git cmake ninja gcc-c++ python lld - git clone --recursive https://github.com/ClickHouse/ClickHouse.git - mkdir build && cd build - cmake ../ClickHouse - ninja - -Ejemplo de Fedora Rawhide: - - sudo yum update - yum --nogpg install git cmake make gcc-c++ python3 - git clone --recursive https://github.com/ClickHouse/ClickHouse.git - mkdir build && cd build - cmake ../ClickHouse - make -j $(nproc) - -# No tienes que construir ClickHouse {#you-dont-have-to-build-clickhouse} - -ClickHouse está disponible en binarios y paquetes preconstruidos. Los binarios son portátiles y se pueden ejecutar en cualquier tipo de Linux. - -Están diseñados para lanzamientos estables, preestablecidos y de prueba, siempre que para cada compromiso con el maestro y para cada solicitud de extracción. - -Para encontrar la construcción más fresca de `master`, ir a [se compromete página](https://github.com/ClickHouse/ClickHouse/commits/master), haga clic en la primera marca de verificación verde o cruz roja cerca de confirmar, y haga clic en “Details” enlace justo después “ClickHouse Build Check”. - -# Cómo construir el paquete Debian ClickHouse {#how-to-build-clickhouse-debian-package} - -## Instalar Git y Pbuilder {#install-git-and-pbuilder} - -``` bash -$ sudo apt-get update -$ sudo apt-get install git python pbuilder debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring -``` - -## Fuentes de ClickHouse de pago {#checkout-clickhouse-sources-1} - -``` bash -$ git clone --recursive --branch master https://github.com/ClickHouse/ClickHouse.git -$ cd ClickHouse -``` - -## Ejecutar secuencia de comandos de lanzamiento {#run-release-script} - -``` bash -$ ./release -``` - -[Artículo Original](https://clickhouse.tech/docs/en/development/build/) diff --git a/docs/es/development/contrib.md b/docs/es/development/contrib.md deleted file mode 100644 index 3f3013570e5..00000000000 --- a/docs/es/development/contrib.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 70 -toc_title: Bibliotecas de terceros utilizadas ---- - -# Bibliotecas de terceros utilizadas {#third-party-libraries-used} - -| Biblioteca | Licencia | -|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| -| base64 | [Licencia BSD de 2 cláusulas](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | -| impulsar | [Licencia de software Boost 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | -| Bienvenido | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | -| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | -| Cctz | [Licencia Apache 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | -| doble conversión | [Licencia de 3 cláusulas BSD](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | -| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | -| Más información | [Licencia de 3 cláusulas BSD](https://github.com/google/googletest/blob/master/LICENSE) | -| H3 | [Licencia Apache 2.0](https://github.com/uber/h3/blob/master/LICENSE) | -| hyperscan | [Licencia de 3 cláusulas BSD](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | -| libdivide | [Licencia Zlib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | -| libgsasl | [Información adicional](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | -| libhdfs3 | [Licencia Apache 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | -| libmetrohash | [Licencia Apache 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | -| libpcg-al azar | [Licencia Apache 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | -| Libressl | [Licencia OpenSSL](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | -| Librdkafka | [Licencia BSD de 2 cláusulas](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | -| libwidechar_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | -| llvm | [Licencia de 3 cláusulas BSD](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | -| lz4 | [Licencia BSD de 2 cláusulas](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | -| mariadb-conector-c | [Información adicional](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | -| murmurhash | [Dominio público](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) | -| pdqsort | [Licencia Zlib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | -| Poco | [Boost Software License - Versión 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | -| protobuf | [Licencia de 3 cláusulas BSD](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | -| Re2 | [Licencia de 3 cláusulas BSD](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | -| UnixODBC | [Información adicional](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | -| Sistema abierto. | [Licencia Zlib](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | -| zstd | [Licencia de 3 cláusulas BSD](https://github.com/facebook/zstd/blob/dev/LICENSE) | diff --git a/docs/es/development/developer-instruction.md b/docs/es/development/developer-instruction.md deleted file mode 100644 index 0ce5d0b457a..00000000000 --- a/docs/es/development/developer-instruction.md +++ /dev/null @@ -1,287 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 61 -toc_title: "La instrucci\xF3n para desarrolladores de ClickHouse para principiantes" ---- - -La construcción de ClickHouse es compatible con Linux, FreeBSD y Mac OS X. - -# Si utiliza Windows {#if-you-use-windows} - -Si usa Windows, necesita crear una máquina virtual con Ubuntu. Para comenzar a trabajar con una máquina virtual, instale VirtualBox. Puede descargar Ubuntu desde el sitio web: https://www.ubuntu.com/#download. Por favor, cree una máquina virtual a partir de la imagen descargada (debe reservar al menos 4 GB de RAM para ello). Para ejecutar un terminal de línea de comandos en Ubuntu, busque un programa que contenga la palabra “terminal” en su nombre (gnome-terminal, konsole etc.) o simplemente presione Ctrl + Alt + T. - -# Si utiliza un sistema de 32 bits {#if-you-use-a-32-bit-system} - -ClickHouse no puede funcionar ni construir en un sistema de 32 bits. Debe adquirir acceso a un sistema de 64 bits y puede continuar leyendo. - -# Creación de un repositorio en GitHub {#creating-a-repository-on-github} - -Para comenzar a trabajar con el repositorio de ClickHouse, necesitará una cuenta de GitHub. - -Probablemente ya tenga uno, pero si no lo hace, regístrese en https://github.com . En caso de que no tenga claves SSH, debe generarlas y luego cargarlas en GitHub. Es necesario para enviar a través de sus parches. También es posible usar las mismas claves SSH que usa con cualquier otro servidor SSH, probablemente ya las tenga. - -Cree una bifurcación del repositorio ClickHouse. Para hacerlo por favor haga clic en el “fork” botón en la esquina superior derecha en https://github.com/ClickHouse/ClickHouse . Se bifurcará su propia copia de ClickHouse/ClickHouse a su cuenta. - -El proceso de desarrollo consiste en comprometer primero los cambios previstos en su bifurcación de ClickHouse y luego crear un “pull request” para que estos cambios sean aceptados en el repositorio principal (ClickHouse / ClickHouse). - -Para trabajar con repositorios git, instale `git`. - -Para hacer eso en Ubuntu, ejecutaría en la terminal de línea de comandos: - - sudo apt update - sudo apt install git - -Puede encontrar un breve manual sobre el uso de Git aquí: https://education.github.com/git-cheat-sheet-education.pdf . -Para obtener un manual detallado sobre Git, consulte https://git-scm.com/book/en/v2 . - -# Clonación de un repositorio en su máquina de desarrollo {#cloning-a-repository-to-your-development-machine} - -A continuación, debe descargar los archivos fuente en su máquina de trabajo. Esto se llama “to clone a repository” porque crea una copia local del repositorio en su máquina de trabajo. - -En el terminal de línea de comandos, ejecute: - - git clone --recursive git@github.com:your_github_username/ClickHouse.git - cd ClickHouse - -Nota: por favor, sustituye *your_github_username* con lo que es apropiado! - -Este comando creará un directorio `ClickHouse` que contiene la copia de trabajo del proyecto. - -Es importante que la ruta al directorio de trabajo no contenga espacios en blanco, ya que puede ocasionar problemas con la ejecución del sistema de compilación. - -Tenga en cuenta que el repositorio ClickHouse utiliza `submodules`. That is what the references to additional repositories are called (i.e. external libraries on which the project depends). It means that when cloning the repository you need to specify the `--recursive` como en el ejemplo anterior. Si el repositorio se ha clonado sin submódulos, para descargarlos debe ejecutar lo siguiente: - - git submodule init - git submodule update - -Puede verificar el estado con el comando: `git submodule status`. - -Si recibe el siguiente mensaje de error: - - Permission denied (publickey). - fatal: Could not read from remote repository. - - Please make sure you have the correct access rights - and the repository exists. - -Por lo general, significa que faltan las claves SSH para conectarse a GitHub. Estas teclas se encuentran normalmente en `~/.ssh`. Para que las claves SSH sean aceptadas, debe cargarlas en la sección de configuración de la interfaz de usuario de GitHub. - -También puede clonar el repositorio a través del protocolo https: - - git clone https://github.com/ClickHouse/ClickHouse.git - -Sin embargo, esto no le permitirá enviar los cambios al servidor. Aún puede usarlo temporalmente y agregar las claves SSH más tarde reemplazando la dirección remota del repositorio con `git remote` comando. - -También puede agregar la dirección original del repositorio de ClickHouse a su repositorio local para extraer actualizaciones desde allí: - - git remote add upstream git@github.com:ClickHouse/ClickHouse.git - -Después de ejecutar con éxito este comando, podrá extraer actualizaciones del repositorio principal de ClickHouse ejecutando `git pull upstream master`. - -## Trabajar con submódulos {#working-with-submodules} - -Trabajar con submódulos en git podría ser doloroso. Los siguientes comandos ayudarán a administrarlo: - - # ! each command accepts --recursive - # Update remote URLs for submodules. Barely rare case - git submodule sync - # Add new submodules - git submodule init - # Update existing submodules to the current state - git submodule update - # Two last commands could be merged together - git submodule update --init - -Los siguientes comandos le ayudarían a restablecer todos los submódulos al estado inicial (!¡ADVERTENCIA! - cualquier cambio en el interior será eliminado): - - # Synchronizes submodules' remote URL with .gitmodules - git submodule sync --recursive - # Update the registered submodules with initialize not yet initialized - git submodule update --init --recursive - # Reset all changes done after HEAD - git submodule foreach git reset --hard - # Clean files from .gitignore - git submodule foreach git clean -xfd - # Repeat last 4 commands for all submodule - git submodule foreach git submodule sync --recursive - git submodule foreach git submodule update --init --recursive - git submodule foreach git submodule foreach git reset --hard - git submodule foreach git submodule foreach git clean -xfd - -# Sistema de construcción {#build-system} - -ClickHouse utiliza CMake y Ninja para la construcción. - -CMake - un sistema de meta-construcción que puede generar archivos Ninja (tareas de construcción). -Ninja: un sistema de compilación más pequeño con un enfoque en la velocidad utilizada para ejecutar esas tareas generadas por cmake. - -Para instalar en Ubuntu, Debian o Mint run `sudo apt install cmake ninja-build`. - -En CentOS, RedHat se ejecuta `sudo yum install cmake ninja-build`. - -Si usa Arch o Gentoo, probablemente lo sepa usted mismo cómo instalar CMake. - -Para instalar CMake y Ninja en Mac OS X, primero instale Homebrew y luego instale todo lo demás a través de brew: - - /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" - brew install cmake ninja - -A continuación, verifique la versión de CMake: `cmake --version`. Si está por debajo de 3.3, debe instalar una versión más reciente desde el sitio web: https://cmake.org/download/. - -# Bibliotecas externas opcionales {#optional-external-libraries} - -ClickHouse utiliza varias bibliotecas externas para la construcción. Todos ellos no necesitan ser instalados por separado, ya que se construyen junto con ClickHouse a partir de las fuentes ubicadas en los submódulos. Puede consultar la lista en `contrib`. - -# Compilador de C ++ {#c-compiler} - -Los compiladores GCC a partir de la versión 10 y Clang versión 8 o superior son compatibles para construir ClickHouse. - -Las compilaciones oficiales de Yandex actualmente usan GCC porque genera código de máquina de un rendimiento ligeramente mejor (con una diferencia de hasta varios por ciento según nuestros puntos de referencia). Y Clang es más conveniente para el desarrollo generalmente. Sin embargo, nuestra plataforma de integración continua (CI) ejecuta verificaciones de aproximadamente una docena de combinaciones de compilación. - -Para instalar GCC en Ubuntu, ejecute: `sudo apt install gcc g++` - -Compruebe la versión de gcc: `gcc --version`. Si está por debajo de 9, siga las instrucciones aquí: https://clickhouse.tech/docs/es/development/build/#install-gcc-10. - -La compilación de Mac OS X solo es compatible con Clang. Sólo tiene que ejecutar `brew install llvm` - -Si decide utilizar Clang, también puede instalar `libc++` y `lld` si usted sabe lo que es. Utilizar `ccache` también se recomienda. - -# El proceso de construcción {#the-building-process} - -Ahora que está listo para construir ClickHouse, le recomendamos que cree un directorio separado `build` dentro `ClickHouse` que contendrá todos los de la generación de artefactos: - - mkdir build - cd build - -Puede tener varios directorios diferentes (build_release, build_debug, etc.) para diferentes tipos de construcción. - -Mientras que dentro de la `build` directorio, configure su compilación ejecutando CMake. Antes de la primera ejecución, debe definir variables de entorno que especifiquen el compilador (compilador gcc versión 10 en este ejemplo). - -Linux: - - export CC=gcc-10 CXX=g++-10 - cmake .. - -Mac OS X: - - export CC=clang CXX=clang++ - cmake .. - -El `CC` variable especifica el compilador para C (abreviatura de C Compiler), y `CXX` variable indica qué compilador de C ++ se usará para compilar. - -Para una construcción más rápida, puede recurrir al `debug` tipo de compilación: una compilación sin optimizaciones. Para ese suministro el siguiente parámetro `-D CMAKE_BUILD_TYPE=Debug`: - - cmake -D CMAKE_BUILD_TYPE=Debug .. - -Puede cambiar el tipo de compilación ejecutando este comando en el `build` directorio. - -Ejecutar ninja para construir: - - ninja clickhouse-server clickhouse-client - -Solo los binarios requeridos se van a construir en este ejemplo. - -Si necesita construir todos los binarios (utilidades y pruebas), debe ejecutar ninja sin parámetros: - - ninja - -La compilación completa requiere aproximadamente 30 GB de espacio libre en disco o 15 GB para construir los binarios principales. - -Cuando hay una gran cantidad de RAM disponible en la máquina de compilación, debe limitar el número de tareas de compilación que se ejecutan en paralelo con `-j` parámetro: - - ninja -j 1 clickhouse-server clickhouse-client - -En máquinas con 4GB de RAM, se recomienda especificar 1, para 8GB de RAM `-j 2` se recomienda. - -Si recibe el mensaje: `ninja: error: loading 'build.ninja': No such file or directory`, significa que la generación de una configuración de compilación ha fallado y necesita inspeccionar el mensaje anterior. - -Cuando se inicie correctamente el proceso de construcción, verá el progreso de la compilación: el número de tareas procesadas y el número total de tareas. - -Al crear mensajes sobre archivos protobuf en la biblioteca libhdfs2, como `libprotobuf WARNING` puede aparecer. Afectan a nada y son seguros para ser ignorado. - -Tras la compilación exitosa, obtienes un archivo ejecutable `ClickHouse//programs/clickhouse`: - - ls -l programs/clickhouse - -# Ejecución del ejecutable construido de ClickHouse {#running-the-built-executable-of-clickhouse} - -Para ejecutar el servidor bajo el usuario actual, debe navegar hasta `ClickHouse/programs/server/` (situado fuera de `build`) y ejecutar: - - ../../build/programs/clickhouse server - -En este caso, ClickHouse usará archivos de configuración ubicados en el directorio actual. Puede ejecutar `clickhouse server` desde cualquier directorio que especifique la ruta a un archivo de configuración como un parámetro de línea de comandos `--config-file`. - -Para conectarse a ClickHouse con clickhouse-client en otro terminal, vaya a `ClickHouse/build/programs/` y ejecutar `./clickhouse client`. - -Si usted consigue `Connection refused` mensaje en Mac OS X o FreeBSD, intente especificar la dirección de host 127.0.0.1: - - clickhouse client --host 127.0.0.1 - -Puede reemplazar la versión de producción del binario ClickHouse instalado en su sistema con su binario ClickHouse personalizado. Para ello, instale ClickHouse en su máquina siguiendo las instrucciones del sitio web oficial. A continuación, ejecute lo siguiente: - - sudo service clickhouse-server stop - sudo cp ClickHouse/build/programs/clickhouse /usr/bin/ - sudo service clickhouse-server start - -Tenga en cuenta que `clickhouse-client`, `clickhouse-server` y otros son enlaces simbólicos a los comúnmente compartidos `clickhouse` binario. - -También puede ejecutar su binario ClickHouse personalizado con el archivo de configuración del paquete ClickHouse instalado en su sistema: - - sudo service clickhouse-server stop - sudo -u clickhouse ClickHouse/build/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml - -# IDE (entorno de desarrollo integrado) {#ide-integrated-development-environment} - -Si no sabe qué IDE usar, le recomendamos que use CLion. CLion es un software comercial, pero ofrece un período de prueba gratuito de 30 días. También es gratuito para los estudiantes. CLion se puede usar tanto en Linux como en Mac OS X. - -KDevelop y QTCreator son otras excelentes alternativas de un IDE para desarrollar ClickHouse. KDevelop viene como un IDE muy útil aunque inestable. Si KDevelop se bloquea después de un tiempo al abrir el proyecto, debe hacer clic “Stop All” botón tan pronto como se ha abierto la lista de archivos del proyecto. Después de hacerlo, KDevelop debería estar bien para trabajar. - -Como editores de código simples, puede usar Sublime Text o Visual Studio Code, o Kate (todos los cuales están disponibles en Linux). - -Por si acaso, vale la pena mencionar que CLion crea `build` por sí mismo, también por sí mismo selecciona `debug` para el tipo de compilación, para la configuración usa una versión de CMake que está definida en CLion y no la instalada por usted, y finalmente, CLion usará `make` para ejecutar tareas de compilación en lugar de `ninja`. Este es un comportamiento normal, solo tenlo en cuenta para evitar confusiones. - -# Código de escritura {#writing-code} - -La descripción de la arquitectura ClickHouse se puede encontrar aquí: https://clickhouse.tech/docs/es/desarrollo/arquitectura/ - -La Guía de estilo de código: https://clickhouse.tech/docs/en/development/style/ - -Pruebas de escritura: https://clickhouse.tech/docs/en/development/tests/ - -Lista de tareas: https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aopen+is%3Aissue+label%3A%22easy+task%22 - -# Datos de prueba {#test-data} - -El desarrollo de ClickHouse a menudo requiere cargar conjuntos de datos realistas. Es particularmente importante para las pruebas de rendimiento. Tenemos un conjunto especialmente preparado de datos anónimos de Yandex.Métrica. Se requiere, además, unos 3 GB de espacio libre en disco. Tenga en cuenta que estos datos no son necesarios para realizar la mayoría de las tareas de desarrollo. - - sudo apt install wget xz-utils - - wget https://datasets.clickhouse.tech/hits/tsv/hits_v1.tsv.xz - wget https://datasets.clickhouse.tech/visits/tsv/visits_v1.tsv.xz - - xz -v -d hits_v1.tsv.xz - xz -v -d visits_v1.tsv.xz - - clickhouse-client - - CREATE DATABASE IF NOT EXISTS test - - CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); - - CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); - - clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv - clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv - -# Creación de solicitud de extracción {#creating-pull-request} - -Navega a tu repositorio de fork en la interfaz de usuario de GitHub. Si ha estado desarrollando en una sucursal, debe seleccionar esa sucursal. Habrá un “Pull request” botón situado en la pantalla. En esencia, esto significa “create a request for accepting my changes into the main repository”. - -Se puede crear una solicitud de extracción incluso si el trabajo aún no se ha completado. En este caso, por favor ponga la palabra “WIP” (trabajo en curso) al comienzo del título, se puede cambiar más tarde. Esto es útil para la revisión cooperativa y la discusión de los cambios, así como para ejecutar todas las pruebas disponibles. Es importante que proporcione una breve descripción de sus cambios, que más tarde se utilizará para generar registros de cambios de lanzamiento. - -Las pruebas comenzarán tan pronto como los empleados de Yandex etiqueten su PR con una etiqueta “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour. - -El sistema preparará compilaciones binarias ClickHouse para su solicitud de extracción individualmente. Para recuperar estas compilaciones, haga clic en “Details” junto al link “ClickHouse build check” en la lista de cheques. Allí encontrará enlaces directos a la construcción.deb paquetes de ClickHouse que puede implementar incluso en sus servidores de producción (si no tiene miedo). - -Lo más probable es que algunas de las compilaciones fallen las primeras veces. Esto se debe al hecho de que verificamos las compilaciones tanto con gcc como con clang, con casi todas las advertencias existentes (siempre con el `-Werror` bandera) habilitado para sonido. En esa misma página, puede encontrar todos los registros de compilación para que no tenga que compilar ClickHouse de todas las formas posibles. diff --git a/docs/es/development/index.md b/docs/es/development/index.md deleted file mode 100644 index 6f96f9b3f02..00000000000 --- a/docs/es/development/index.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Desarrollo -toc_hidden: true -toc_priority: 58 -toc_title: oculto ---- - -# Desarrollo de ClickHouse {#clickhouse-development} - -[Artículo Original](https://clickhouse.tech/docs/en/development/) diff --git a/docs/es/development/style.md b/docs/es/development/style.md deleted file mode 100644 index ec55516fe2c..00000000000 --- a/docs/es/development/style.md +++ /dev/null @@ -1,841 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 68 -toc_title: "C\xF3mo escribir c\xF3digo C ++" ---- - -# Cómo escribir código C ++ {#how-to-write-c-code} - -## Recomendaciones generales {#general-recommendations} - -**1.** Las siguientes son recomendaciones, no requisitos. - -**2.** Si está editando código, tiene sentido seguir el formato del código existente. - -**3.** El estilo de código es necesario para la coherencia. La consistencia facilita la lectura del código y también facilita la búsqueda del código. - -**4.** Muchas de las reglas no tienen razones lógicas; están dictadas por prácticas establecidas. - -## Formatear {#formatting} - -**1.** La mayor parte del formato se realizará automáticamente por `clang-format`. - -**2.** Las sangrías son 4 espacios. Configure el entorno de desarrollo para que una pestaña agregue cuatro espacios. - -**3.** Abrir y cerrar llaves deben estar en una línea separada. - -``` cpp -inline void readBoolText(bool & x, ReadBuffer & buf) -{ - char tmp = '0'; - readChar(tmp, buf); - x = tmp != '0'; -} -``` - -**4.** Si todo el cuerpo de la función es `statement`, se puede colocar en una sola línea. Coloque espacios alrededor de llaves (además del espacio al final de la línea). - -``` cpp -inline size_t mask() const { return buf_size() - 1; } -inline size_t place(HashValue x) const { return x & mask(); } -``` - -**5.** Para funciones. No coloque espacios alrededor de los corchetes. - -``` cpp -void reinsert(const Value & x) -``` - -``` cpp -memcpy(&buf[place_value], &x, sizeof(x)); -``` - -**6.** En `if`, `for`, `while` y otras expresiones, se inserta un espacio delante del corchete de apertura (a diferencia de las llamadas a funciones). - -``` cpp -for (size_t i = 0; i < rows; i += storage.index_granularity) -``` - -**7.** Agregar espacios alrededor de los operadores binarios (`+`, `-`, `*`, `/`, `%`, …) and the ternary operator `?:`. - -``` cpp -UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); -UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); -UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); -``` - -**8.** Si se introduce un avance de línea, coloque al operador en una nueva línea y aumente la sangría antes de ella. - -``` cpp -if (elapsed_ns) - message << " (" - << rows_read_on_server * 1000000000 / elapsed_ns << " rows/s., " - << bytes_read_on_server * 1000.0 / elapsed_ns << " MB/s.) "; -``` - -**9.** Puede utilizar espacios para la alineación dentro de una línea, si lo desea. - -``` cpp -dst.ClickLogID = click.LogID; -dst.ClickEventID = click.EventID; -dst.ClickGoodEvent = click.GoodEvent; -``` - -**10.** No use espacios alrededor de los operadores `.`, `->`. - -Si es necesario, el operador se puede envolver a la siguiente línea. En este caso, el desplazamiento frente a él aumenta. - -**11.** No utilice un espacio para separar los operadores unarios (`--`, `++`, `*`, `&`, …) from the argument. - -**12.** Pon un espacio después de una coma, pero no antes. La misma regla se aplica a un punto y coma dentro de un `for` expresion. - -**13.** No utilice espacios para separar el `[]` operador. - -**14.** En un `template <...>` expresión, use un espacio entre `template` y `<`; sin espacios después de `<` o antes `>`. - -``` cpp -template -struct AggregatedStatElement -{} -``` - -**15.** En clases y estructuras, escribe `public`, `private`, y `protected` en el mismo nivel que `class/struct`, y sangrar el resto del código. - -``` cpp -template -class MultiVersion -{ -public: - /// Version of object for usage. shared_ptr manage lifetime of version. - using Version = std::shared_ptr; - ... -} -``` - -**16.** Si el mismo `namespace` se usa para todo el archivo, y no hay nada más significativo, no es necesario un desplazamiento dentro `namespace`. - -**17.** Si el bloque para un `if`, `for`, `while`, u otra expresión consiste en una sola `statement`, las llaves son opcionales. Coloque el `statement` en una línea separada, en su lugar. Esta regla también es válida para `if`, `for`, `while`, … - -Pero si el interior `statement` contiene llaves o `else`, el bloque externo debe escribirse entre llaves. - -``` cpp -/// Finish write. -for (auto & stream : streams) - stream.second->finalize(); -``` - -**18.** No debería haber espacios al final de las líneas. - -**19.** Los archivos de origen están codificados en UTF-8. - -**20.** Los caracteres no ASCII se pueden usar en literales de cadena. - -``` cpp -<< ", " << (timer.elapsed() / chunks_stats.hits) << " μsec/hit."; -``` - -**21.** No escriba varias expresiones en una sola línea. - -**22.** Agrupe secciones de código dentro de las funciones y sepárelas con no más de una línea vacía. - -**23.** Separe funciones, clases, etc. con una o dos líneas vacías. - -**24.** `A const` (relacionado con un valor) debe escribirse antes del nombre del tipo. - -``` cpp -//correct -const char * pos -const std::string & s -//incorrect -char const * pos -``` - -**25.** Al declarar un puntero o referencia, el `*` y `&` Los símbolos deben estar separados por espacios en ambos lados. - -``` cpp -//correct -const char * pos -//incorrect -const char* pos -const char *pos -``` - -**26.** Cuando utilice tipos de plantilla, alias con el `using` palabra clave (excepto en los casos más simples). - -En otras palabras, los parámetros de la plantilla se especifican solo en `using` y no se repiten en el código. - -`using` se puede declarar localmente, como dentro de una función. - -``` cpp -//correct -using FileStreams = std::map>; -FileStreams streams; -//incorrect -std::map> streams; -``` - -**27.** No declare varias variables de diferentes tipos en una instrucción. - -``` cpp -//incorrect -int x, *y; -``` - -**28.** No utilice moldes de estilo C. - -``` cpp -//incorrect -std::cerr << (int)c <<; std::endl; -//correct -std::cerr << static_cast(c) << std::endl; -``` - -**29.** En clases y estructuras, los miembros del grupo y las funciones por separado dentro de cada ámbito de visibilidad. - -**30.** Para clases y estructuras pequeñas, no es necesario separar la declaración del método de la implementación. - -Lo mismo es cierto para los métodos pequeños en cualquier clase o estructura. - -Para clases y estructuras con plantillas, no separe las declaraciones de métodos de la implementación (porque de lo contrario deben definirse en la misma unidad de traducción). - -**31.** Puede ajustar líneas en 140 caracteres, en lugar de 80. - -**32.** Utilice siempre los operadores de incremento / decremento de prefijo si no se requiere postfix. - -``` cpp -for (Names::const_iterator it = column_names.begin(); it != column_names.end(); ++it) -``` - -## Comentario {#comments} - -**1.** Asegúrese de agregar comentarios para todas las partes no triviales del código. - -Esto es muy importante. Escribir el comentario puede ayudarte a darte cuenta de que el código no es necesario o que está diseñado incorrectamente. - -``` cpp -/** Part of piece of memory, that can be used. - * For example, if internal_buffer is 1MB, and there was only 10 bytes loaded to buffer from file for reading, - * then working_buffer will have size of only 10 bytes - * (working_buffer.end() will point to position right after those 10 bytes available for read). - */ -``` - -**2.** Los comentarios pueden ser tan detallados como sea necesario. - -**3.** Coloque comentarios antes del código que describen. En casos raros, los comentarios pueden aparecer después del código, en la misma línea. - -``` cpp -/** Parses and executes the query. -*/ -void executeQuery( - ReadBuffer & istr, /// Where to read the query from (and data for INSERT, if applicable) - WriteBuffer & ostr, /// Where to write the result - Context & context, /// DB, tables, data types, engines, functions, aggregate functions... - BlockInputStreamPtr & query_plan, /// Here could be written the description on how query was executed - QueryProcessingStage::Enum stage = QueryProcessingStage::Complete /// Up to which stage process the SELECT query - ) -``` - -**4.** Los comentarios deben escribirse en inglés solamente. - -**5.** Si está escribiendo una biblioteca, incluya comentarios detallados que la expliquen en el archivo de encabezado principal. - -**6.** No agregue comentarios que no proporcionen información adicional. En particular, no deje comentarios vacíos como este: - -``` cpp -/* -* Procedure Name: -* Original procedure name: -* Author: -* Date of creation: -* Dates of modification: -* Modification authors: -* Original file name: -* Purpose: -* Intent: -* Designation: -* Classes used: -* Constants: -* Local variables: -* Parameters: -* Date of creation: -* Purpose: -*/ -``` - -El ejemplo se toma prestado del recurso http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/. - -**7.** No escriba comentarios de basura (autor, fecha de creación ..) al principio de cada archivo. - -**8.** Los comentarios de una sola línea comienzan con tres barras: `///` y los comentarios de varias líneas comienzan con `/**`. Estos comentarios son considerados “documentation”. - -Nota: Puede usar Doxygen para generar documentación a partir de estos comentarios. Pero Doxygen no se usa generalmente porque es más conveniente navegar por el código en el IDE. - -**9.** Los comentarios de varias líneas no deben tener líneas vacías al principio y al final (excepto la línea que cierra un comentario de varias líneas). - -**10.** Para comentar el código, use comentarios básicos, no “documenting” comentario. - -**11.** Elimine las partes comentadas del código antes de confirmar. - -**12.** No use blasfemias en comentarios o código. - -**13.** No use letras mayúsculas. No use puntuación excesiva. - -``` cpp -/// WHAT THE FAIL??? -``` - -**14.** No use comentarios para hacer delímetros. - -``` cpp -///****************************************************** -``` - -**15.** No comiencen las discusiones en los comentarios. - -``` cpp -/// Why did you do this stuff? -``` - -**16.** No es necesario escribir un comentario al final de un bloque que describa de qué se trataba. - -``` cpp -/// for -``` - -## Nombre {#names} - -**1.** Use letras minúsculas con guiones bajos en los nombres de variables y miembros de clase. - -``` cpp -size_t max_block_size; -``` - -**2.** Para los nombres de las funciones (métodos), use camelCase comenzando con una letra minúscula. - -``` cpp -std::string getName() const override { return "Memory"; } -``` - -**3.** Para los nombres de las clases (estructuras), use CamelCase comenzando con una letra mayúscula. Los prefijos distintos de I no se usan para interfaces. - -``` cpp -class StorageMemory : public IStorage -``` - -**4.** `using` se nombran de la misma manera que las clases, o con `_t` al final. - -**5.** Nombres de argumentos de tipo de plantilla: en casos simples, use `T`; `T`, `U`; `T1`, `T2`. - -Para casos más complejos, siga las reglas para los nombres de clase o agregue el prefijo `T`. - -``` cpp -template -struct AggregatedStatElement -``` - -**6.** Nombres de argumentos constantes de plantilla: siga las reglas para los nombres de variables o use `N` en casos simples. - -``` cpp -template -struct ExtractDomain -``` - -**7.** Para clases abstractas (interfaces) puede agregar el `I` prefijo. - -``` cpp -class IBlockInputStream -``` - -**8.** Si usa una variable localmente, puede usar el nombre corto. - -En todos los demás casos, use un nombre que describa el significado. - -``` cpp -bool info_successfully_loaded = false; -``` - -**9.** Nombres de `define`s y las constantes globales usan ALL_CAPS con guiones bajos. - -``` cpp -#define MAX_SRC_TABLE_NAMES_TO_STORE 1000 -``` - -**10.** Los nombres de archivo deben usar el mismo estilo que su contenido. - -Si un archivo contiene una sola clase, nombre el archivo de la misma manera que la clase (CamelCase). - -Si el archivo contiene una sola función, nombre el archivo de la misma manera que la función (camelCase). - -**11.** Si el nombre contiene una abreviatura, : - -- Para los nombres de variables, la abreviatura debe usar letras minúsculas `mysql_connection` (ni `mySQL_connection`). -- Para los nombres de clases y funciones, mantenga las letras mayúsculas en la abreviatura`MySQLConnection` (ni `MySqlConnection`). - -**12.** Los argumentos del constructor que se usan solo para inicializar los miembros de la clase deben nombrarse de la misma manera que los miembros de la clase, pero con un guión bajo al final. - -``` cpp -FileQueueProcessor( - const std::string & path_, - const std::string & prefix_, - std::shared_ptr handler_) - : path(path_), - prefix(prefix_), - handler(handler_), - log(&Logger::get("FileQueueProcessor")) -{ -} -``` - -El sufijo de subrayado se puede omitir si el argumento no se usa en el cuerpo del constructor. - -**13.** No hay diferencia en los nombres de las variables locales y los miembros de la clase (no se requieren prefijos). - -``` cpp -timer (not m_timer) -``` - -**14.** Para las constantes en un `enum`, usar CamelCase con una letra mayúscula. ALL_CAPS también es aceptable. Si el `enum` no es local, utilice un `enum class`. - -``` cpp -enum class CompressionMethod -{ - QuickLZ = 0, - LZ4 = 1, -}; -``` - -**15.** Todos los nombres deben estar en inglés. La transliteración de palabras rusas no está permitida. - - not Stroka - -**16.** Las abreviaturas son aceptables si son bien conocidas (cuando puede encontrar fácilmente el significado de la abreviatura en Wikipedia o en un motor de búsqueda). - - `AST`, `SQL`. - - Not `NVDH` (some random letters) - -Las palabras incompletas son aceptables si la versión abreviada es de uso común. - -También puede usar una abreviatura si el nombre completo se incluye junto a él en los comentarios. - -**17.** Los nombres de archivo con código fuente de C++ deben tener `.cpp` ampliación. Los archivos de encabezado deben tener `.h` ampliación. - -## Cómo escribir código {#how-to-write-code} - -**1.** Gestión de la memoria. - -Desasignación de memoria manual (`delete`) solo se puede usar en el código de la biblioteca. - -En el código de la biblioteca, el `delete` operador sólo se puede utilizar en destructores. - -En el código de la aplicación, la memoria debe ser liberada por el objeto que la posee. - -Ejemplos: - -- La forma más fácil es colocar un objeto en la pila o convertirlo en miembro de otra clase. -- Para una gran cantidad de objetos pequeños, use contenedores. -- Para la desasignación automática de un pequeño número de objetos que residen en el montón, use `shared_ptr/unique_ptr`. - -**2.** Gestión de recursos. - -Utilizar `RAII` y ver arriba. - -**3.** Manejo de errores. - -Utilice excepciones. En la mayoría de los casos, solo necesita lanzar una excepción y no necesita atraparla (debido a `RAII`). - -En las aplicaciones de procesamiento de datos fuera de línea, a menudo es aceptable no detectar excepciones. - -En los servidores que manejan las solicitudes de los usuarios, generalmente es suficiente detectar excepciones en el nivel superior del controlador de conexión. - -En las funciones de subproceso, debe capturar y mantener todas las excepciones para volver a lanzarlas en el subproceso principal después `join`. - -``` cpp -/// If there weren't any calculations yet, calculate the first block synchronously -if (!started) -{ - calculate(); - started = true; -} -else /// If calculations are already in progress, wait for the result - pool.wait(); - -if (exception) - exception->rethrow(); -``` - -Nunca oculte excepciones sin manejo. Nunca simplemente ponga ciegamente todas las excepciones para iniciar sesión. - -``` cpp -//Not correct -catch (...) {} -``` - -Si necesita ignorar algunas excepciones, hágalo solo para las específicas y vuelva a lanzar el resto. - -``` cpp -catch (const DB::Exception & e) -{ - if (e.code() == ErrorCodes::UNKNOWN_AGGREGATE_FUNCTION) - return nullptr; - else - throw; -} -``` - -Al usar funciones con códigos de respuesta o `errno`, siempre verifique el resultado y arroje una excepción en caso de error. - -``` cpp -if (0 != close(fd)) - throwFromErrno("Cannot close file " + file_name, ErrorCodes::CANNOT_CLOSE_FILE); -``` - -`Do not use assert`. - -**4.** Tipos de excepción. - -No es necesario utilizar una jerarquía de excepciones compleja en el código de la aplicación. El texto de excepción debe ser comprensible para un administrador del sistema. - -**5.** Lanzar excepciones de destructores. - -Esto no es recomendable, pero está permitido. - -Utilice las siguientes opciones: - -- Crear una función (`done()` o `finalize()`) que hará todo el trabajo de antemano que podría conducir a una excepción. Si se llamó a esa función, no debería haber excepciones en el destructor más adelante. -- Las tareas que son demasiado complejas (como enviar mensajes a través de la red) se pueden poner en un método separado al que el usuario de la clase tendrá que llamar antes de la destrucción. -- Si hay una excepción en el destructor, es mejor registrarla que ocultarla (si el registrador está disponible). -- En aplicaciones simples, es aceptable confiar en `std::terminate` (para los casos de `noexcept` de forma predeterminada en C ++ 11) para manejar excepciones. - -**6.** Bloques de código anónimos. - -Puede crear un bloque de código separado dentro de una sola función para hacer que ciertas variables sean locales, de modo que se llame a los destructores al salir del bloque. - -``` cpp -Block block = data.in->read(); - -{ - std::lock_guard lock(mutex); - data.ready = true; - data.block = block; -} - -ready_any.set(); -``` - -**7.** Multithreading. - -En programas de procesamiento de datos fuera de línea: - -- Trate de obtener el mejor rendimiento posible en un solo núcleo de CPU. A continuación, puede paralelizar su código si es necesario. - -En aplicaciones de servidor: - -- Utilice el grupo de subprocesos para procesar solicitudes. En este punto, no hemos tenido ninguna tarea que requiera el cambio de contexto de espacio de usuario. - -La horquilla no se usa para la paralelización. - -**8.** Sincronización de hilos. - -A menudo es posible hacer que diferentes hilos usen diferentes celdas de memoria (incluso mejor: diferentes líneas de caché) y no usar ninguna sincronización de hilos (excepto `joinAll`). - -Si se requiere sincronización, en la mayoría de los casos, es suficiente usar mutex bajo `lock_guard`. - -En otros casos, use primitivas de sincronización del sistema. No utilice la espera ocupada. - -Las operaciones atómicas deben usarse solo en los casos más simples. - -No intente implementar estructuras de datos sin bloqueo a menos que sea su principal área de especialización. - -**9.** Punteros vs referencias. - -En la mayoría de los casos, prefiera referencias. - -**10.** Construir. - -Usar referencias constantes, punteros a constantes, `const_iterator`, y métodos const. - -Considerar `const` para ser predeterminado y usar no-`const` sólo cuando sea necesario. - -Al pasar variables por valor, usando `const` por lo general no tiene sentido. - -**11.** sin firmar. - -Utilizar `unsigned` si es necesario. - -**12.** Tipos numéricos. - -Utilice los tipos `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, y `Int64`, así como `size_t`, `ssize_t`, y `ptrdiff_t`. - -No use estos tipos para números: `signed/unsigned long`, `long long`, `short`, `signed/unsigned char`, `char`. - -**13.** Pasando argumentos. - -Pasar valores complejos por referencia (incluyendo `std::string`). - -Si una función captura la propiedad de un objeto creado en el montón, cree el tipo de argumento `shared_ptr` o `unique_ptr`. - -**14.** Valores devueltos. - -En la mayoría de los casos, sólo tiene que utilizar `return`. No escribir `return std::move(res)`. - -Si la función asigna un objeto en el montón y lo devuelve, use `shared_ptr` o `unique_ptr`. - -En casos excepcionales, es posible que deba devolver el valor a través de un argumento. En este caso, el argumento debe ser una referencia. - -``` cpp -using AggregateFunctionPtr = std::shared_ptr; - -/** Allows creating an aggregate function by its name. - */ -class AggregateFunctionFactory -{ -public: - AggregateFunctionFactory(); - AggregateFunctionPtr get(const String & name, const DataTypes & argument_types) const; -``` - -**15.** espacio de nombres. - -No hay necesidad de usar un `namespace` para el código de aplicación. - -Las bibliotecas pequeñas tampoco necesitan esto. - -Para bibliotecas medianas a grandes, coloque todo en un `namespace`. - -En la biblioteca `.h` archivo, se puede utilizar `namespace detail` para ocultar los detalles de implementación no necesarios para el código de la aplicación. - -En un `.cpp` archivo, puede usar un `static` o espacio de nombres anónimo para ocultar símbolos. - -Además, un `namespace` puede ser utilizado para un `enum` para evitar que los nombres correspondientes caigan en un `namespace` (pero es mejor usar un `enum class`). - -**16.** Inicialización diferida. - -Si se requieren argumentos para la inicialización, normalmente no debe escribir un constructor predeterminado. - -Si más adelante tendrá que retrasar la inicialización, puede agregar un constructor predeterminado que creará un objeto no válido. O, para un pequeño número de objetos, puede usar `shared_ptr/unique_ptr`. - -``` cpp -Loader(DB::Connection * connection_, const std::string & query, size_t max_block_size_); - -/// For deferred initialization -Loader() {} -``` - -**17.** Funciones virtuales. - -Si la clase no está destinada para uso polimórfico, no necesita hacer que las funciones sean virtuales. Esto también se aplica al destructor. - -**18.** Codificación. - -Usa UTF-8 en todas partes. Utilizar `std::string`y`char *`. No use `std::wstring`y`wchar_t`. - -**19.** Tala. - -Vea los ejemplos en todas partes del código. - -Antes de confirmar, elimine todo el registro de depuración y sin sentido, y cualquier otro tipo de salida de depuración. - -Se debe evitar el registro en ciclos, incluso en el nivel Trace. - -Los registros deben ser legibles en cualquier nivel de registro. - -El registro solo debe usarse en el código de la aplicación, en su mayor parte. - -Los mensajes de registro deben estar escritos en inglés. - -El registro debe ser preferiblemente comprensible para el administrador del sistema. - -No use blasfemias en el registro. - -Utilice la codificación UTF-8 en el registro. En casos excepcionales, puede usar caracteres que no sean ASCII en el registro. - -**20.** Entrada-salida. - -No utilice `iostreams` en ciclos internos que son críticos para el rendimiento de la aplicación (y nunca usan `stringstream`). - -Utilice el `DB/IO` biblioteca en su lugar. - -**21.** Fecha y hora. - -Ver el `DateLUT` biblioteca. - -**22.** incluir. - -Utilice siempre `#pragma once` en lugar de incluir guardias. - -**23.** utilizar. - -`using namespace` no se utiliza. Usted puede utilizar `using` con algo específico. Pero hazlo local dentro de una clase o función. - -**24.** No use `trailing return type` para funciones a menos que sea necesario. - -``` cpp -auto f() -> void -``` - -**25.** Declaración e inicialización de variables. - -``` cpp -//right way -std::string s = "Hello"; -std::string s{"Hello"}; - -//wrong way -auto s = std::string{"Hello"}; -``` - -**26.** Para funciones virtuales, escriba `virtual` en la clase base, pero escribe `override` en lugar de `virtual` en las clases descendientes. - -## Características no utilizadas de C ++ {#unused-features-of-c} - -**1.** La herencia virtual no se utiliza. - -**2.** Los especificadores de excepción de C ++ 03 no se usan. - -## Plataforma {#platform} - -**1.** Escribimos código para una plataforma específica. - -Pero en igualdad de condiciones, se prefiere el código multiplataforma o portátil. - -**2.** Idioma: C++20. - -**3.** Compilación: `gcc`. En este momento (agosto 2020), el código se compila utilizando la versión 9.3. (También se puede compilar usando `clang 8`.) - -Se utiliza la biblioteca estándar (`libc++`). - -**4.**OS: Linux Ubuntu, no más viejo que Precise. - -**5.**El código está escrito para la arquitectura de CPU x86_64. - -El conjunto de instrucciones de CPU es el conjunto mínimo admitido entre nuestros servidores. Actualmente, es SSE 4.2. - -**6.** Utilizar `-Wall -Wextra -Werror` flags de compilación. - -**7.** Use enlaces estáticos con todas las bibliotecas, excepto aquellas a las que son difíciles de conectar estáticamente (consulte la salida de la `ldd` comando). - -**8.** El código se desarrolla y se depura con la configuración de la versión. - -## Herramienta {#tools} - -**1.** KDevelop es un buen IDE. - -**2.** Para la depuración, use `gdb`, `valgrind` (`memcheck`), `strace`, `-fsanitize=...`, o `tcmalloc_minimal_debug`. - -**3.** Para crear perfiles, use `Linux Perf`, `valgrind` (`callgrind`), o `strace -cf`. - -**4.** Las fuentes están en Git. - -**5.** Usos de ensamblaje `CMake`. - -**6.** Los programas se lanzan usando `deb` paquete. - -**7.** Los compromisos a dominar no deben romper la compilación. - -Aunque solo las revisiones seleccionadas se consideran viables. - -**8.** Realice confirmaciones tan a menudo como sea posible, incluso si el código está parcialmente listo. - -Use ramas para este propósito. - -Si su código en el `master` branch todavía no se puede construir, excluirlo de la compilación antes de la `push`. Tendrá que terminarlo o eliminarlo dentro de unos días. - -**9.** Para cambios no triviales, use ramas y publíquelas en el servidor. - -**10.** El código no utilizado se elimina del repositorio. - -## Biblioteca {#libraries} - -**1.** Se utiliza la biblioteca estándar de C++20 (se permiten extensiones experimentales), así como `boost` y `Poco` marco. - -**2.** Si es necesario, puede usar cualquier biblioteca conocida disponible en el paquete del sistema operativo. - -Si ya hay una buena solución disponible, úsela, incluso si eso significa que debe instalar otra biblioteca. - -(Pero prepárese para eliminar las bibliotecas incorrectas del código.) - -**3.** Puede instalar una biblioteca que no esté en los paquetes, si los paquetes no tienen lo que necesita o tienen una versión obsoleta o el tipo de compilación incorrecto. - -**4.** Si la biblioteca es pequeña y no tiene su propio sistema de compilación complejo, coloque los archivos `contrib` carpeta. - -**5.** Siempre se da preferencia a las bibliotecas que ya están en uso. - -## Recomendaciones generales {#general-recommendations-1} - -**1.** Escribe el menor código posible. - -**2.** Pruebe la solución más simple. - -**3.** No escriba código hasta que sepa cómo va a funcionar y cómo funcionará el bucle interno. - -**4.** En los casos más simples, use `using` en lugar de clases o estructuras. - -**5.** Si es posible, no escriba constructores de copia, operadores de asignación, destructores (que no sean virtuales, si la clase contiene al menos una función virtual), mueva constructores o mueva operadores de asignación. En otras palabras, las funciones generadas por el compilador deben funcionar correctamente. Usted puede utilizar `default`. - -**6.** Se fomenta la simplificación del código. Reduzca el tamaño de su código siempre que sea posible. - -## Recomendaciones adicionales {#additional-recommendations} - -**1.** Especificar explícitamente `std::` para tipos de `stddef.h` - -no se recomienda. En otras palabras, recomendamos escribir `size_t` en su lugar `std::size_t` porque es más corto. - -Es aceptable agregar `std::`. - -**2.** Especificar explícitamente `std::` para funciones de la biblioteca C estándar - -no se recomienda. En otras palabras, escribir `memcpy` en lugar de `std::memcpy`. - -La razón es que hay funciones no estándar similares, tales como `memmem`. Utilizamos estas funciones en ocasiones. Estas funciones no existen en `namespace std`. - -Si usted escribe `std::memcpy` en lugar de `memcpy` en todas partes, entonces `memmem` sin `std::` se verá extraño. - -Sin embargo, todavía puedes usar `std::` si lo prefieres. - -**3.** Usar funciones de C cuando las mismas están disponibles en la biblioteca estándar de C ++. - -Esto es aceptable si es más eficiente. - -Por ejemplo, use `memcpy` en lugar de `std::copy` para copiar grandes trozos de memoria. - -**4.** Argumentos de función multilínea. - -Se permite cualquiera de los siguientes estilos de ajuste: - -``` cpp -function( - T1 x1, - T2 x2) -``` - -``` cpp -function( - size_t left, size_t right, - const & RangesInDataParts ranges, - size_t limit) -``` - -``` cpp -function(size_t left, size_t right, - const & RangesInDataParts ranges, - size_t limit) -``` - -``` cpp -function(size_t left, size_t right, - const & RangesInDataParts ranges, - size_t limit) -``` - -``` cpp -function( - size_t left, - size_t right, - const & RangesInDataParts ranges, - size_t limit) -``` - -[Artículo Original](https://clickhouse.tech/docs/en/development/style/) diff --git a/docs/es/development/tests.md b/docs/es/development/tests.md deleted file mode 120000 index c03d36c3916..00000000000 --- a/docs/es/development/tests.md +++ /dev/null @@ -1 +0,0 @@ -../../en/development/tests.md \ No newline at end of file diff --git a/docs/es/engines/database-engines/atomic.md b/docs/es/engines/database-engines/atomic.md deleted file mode 100644 index f019b94a00b..00000000000 --- a/docs/es/engines/database-engines/atomic.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -toc_priority: 32 -toc_title: Atomic ---- - - -# Atomic {#atomic} - -It is supports non-blocking `DROP` and `RENAME TABLE` queries and atomic `EXCHANGE TABLES t1 AND t2` queries. Atomic database engine is used by default. - -## Creating a Database {#creating-a-database} - -```sql -CREATE DATABASE test ENGINE = Atomic; -``` - -[Original article](https://clickhouse.tech/docs/en/engines/database_engines/atomic/) diff --git a/docs/es/engines/database-engines/index.md b/docs/es/engines/database-engines/index.md deleted file mode 100644 index 8784b9bd02b..00000000000 --- a/docs/es/engines/database-engines/index.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Motores de base de datos -toc_priority: 27 -toc_title: "Implantaci\xF3n" ---- - -# Motores de base de datos {#database-engines} - -Los motores de bases de datos le permiten trabajar con tablas. - -De forma predeterminada, ClickHouse utiliza su motor de base de datos nativa, que proporciona [motores de mesa](../../engines/table-engines/index.md) y una [Dialecto SQL](../../sql-reference/syntax.md). - -También puede utilizar los siguientes motores de base de datos: - -- [MySQL](mysql.md) - -- [Perezoso](lazy.md) - -[Artículo Original](https://clickhouse.tech/docs/en/database_engines/) diff --git a/docs/es/engines/database-engines/lazy.md b/docs/es/engines/database-engines/lazy.md deleted file mode 100644 index 0988c4cb395..00000000000 --- a/docs/es/engines/database-engines/lazy.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 31 -toc_title: Perezoso ---- - -# Perezoso {#lazy} - -Mantiene las tablas en RAM solamente `expiration_time_in_seconds` segundos después del último acceso. Solo se puede usar con tablas \*Log. - -Está optimizado para almacenar muchas tablas pequeñas \* Log, para las cuales hay un largo intervalo de tiempo entre los accesos. - -## Creación de una base de datos {#creating-a-database} - - CREATE DATABASE testlazy ENGINE = Lazy(expiration_time_in_seconds); - -[Artículo Original](https://clickhouse.tech/docs/en/database_engines/lazy/) diff --git a/docs/es/engines/database-engines/mysql.md b/docs/es/engines/database-engines/mysql.md deleted file mode 100644 index 5f1dec97f35..00000000000 --- a/docs/es/engines/database-engines/mysql.md +++ /dev/null @@ -1,135 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 30 -toc_title: MySQL ---- - -# MySQL {#mysql} - -Permite conectarse a bases de datos en un servidor MySQL remoto y realizar `INSERT` y `SELECT` consultas para intercambiar datos entre ClickHouse y MySQL. - -El `MySQL` motor de base de datos traducir consultas al servidor MySQL para que pueda realizar operaciones tales como `SHOW TABLES` o `SHOW CREATE TABLE`. - -No puede realizar las siguientes consultas: - -- `RENAME` -- `CREATE TABLE` -- `ALTER` - -## Creación de una base de datos {#creating-a-database} - -``` sql -CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] -ENGINE = MySQL('host:port', ['database' | database], 'user', 'password') -``` - -**Parámetros del motor** - -- `host:port` — MySQL server address. -- `database` — Remote database name. -- `user` — MySQL user. -- `password` — User password. - -## Soporte de tipos de datos {#data_types-support} - -| MySQL | Haga clic en Casa | -|----------------------------------|--------------------------------------------------------------| -| UNSIGNED TINYINT | [UInt8](../../sql-reference/data-types/int-uint.md) | -| TINYINT | [Int8](../../sql-reference/data-types/int-uint.md) | -| UNSIGNED SMALLINT | [UInt16](../../sql-reference/data-types/int-uint.md) | -| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | -| UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../../sql-reference/data-types/int-uint.md) | -| INT, MEDIUMINT | [Int32](../../sql-reference/data-types/int-uint.md) | -| UNSIGNED BIGINT | [UInt64](../../sql-reference/data-types/int-uint.md) | -| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | -| FLOAT | [Float32](../../sql-reference/data-types/float.md) | -| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | -| DATE | [Fecha](../../sql-reference/data-types/date.md) | -| DATETIME, TIMESTAMP | [FechaHora](../../sql-reference/data-types/datetime.md) | -| BINARY | [Cadena fija](../../sql-reference/data-types/fixedstring.md) | - -Todos los demás tipos de datos MySQL se convierten en [Cadena](../../sql-reference/data-types/string.md). - -[NULL](../../sql-reference/data-types/nullable.md) se admite. - -## Ejemplos de uso {#examples-of-use} - -Tabla en MySQL: - -``` text -mysql> USE test; -Database changed - -mysql> CREATE TABLE `mysql_table` ( - -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `float` FLOAT NOT NULL, - -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) - -mysql> insert into mysql_table (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) - -mysql> select * from mysql_table; -+------+-----+ -| int_id | value | -+------+-----+ -| 1 | 2 | -+------+-----+ -1 row in set (0,00 sec) -``` - -Base de datos en ClickHouse, intercambiando datos con el servidor MySQL: - -``` sql -CREATE DATABASE mysql_db ENGINE = MySQL('localhost:3306', 'test', 'my_user', 'user_password') -``` - -``` sql -SHOW DATABASES -``` - -``` text -┌─name─────┐ -│ default │ -│ mysql_db │ -│ system │ -└──────────┘ -``` - -``` sql -SHOW TABLES FROM mysql_db -``` - -``` text -┌─name─────────┐ -│ mysql_table │ -└──────────────┘ -``` - -``` sql -SELECT * FROM mysql_db.mysql_table -``` - -``` text -┌─int_id─┬─value─┐ -│ 1 │ 2 │ -└────────┴───────┘ -``` - -``` sql -INSERT INTO mysql_db.mysql_table VALUES (3,4) -``` - -``` sql -SELECT * FROM mysql_db.mysql_table -``` - -``` text -┌─int_id─┬─value─┐ -│ 1 │ 2 │ -│ 3 │ 4 │ -└────────┴───────┘ -``` - -[Artículo Original](https://clickhouse.tech/docs/en/database_engines/mysql/) diff --git a/docs/es/engines/index.md b/docs/es/engines/index.md deleted file mode 100644 index 03e4426dd8d..00000000000 --- a/docs/es/engines/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Motor -toc_priority: 25 ---- - - diff --git a/docs/es/engines/table-engines/index.md b/docs/es/engines/table-engines/index.md deleted file mode 100644 index 7be315e3ee3..00000000000 --- a/docs/es/engines/table-engines/index.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Motores de mesa -toc_priority: 26 -toc_title: "Implantaci\xF3n" ---- - -# Motores de mesa {#table_engines} - -El motor de tabla (tipo de tabla) determina: - -- Cómo y dónde se almacenan los datos, dónde escribirlos y dónde leerlos. -- Qué consultas son compatibles y cómo. -- Acceso a datos simultáneos. -- Uso de índices, si está presente. -- Si es posible la ejecución de solicitudes multiproceso. -- Parámetros de replicación de datos. - -## Familias de motores {#engine-families} - -### Método de codificación de datos: {#mergetree} - -Los motores de mesa más universales y funcionales para tareas de alta carga. La propiedad compartida por estos motores es la inserción rápida de datos con el posterior procesamiento de datos en segundo plano. `MergeTree` Los motores familiares admiten la replicación de datos (con [Replicado\*](mergetree-family/replication.md#table_engines-replication) versiones de motores), particionamiento y otras características no admitidas en otros motores. - -Motores en la familia: - -- [Método de codificación de datos:](mergetree-family/mergetree.md#mergetree) -- [ReplacingMergeTree](mergetree-family/replacingmergetree.md#replacingmergetree) -- [SummingMergeTree](mergetree-family/summingmergetree.md#summingmergetree) -- [AgregaciónMergeTree](mergetree-family/aggregatingmergetree.md#aggregatingmergetree) -- [ColapsarMergeTree](mergetree-family/collapsingmergetree.md#table_engine-collapsingmergetree) -- [VersionedCollapsingMergeTree](mergetree-family/versionedcollapsingmergetree.md#versionedcollapsingmergetree) -- [GraphiteMergeTree](mergetree-family/graphitemergetree.md#graphitemergetree) - -### Registro {#log} - -Ligero [motor](log-family/index.md) con funcionalidad mínima. Son los más efectivos cuando necesita escribir rápidamente muchas tablas pequeñas (hasta aproximadamente 1 millón de filas) y leerlas más tarde como un todo. - -Motores en la familia: - -- [TinyLog](log-family/tinylog.md#tinylog) -- [StripeLog](log-family/stripelog.md#stripelog) -- [Registro](log-family/log.md#log) - -### Motores de integración {#integration-engines} - -Motores para comunicarse con otros sistemas de almacenamiento y procesamiento de datos. - -Motores en la familia: - -- [Kafka](integrations/kafka.md#kafka) -- [MySQL](integrations/mysql.md#mysql) -- [ODBC](integrations/odbc.md#table-engine-odbc) -- [JDBC](integrations/jdbc.md#table-engine-jdbc) -- [HDFS](integrations/hdfs.md#hdfs) - -### Motores especiales {#special-engines} - -Motores en la familia: - -- [Distribuido](special/distributed.md#distributed) -- [Método de codificación de datos:](special/materializedview.md#materializedview) -- [Diccionario](special/dictionary.md#dictionary) -- \[Fusión\](special/merge.md#merge -- [File](special/file.md#file) -- [Nulo](special/null.md#null) -- [Establecer](special/set.md#set) -- [Unir](special/join.md#join) -- [URL](special/url.md#table_engines-url) -- [Vista](special/view.md#table_engines-view) -- [Memoria](special/memory.md#memory) -- [Búfer](special/buffer.md#buffer) - -## Virtual Columnas {#table_engines-virtual_columns} - -La columna virtual es un atributo de motor de tabla integral que se define en el código fuente del motor. - -No debe especificar columnas virtuales en el `CREATE TABLE` consulta y no puedes verlos en `SHOW CREATE TABLE` y `DESCRIBE TABLE` resultados de la consulta. Las columnas virtuales también son de solo lectura, por lo que no puede insertar datos en columnas virtuales. - -Para seleccionar datos de una columna virtual, debe especificar su nombre en el `SELECT` consulta. `SELECT *` no devuelve valores de columnas virtuales. - -Si crea una tabla con una columna que tiene el mismo nombre que una de las columnas virtuales de la tabla, la columna virtual se vuelve inaccesible. No recomendamos hacer esto. Para ayudar a evitar conflictos, los nombres de columna virtual suelen tener el prefijo de un guión bajo. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/) diff --git a/docs/es/engines/table-engines/integrations/hdfs.md b/docs/es/engines/table-engines/integrations/hdfs.md deleted file mode 100644 index 5e0211660f5..00000000000 --- a/docs/es/engines/table-engines/integrations/hdfs.md +++ /dev/null @@ -1,123 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 36 -toc_title: HDFS ---- - -# HDFS {#table_engines-hdfs} - -Este motor proporciona integración con [Acerca de nosotros](https://en.wikipedia.org/wiki/Apache_Hadoop) permitiendo gestionar datos sobre [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)a través de ClickHouse. Este motor es similar -a la [File](../special/file.md#table_engines-file) y [URL](../special/url.md#table_engines-url) motores, pero proporciona características específicas de Hadoop. - -## Uso {#usage} - -``` sql -ENGINE = HDFS(URI, format) -``` - -El `URI` El parámetro es el URI del archivo completo en HDFS. -El `format` parámetro especifica uno de los formatos de archivo disponibles. Realizar -`SELECT` consultas, el formato debe ser compatible para la entrada, y para realizar -`INSERT` queries – for output. The available formats are listed in the -[Formato](../../../interfaces/formats.md#formats) apartado. -La parte de la ruta de `URI` puede contener globs. En este caso, la tabla sería de solo lectura. - -**Ejemplo:** - -**1.** Configurar el `hdfs_engine_table` tabla: - -``` sql -CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') -``` - -**2.** Llenar archivo: - -``` sql -INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) -``` - -**3.** Consultar los datos: - -``` sql -SELECT * FROM hdfs_engine_table LIMIT 2 -``` - -``` text -┌─name─┬─value─┐ -│ one │ 1 │ -│ two │ 2 │ -└──────┴───────┘ -``` - -## Detalles de implementación {#implementation-details} - -- Las lecturas y escrituras pueden ser paralelas -- No soportado: - - `ALTER` y `SELECT...SAMPLE` operación. - - Índices. - - Replicación. - -**Globs en el camino** - -Múltiples componentes de ruta de acceso pueden tener globs. Para ser procesado, el archivo debe existir y coincidir con todo el patrón de ruta. Listado de archivos determina durante `SELECT` (no en `CREATE` momento). - -- `*` — Substitutes any number of any characters except `/` incluyendo cadena vacía. -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. - -Construcciones con `{}` son similares a la [remoto](../../../sql-reference/table-functions/remote.md) función de la tabla. - -**Ejemplo** - -1. Supongamos que tenemos varios archivos en formato TSV con los siguientes URI en HDFS: - -- ‘hdfs://hdfs1:9000/some_dir/some_file_1’ -- ‘hdfs://hdfs1:9000/some_dir/some_file_2’ -- ‘hdfs://hdfs1:9000/some_dir/some_file_3’ -- ‘hdfs://hdfs1:9000/another_dir/some_file_1’ -- ‘hdfs://hdfs1:9000/another_dir/some_file_2’ -- ‘hdfs://hdfs1:9000/another_dir/some_file_3’ - -1. Hay varias maneras de hacer una tabla que consta de los seis archivos: - - - -``` sql -CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV') -``` - -Otra forma: - -``` sql -CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV') -``` - -La tabla consta de todos los archivos en ambos directorios (todos los archivos deben satisfacer el formato y el esquema descritos en la consulta): - -``` sql -CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') -``` - -!!! warning "Advertencia" - Si la lista de archivos contiene rangos de números con ceros a la izquierda, use la construcción con llaves para cada dígito por separado o use `?`. - -**Ejemplo** - -Crear tabla con archivos llamados `file000`, `file001`, … , `file999`: - -``` sql -CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') -``` - -## Virtual Columnas {#virtual-columns} - -- `_path` — Path to the file. -- `_file` — Name of the file. - -**Ver también** - -- [Virtual columnas](../index.md#table_engines-virtual_columns) - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/hdfs/) diff --git a/docs/es/engines/table-engines/integrations/index.md b/docs/es/engines/table-engines/integrations/index.md deleted file mode 100644 index e57aaf88744..00000000000 --- a/docs/es/engines/table-engines/integrations/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Integraci\xF3n" -toc_priority: 30 ---- - - diff --git a/docs/es/engines/table-engines/integrations/jdbc.md b/docs/es/engines/table-engines/integrations/jdbc.md deleted file mode 100644 index fd3450cef7c..00000000000 --- a/docs/es/engines/table-engines/integrations/jdbc.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 34 -toc_title: JDBC ---- - -# JDBC {#table-engine-jdbc} - -Permite que ClickHouse se conecte a bases de datos externas a través de [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity). - -Para implementar la conexión JDBC, ClickHouse utiliza el programa independiente [Sistema abierto.](https://github.com/alex-krash/clickhouse-jdbc-bridge) que debería ejecutarse como un demonio. - -Este motor soporta el [NULL](../../../sql-reference/data-types/nullable.md) tipo de datos. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name -( - columns list... -) -ENGINE = JDBC(dbms_uri, external_database, external_table) -``` - -**Parámetros del motor** - -- `dbms_uri` — URI of an external DBMS. - - Formato: `jdbc:://:/?user=&password=`. - Ejemplo para MySQL: `jdbc:mysql://localhost:3306/?user=root&password=root`. - -- `external_database` — Database in an external DBMS. - -- `external_table` — Name of the table in `external_database`. - -## Ejemplo de uso {#usage-example} - -Crear una tabla en el servidor MySQL conectándose directamente con su cliente de consola: - -``` text -mysql> CREATE TABLE `test`.`test` ( - -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `int_nullable` INT NULL DEFAULT NULL, - -> `float` FLOAT NOT NULL, - -> `float_nullable` FLOAT NULL DEFAULT NULL, - -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) - -mysql> insert into test (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) - -mysql> select * from test; -+------+----------+-----+----------+ -| int_id | int_nullable | float | float_nullable | -+------+----------+-----+----------+ -| 1 | NULL | 2 | NULL | -+------+----------+-----+----------+ -1 row in set (0,00 sec) -``` - -Creación de una tabla en el servidor ClickHouse y selección de datos de ella: - -``` sql -CREATE TABLE jdbc_table -( - `int_id` Int32, - `int_nullable` Nullable(Int32), - `float` Float32, - `float_nullable` Nullable(Float32) -) -ENGINE JDBC('jdbc:mysql://localhost:3306/?user=root&password=root', 'test', 'test') -``` - -``` sql -SELECT * -FROM jdbc_table -``` - -``` text -┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ -│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ -└────────┴──────────────┴───────┴────────────────┘ -``` - -## Ver también {#see-also} - -- [Función de la tabla de JDBC](../../../sql-reference/table-functions/jdbc.md). - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/jdbc/) diff --git a/docs/es/engines/table-engines/integrations/kafka.md b/docs/es/engines/table-engines/integrations/kafka.md deleted file mode 100644 index 54250aae82a..00000000000 --- a/docs/es/engines/table-engines/integrations/kafka.md +++ /dev/null @@ -1,180 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 32 -toc_title: Kafka ---- - -# Kafka {#kafka} - -Este motor funciona con [Acerca de nosotros](http://kafka.apache.org/). - -Kafka te permite: - -- Publicar o suscribirse a flujos de datos. -- Organice el almacenamiento tolerante a fallos. -- Secuencias de proceso a medida que estén disponibles. - -## Creación de una tabla {#table_engine-kafka-creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = Kafka() -SETTINGS - kafka_broker_list = 'host:port', - kafka_topic_list = 'topic1,topic2,...', - kafka_group_name = 'group_name', - kafka_format = 'data_format'[,] - [kafka_row_delimiter = 'delimiter_symbol',] - [kafka_schema = '',] - [kafka_num_consumers = N,] - [kafka_max_block_size = 0,] - [kafka_skip_broken_messages = N,] - [kafka_commit_every_batch = 0] -``` - -Parámetros requeridos: - -- `kafka_broker_list` – A comma-separated list of brokers (for example, `localhost:9092`). -- `kafka_topic_list` – A list of Kafka topics. -- `kafka_group_name` – A group of Kafka consumers. Reading margins are tracked for each group separately. If you don't want messages to be duplicated in the cluster, use the same group name everywhere. -- `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` función, tal como `JSONEachRow`. Para obtener más información, consulte [Formato](../../../interfaces/formats.md) apartado. - -Parámetros opcionales: - -- `kafka_row_delimiter` – Delimiter character, which ends the message. -- `kafka_schema` – Parameter that must be used if the format requires a schema definition. For example, [Cap'n Proto](https://capnproto.org/) requiere la ruta de acceso al archivo de esquema y el nombre de la raíz `schema.capnp:Message` objeto. -- `kafka_num_consumers` – The number of consumers per table. Default: `1`. Especifique más consumidores si el rendimiento de un consumidor es insuficiente. El número total de consumidores no debe exceder el número de particiones en el tema, ya que solo se puede asignar un consumidor por partición. -- `kafka_max_block_size` - El tamaño máximo de lote (en mensajes) para la encuesta (predeterminado: `max_block_size`). -- `kafka_skip_broken_messages` – Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. Si `kafka_skip_broken_messages = N` entonces el motor salta *N* Mensajes de Kafka que no se pueden analizar (un mensaje es igual a una fila de datos). -- `kafka_commit_every_batch` - Confirmar cada lote consumido y manejado en lugar de una única confirmación después de escribir un bloque completo (predeterminado: `0`). - -Ejemplos: - -``` sql - CREATE TABLE queue ( - timestamp UInt64, - level String, - message String - ) ENGINE = Kafka('localhost:9092', 'topic', 'group1', 'JSONEachRow'); - - SELECT * FROM queue LIMIT 5; - - CREATE TABLE queue2 ( - timestamp UInt64, - level String, - message String - ) ENGINE = Kafka SETTINGS kafka_broker_list = 'localhost:9092', - kafka_topic_list = 'topic', - kafka_group_name = 'group1', - kafka_format = 'JSONEachRow', - kafka_num_consumers = 4; - - CREATE TABLE queue2 ( - timestamp UInt64, - level String, - message String - ) ENGINE = Kafka('localhost:9092', 'topic', 'group1') - SETTINGS kafka_format = 'JSONEachRow', - kafka_num_consumers = 4; -``` - -
- -Método obsoleto para crear una tabla - -!!! attention "Atención" - No utilice este método en nuevos proyectos. Si es posible, cambie los proyectos antiguos al método descrito anteriormente. - -``` sql -Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format - [, kafka_row_delimiter, kafka_schema, kafka_num_consumers, kafka_skip_broken_messages]) -``` - -
- -## Descripci {#description} - -Los mensajes entregados se realizan un seguimiento automático, por lo que cada mensaje de un grupo solo se cuenta una vez. Si desea obtener los datos dos veces, cree una copia de la tabla con otro nombre de grupo. - -Los grupos son flexibles y se sincronizan en el clúster. Por ejemplo, si tiene 10 temas y 5 copias de una tabla en un clúster, cada copia obtiene 2 temas. Si el número de copias cambia, los temas se redistribuyen automáticamente entre las copias. Lea más sobre esto en http://kafka.apache.org/intro . - -`SELECT` no es particularmente útil para leer mensajes (excepto para la depuración), ya que cada mensaje se puede leer solo una vez. Es más práctico crear subprocesos en tiempo real utilizando vistas materializadas. Para hacer esto: - -1. Use el motor para crear un consumidor de Kafka y considérelo como un flujo de datos. -2. Crea una tabla con la estructura deseada. -3. Cree una vista materializada que convierta los datos del motor y los coloque en una tabla creada previamente. - -Cuando el `MATERIALIZED VIEW` se une al motor, comienza a recopilar datos en segundo plano. Esto le permite recibir continuamente mensajes de Kafka y convertirlos al formato requerido usando `SELECT`. -Una tabla kafka puede tener tantas vistas materializadas como desee, no leen datos de la tabla kafka directamente, sino que reciben nuevos registros (en bloques), de esta manera puede escribir en varias tablas con diferentes niveles de detalle (con agrupación - agregación y sin). - -Ejemplo: - -``` sql - CREATE TABLE queue ( - timestamp UInt64, - level String, - message String - ) ENGINE = Kafka('localhost:9092', 'topic', 'group1', 'JSONEachRow'); - - CREATE TABLE daily ( - day Date, - level String, - total UInt64 - ) ENGINE = SummingMergeTree(day, (day, level), 8192); - - CREATE MATERIALIZED VIEW consumer TO daily - AS SELECT toDate(toDateTime(timestamp)) AS day, level, count() as total - FROM queue GROUP BY day, level; - - SELECT level, sum(total) FROM daily GROUP BY level; -``` - -Para mejorar el rendimiento, los mensajes recibidos se agrupan en bloques del tamaño de [Max_insert_block_size](../../../operations/server-configuration-parameters/settings.md#settings-max_insert_block_size). Si el bloque no se formó dentro de [Nombre de la red inalámbrica (SSID):](../../../operations/server-configuration-parameters/settings.md) milisegundos, los datos se vaciarán a la tabla independientemente de la integridad del bloque. - -Para detener la recepción de datos de tema o cambiar la lógica de conversión, desconecte la vista materializada: - -``` sql - DETACH TABLE consumer; - ATTACH TABLE consumer; -``` - -Si desea cambiar la tabla de destino utilizando `ALTER`, recomendamos deshabilitar la vista de material para evitar discrepancias entre la tabla de destino y los datos de la vista. - -## Configuración {#configuration} - -Similar a GraphiteMergeTree, el motor Kafka admite una configuración extendida utilizando el archivo de configuración ClickHouse. Hay dos claves de configuración que puede usar: global (`kafka`) y a nivel de tema (`kafka_*`). La configuración global se aplica primero y, a continuación, se aplica la configuración de nivel de tema (si existe). - -``` xml - - - cgrp - smallest - - - - - 250 - 100000 - -``` - -Para obtener una lista de posibles opciones de configuración, consulte [referencia de configuración librdkafka](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md). Usa el guión bajo (`_`) en lugar de un punto en la configuración de ClickHouse. Por ejemplo, `check.crcs=true` será `true`. - -## Virtual Columnas {#virtual-columns} - -- `_topic` — Kafka topic. -- `_key` — Key of the message. -- `_offset` — Offset of the message. -- `_timestamp` — Timestamp of the message. -- `_partition` — Partition of Kafka topic. - -**Ver también** - -- [Virtual columnas](../index.md#table_engines-virtual_columns) - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/kafka/) diff --git a/docs/es/engines/table-engines/integrations/mysql.md b/docs/es/engines/table-engines/integrations/mysql.md deleted file mode 100644 index 52799117255..00000000000 --- a/docs/es/engines/table-engines/integrations/mysql.md +++ /dev/null @@ -1,105 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 33 -toc_title: MySQL ---- - -# Mysql {#mysql} - -El motor MySQL le permite realizar `SELECT` consultas sobre datos almacenados en un servidor MySQL remoto. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], - ... -) ENGINE = MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); -``` - -Vea una descripción detallada del [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) consulta. - -La estructura de la tabla puede diferir de la estructura de la tabla MySQL original: - -- Los nombres de columna deben ser los mismos que en la tabla MySQL original, pero puede usar solo algunas de estas columnas y en cualquier orden. -- Los tipos de columna pueden diferir de los de la tabla MySQL original. ClickHouse intenta [elenco](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) valores a los tipos de datos ClickHouse. - -**Parámetros del motor** - -- `host:port` — MySQL server address. - -- `database` — Remote database name. - -- `table` — Remote table name. - -- `user` — MySQL user. - -- `password` — User password. - -- `replace_query` — Flag that converts `INSERT INTO` consultas a `REPLACE INTO`. Si `replace_query=1`, la consulta se sustituye. - -- `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expresión que se añade a la `INSERT` consulta. - - Ejemplo: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, donde `on_duplicate_clause` ser `UPDATE c2 = c2 + 1`. Ver el [Documentación de MySQL](https://dev.mysql.com/doc/refman/8.0/en/insert-on-duplicate.html) para encontrar qué `on_duplicate_clause` se puede utilizar con el `ON DUPLICATE KEY` clausula. - - Especificar `on_duplicate_clause` tienes que pasar `0` a la `replace_query` parámetro. Si pasa simultáneamente `replace_query = 1` y `on_duplicate_clause`, ClickHouse genera una excepción. - -Simple `WHERE` cláusulas tales como `=, !=, >, >=, <, <=` se ejecutan en el servidor MySQL. - -El resto de las condiciones y el `LIMIT` La restricción de muestreo se ejecuta en ClickHouse solo después de que finalice la consulta a MySQL. - -## Ejemplo de uso {#usage-example} - -Tabla en MySQL: - -``` text -mysql> CREATE TABLE `test`.`test` ( - -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `int_nullable` INT NULL DEFAULT NULL, - -> `float` FLOAT NOT NULL, - -> `float_nullable` FLOAT NULL DEFAULT NULL, - -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) - -mysql> insert into test (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) - -mysql> select * from test; -+------+----------+-----+----------+ -| int_id | int_nullable | float | float_nullable | -+------+----------+-----+----------+ -| 1 | NULL | 2 | NULL | -+------+----------+-----+----------+ -1 row in set (0,00 sec) -``` - -Tabla en ClickHouse, recuperando datos de la tabla MySQL creada anteriormente: - -``` sql -CREATE TABLE mysql_table -( - `float_nullable` Nullable(Float32), - `int_id` Int32 -) -ENGINE = MySQL('localhost:3306', 'test', 'test', 'bayonet', '123') -``` - -``` sql -SELECT * FROM mysql_table -``` - -``` text -┌─float_nullable─┬─int_id─┐ -│ ᴺᵁᴸᴸ │ 1 │ -└────────────────┴────────┘ -``` - -## Ver también {#see-also} - -- [El ‘mysql’ función de la tabla](../../../sql-reference/table-functions/mysql.md) -- [Uso de MySQL como fuente de diccionario externo](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/mysql/) diff --git a/docs/es/engines/table-engines/integrations/odbc.md b/docs/es/engines/table-engines/integrations/odbc.md deleted file mode 100644 index 75c79484d61..00000000000 --- a/docs/es/engines/table-engines/integrations/odbc.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 35 -toc_title: ODBC ---- - -# ODBC {#table-engine-odbc} - -Permite que ClickHouse se conecte a bases de datos externas a través de [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). - -Para implementar con seguridad conexiones ODBC, ClickHouse usa un programa separado `clickhouse-odbc-bridge`. Si el controlador ODBC se carga directamente desde `clickhouse-server`, problemas de controlador pueden bloquear el servidor ClickHouse. ClickHouse se inicia automáticamente `clickhouse-odbc-bridge` cuando se requiere. El programa de puente ODBC se instala desde el mismo paquete que el `clickhouse-server`. - -Este motor soporta el [NULL](../../../sql-reference/data-types/nullable.md) tipo de datos. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1], - name2 [type2], - ... -) -ENGINE = ODBC(connection_settings, external_database, external_table) -``` - -Vea una descripción detallada del [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) consulta. - -La estructura de la tabla puede diferir de la estructura de la tabla de origen: - -- Los nombres de columna deben ser los mismos que en la tabla de origen, pero puede usar solo algunas de estas columnas y en cualquier orden. -- Los tipos de columna pueden diferir de los de la tabla de origen. ClickHouse intenta [elenco](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) valores a los tipos de datos ClickHouse. - -**Parámetros del motor** - -- `connection_settings` — Name of the section with connection settings in the `odbc.ini` file. -- `external_database` — Name of a database in an external DBMS. -- `external_table` — Name of a table in the `external_database`. - -## Ejemplo de uso {#usage-example} - -**Recuperación de datos de la instalación local de MySQL a través de ODBC** - -Este ejemplo se comprueba para Ubuntu Linux 18.04 y el servidor MySQL 5.7. - -Asegúrese de que unixODBC y MySQL Connector están instalados. - -De forma predeterminada (si se instala desde paquetes), ClickHouse comienza como usuario `clickhouse`. Por lo tanto, debe crear y configurar este usuario en el servidor MySQL. - -``` bash -$ sudo mysql -``` - -``` sql -mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; -mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; -``` - -A continuación, configure la conexión en `/etc/odbc.ini`. - -``` bash -$ cat /etc/odbc.ini -[mysqlconn] -DRIVER = /usr/local/lib/libmyodbc5w.so -SERVER = 127.0.0.1 -PORT = 3306 -DATABASE = test -USERNAME = clickhouse -PASSWORD = clickhouse -``` - -Puede verificar la conexión usando el `isql` utilidad desde la instalación de unixODBC. - -``` bash -$ isql -v mysqlconn -+-------------------------+ -| Connected! | -| | -... -``` - -Tabla en MySQL: - -``` text -mysql> CREATE TABLE `test`.`test` ( - -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `int_nullable` INT NULL DEFAULT NULL, - -> `float` FLOAT NOT NULL, - -> `float_nullable` FLOAT NULL DEFAULT NULL, - -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) - -mysql> insert into test (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) - -mysql> select * from test; -+------+----------+-----+----------+ -| int_id | int_nullable | float | float_nullable | -+------+----------+-----+----------+ -| 1 | NULL | 2 | NULL | -+------+----------+-----+----------+ -1 row in set (0,00 sec) -``` - -Tabla en ClickHouse, recuperando datos de la tabla MySQL: - -``` sql -CREATE TABLE odbc_t -( - `int_id` Int32, - `float_nullable` Nullable(Float32) -) -ENGINE = ODBC('DSN=mysqlconn', 'test', 'test') -``` - -``` sql -SELECT * FROM odbc_t -``` - -``` text -┌─int_id─┬─float_nullable─┐ -│ 1 │ ᴺᵁᴸᴸ │ -└────────┴────────────────┘ -``` - -## Ver también {#see-also} - -- [Diccionarios externos ODBC](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) -- [Tabla ODBC función](../../../sql-reference/table-functions/odbc.md) - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/odbc/) diff --git a/docs/es/engines/table-engines/log-family/index.md b/docs/es/engines/table-engines/log-family/index.md deleted file mode 100644 index a7a3016f967..00000000000 --- a/docs/es/engines/table-engines/log-family/index.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Familia de registro -toc_priority: 29 -toc_title: "Implantaci\xF3n" ---- - -# Familia del motor de registro {#log-engine-family} - -Estos motores fueron desarrollados para escenarios en los que necesita escribir rápidamente muchas tablas pequeñas (hasta aproximadamente 1 millón de filas) y leerlas más tarde en su conjunto. - -Motores de la familia: - -- [StripeLog](stripelog.md) -- [Registro](log.md) -- [TinyLog](tinylog.md) - -## Propiedades comunes {#common-properties} - -Motor: - -- Almacenar datos en un disco. - -- Agregue datos al final del archivo al escribir. - -- Bloqueos de soporte para el acceso a datos simultáneos. - - Durante `INSERT` consultas, la tabla está bloqueada y otras consultas para leer y escribir datos esperan a que la tabla se desbloquee. Si no hay consultas de escritura de datos, se puede realizar cualquier número de consultas de lectura de datos simultáneamente. - -- No apoyo [mutación](../../../sql-reference/statements/alter.md#alter-mutations) operación. - -- No admite índices. - - Esto significa que `SELECT` las consultas para rangos de datos no son eficientes. - -- No escriba datos atómicamente. - - Puede obtener una tabla con datos dañados si algo rompe la operación de escritura, por ejemplo, un cierre anormal del servidor. - -## Diferencia {#differences} - -El `TinyLog` es el más simple de la familia y proporciona la funcionalidad más pobre y la eficiencia más baja. El `TinyLog` el motor no admite la lectura de datos paralelos por varios hilos. Lee datos más lentamente que otros motores de la familia que admiten lectura paralela y utiliza casi tantos descriptores como los `Log` motor porque almacena cada columna en un archivo separado. Úselo en escenarios simples de baja carga. - -El `Log` y `StripeLog` Los motores admiten lectura de datos paralela. Al leer datos, ClickHouse usa múltiples hilos. Cada subproceso procesa un bloque de datos separado. El `Log` utiliza un archivo separado para cada columna de la tabla. `StripeLog` almacena todos los datos en un archivo. Como resultado, el `StripeLog` el motor utiliza menos descriptores en el sistema operativo, pero el `Log` proporciona una mayor eficiencia al leer datos. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/log_family/) diff --git a/docs/es/engines/table-engines/log-family/log.md b/docs/es/engines/table-engines/log-family/log.md deleted file mode 100644 index 1db374390e4..00000000000 --- a/docs/es/engines/table-engines/log-family/log.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 33 -toc_title: Registro ---- - -# Registro {#log} - -El motor pertenece a la familia de motores de registro. Consulte las propiedades comunes de los motores de registro y sus diferencias en [Familia del motor de registro](index.md) artículo. - -El registro difiere de [TinyLog](tinylog.md) en que un pequeño archivo de “marks” reside con los archivos de columna. Estas marcas se escriben en cada bloque de datos y contienen compensaciones que indican dónde comenzar a leer el archivo para omitir el número especificado de filas. Esto hace posible leer datos de tabla en múltiples hilos. -Para el acceso a datos simultáneos, las operaciones de lectura se pueden realizar simultáneamente, mientras que las operaciones de escritura bloquean las lecturas entre sí. -El motor de registro no admite índices. Del mismo modo, si la escritura en una tabla falla, la tabla se rompe y la lectura de ella devuelve un error. El motor de registro es adecuado para datos temporales, tablas de escritura única y para fines de prueba o demostración. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/log/) diff --git a/docs/es/engines/table-engines/log-family/stripelog.md b/docs/es/engines/table-engines/log-family/stripelog.md deleted file mode 100644 index 0965e9a987c..00000000000 --- a/docs/es/engines/table-engines/log-family/stripelog.md +++ /dev/null @@ -1,95 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 32 -toc_title: StripeLog ---- - -# Lista de Stripelog {#stripelog} - -Este motor pertenece a la familia de motores de registro. Consulte las propiedades comunes de los motores de registro y sus diferencias en [Familia del motor de registro](index.md) artículo. - -Utilice este motor en escenarios en los que necesite escribir muchas tablas con una pequeña cantidad de datos (menos de 1 millón de filas). - -## Creación de una tabla {#table_engines-stripelog-creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - column2_name [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = StripeLog -``` - -Vea la descripción detallada del [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) consulta. - -## Escribir los datos {#table_engines-stripelog-writing-the-data} - -El `StripeLog` el motor almacena todas las columnas en un archivo. Para cada `INSERT` consulta, ClickHouse agrega el bloque de datos al final de un archivo de tabla, escribiendo columnas una por una. - -Para cada tabla, ClickHouse escribe los archivos: - -- `data.bin` — Data file. -- `index.mrk` — File with marks. Marks contain offsets for each column of each data block inserted. - -El `StripeLog` el motor no soporta el `ALTER UPDATE` y `ALTER DELETE` operación. - -## Lectura de los datos {#table_engines-stripelog-reading-the-data} - -El archivo con marcas permite ClickHouse paralelizar la lectura de datos. Esto significa que un `SELECT` query devuelve filas en un orden impredecible. Utilice el `ORDER BY` cláusula para ordenar filas. - -## Ejemplo de uso {#table_engines-stripelog-example-of-use} - -Creación de una tabla: - -``` sql -CREATE TABLE stripe_log_table -( - timestamp DateTime, - message_type String, - message String -) -ENGINE = StripeLog -``` - -Insertar datos: - -``` sql -INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The first regular message') -INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The second regular message'),(now(),'WARNING','The first warning message') -``` - -Se utilizaron dos `INSERT` consultas para crear dos bloques de datos dentro del `data.bin` file. - -ClickHouse usa múltiples subprocesos al seleccionar datos. Cada subproceso lee un bloque de datos separado y devuelve las filas resultantes de forma independiente a medida que termina. Como resultado, el orden de los bloques de filas en la salida no coincide con el orden de los mismos bloques en la entrada en la mayoría de los casos. Por ejemplo: - -``` sql -SELECT * FROM stripe_log_table -``` - -``` text -┌───────────timestamp─┬─message_type─┬─message────────────────────┐ -│ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ -│ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ -└─────────────────────┴──────────────┴────────────────────────────┘ -┌───────────timestamp─┬─message_type─┬─message───────────────────┐ -│ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ -└─────────────────────┴──────────────┴───────────────────────────┘ -``` - -Ordenación de los resultados (orden ascendente por defecto): - -``` sql -SELECT * FROM stripe_log_table ORDER BY timestamp -``` - -``` text -┌───────────timestamp─┬─message_type─┬─message────────────────────┐ -│ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ -│ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ -│ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ -└─────────────────────┴──────────────┴────────────────────────────┘ -``` - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/stripelog/) diff --git a/docs/es/engines/table-engines/log-family/tinylog.md b/docs/es/engines/table-engines/log-family/tinylog.md deleted file mode 100644 index a2cbf7257b6..00000000000 --- a/docs/es/engines/table-engines/log-family/tinylog.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 34 -toc_title: TinyLog ---- - -# TinyLog {#tinylog} - -El motor pertenece a la familia de motores de registro. Ver [Familia del motor de registro](index.md) para las propiedades comunes de los motores de registro y sus diferencias. - -Este motor de tablas se usa normalmente con el método write-once: escribir datos una vez, luego leerlos tantas veces como sea necesario. Por ejemplo, puede usar `TinyLog`-type tablas para datos intermedios que se procesan en pequeños lotes. Tenga en cuenta que el almacenamiento de datos en un gran número de tablas pequeñas es ineficiente. - -Las consultas se ejecutan en una sola secuencia. En otras palabras, este motor está diseñado para tablas relativamente pequeñas (hasta aproximadamente 1,000,000 filas). Tiene sentido usar este motor de tablas si tiene muchas tablas pequeñas, ya que es más simple que el [Registro](log.md) motor (menos archivos necesitan ser abiertos). - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/tinylog/) diff --git a/docs/es/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/es/engines/table-engines/mergetree-family/aggregatingmergetree.md deleted file mode 100644 index 2aedfbd2317..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/aggregatingmergetree.md +++ /dev/null @@ -1,105 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 35 -toc_title: "Agregaci\xF3nMergeTree" ---- - -# Aggregatingmergetree {#aggregatingmergetree} - -El motor hereda de [Método de codificación de datos:](mergetree.md#table_engines-mergetree), alterando la lógica para la fusión de partes de datos. ClickHouse reemplaza todas las filas con la misma clave principal (o más exactamente, con la misma [clave de clasificación](mergetree.md)) con una sola fila (dentro de una parte de datos) que almacena una combinación de estados de funciones agregadas. - -Usted puede utilizar `AggregatingMergeTree` tablas para la agregación de datos incrementales, incluidas las vistas materializadas agregadas. - -El motor procesa todas las columnas con los siguientes tipos: - -- [AggregateFunction](../../../sql-reference/data-types/aggregatefunction.md) -- [SimpleAggregateFunction](../../../sql-reference/data-types/simpleaggregatefunction.md) - -Es apropiado usar `AggregatingMergeTree` si reduce el número de filas por pedidos. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = AggregatingMergeTree() -[PARTITION BY expr] -[ORDER BY expr] -[SAMPLE BY expr] -[TTL expr] -[SETTINGS name=value, ...] -``` - -Para obtener una descripción de los parámetros de solicitud, consulte [descripción de la solicitud](../../../sql-reference/statements/create.md). - -**Cláusulas de consulta** - -Al crear un `AggregatingMergeTree` mesa de la misma [clausula](mergetree.md) se requieren, como al crear un `MergeTree` tabla. - -
- -Método obsoleto para crear una tabla - -!!! attention "Atención" - No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE [=] AggregatingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity) -``` - -Todos los parámetros tienen el mismo significado que en `MergeTree`. -
- -## SELECCIONAR e INSERTAR {#select-and-insert} - -Para insertar datos, utilice [INSERT SELECT](../../../sql-reference/statements/insert-into.md) consulta con funciones agregadas -State-. -Al seleccionar datos de `AggregatingMergeTree` mesa, uso `GROUP BY` cláusula y las mismas funciones agregadas que al insertar datos, pero usando `-Merge` sufijo. - -En los resultados de `SELECT` consulta, los valores de `AggregateFunction` tipo tiene representación binaria específica de la implementación para todos los formatos de salida de ClickHouse. Si volcar datos en, por ejemplo, `TabSeparated` formato con `SELECT` consulta entonces este volcado se puede cargar de nuevo usando `INSERT` consulta. - -## Ejemplo de una vista materializada agregada {#example-of-an-aggregated-materialized-view} - -`AggregatingMergeTree` vista materializada que mira el `test.visits` tabla: - -``` sql -CREATE MATERIALIZED VIEW test.basic -ENGINE = AggregatingMergeTree() PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate) -AS SELECT - CounterID, - StartDate, - sumState(Sign) AS Visits, - uniqState(UserID) AS Users -FROM test.visits -GROUP BY CounterID, StartDate; -``` - -Insertar datos en el `test.visits` tabla. - -``` sql -INSERT INTO test.visits ... -``` - -Los datos se insertan tanto en la tabla como en la vista `test.basic` que realizará la agregación. - -Para obtener los datos agregados, necesitamos ejecutar una consulta como `SELECT ... GROUP BY ...` de la vista `test.basic`: - -``` sql -SELECT - StartDate, - sumMerge(Visits) AS Visits, - uniqMerge(Users) AS Users -FROM test.basic -GROUP BY StartDate -ORDER BY StartDate; -``` - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/aggregatingmergetree/) diff --git a/docs/es/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/es/engines/table-engines/mergetree-family/collapsingmergetree.md deleted file mode 100644 index 027d5c2adf7..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/collapsingmergetree.md +++ /dev/null @@ -1,306 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 36 -toc_title: ColapsarMergeTree ---- - -# ColapsarMergeTree {#table_engine-collapsingmergetree} - -El motor hereda de [Método de codificación de datos:](mergetree.md) y agrega la lógica de las filas que colapsan al algoritmo de fusión de partes de datos. - -`CollapsingMergeTree` elimina de forma asincrónica (colapsa) pares de filas si todos los campos de una clave de ordenación (`ORDER BY`) son equivalentes excepto el campo particular `Sign` que puede tener `1` y `-1` valor. Las filas sin un par se mantienen. Para más detalles, consulte el [Derrumbar](#table_engine-collapsingmergetree-collapsing) sección del documento. - -El motor puede reducir significativamente el volumen de almacenamiento y aumentar la eficiencia de `SELECT` consulta como consecuencia. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = CollapsingMergeTree(sign) -[PARTITION BY expr] -[ORDER BY expr] -[SAMPLE BY expr] -[SETTINGS name=value, ...] -``` - -Para obtener una descripción de los parámetros de consulta, consulte [descripción de la consulta](../../../sql-reference/statements/create.md). - -**CollapsingMergeTree Parámetros** - -- `sign` — Name of the column with the type of row: `1` es una “state” fila, `-1` es una “cancel” fila. - - Column data type — `Int8`. - -**Cláusulas de consulta** - -Al crear un `CollapsingMergeTree` mesa, la misma [cláusulas de consulta](mergetree.md#table_engine-mergetree-creating-a-table) se requieren, como al crear un `MergeTree` tabla. - -
- -Método obsoleto para crear una tabla - -!!! attention "Atención" - No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE [=] CollapsingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, sign) -``` - -Todos los parámetros excepto `sign` el mismo significado que en `MergeTree`. - -- `sign` — Name of the column with the type of row: `1` — “state” fila, `-1` — “cancel” fila. - - Column Data Type — `Int8`. - -
- -## Derrumbar {#table_engine-collapsingmergetree-collapsing} - -### Datos {#data} - -Considere la situación en la que necesita guardar datos que cambian continuamente para algún objeto. Parece lógico tener una fila para un objeto y actualizarla en cualquier cambio, pero la operación de actualización es costosa y lenta para DBMS porque requiere la reescritura de los datos en el almacenamiento. Si necesita escribir datos rápidamente, la actualización no es aceptable, pero puede escribir los cambios de un objeto secuencialmente de la siguiente manera. - -Utilice la columna en particular `Sign`. Si `Sign = 1` significa que la fila es un estado de un objeto, llamémoslo “state” fila. Si `Sign = -1` significa la cancelación del estado de un objeto con los mismos atributos, llamémoslo “cancel” fila. - -Por ejemplo, queremos calcular cuántas páginas revisaron los usuarios en algún sitio y cuánto tiempo estuvieron allí. En algún momento escribimos la siguiente fila con el estado de la actividad del usuario: - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -En algún momento después registramos el cambio de actividad del usuario y lo escribimos con las siguientes dos filas. - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -La primera fila cancela el estado anterior del objeto (usuario). Debe copiar los campos clave de ordenación del estado cancelado exceptuando `Sign`. - -La segunda fila contiene el estado actual. - -Como solo necesitamos el último estado de actividad del usuario, las filas - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -se puede eliminar colapsando el estado no válido (antiguo) de un objeto. `CollapsingMergeTree` hace esto mientras se fusionan las partes de datos. - -Por qué necesitamos 2 filas para cada cambio leído en el [Algoritmo](#table_engine-collapsingmergetree-collapsing-algorithm) apartado. - -**Propiedades peculiares de tal enfoque** - -1. El programa que escribe los datos debe recordar el estado de un objeto para poder cancelarlo. “Cancel” debe contener copias de los campos de clave de ordenación “state” y lo opuesto `Sign`. Aumenta el tamaño inicial de almacenamiento, pero permite escribir los datos rápidamente. -2. Las matrices de largo crecimiento en columnas reducen la eficiencia del motor debido a la carga para escribir. Los datos más sencillos, mayor será la eficiencia. -3. El `SELECT` Los resultados dependen en gran medida de la consistencia del historial de cambios de objetos. Sea preciso al preparar los datos para insertarlos. Puede obtener resultados impredecibles en datos incoherentes, por ejemplo, valores negativos para métricas no negativas, como la profundidad de la sesión. - -### Algoritmo {#table_engine-collapsingmergetree-collapsing-algorithm} - -Cuando ClickHouse combina partes de datos, cada grupo de filas consecutivas tiene la misma clave de ordenación (`ORDER BY`) se reduce a no más de dos filas, una con `Sign = 1` (“state” fila) y otro con `Sign = -1` (“cancel” fila). En otras palabras, las entradas colapsan. - -Para cada parte de datos resultante, ClickHouse guarda: - -1. El primero “cancel” y el último “state” si el número de “state” y “cancel” y la última fila es una “state” fila. -2. El último “state” fila, si hay más “state” filas que “cancel” filas. -3. El primero “cancel” fila, si hay más “cancel” filas que “state” filas. -4. Ninguna de las filas, en todos los demás casos. - -También cuando hay al menos 2 más “state” filas que “cancel” filas, o al menos 2 más “cancel” filas entonces “state” fila, la fusión continúa, pero ClickHouse trata esta situación como un error lógico y la registra en el registro del servidor. Este error puede producirse si se insertan los mismos datos más de una vez. - -Por lo tanto, el colapso no debe cambiar los resultados del cálculo de las estadísticas. -Los cambios colapsaron gradualmente para que al final solo quedara el último estado de casi todos los objetos. - -El `Sign` se requiere porque el algoritmo de fusión no garantiza que todas las filas con la misma clave de clasificación estén en la misma parte de datos resultante e incluso en el mismo servidor físico. Proceso de ClickHouse `SELECT` consultas con múltiples hilos, y no puede predecir el orden de las filas en el resultado. La agregación es necesaria si hay una necesidad de obtener completamente “collapsed” datos de `CollapsingMergeTree` tabla. - -Para finalizar el colapso, escriba una consulta con `GROUP BY` cláusula y funciones agregadas que representan el signo. Por ejemplo, para calcular la cantidad, use `sum(Sign)` en lugar de `count()`. Para calcular la suma de algo, use `sum(Sign * x)` en lugar de `sum(x)` y así sucesivamente, y también añadir `HAVING sum(Sign) > 0`. - -Los agregados `count`, `sum` y `avg` podría calcularse de esta manera. El agregado `uniq` podría calcularse si un objeto tiene al menos un estado no colapsado. Los agregados `min` y `max` no se pudo calcular porque `CollapsingMergeTree` no guarda el historial de valores de los estados colapsados. - -Si necesita extraer datos sin agregación (por ejemplo, para comprobar si hay filas presentes cuyos valores más recientes coinciden con ciertas condiciones), puede utilizar el `FINAL` modificador para el `FROM` clausula. Este enfoque es significativamente menos eficiente. - -## Ejemplo de uso {#example-of-use} - -Datos de ejemplo: - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -Creación de la tabla: - -``` sql -CREATE TABLE UAct -( - UserID UInt64, - PageViews UInt8, - Duration UInt8, - Sign Int8 -) -ENGINE = CollapsingMergeTree(Sign) -ORDER BY UserID -``` - -Inserción de los datos: - -``` sql -INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1) -``` - -``` sql -INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1) -``` - -Usamos dos `INSERT` consultas para crear dos partes de datos diferentes. Si insertamos los datos con una consulta, ClickHouse crea una parte de datos y nunca realizará ninguna fusión. - -Obtener los datos: - -``` sql -SELECT * FROM UAct -``` - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -¿Qué vemos y dónde está colapsando? - -Con dos `INSERT` consultas, hemos creado 2 partes de datos. El `SELECT` la consulta se realizó en 2 hilos, y obtuvimos un orden aleatorio de filas. No se ha producido un colapso porque todavía no se había fusionado las partes de datos. ClickHouse fusiona parte de datos en un momento desconocido que no podemos predecir. - -Por lo tanto, necesitamos agregación: - -``` sql -SELECT - UserID, - sum(PageViews * Sign) AS PageViews, - sum(Duration * Sign) AS Duration -FROM UAct -GROUP BY UserID -HAVING sum(Sign) > 0 -``` - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┐ -│ 4324182021466249494 │ 6 │ 185 │ -└─────────────────────┴───────────┴──────────┘ -``` - -Si no necesitamos agregación y queremos forzar el colapso, podemos usar `FINAL` modificador para `FROM` clausula. - -``` sql -SELECT * FROM UAct FINAL -``` - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -Esta forma de seleccionar los datos es muy ineficiente. No lo use para mesas grandes. - -## Ejemplo de otro enfoque {#example-of-another-approach} - -Datos de ejemplo: - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -│ 4324182021466249494 │ -5 │ -146 │ -1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -La idea es que las fusiones tengan en cuenta solo los campos clave. Y en el “Cancel” línea podemos especificar valores negativos que igualan la versión anterior de la fila al sumar sin usar la columna Sign. Para este enfoque, es necesario cambiar el tipo de datos `PageViews`,`Duration` para almacenar valores negativos de UInt8 -\> Int16. - -``` sql -CREATE TABLE UAct -( - UserID UInt64, - PageViews Int16, - Duration Int16, - Sign Int8 -) -ENGINE = CollapsingMergeTree(Sign) -ORDER BY UserID -``` - -Vamos a probar el enfoque: - -``` sql -insert into UAct values(4324182021466249494, 5, 146, 1); -insert into UAct values(4324182021466249494, -5, -146, -1); -insert into UAct values(4324182021466249494, 6, 185, 1); - -select * from UAct final; // avoid using final in production (just for a test or small tables) -``` - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -``` sql -SELECT - UserID, - sum(PageViews) AS PageViews, - sum(Duration) AS Duration -FROM UAct -GROUP BY UserID -```text -┌──────────────UserID─┬─PageViews─┬─Duration─┐ -│ 4324182021466249494 │ 6 │ 185 │ -└─────────────────────┴───────────┴──────────┘ -``` - -``` sqk -select count() FROM UAct -``` - -``` text -┌─count()─┐ -│ 3 │ -└─────────┘ -``` - -``` sql -optimize table UAct final; - -select * FROM UAct -``` - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/collapsingmergetree/) diff --git a/docs/es/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/es/engines/table-engines/mergetree-family/custom-partitioning-key.md deleted file mode 100644 index 6cbc0a9192e..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ /dev/null @@ -1,127 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 32 -toc_title: "Clave de partici\xF3n personalizada" ---- - -# Clave de partición personalizada {#custom-partitioning-key} - -La partición está disponible para el [Método de codificación de datos:](mergetree.md) mesas familiares (incluyendo [repetición](replication.md) tabla). [Vistas materializadas](../special/materializedview.md#materializedview) basado en tablas MergeTree soporte de particionamiento, también. - -Una partición es una combinación lógica de registros en una tabla por un criterio especificado. Puede establecer una partición por un criterio arbitrario, como por mes, por día o por tipo de evento. Cada partición se almacena por separado para simplificar las manipulaciones de estos datos. Al acceder a los datos, ClickHouse utiliza el subconjunto más pequeño de particiones posible. - -La partición se especifica en el `PARTITION BY expr` cláusula cuando [creando una tabla](mergetree.md#table_engine-mergetree-creating-a-table). La clave de partición puede ser cualquier expresión de las columnas de la tabla. Por ejemplo, para especificar la partición por mes, utilice la expresión `toYYYYMM(date_column)`: - -``` sql -CREATE TABLE visits -( - VisitDate Date, - Hour UInt8, - ClientID UUID -) -ENGINE = MergeTree() -PARTITION BY toYYYYMM(VisitDate) -ORDER BY Hour; -``` - -La clave de partición también puede ser una tupla de expresiones (similar a la [clave primaria](mergetree.md#primary-keys-and-indexes-in-queries)). Por ejemplo: - -``` sql -ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/name', 'replica1', Sign) -PARTITION BY (toMonday(StartDate), EventType) -ORDER BY (CounterID, StartDate, intHash32(UserID)); -``` - -En este ejemplo, establecemos la partición por los tipos de eventos que se produjeron durante la semana actual. - -Al insertar datos nuevos en una tabla, estos datos se almacenan como una parte separada (porción) ordenada por la clave principal. En 10-15 minutos después de insertar, las partes de la misma partición se fusionan en toda la parte. - -!!! info "INFO" - Una combinación solo funciona para partes de datos que tienen el mismo valor para la expresión de partición. Esto significa **no deberías hacer particiones demasiado granulares** (más de un millar de particiones). De lo contrario, el `SELECT` consulta funciona mal debido a un número excesivamente grande de archivos en el sistema de archivos y descriptores de archivos abiertos. - -Utilice el [sistema.parte](../../../operations/system-tables.md#system_tables-parts) tabla para ver las partes y particiones de la tabla. Por ejemplo, supongamos que tenemos un `visits` tabla con partición por mes. Vamos a realizar el `SELECT` consulta para el `system.parts` tabla: - -``` sql -SELECT - partition, - name, - active -FROM system.parts -WHERE table = 'visits' -``` - -``` text -┌─partition─┬─name───────────┬─active─┐ -│ 201901 │ 201901_1_3_1 │ 0 │ -│ 201901 │ 201901_1_9_2 │ 1 │ -│ 201901 │ 201901_8_8_0 │ 0 │ -│ 201901 │ 201901_9_9_0 │ 0 │ -│ 201902 │ 201902_4_6_1 │ 1 │ -│ 201902 │ 201902_10_10_0 │ 1 │ -│ 201902 │ 201902_11_11_0 │ 1 │ -└───────────┴────────────────┴────────┘ -``` - -El `partition` columna contiene los nombres de las particiones. Hay dos particiones en este ejemplo: `201901` y `201902`. Puede utilizar este valor de columna para especificar el nombre de partición en [ALTER … PARTITION](#alter_manipulations-with-partitions) consulta. - -El `name` columna contiene los nombres de las partes de datos de partición. Puede utilizar esta columna para especificar el nombre de la pieza [ALTER ATTACH PART](#alter_attach-partition) consulta. - -Vamos a desglosar el nombre de la primera parte: `201901_1_3_1`: - -- `201901` es el nombre de la partición. -- `1` es el número mínimo del bloque de datos. -- `3` es el número máximo del bloque de datos. -- `1` es el nivel de fragmento (la profundidad del árbol de fusión del que se forma). - -!!! info "INFO" - Las partes de las tablas de tipo antiguo tienen el nombre: `20190117_20190123_2_2_0` (fecha mínima - fecha máxima - número de bloque mínimo - número de bloque máximo - nivel). - -El `active` columna muestra el estado de la pieza. `1` está activo; `0` está inactivo. Las partes inactivas son, por ejemplo, las partes de origen que quedan después de fusionarse con una parte más grande. Las partes de datos dañadas también se indican como inactivas. - -Como puede ver en el ejemplo, hay varias partes separadas de la misma partición (por ejemplo, `201901_1_3_1` y `201901_1_9_2`). Esto significa que estas partes aún no están fusionadas. ClickHouse combina las partes insertadas de datos periódicamente, aproximadamente 15 minutos después de la inserción. Además, puede realizar una fusión no programada utilizando el [OPTIMIZE](../../../sql-reference/statements/misc.md#misc_operations-optimize) consulta. Ejemplo: - -``` sql -OPTIMIZE TABLE visits PARTITION 201902; -``` - -``` text -┌─partition─┬─name───────────┬─active─┐ -│ 201901 │ 201901_1_3_1 │ 0 │ -│ 201901 │ 201901_1_9_2 │ 1 │ -│ 201901 │ 201901_8_8_0 │ 0 │ -│ 201901 │ 201901_9_9_0 │ 0 │ -│ 201902 │ 201902_4_6_1 │ 0 │ -│ 201902 │ 201902_4_11_2 │ 1 │ -│ 201902 │ 201902_10_10_0 │ 0 │ -│ 201902 │ 201902_11_11_0 │ 0 │ -└───────────┴────────────────┴────────┘ -``` - -Las partes inactivas se eliminarán aproximadamente 10 minutos después de la fusión. - -Otra forma de ver un conjunto de partes y particiones es ir al directorio de la tabla: `/var/lib/clickhouse/data///`. Por ejemplo: - -``` bash -/var/lib/clickhouse/data/default/visits$ ls -l -total 40 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_8_8_0 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_9_9_0 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_10_10_0 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_11_11_0 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:19 201902_4_11_2 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 12:09 201902_4_6_1 -drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached -``` - -Carpeta ‘201901_1_1_0’, ‘201901_1_7_1’ y así sucesivamente son los directorios de las partes. Cada parte se relaciona con una partición correspondiente y contiene datos solo para un mes determinado (la tabla de este ejemplo tiene particiones por mes). - -El `detached` el directorio contiene partes que se separaron de la tabla utilizando el [DETACH](../../../sql-reference/statements/alter.md#alter_detach-partition) consulta. Las partes dañadas también se mueven a este directorio, en lugar de eliminarse. El servidor no utiliza las piezas del `detached` directory. You can add, delete, or modify the data in this directory at any time – the server will not know about this until you run the [ATTACH](../../../sql-reference/statements/alter.md#alter_attach-partition) consulta. - -Tenga en cuenta que en el servidor operativo, no puede cambiar manualmente el conjunto de piezas o sus datos en el sistema de archivos, ya que el servidor no lo sabrá. Para tablas no replicadas, puede hacer esto cuando se detiene el servidor, pero no se recomienda. Para tablas replicadas, el conjunto de piezas no se puede cambiar en ningún caso. - -ClickHouse le permite realizar operaciones con las particiones: eliminarlas, copiar de una tabla a otra o crear una copia de seguridad. Consulte la lista de todas las operaciones en la sección [Manipulaciones con particiones y piezas](../../../sql-reference/statements/alter.md#alter_manipulations-with-partitions). - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/custom_partitioning_key/) diff --git a/docs/es/engines/table-engines/mergetree-family/graphitemergetree.md b/docs/es/engines/table-engines/mergetree-family/graphitemergetree.md deleted file mode 100644 index d33ddcebac2..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/graphitemergetree.md +++ /dev/null @@ -1,174 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 38 -toc_title: GraphiteMergeTree ---- - -# GraphiteMergeTree {#graphitemergetree} - -Este motor está diseñado para el adelgazamiento y la agregación / promedio (rollup) [Grafito](http://graphite.readthedocs.io/en/latest/index.html) datos. Puede ser útil para los desarrolladores que desean usar ClickHouse como almacén de datos para Graphite. - -Puede usar cualquier motor de tabla ClickHouse para almacenar los datos de Graphite si no necesita un paquete acumulativo, pero si necesita un paquete acumulativo, use `GraphiteMergeTree`. El motor reduce el volumen de almacenamiento y aumenta la eficiencia de las consultas de Graphite. - -El motor hereda propiedades de [Método de codificación de datos:](mergetree.md). - -## Creación de una tabla {#creating-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - Path String, - Time DateTime, - Value , - Version - ... -) ENGINE = GraphiteMergeTree(config_section) -[PARTITION BY expr] -[ORDER BY expr] -[SAMPLE BY expr] -[SETTINGS name=value, ...] -``` - -Vea una descripción detallada del [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) consulta. - -Una tabla para los datos de grafito debe tener las siguientes columnas para los siguientes datos: - -- Nombre métrico (sensor de grafito). Tipo de datos: `String`. - -- Tiempo de medición de la métrica. Tipo de datos: `DateTime`. - -- Valor de la métrica. Tipo de datos: cualquier numérico. - -- Versión de la métrica. Tipo de datos: cualquier numérico. - - ClickHouse guarda las filas con la versión más alta o la última escrita si las versiones son las mismas. Otras filas se eliminan durante la fusión de partes de datos. - -Los nombres de estas columnas deben establecerse en la configuración acumulativa. - -**GraphiteMergeTree parámetros** - -- `config_section` — Name of the section in the configuration file, where are the rules of rollup set. - -**Cláusulas de consulta** - -Al crear un `GraphiteMergeTree` mesa, la misma [clausula](mergetree.md#table_engine-mergetree-creating-a-table) se requieren, como al crear un `MergeTree` tabla. - -
- -Método obsoleto para crear una tabla - -!!! attention "Atención" - No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - EventDate Date, - Path String, - Time DateTime, - Value , - Version - ... -) ENGINE [=] GraphiteMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, config_section) -``` - -Todos los parámetros excepto `config_section` el mismo significado que en `MergeTree`. - -- `config_section` — Name of the section in the configuration file, where are the rules of rollup set. - -
- -## Configuración acumulativa {#rollup-configuration} - -La configuración del paquete acumulativo está definida por [graphite_rollup](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-graphite) parámetro en la configuración del servidor. El nombre del parámetro podría ser cualquiera. Puede crear varias configuraciones y usarlas para diferentes tablas. - -Estructura de configuración Rollup: - - required-columns - patterns - -### Columnas requeridas {#required-columns} - -- `path_column_name` — The name of the column storing the metric name (Graphite sensor). Default value: `Path`. -- `time_column_name` — The name of the column storing the time of measuring the metric. Default value: `Time`. -- `value_column_name` — The name of the column storing the value of the metric at the time set in `time_column_name`. Valor predeterminado: `Value`. -- `version_column_name` — The name of the column storing the version of the metric. Default value: `Timestamp`. - -### Patrón {#patterns} - -Estructura del `patterns` apartado: - -``` text -pattern - regexp - function -pattern - regexp - age + precision - ... -pattern - regexp - function - age + precision - ... -pattern - ... -default - function - age + precision - ... -``` - -!!! warning "Atención" - Los patrones deben ser estrictamente ordenados: - - 1. Patterns without `function` or `retention`. - 1. Patterns with both `function` and `retention`. - 1. Pattern `default`. - -Al procesar una fila, ClickHouse comprueba las reglas en el `pattern` apartado. Cada uno de `pattern` (incluir `default` secciones pueden contener `function` parámetro para la agregación, `retention` parámetros o ambos. Si el nombre de la métrica coincide con `regexp`, las reglas de la `pattern` sección (o secciones); de lo contrario, las reglas de la `default` sección se utilizan. - -Campos para `pattern` y `default` apartado: - -- `regexp`– A pattern for the metric name. -- `age` – The minimum age of the data in seconds. -- `precision`– How precisely to define the age of the data in seconds. Should be a divisor for 86400 (seconds in a day). -- `function` – The name of the aggregating function to apply to data whose age falls within the range `[age, age + precision]`. - -### Ejemplo de configuración {#configuration-example} - -``` xml - - Version - - click_cost - any - - 0 - 5 - - - 86400 - 60 - - - - max - - 0 - 60 - - - 3600 - 300 - - - 86400 - 3600 - - - -``` - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/graphitemergetree/) diff --git a/docs/es/engines/table-engines/mergetree-family/index.md b/docs/es/engines/table-engines/mergetree-family/index.md deleted file mode 100644 index 359d58b2ff1..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Familia MergeTree -toc_priority: 28 ---- - - diff --git a/docs/es/engines/table-engines/mergetree-family/mergetree.md b/docs/es/engines/table-engines/mergetree-family/mergetree.md deleted file mode 100644 index a4bab840b52..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/mergetree.md +++ /dev/null @@ -1,654 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 30 -toc_title: "M\xE9todo de codificaci\xF3n de datos:" ---- - -# Método de codificación de datos: {#table_engines-mergetree} - -El `MergeTree` motor y otros motores de esta familia (`*MergeTree`) son los motores de mesa ClickHouse más robustos. - -Motores en el `MergeTree` familia están diseñados para insertar una gran cantidad de datos en una tabla. Los datos se escriben rápidamente en la tabla parte por parte, luego se aplican reglas para fusionar las partes en segundo plano. Este método es mucho más eficiente que reescribir continuamente los datos en almacenamiento durante la inserción. - -Principales características: - -- Almacena datos ordenados por clave principal. - - Esto le permite crear un pequeño índice disperso que ayuda a encontrar datos más rápido. - -- Las particiones se pueden utilizar si [clave de partición](custom-partitioning-key.md) se especifica. - - ClickHouse admite ciertas operaciones con particiones que son más efectivas que las operaciones generales en los mismos datos con el mismo resultado. ClickHouse también corta automáticamente los datos de partición donde se especifica la clave de partición en la consulta. Esto también mejora el rendimiento de las consultas. - -- Soporte de replicación de datos. - - La familia de `ReplicatedMergeTree` proporciona la replicación de datos. Para obtener más información, consulte [Replicación de datos](replication.md). - -- Soporte de muestreo de datos. - - Si es necesario, puede establecer el método de muestreo de datos en la tabla. - -!!! info "INFO" - El [Fusionar](../special/merge.md#merge) el motor no pertenece al `*MergeTree` familia. - -## Creación de una tabla {#table_engine-mergetree-creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], - ... - INDEX index_name1 expr1 TYPE type1(...) GRANULARITY value1, - INDEX index_name2 expr2 TYPE type2(...) GRANULARITY value2 -) ENGINE = MergeTree() -[PARTITION BY expr] -[ORDER BY expr] -[PRIMARY KEY expr] -[SAMPLE BY expr] -[TTL expr [DELETE|TO DISK 'xxx'|TO VOLUME 'xxx'], ...] -[SETTINGS name=value, ...] -``` - -Para obtener una descripción de los parámetros, consulte [Descripción de la consulta CREATE](../../../sql-reference/statements/create.md). - -!!! note "Nota" - `INDEX` es una característica experimental, ver [Índices de saltos de datos](#table_engine-mergetree-data_skipping-indexes). - -### Cláusulas de consulta {#mergetree-query-clauses} - -- `ENGINE` — Name and parameters of the engine. `ENGINE = MergeTree()`. El `MergeTree` el motor no tiene parámetros. - -- `PARTITION BY` — The [clave de partición](custom-partitioning-key.md). - - Para particionar por mes, utilice el `toYYYYMM(date_column)` expresión, donde `date_column` es una columna con una fecha del tipo [Fecha](../../../sql-reference/data-types/date.md). Los nombres de partición aquí tienen el `"YYYYMM"` formato. - -- `ORDER BY` — The sorting key. - - Una tupla de columnas o expresiones arbitrarias. Ejemplo: `ORDER BY (CounterID, EventDate)`. - -- `PRIMARY KEY` — The primary key if it [difiere de la clave de clasificación](#choosing-a-primary-key-that-differs-from-the-sorting-key). - - De forma predeterminada, la clave principal es la misma que la clave de ordenación (que se especifica `ORDER BY` clausula). Por lo tanto, en la mayoría de los casos no es necesario especificar un `PRIMARY KEY` clausula. - -- `SAMPLE BY` — An expression for sampling. - - Si se utiliza una expresión de muestreo, la clave principal debe contenerla. Ejemplo: `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))`. - -- `TTL` — A list of rules specifying storage duration of rows and defining logic of automatic parts movement [entre discos y volúmenes](#table_engine-mergetree-multiple-volumes). - - La expresión debe tener una `Date` o `DateTime` columna como resultado. Ejemplo: - `TTL date + INTERVAL 1 DAY` - - Tipo de regla `DELETE|TO DISK 'xxx'|TO VOLUME 'xxx'` especifica una acción que debe realizarse con la pieza si la expresión está satisfecha (alcanza la hora actual): eliminación de filas caducadas, mover una pieza (si la expresión está satisfecha para todas las filas de una pieza) al disco especificado (`TO DISK 'xxx'`) o al volumen (`TO VOLUME 'xxx'`). El tipo predeterminado de la regla es la eliminación (`DELETE`). Se puede especificar una lista de varias reglas, pero no debe haber más de una `DELETE` regla. - - Para obtener más información, consulte [TTL para columnas y tablas](#table_engine-mergetree-ttl) - -- `SETTINGS` — Additional parameters that control the behavior of the `MergeTree`: - - - `index_granularity` — Maximum number of data rows between the marks of an index. Default value: 8192. See [Almacenamiento de datos](#mergetree-data-storage). - - `index_granularity_bytes` — Maximum size of data granules in bytes. Default value: 10Mb. To restrict the granule size only by number of rows, set to 0 (not recommended). See [Almacenamiento de datos](#mergetree-data-storage). - - `enable_mixed_granularity_parts` — Enables or disables transitioning to control the granule size with the `index_granularity_bytes` configuración. Antes de la versión 19.11, sólo existía el `index_granularity` ajuste para restringir el tamaño del gránulo. El `index_granularity_bytes` mejora el rendimiento de ClickHouse al seleccionar datos de tablas con filas grandes (decenas y cientos de megabytes). Si tiene tablas con filas grandes, puede habilitar esta configuración para que las tablas mejoren la eficiencia de `SELECT` consulta. - - `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, entonces ZooKeeper almacena menos datos. Para obtener más información, consulte [descripción del ajuste](../../../operations/server-configuration-parameters/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) en “Server configuration parameters”. - - `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation that is required for using direct I/O access to the storage disk. When merging data parts, ClickHouse calculates the total storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, ClickHouse lee y escribe los datos en el disco de almacenamiento utilizando la interfaz de E / S directa (`O_DIRECT` opcion). Si `min_merge_bytes_to_use_direct_io = 0`, entonces la E/S directa está deshabilitada. Valor predeterminado: `10 * 1024 * 1024 * 1024` byte. - - - `merge_with_ttl_timeout` — Minimum delay in seconds before repeating a merge with TTL. Default value: 86400 (1 day). - - `write_final_mark` — Enables or disables writing the final index mark at the end of data part (after the last byte). Default value: 1. Don't turn it off. - - `merge_max_block_size` — Maximum number of rows in block for merge operations. Default value: 8192. - - `storage_policy` — Storage policy. See [Uso de varios dispositivos de bloque para el almacenamiento de datos](#table_engine-mergetree-multiple-volumes). - -**Ejemplo de configuración de secciones** - -``` sql -ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192 -``` - -En el ejemplo, configuramos la partición por mes. - -También establecemos una expresión para el muestreo como un hash por el ID de usuario. Esto le permite pseudoaleatorizar los datos en la tabla para cada `CounterID` y `EventDate`. Si define un [SAMPLE](../../../sql-reference/statements/select/sample.md#select-sample-clause) cláusula al seleccionar los datos, ClickHouse devolverá una muestra de datos pseudoaleatoria uniforme para un subconjunto de usuarios. - -El `index_granularity` se puede omitir porque 8192 es el valor predeterminado. - -
- -Método obsoleto para crear una tabla - -!!! attention "Atención" - No utilice este método en nuevos proyectos. Si es posible, cambie los proyectos antiguos al método descrito anteriormente. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE [=] MergeTree(date-column [, sampling_expression], (primary, key), index_granularity) -``` - -**Parámetros MergeTree()** - -- `date-column` — The name of a column of the [Fecha](../../../sql-reference/data-types/date.md) tipo. ClickHouse crea automáticamente particiones por mes en función de esta columna. Los nombres de partición están en el `"YYYYMM"` formato. -- `sampling_expression` — An expression for sampling. -- `(primary, key)` — Primary key. Type: [Tupla()](../../../sql-reference/data-types/tuple.md) -- `index_granularity` — The granularity of an index. The number of data rows between the “marks” de un índice. El valor 8192 es apropiado para la mayoría de las tareas. - -**Ejemplo** - -``` sql -MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) -``` - -El `MergeTree` engine se configura de la misma manera que en el ejemplo anterior para el método de configuración del motor principal. -
- -## Almacenamiento de datos {#mergetree-data-storage} - -Una tabla consta de partes de datos ordenadas por clave principal. - -Cuando se insertan datos en una tabla, se crean partes de datos separadas y cada una de ellas se ordena lexicográficamente por clave principal. Por ejemplo, si la clave principal es `(CounterID, Date)`, los datos en la parte se ordenan por `CounterID`, y dentro de cada `CounterID` es ordenado por `Date`. - -Los datos que pertenecen a diferentes particiones se separan en diferentes partes. En el fondo, ClickHouse combina partes de datos para un almacenamiento más eficiente. Las piezas que pertenecen a particiones diferentes no se fusionan. El mecanismo de combinación no garantiza que todas las filas con la misma clave principal estén en la misma parte de datos. - -Cada parte de datos se divide lógicamente en gránulos. Un gránulo es el conjunto de datos indivisibles más pequeño que ClickHouse lee al seleccionar datos. ClickHouse no divide filas o valores, por lo que cada gránulo siempre contiene un número entero de filas. La primera fila de un gránulo está marcada con el valor de la clave principal de la fila. Para cada parte de datos, ClickHouse crea un archivo de índice que almacena las marcas. Para cada columna, ya sea en la clave principal o no, ClickHouse también almacena las mismas marcas. Estas marcas le permiten encontrar datos directamente en archivos de columnas. - -El tamaño del gránulo es restringido por `index_granularity` y `index_granularity_bytes` configuración del motor de tabla. El número de filas en un gránulo se encuentra en el `[1, index_granularity]` rango, dependiendo del tamaño de las filas. El tamaño de un gránulo puede exceder `index_granularity_bytes` si el tamaño de una sola fila es mayor que el valor de la configuración. En este caso, el tamaño del gránulo es igual al tamaño de la fila. - -## Claves e índices principales en consultas {#primary-keys-and-indexes-in-queries} - -Tome el `(CounterID, Date)` clave primaria como ejemplo. En este caso, la clasificación y el índice se pueden ilustrar de la siguiente manera: - - Whole data: [---------------------------------------------] - CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] - Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] - Marks: | | | | | | | | | | | - a,1 a,2 a,3 b,3 e,2 e,3 g,1 h,2 i,1 i,3 l,3 - Marks numbers: 0 1 2 3 4 5 6 7 8 9 10 - -Si la consulta de datos especifica: - -- `CounterID in ('a', 'h')`, el servidor lee los datos en los rangos de marcas `[0, 3)` y `[6, 8)`. -- `CounterID IN ('a', 'h') AND Date = 3`, el servidor lee los datos en los rangos de marcas `[1, 3)` y `[7, 8)`. -- `Date = 3`, el servidor lee los datos en el rango de marcas `[1, 10]`. - -Los ejemplos anteriores muestran que siempre es más efectivo usar un índice que un análisis completo. - -Un índice disperso permite leer datos adicionales. Al leer un único rango de la clave primaria, hasta `index_granularity * 2` se pueden leer filas adicionales en cada bloque de datos. - -Los índices dispersos le permiten trabajar con una gran cantidad de filas de tabla, porque en la mayoría de los casos, dichos índices caben en la RAM de la computadora. - -ClickHouse no requiere una clave principal única. Puede insertar varias filas con la misma clave principal. - -### Selección de la clave principal {#selecting-the-primary-key} - -El número de columnas en la clave principal no está explícitamente limitado. Dependiendo de la estructura de datos, puede incluir más o menos columnas en la clave principal. Esto puede: - -- Mejorar el rendimiento de un índice. - - Si la clave principal es `(a, b)`, a continuación, añadir otra columna `c` mejorará el rendimiento si se cumplen las siguientes condiciones: - - - Hay consultas con una condición en la columna `c`. - - Rangos de datos largos (varias veces más `index_granularity`) con valores idénticos para `(a, b)` son comunes. En otras palabras, al agregar otra columna le permite omitir rangos de datos bastante largos. - -- Mejorar la compresión de datos. - - ClickHouse ordena los datos por clave principal, por lo que cuanto mayor sea la consistencia, mejor será la compresión. - -- Proporcione una lógica adicional al fusionar partes de datos en el [ColapsarMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) y [SummingMergeTree](summingmergetree.md) motor. - - En este caso tiene sentido especificar el *clave de clasificación* que es diferente de la clave principal. - -Una clave principal larga afectará negativamente al rendimiento de la inserción y al consumo de memoria, pero las columnas adicionales de la clave principal no afectarán al rendimiento de ClickHouse durante `SELECT` consulta. - -### Elegir una clave principal que difiere de la clave de ordenación {#choosing-a-primary-key-that-differs-from-the-sorting-key} - -Es posible especificar una clave principal (una expresión con valores que se escriben en el archivo de índice para cada marca) que es diferente de la clave de ordenación (una expresión para ordenar las filas en partes de datos). En este caso, la tupla de expresión de clave primaria debe ser un prefijo de la tupla de expresión de clave de ordenación. - -Esta característica es útil cuando se [SummingMergeTree](summingmergetree.md) y -[AgregaciónMergeTree](aggregatingmergetree.md) motores de mesa. En un caso común cuando se utilizan estos motores, la tabla tiene dos tipos de columnas: *cota* y *medida*. Las consultas típicas agregan valores de columnas de medida con `GROUP BY` y filtrado por dimensiones. Debido a que SummingMergeTree y AggregatingMergeTree agregan filas con el mismo valor de la clave de ordenación, es natural agregarle todas las dimensiones. Como resultado, la expresión de clave consta de una larga lista de columnas y esta lista debe actualizarse con frecuencia con las dimensiones recién agregadas. - -En este caso, tiene sentido dejar solo unas pocas columnas en la clave principal que proporcionarán análisis de rango eficientes y agregarán las columnas de dimensión restantes a la tupla de clave de clasificación. - -[ALTER](../../../sql-reference/statements/alter.md) de la clave de ordenación es una operación ligera porque cuando se agrega una nueva columna simultáneamente a la tabla y a la clave de ordenación, las partes de datos existentes no necesitan ser cambiadas. Dado que la clave de ordenación anterior es un prefijo de la nueva clave de ordenación y no hay datos en la columna recién agregada, los datos se ordenan tanto por las claves de ordenación antiguas como por las nuevas en el momento de la modificación de la tabla. - -### Uso de índices y particiones en consultas {#use-of-indexes-and-partitions-in-queries} - -Para `SELECT` consultas, ClickHouse analiza si se puede usar un índice. Se puede usar un índice si el `WHERE/PREWHERE` clause tiene una expresión (como uno de los elementos de conjunción, o enteramente) que representa una operación de comparación de igualdad o desigualdad, o si tiene `IN` o `LIKE` con un prefijo fijo en columnas o expresiones que están en la clave principal o clave de partición, o en ciertas funciones parcialmente repetitivas de estas columnas, o relaciones lógicas de estas expresiones. - -Por lo tanto, es posible ejecutar rápidamente consultas en uno o varios rangos de la clave principal. En este ejemplo, las consultas serán rápidas cuando se ejecuten para una etiqueta de seguimiento específica, para una etiqueta y un intervalo de fechas específicos, para una etiqueta y una fecha específicas, para varias etiquetas con un intervalo de fechas, etc. - -Veamos el motor configurado de la siguiente manera: - - ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 - -En este caso, en consultas: - -``` sql -SELECT count() FROM table WHERE EventDate = toDate(now()) AND CounterID = 34 -SELECT count() FROM table WHERE EventDate = toDate(now()) AND (CounterID = 34 OR CounterID = 42) -SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDate <= toDate('2014-01-31')) OR EventDate = toDate('2014-05-01')) AND CounterID IN (101500, 731962, 160656) AND (CounterID = 101500 OR EventDate != toDate('2014-05-01')) -``` - -ClickHouse utilizará el índice de clave principal para recortar datos incorrectos y la clave de partición mensual para recortar particiones que están en intervalos de fechas incorrectos. - -Las consultas anteriores muestran que el índice se usa incluso para expresiones complejas. La lectura de la tabla está organizada de modo que el uso del índice no puede ser más lento que un escaneo completo. - -En el siguiente ejemplo, el índice no se puede usar. - -``` sql -SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' -``` - -Para comprobar si ClickHouse puede usar el índice al ejecutar una consulta, use la configuración [Fecha de nacimiento](../../../operations/settings/settings.md#settings-force_index_by_date) y [force_primary_key](../../../operations/settings/settings.md). - -La clave para particionar por mes permite leer solo aquellos bloques de datos que contienen fechas del rango adecuado. En este caso, el bloque de datos puede contener datos para muchas fechas (hasta un mes). Dentro de un bloque, los datos se ordenan por clave principal, que puede no contener la fecha como la primera columna. Debido a esto, el uso de una consulta con solo una condición de fecha que no especifica el prefijo de clave principal hará que se lean más datos que para una sola fecha. - -### Uso del índice para claves primarias parcialmente monótonas {#use-of-index-for-partially-monotonic-primary-keys} - -Considere, por ejemplo, los días del mes. Ellos forman un [monótona secuencia](https://en.wikipedia.org/wiki/Monotonic_function) durante un mes, pero no monótono durante períodos más prolongados. Esta es una secuencia parcialmente monotónica. Si un usuario crea la tabla con clave primaria parcialmente monótona, ClickHouse crea un índice disperso como de costumbre. Cuando un usuario selecciona datos de este tipo de tabla, ClickHouse analiza las condiciones de consulta. Si el usuario desea obtener datos entre dos marcas del índice y ambas marcas caen dentro de un mes, ClickHouse puede usar el índice en este caso particular porque puede calcular la distancia entre los parámetros de una consulta y las marcas de índice. - -ClickHouse no puede usar un índice si los valores de la clave principal en el rango de parámetros de consulta no representan una secuencia monotónica. En este caso, ClickHouse utiliza el método de análisis completo. - -ClickHouse usa esta lógica no solo para secuencias de días del mes, sino para cualquier clave principal que represente una secuencia parcialmente monotónica. - -### Índices de saltos de datos (experimental) {#table_engine-mergetree-data_skipping-indexes} - -La declaración de índice se encuentra en la sección de columnas del `CREATE` consulta. - -``` sql -INDEX index_name expr TYPE type(...) GRANULARITY granularity_value -``` - -Para tablas de la `*MergeTree` familia, se pueden especificar índices de omisión de datos. - -Estos índices agregan cierta información sobre la expresión especificada en bloques, que consisten en `granularity_value` gránulos (el tamaño del gránulo se especifica utilizando el `index_granularity` ajuste en el motor de la tabla). Entonces estos agregados se usan en `SELECT` consultas para reducir la cantidad de datos a leer desde el disco omitiendo grandes bloques de datos donde el `where` consulta no puede ser satisfecha. - -**Ejemplo** - -``` sql -CREATE TABLE table_name -( - u64 UInt64, - i32 Int32, - s String, - ... - INDEX a (u64 * i32, s) TYPE minmax GRANULARITY 3, - INDEX b (u64 * length(s)) TYPE set(1000) GRANULARITY 4 -) ENGINE = MergeTree() -... -``` - -ClickHouse puede utilizar los índices del ejemplo para reducir la cantidad de datos que se leen desde el disco en las siguientes consultas: - -``` sql -SELECT count() FROM table WHERE s < 'z' -SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 -``` - -#### Tipos de índices disponibles {#available-types-of-indices} - -- `minmax` - - Almacena los extremos de la expresión especificada (si la expresión `tuple`, entonces almacena extremos para cada elemento de `tuple`), utiliza información almacenada para omitir bloques de datos como la clave principal. - -- `set(max_rows)` - - Almacena valores únicos de la expresión especificada (no más de `max_rows` filas, `max_rows=0` medio “no limits”). Utiliza los valores para comprobar si `WHERE` expresión no es satisfactorio en un bloque de datos. - -- `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` - - Tiendas a [Filtro de floración](https://en.wikipedia.org/wiki/Bloom_filter) que contiene todos los ngrams de un bloque de datos. Funciona solo con cadenas. Puede ser utilizado para la optimización de `equals`, `like` y `in` expresiones. - - - `n` — ngram size, - - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, for example, 256 or 512, because it can be compressed well). - - `number_of_hash_functions` — The number of hash functions used in the Bloom filter. - - `random_seed` — The seed for Bloom filter hash functions. - -- `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` - - Lo mismo que `ngrambf_v1`, pero almacena tokens en lugar de ngrams. Los tokens son secuencias separadas por caracteres no alfanuméricos. - -- `bloom_filter([false_positive])` — Stores a [Filtro de floración](https://en.wikipedia.org/wiki/Bloom_filter) para las columnas especificadas. - - Opcional `false_positive` parámetro es la probabilidad de recibir una respuesta falsa positiva del filtro. Valores posibles: (0, 1). Valor predeterminado: 0.025. - - Tipos de datos admitidos: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`, `Array`, `LowCardinality`, `Nullable`. - - Las siguientes funciones pueden usarlo: [igual](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [en](../../../sql-reference/functions/in-functions.md), [noEn](../../../sql-reference/functions/in-functions.md), [tener](../../../sql-reference/functions/array-functions.md). - - - -``` sql -INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4 -INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4 -INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4 -``` - -#### Funciones de apoyo {#functions-support} - -Condiciones en el `WHERE` cláusula contiene llamadas de las funciones que operan con columnas. Si la columna forma parte de un índice, ClickHouse intenta usar este índice al realizar las funciones. ClickHouse admite diferentes subconjuntos de funciones para usar índices. - -El `set` index se puede utilizar con todas las funciones. Subconjuntos de funciones para otros índices se muestran en la siguiente tabla. - -| Función (operador) / Índice | clave primaria | minmax | Descripción | Sistema abierto. | bloom_filter | -|----------------------------------------------------------------------------------------------------------|----------------|--------|-------------|------------------|---------------| -| [igual (=, ==)](../../../sql-reference/functions/comparison-functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [notEquals(!=, \<\>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [como](../../../sql-reference/functions/string-search-functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ | -| [No como](../../../sql-reference/functions/string-search-functions.md#function-notlike) | ✔ | ✔ | ✔ | ✗ | ✗ | -| [Comienza con](../../../sql-reference/functions/string-functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | -| [Finaliza con](../../../sql-reference/functions/string-functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | -| [multiSearchAny](../../../sql-reference/functions/string-search-functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | -| [en](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [noEn](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [menos (\<)](../../../sql-reference/functions/comparison-functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [mayor (\>)](../../../sql-reference/functions/comparison-functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [menosOrEquals (\<=)](../../../sql-reference/functions/comparison-functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [mayorOrEquals (\>=)](../../../sql-reference/functions/comparison-functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [vaciar](../../../sql-reference/functions/array-functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [notEmpty](../../../sql-reference/functions/array-functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | -| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | - -Las funciones con un argumento constante que es menor que el tamaño de ngram no pueden ser utilizadas por `ngrambf_v1` para la optimización de consultas. - -Los filtros Bloom pueden tener coincidencias falsas positivas, por lo que `ngrambf_v1`, `tokenbf_v1`, y `bloom_filter` los índices no se pueden usar para optimizar consultas donde se espera que el resultado de una función sea falso, por ejemplo: - -- Puede ser optimizado: - - `s LIKE '%test%'` - - `NOT s NOT LIKE '%test%'` - - `s = 1` - - `NOT s != 1` - - `startsWith(s, 'test')` -- No se puede optimizar: - - `NOT s LIKE '%test%'` - - `s NOT LIKE '%test%'` - - `NOT s = 1` - - `s != 1` - - `NOT startsWith(s, 'test')` - -## Acceso a datos simultáneos {#concurrent-data-access} - -Para el acceso simultáneo a tablas, usamos versiones múltiples. En otras palabras, cuando una tabla se lee y actualiza simultáneamente, los datos se leen de un conjunto de partes que está actualizado en el momento de la consulta. No hay cerraduras largas. Las inserciones no se interponen en el camino de las operaciones de lectura. - -La lectura de una tabla se paralela automáticamente. - -## TTL para columnas y tablas {#table_engine-mergetree-ttl} - -Determina la duración de los valores. - -El `TTL` se puede establecer para toda la tabla y para cada columna individual. TTL de nivel de tabla también puede especificar la lógica de movimiento automático de datos entre discos y volúmenes. - -Las expresiones deben evaluar [Fecha](../../../sql-reference/data-types/date.md) o [FechaHora](../../../sql-reference/data-types/datetime.md) tipo de datos. - -Ejemplo: - -``` sql -TTL time_column -TTL time_column + interval -``` - -Definir `interval`, utilizar [intervalo de tiempo](../../../sql-reference/operators/index.md#operators-datetime) operador. - -``` sql -TTL date_time + INTERVAL 1 MONTH -TTL date_time + INTERVAL 15 HOUR -``` - -### Columna TTL {#mergetree-column-ttl} - -Cuando los valores de la columna caducan, ClickHouse los reemplaza con los valores predeterminados para el tipo de datos de columna. Si todos los valores de columna en la parte de datos caducan, ClickHouse elimina esta columna de la parte de datos en un sistema de archivos. - -El `TTL` cláusula no se puede utilizar para columnas clave. - -Ejemplos: - -Creación de una tabla con TTL - -``` sql -CREATE TABLE example_table -( - d DateTime, - a Int TTL d + INTERVAL 1 MONTH, - b Int TTL d + INTERVAL 1 MONTH, - c String -) -ENGINE = MergeTree -PARTITION BY toYYYYMM(d) -ORDER BY d; -``` - -Adición de TTL a una columna de una tabla existente - -``` sql -ALTER TABLE example_table - MODIFY COLUMN - c String TTL d + INTERVAL 1 DAY; -``` - -Modificación de TTL de la columna - -``` sql -ALTER TABLE example_table - MODIFY COLUMN - c String TTL d + INTERVAL 1 MONTH; -``` - -### Tabla TTL {#mergetree-table-ttl} - -La tabla puede tener una expresión para la eliminación de filas caducadas y varias expresiones para el movimiento automático de partes entre [discos o volúmenes](#table_engine-mergetree-multiple-volumes). Cuando las filas de la tabla caducan, ClickHouse elimina todas las filas correspondientes. Para la entidad de movimiento de piezas, todas las filas de una pieza deben cumplir los criterios de expresión de movimiento. - -``` sql -TTL expr [DELETE|TO DISK 'aaa'|TO VOLUME 'bbb'], ... -``` - -El tipo de regla TTL puede seguir cada expresión TTL. Afecta a una acción que debe realizarse una vez que se satisface la expresión (alcanza la hora actual): - -- `DELETE` - Eliminar filas caducadas (acción predeterminada); -- `TO DISK 'aaa'` - mover parte al disco `aaa`; -- `TO VOLUME 'bbb'` - mover parte al disco `bbb`. - -Ejemplos: - -Creación de una tabla con TTL - -``` sql -CREATE TABLE example_table -( - d DateTime, - a Int -) -ENGINE = MergeTree -PARTITION BY toYYYYMM(d) -ORDER BY d -TTL d + INTERVAL 1 MONTH [DELETE], - d + INTERVAL 1 WEEK TO VOLUME 'aaa', - d + INTERVAL 2 WEEK TO DISK 'bbb'; -``` - -Modificación de TTL de la tabla - -``` sql -ALTER TABLE example_table - MODIFY TTL d + INTERVAL 1 DAY; -``` - -**Eliminación de datos** - -Los datos con un TTL caducado se eliminan cuando ClickHouse fusiona partes de datos. - -Cuando ClickHouse ve que los datos han caducado, realiza una combinación fuera de programación. Para controlar la frecuencia de tales fusiones, puede establecer `merge_with_ttl_timeout`. Si el valor es demasiado bajo, realizará muchas fusiones fuera de horario que pueden consumir muchos recursos. - -Si realiza el `SELECT` consulta entre fusiones, puede obtener datos caducados. Para evitarlo, use el [OPTIMIZE](../../../sql-reference/statements/misc.md#misc_operations-optimize) consulta antes `SELECT`. - -## Uso de varios dispositivos de bloque para el almacenamiento de datos {#table_engine-mergetree-multiple-volumes} - -### Implantación {#introduction} - -`MergeTree` Los motores de tablas familiares pueden almacenar datos en múltiples dispositivos de bloque. Por ejemplo, puede ser útil cuando los datos de una determinada tabla se dividen implícitamente en “hot” y “cold”. Los datos más recientes se solicitan regularmente, pero solo requieren una pequeña cantidad de espacio. Por el contrario, los datos históricos de cola gorda se solicitan raramente. Si hay varios discos disponibles, el “hot” los datos pueden estar ubicados en discos rápidos (por ejemplo, SSD NVMe o en memoria), mientras que “cold” datos - en los relativamente lentos (por ejemplo, HDD). - -La parte de datos es la unidad móvil mínima para `MergeTree`-mesas de motor. Los datos que pertenecen a una parte se almacenan en un disco. Las partes de datos se pueden mover entre discos en segundo plano (según la configuración del usuario) así como por medio de la [ALTER](../../../sql-reference/statements/alter.md#alter_move-partition) consulta. - -### Plazo {#terms} - -- Disk — Block device mounted to the filesystem. -- Default disk — Disk that stores the path specified in the [camino](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-path) configuración del servidor. -- Volume — Ordered set of equal disks (similar to [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures)). -- Storage policy — Set of volumes and the rules for moving data between them. - -Los nombres dados a las entidades descritas se pueden encontrar en las tablas del sistema, [sistema.almacenamiento_policies](../../../operations/system-tables.md#system_tables-storage_policies) y [sistema.disco](../../../operations/system-tables.md#system_tables-disks). Para aplicar una de las directivas de almacenamiento configuradas para una tabla, `storage_policy` establecimiento de `MergeTree`-mesas de la familia del motor. - -### Configuración {#table_engine-mergetree-multiple-volumes_configure} - -Los discos, los volúmenes y las políticas de almacenamiento deben declararse `` etiqueta ya sea en el archivo principal `config.xml` o en un archivo distinto en el `config.d` directorio. - -Estructura de configuración: - -``` xml - - - - /mnt/fast_ssd/clickhouse/ - - - /mnt/hdd1/clickhouse/ - 10485760 - - - /mnt/hdd2/clickhouse/ - 10485760 - - - ... - - - ... - -``` - -Tags: - -- `` — Disk name. Names must be different for all disks. -- `path` — path under which a server will store data (`data` y `shadow` carpetas), debe terminarse con ‘/’. -- `keep_free_space_bytes` — the amount of free disk space to be reserved. - -El orden de la definición del disco no es importante. - -Marcado de configuración de directivas de almacenamiento: - -``` xml - - ... - - - - - disk_name_from_disks_configuration - 1073741824 - - - - - - - 0.2 - - - - - - - - ... - -``` - -Tags: - -- `policy_name_N` — Policy name. Policy names must be unique. -- `volume_name_N` — Volume name. Volume names must be unique. -- `disk` — a disk within a volume. -- `max_data_part_size_bytes` — the maximum size of a part that can be stored on any of the volume's disks. -- `move_factor` — when the amount of available space gets lower than this factor, data automatically start to move on the next volume if any (by default, 0.1). - -Cofiguration ejemplos: - -``` xml - - ... - - - - - disk1 - disk2 - - - - - - - - fast_ssd - 1073741824 - - - disk1 - - - 0.2 - - - ... - -``` - -En un ejemplo dado, el `hdd_in_order` la política implementa el [Ronda-robin](https://en.wikipedia.org/wiki/Round-robin_scheduling) enfoque. Por lo tanto, esta política define solo un volumen (`single`), las partes de datos se almacenan en todos sus discos en orden circular. Dicha política puede ser bastante útil si hay varios discos similares montados en el sistema, pero RAID no está configurado. Tenga en cuenta que cada unidad de disco individual no es confiable y es posible que desee compensarlo con un factor de replicación de 3 o más. - -Si hay diferentes tipos de discos disponibles en el sistema, `moving_from_ssd_to_hdd` política se puede utilizar en su lugar. Volumen `hot` consta de un disco SSD (`fast_ssd`), y el tamaño máximo de una pieza que se puede almacenar en este volumen es de 1 GB. Todas las piezas con el tamaño más grande que 1GB serán almacenadas directamente en `cold` volumen, que contiene un disco duro `disk1`. -Además, una vez que el disco `fast_ssd` se llena en más del 80%, los datos se transferirán al `disk1` por un proceso en segundo plano. - -El orden de enumeración de volúmenes dentro de una directiva de almacenamiento es importante. Una vez que un volumen está sobrellenado, los datos se mueven al siguiente. El orden de la enumeración del disco también es importante porque los datos se almacenan en ellos por turnos. - -Al crear una tabla, se puede aplicarle una de las directivas de almacenamiento configuradas: - -``` sql -CREATE TABLE table_with_non_default_policy ( - EventDate Date, - OrderID UInt64, - BannerID UInt64, - SearchPhrase String -) ENGINE = MergeTree -ORDER BY (OrderID, BannerID) -PARTITION BY toYYYYMM(EventDate) -SETTINGS storage_policy = 'moving_from_ssd_to_hdd' -``` - -El `default` política de almacenamiento implica el uso de un solo volumen, que consiste en un solo disco dado en ``. Una vez que se crea una tabla, no se puede cambiar su política de almacenamiento. - -### Detalles {#details} - -En el caso de `MergeTree` tablas, los datos están llegando al disco de diferentes maneras: - -- Como resultado de un inserto (`INSERT` consulta). -- Durante las fusiones de fondo y [mutación](../../../sql-reference/statements/alter.md#alter-mutations). -- Al descargar desde otra réplica. -- Como resultado de la congelación de particiones [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter.md#alter_freeze-partition). - -En todos estos casos, excepto las mutaciones y la congelación de particiones, una pieza se almacena en un volumen y un disco de acuerdo con la política de almacenamiento dada: - -1. El primer volumen (en el orden de definición) que tiene suficiente espacio en disco para almacenar una pieza (`unreserved_space > current_part_size`) y permite almacenar partes de un tamaño determinado (`max_data_part_size_bytes > current_part_size`) se elige. -2. Dentro de este volumen, se elige ese disco que sigue al que se utilizó para almacenar el fragmento de datos anterior y que tiene espacio libre más que el tamaño de la pieza (`unreserved_space - keep_free_space_bytes > current_part_size`). - -Bajo el capó, las mutaciones y la congelación de particiones hacen uso de [enlaces duros](https://en.wikipedia.org/wiki/Hard_link). Los enlaces duros entre diferentes discos no son compatibles, por lo tanto, en tales casos las partes resultantes se almacenan en los mismos discos que los iniciales. - -En el fondo, las partes se mueven entre volúmenes en función de la cantidad de espacio libre (`move_factor` parámetro) según el orden en que se declaran los volúmenes en el archivo de configuración. -Los datos nunca se transfieren desde el último y al primero. Uno puede usar tablas del sistema [sistema.part_log](../../../operations/system-tables.md#system_tables-part-log) (campo `type = MOVE_PART`) y [sistema.parte](../../../operations/system-tables.md#system_tables-parts) (campo `path` y `disk`) para monitorear movimientos de fondo. Además, la información detallada se puede encontrar en los registros del servidor. - -El usuario puede forzar el movimiento de una pieza o una partición de un volumen a otro mediante la consulta [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter.md#alter_move-partition), todas las restricciones para las operaciones en segundo plano se tienen en cuenta. La consulta inicia un movimiento por sí misma y no espera a que se completen las operaciones en segundo plano. El usuario recibirá un mensaje de error si no hay suficiente espacio libre disponible o si no se cumple alguna de las condiciones requeridas. - -Mover datos no interfiere con la replicación de datos. Por lo tanto, se pueden especificar diferentes directivas de almacenamiento para la misma tabla en diferentes réplicas. - -Después de la finalización de las fusiones y mutaciones de fondo, las partes viejas se eliminan solo después de un cierto período de tiempo (`old_parts_lifetime`). -Durante este tiempo, no se mueven a otros volúmenes o discos. Por lo tanto, hasta que las partes finalmente se eliminen, aún se tienen en cuenta para la evaluación del espacio en disco ocupado. - -[Artículo Original](https://clickhouse.tech/docs/ru/operations/table_engines/mergetree/) diff --git a/docs/es/engines/table-engines/mergetree-family/replacingmergetree.md b/docs/es/engines/table-engines/mergetree-family/replacingmergetree.md deleted file mode 100644 index a1e95c5b5f4..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/replacingmergetree.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 33 -toc_title: ReplacingMergeTree ---- - -# ReplacingMergeTree {#replacingmergetree} - -El motor difiere de [Método de codificación de datos:](mergetree.md#table_engines-mergetree) en que elimina las entradas duplicadas con el mismo valor de clave principal (o más exactamente, con el mismo [clave de clasificación](mergetree.md) valor). - -La desduplicación de datos solo se produce durante una fusión. La fusión ocurre en segundo plano en un momento desconocido, por lo que no puede planificarla. Algunos de los datos pueden permanecer sin procesar. Aunque puede ejecutar una fusión no programada utilizando el `OPTIMIZE` consulta, no cuente con usarlo, porque el `OPTIMIZE` consulta leerá y escribirá una gran cantidad de datos. - -Así, `ReplacingMergeTree` es adecuado para borrar datos duplicados en segundo plano para ahorrar espacio, pero no garantiza la ausencia de duplicados. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = ReplacingMergeTree([ver]) -[PARTITION BY expr] -[ORDER BY expr] -[PRIMARY KEY expr] -[SAMPLE BY expr] -[SETTINGS name=value, ...] -``` - -Para obtener una descripción de los parámetros de solicitud, consulte [descripción de la solicitud](../../../sql-reference/statements/create.md). - -**ReplacingMergeTree Parámetros** - -- `ver` — column with version. Type `UInt*`, `Date` o `DateTime`. Parámetro opcional. - - Al fusionar, `ReplacingMergeTree` de todas las filas con la misma clave primaria deja solo una: - - - Último en la selección, si `ver` no establecido. - - Con la versión máxima, si `ver` indicado. - -**Cláusulas de consulta** - -Al crear un `ReplacingMergeTree` mesa de la misma [clausula](mergetree.md) se requieren, como al crear un `MergeTree` tabla. - -
- -Método obsoleto para crear una tabla - -!!! attention "Atención" - No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE [=] ReplacingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [ver]) -``` - -Todos los parámetros excepto `ver` el mismo significado que en `MergeTree`. - -- `ver` - columna con la versión. Parámetro opcional. Para una descripción, vea el texto anterior. - -
- -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/replacingmergetree/) diff --git a/docs/es/engines/table-engines/mergetree-family/replication.md b/docs/es/engines/table-engines/mergetree-family/replication.md deleted file mode 100644 index 505f5223800..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/replication.md +++ /dev/null @@ -1,218 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 31 -toc_title: "Replicaci\xF3n de datos" ---- - -# Replicación de datos {#table_engines-replication} - -La replicación solo se admite para tablas de la familia MergeTree: - -- ReplicatedMergeTree -- ReplicatedSummingMergeTree -- ReplicatedReplacingMergeTree -- ReplicatedAggregatingMergeTree -- ReplicatedCollapsingMergeTree -- ReplicatedVersionedCollapsingMergetree -- ReplicatedGraphiteMergeTree - -La replicación funciona a nivel de una tabla individual, no de todo el servidor. Un servidor puede almacenar tablas replicadas y no replicadas al mismo tiempo. - -La replicación no depende de la fragmentación. Cada fragmento tiene su propia replicación independiente. - -Datos comprimidos para `INSERT` y `ALTER` se replica (para obtener más información, consulte la documentación para [ALTER](../../../sql-reference/statements/alter.md#query_language_queries_alter)). - -`CREATE`, `DROP`, `ATTACH`, `DETACH` y `RENAME` las consultas se ejecutan en un único servidor y no se replican: - -- El `CREATE TABLE` query crea una nueva tabla replicable en el servidor donde se ejecuta la consulta. Si esta tabla ya existe en otros servidores, agrega una nueva réplica. -- El `DROP TABLE` query elimina la réplica ubicada en el servidor donde se ejecuta la consulta. -- El `RENAME` query cambia el nombre de la tabla en una de las réplicas. En otras palabras, las tablas replicadas pueden tener diferentes nombres en diferentes réplicas. - -Uso de ClickHouse [Apache ZooKeeper](https://zookeeper.apache.org) para almacenar metainformación de réplicas. Utilice ZooKeeper versión 3.4.5 o posterior. - -Para utilizar la replicación, establezca los parámetros [Zookeeper](../../../operations/server-configuration-parameters/settings.md#server-settings_zookeeper) sección de configuración del servidor. - -!!! attention "Atención" - No descuides la configuración de seguridad. ClickHouse soporta el `digest` [Esquema de ACL](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) del subsistema de seguridad ZooKeeper. - -Ejemplo de configuración de las direcciones del clúster ZooKeeper: - -``` xml - - - example1 - 2181 - - - example2 - 2181 - - - example3 - 2181 - - -``` - -Puede especificar cualquier clúster ZooKeeper existente y el sistema utilizará un directorio en él para sus propios datos (el directorio se especifica al crear una tabla replicable). - -Si ZooKeeper no está establecido en el archivo de configuración, no puede crear tablas replicadas y las tablas replicadas existentes serán de solo lectura. - -ZooKeeper no se utiliza en `SELECT` consultas porque la replicación no afecta al rendimiento de `SELECT` y las consultas se ejecutan tan rápido como lo hacen para las tablas no replicadas. Al consultar tablas replicadas distribuidas, el comportamiento de ClickHouse se controla mediante la configuración [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) y [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries). - -Para cada `INSERT` consulta, aproximadamente diez entradas se agregan a ZooKeeper a través de varias transacciones. (Para ser más precisos, esto es para cada bloque de datos insertado; una consulta INSERT contiene un bloque o un bloque por `max_insert_block_size = 1048576` filas.) Esto conduce a latencias ligeramente más largas para `INSERT` en comparación con las tablas no replicadas. Pero si sigue las recomendaciones para insertar datos en lotes de no más de uno `INSERT` por segundo, no crea ningún problema. Todo el clúster ClickHouse utilizado para coordinar un clúster ZooKeeper tiene un total de varios cientos `INSERTs` por segundo. El rendimiento en las inserciones de datos (el número de filas por segundo) es tan alto como para los datos no replicados. - -Para clústeres muy grandes, puede usar diferentes clústeres de ZooKeeper para diferentes fragmentos. Sin embargo, esto no ha demostrado ser necesario en el Yandex.Clúster Metrica (aproximadamente 300 servidores). - -La replicación es asíncrona y multi-master. `INSERT` consultas (así como `ALTER`) se puede enviar a cualquier servidor disponible. Los datos se insertan en el servidor donde se ejecuta la consulta y, a continuación, se copian a los demás servidores. Debido a que es asincrónico, los datos insertados recientemente aparecen en las otras réplicas con cierta latencia. Si parte de las réplicas no está disponible, los datos se escriben cuando estén disponibles. Si hay una réplica disponible, la latencia es la cantidad de tiempo que tarda en transferir el bloque de datos comprimidos a través de la red. - -De forma predeterminada, una consulta INSERT espera la confirmación de la escritura de los datos de una sola réplica. Si los datos fue correctamente escrito a sólo una réplica y el servidor con esta réplica deja de existir, los datos almacenados se perderán. Para habilitar la confirmación de las escrituras de datos de varias réplicas, utilice `insert_quorum` opcion. - -Cada bloque de datos se escribe atómicamente. La consulta INSERT se divide en bloques hasta `max_insert_block_size = 1048576` filas. En otras palabras, si el `INSERT` consulta tiene menos de 1048576 filas, se hace atómicamente. - -Los bloques de datos se deduplican. Para varias escrituras del mismo bloque de datos (bloques de datos del mismo tamaño que contienen las mismas filas en el mismo orden), el bloque solo se escribe una vez. La razón de esto es en caso de fallas de red cuando la aplicación cliente no sabe si los datos se escribieron en la base de datos, por lo que `INSERT` consulta simplemente se puede repetir. No importa a qué réplica se enviaron los INSERT con datos idénticos. `INSERTs` son idempotentes. Los parámetros de desduplicación son controlados por [merge_tree](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-merge_tree) configuración del servidor. - -Durante la replicación, sólo los datos de origen que se van a insertar se transfieren a través de la red. La transformación de datos adicional (fusión) se coordina y se realiza en todas las réplicas de la misma manera. Esto minimiza el uso de la red, lo que significa que la replicación funciona bien cuando las réplicas residen en centros de datos diferentes. (Tenga en cuenta que la duplicación de datos en diferentes centros de datos es el objetivo principal de la replicación.) - -Puede tener cualquier número de réplicas de los mismos datos. El Yandex.Metrica utiliza doble replicación en producción. Cada servidor utiliza RAID-5 o RAID-6, y RAID-10 en algunos casos. Esta es una solución relativamente confiable y conveniente. - -El sistema supervisa la sincronicidad de los datos en las réplicas y puede recuperarse después de un fallo. La conmutación por error es automática (para pequeñas diferencias en los datos) o semiautomática (cuando los datos difieren demasiado, lo que puede indicar un error de configuración). - -## Creación de tablas replicadas {#creating-replicated-tables} - -El `Replicated` prefijo se agrega al nombre del motor de tabla. Por ejemplo:`ReplicatedMergeTree`. - -**Replicated\*MergeTree parámetros** - -- `zoo_path` — The path to the table in ZooKeeper. -- `replica_name` — The replica name in ZooKeeper. - -Ejemplo: - -``` sql -CREATE TABLE table_name -( - EventDate DateTime, - CounterID UInt32, - UserID UInt32 -) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}') -PARTITION BY toYYYYMM(EventDate) -ORDER BY (CounterID, EventDate, intHash32(UserID)) -SAMPLE BY intHash32(UserID) -``` - -
- -Ejemplo en sintaxis obsoleta - -``` sql -CREATE TABLE table_name -( - EventDate DateTime, - CounterID UInt32, - UserID UInt32 -) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192) -``` - -
- -Como muestra el ejemplo, estos parámetros pueden contener sustituciones entre llaves. Los valores sustituidos se toman de la ‘macros’ sección del archivo de configuración. Ejemplo: - -``` xml - - 05 - 02 - example05-02-1.yandex.ru - -``` - -La ruta de acceso a la tabla en ZooKeeper debe ser única para cada tabla replicada. Las tablas en diferentes fragmentos deben tener rutas diferentes. -En este caso, la ruta consta de las siguientes partes: - -`/clickhouse/tables/` es el prefijo común. Recomendamos usar exactamente este. - -`{layer}-{shard}` es el identificador de fragmento. En este ejemplo consta de dos partes, ya que el Yandex.Metrica clúster utiliza sharding de dos niveles. Para la mayoría de las tareas, puede dejar solo la sustitución {shard}, que se expandirá al identificador de fragmento. - -`table_name` es el nombre del nodo de la tabla en ZooKeeper. Es una buena idea hacerlo igual que el nombre de la tabla. Se define explícitamente, porque a diferencia del nombre de la tabla, no cambia después de una consulta RENAME. -*HINT*: podría agregar un nombre de base de datos delante de `table_name` También. Nivel de Cifrado WEP `db_name.table_name` - -El nombre de réplica identifica diferentes réplicas de la misma tabla. Puede usar el nombre del servidor para esto, como en el ejemplo. El nombre solo tiene que ser único dentro de cada fragmento. - -Puede definir los parámetros explícitamente en lugar de utilizar sustituciones. Esto podría ser conveniente para probar y para configurar clústeres pequeños. Sin embargo, no puede usar consultas DDL distribuidas (`ON CLUSTER` en este caso. - -Cuando se trabaja con clústeres grandes, se recomienda utilizar sustituciones porque reducen la probabilidad de error. - -Ejecute el `CREATE TABLE` consulta en cada réplica. Esta consulta crea una nueva tabla replicada o agrega una nueva réplica a una existente. - -Si agrega una nueva réplica después de que la tabla ya contenga algunos datos en otras réplicas, los datos se copiarán de las otras réplicas a la nueva después de ejecutar la consulta. En otras palabras, la nueva réplica se sincroniza con las demás. - -Para eliminar una réplica, ejecute `DROP TABLE`. However, only one replica is deleted – the one that resides on the server where you run the query. - -## Recuperación después de fallos {#recovery-after-failures} - -Si ZooKeeper no está disponible cuando se inicia un servidor, las tablas replicadas cambian al modo de solo lectura. El sistema intenta conectarse periódicamente a ZooKeeper. - -Si ZooKeeper no está disponible durante un `INSERT`, o se produce un error al interactuar con ZooKeeper, se produce una excepción. - -Después de conectarse a ZooKeeper, el sistema comprueba si el conjunto de datos en el sistema de archivos local coincide con el conjunto de datos esperado (ZooKeeper almacena esta información). Si hay incoherencias menores, el sistema las resuelve sincronizando datos con las réplicas. - -Si el sistema detecta partes de datos rotas (con un tamaño incorrecto de archivos) o partes no reconocidas (partes escritas en el sistema de archivos pero no grabadas en ZooKeeper), las mueve al `detached` subdirectorio (no se eliminan). Las piezas que faltan se copian de las réplicas. - -Tenga en cuenta que ClickHouse no realiza ninguna acción destructiva, como eliminar automáticamente una gran cantidad de datos. - -Cuando el servidor se inicia (o establece una nueva sesión con ZooKeeper), solo verifica la cantidad y el tamaño de todos los archivos. Si los tamaños de los archivos coinciden pero los bytes se han cambiado en algún punto intermedio, esto no se detecta inmediatamente, sino solo cuando se intenta leer los datos `SELECT` consulta. La consulta produce una excepción sobre una suma de comprobación no coincidente o el tamaño de un bloque comprimido. En este caso, las partes de datos se agregan a la cola de verificación y se copian de las réplicas si es necesario. - -Si el conjunto local de datos difiere demasiado del esperado, se activa un mecanismo de seguridad. El servidor ingresa esto en el registro y se niega a iniciarse. La razón de esto es que este caso puede indicar un error de configuración, como si una réplica en un fragmento se configurara accidentalmente como una réplica en un fragmento diferente. Sin embargo, los umbrales para este mecanismo se establecen bastante bajos, y esta situación puede ocurrir durante la recuperación de falla normal. En este caso, los datos se restauran semiautomáticamente, mediante “pushing a button”. - -Para iniciar la recuperación, cree el nodo `/path_to_table/replica_name/flags/force_restore_data` en ZooKeeper con cualquier contenido, o ejecute el comando para restaurar todas las tablas replicadas: - -``` bash -sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data -``` - -A continuación, reinicie el servidor. Al iniciar, el servidor elimina estos indicadores e inicia la recuperación. - -## Recuperación después de la pérdida completa de datos {#recovery-after-complete-data-loss} - -Si todos los datos y metadatos desaparecieron de uno de los servidores, siga estos pasos para la recuperación: - -1. Instale ClickHouse en el servidor. Defina correctamente las sustituciones en el archivo de configuración que contiene el identificador de fragmento y las réplicas, si las usa. -2. Si tenía tablas no duplicadas que deben duplicarse manualmente en los servidores, copie sus datos desde una réplica (en el directorio `/var/lib/clickhouse/data/db_name/table_name/`). -3. Copiar definiciones de tablas ubicadas en `/var/lib/clickhouse/metadata/` de una réplica. Si un identificador de fragmento o réplica se define explícitamente en las definiciones de tabla, corríjalo para que corresponda a esta réplica. (Como alternativa, inicie el servidor y `ATTACH TABLE` consultas que deberían haber estado en el .sql archivos en `/var/lib/clickhouse/metadata/`.) -4. Para iniciar la recuperación, cree el nodo ZooKeeper `/path_to_table/replica_name/flags/force_restore_data` con cualquier contenido o ejecute el comando para restaurar todas las tablas replicadas: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` - -Luego inicie el servidor (reinicie, si ya se está ejecutando). Los datos se descargarán de las réplicas. - -Una opción de recuperación alternativa es eliminar información sobre la réplica perdida de ZooKeeper (`/path_to_table/replica_name`), luego vuelva a crear la réplica como se describe en “[Creación de tablas replicadas](#creating-replicated-tables)”. - -No hay restricción en el ancho de banda de la red durante la recuperación. Tenga esto en cuenta si está restaurando muchas réplicas a la vez. - -## La conversión de MergeTree a ReplicatedMergeTree {#converting-from-mergetree-to-replicatedmergetree} - -Usamos el término `MergeTree` para referirse a todos los motores de mesa en el `MergeTree family`, lo mismo que para `ReplicatedMergeTree`. - -Si usted tenía un `MergeTree` tabla replicada manualmente, puede convertirla en una tabla replicada. Es posible que tenga que hacer esto si ya ha recopilado una gran cantidad de datos `MergeTree` y ahora desea habilitar la replicación. - -Si los datos difieren en varias réplicas, primero sincronícelos o elimínelos en todas las réplicas, excepto en una. - -Cambie el nombre de la tabla MergeTree existente y, a continuación, cree un `ReplicatedMergeTree` mesa con el antiguo nombre. -Mueva los datos de la tabla antigua a la `detached` subdirectorio dentro del directorio con los nuevos datos de la tabla (`/var/lib/clickhouse/data/db_name/table_name/`). -Luego ejecuta `ALTER TABLE ATTACH PARTITION` en una de las réplicas para agregar estas partes de datos al conjunto de trabajo. - -## La conversión de ReplicatedMergeTree a MergeTree {#converting-from-replicatedmergetree-to-mergetree} - -Cree una tabla MergeTree con un nombre diferente. Mueva todos los datos del directorio con el `ReplicatedMergeTree` datos de la tabla al directorio de datos de la nueva tabla. A continuación, elimine el `ReplicatedMergeTree` y reinicie el servidor. - -Si desea deshacerse de un `ReplicatedMergeTree` sin iniciar el servidor: - -- Eliminar el correspondiente `.sql` archivo en el directorio de metadatos (`/var/lib/clickhouse/metadata/`). -- Eliminar la ruta correspondiente en ZooKeeper (`/path_to_table/replica_name`). - -Después de esto, puede iniciar el servidor, crear un `MergeTree` tabla, mueva los datos a su directorio y, a continuación, reinicie el servidor. - -## Recuperación cuando se pierden o se dañan los metadatos del clúster Zookeeper {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged} - -Si los datos de ZooKeeper se perdieron o se dañaron, puede guardar los datos moviéndolos a una tabla no duplicada como se describió anteriormente. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/replication/) diff --git a/docs/es/engines/table-engines/mergetree-family/summingmergetree.md b/docs/es/engines/table-engines/mergetree-family/summingmergetree.md deleted file mode 100644 index 3ae9a1515c0..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/summingmergetree.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 34 -toc_title: SummingMergeTree ---- - -# SummingMergeTree {#summingmergetree} - -El motor hereda de [Método de codificación de datos:](mergetree.md#table_engines-mergetree). La diferencia es que al fusionar partes de datos para `SummingMergeTree` ClickHouse reemplaza todas las filas con la misma clave primaria (o más exactamente, con la misma [clave de clasificación](mergetree.md)) con una fila que contiene valores resumidos para las columnas con el tipo de datos numérico. Si la clave de ordenación está compuesta de manera que un solo valor de clave corresponde a un gran número de filas, esto reduce significativamente el volumen de almacenamiento y acelera la selección de datos. - -Recomendamos usar el motor junto con `MergeTree`. Almacenar datos completos en `MergeTree` mesa, y el uso `SummingMergeTree` para el almacenamiento de datos agregados, por ejemplo, al preparar informes. Tal enfoque evitará que pierda datos valiosos debido a una clave primaria compuesta incorrectamente. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = SummingMergeTree([columns]) -[PARTITION BY expr] -[ORDER BY expr] -[SAMPLE BY expr] -[SETTINGS name=value, ...] -``` - -Para obtener una descripción de los parámetros de solicitud, consulte [descripción de la solicitud](../../../sql-reference/statements/create.md). - -**Parámetros de SummingMergeTree** - -- `columns` - una tupla con los nombres de las columnas donde se resumirán los valores. Parámetro opcional. - Las columnas deben ser de tipo numérico y no deben estar en la clave principal. - - Si `columns` no especificado, ClickHouse resume los valores de todas las columnas con un tipo de datos numérico que no están en la clave principal. - -**Cláusulas de consulta** - -Al crear un `SummingMergeTree` mesa de la misma [clausula](mergetree.md) se requieren, como al crear un `MergeTree` tabla. - -
- -Método obsoleto para crear una tabla - -!!! attention "Atención" - No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE [=] SummingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [columns]) -``` - -Todos los parámetros excepto `columns` el mismo significado que en `MergeTree`. - -- `columns` — tuple with names of columns values of which will be summarized. Optional parameter. For a description, see the text above. - -
- -## Ejemplo de uso {#usage-example} - -Considere la siguiente tabla: - -``` sql -CREATE TABLE summtt -( - key UInt32, - value UInt32 -) -ENGINE = SummingMergeTree() -ORDER BY key -``` - -Insertar datos: - -``` sql -INSERT INTO summtt Values(1,1),(1,2),(2,1) -``` - -ClickHouse puede sumar todas las filas no completamente ([ver abajo](#data-processing)), entonces usamos una función agregada `sum` y `GROUP BY` cláusula en la consulta. - -``` sql -SELECT key, sum(value) FROM summtt GROUP BY key -``` - -``` text -┌─key─┬─sum(value)─┐ -│ 2 │ 1 │ -│ 1 │ 3 │ -└─────┴────────────┘ -``` - -## Procesamiento de datos {#data-processing} - -Cuando los datos se insertan en una tabla, se guardan tal cual. ClickHouse combina las partes insertadas de los datos periódicamente y esto es cuando las filas con la misma clave principal se suman y se reemplazan con una para cada parte resultante de los datos. - -ClickHouse can merge the data parts so that different resulting parts of data cat consist rows with the same primary key, i.e. the summation will be incomplete. Therefore (`SELECT`) una función agregada [resumir()](../../../sql-reference/aggregate-functions/reference.md#agg_function-sum) y `GROUP BY` cláusula se debe utilizar en una consulta como se describe en el ejemplo anterior. - -### Reglas comunes para la suma {#common-rules-for-summation} - -Se resumen los valores de las columnas con el tipo de datos numérico. El conjunto de columnas está definido por el parámetro `columns`. - -Si los valores eran 0 en todas las columnas para la suma, se elimina la fila. - -Si la columna no está en la clave principal y no se resume, se selecciona un valor arbitrario entre los existentes. - -Los valores no se resumen para las columnas de la clave principal. - -### La suma en las columnas de función agregada {#the-summation-in-the-aggregatefunction-columns} - -Para columnas de [Tipo AggregateFunction](../../../sql-reference/data-types/aggregatefunction.md) ClickHouse se comporta como [AgregaciónMergeTree](aggregatingmergetree.md) agregación del motor según la función. - -### Estructuras anidadas {#nested-structures} - -La tabla puede tener estructuras de datos anidadas que se procesan de una manera especial. - -Si el nombre de una tabla anidada termina con `Map` y contiene al menos dos columnas que cumplen los siguientes criterios: - -- la primera columna es numérica `(*Int*, Date, DateTime)` o una cadena `(String, FixedString)`, vamos a llamarlo `key`, -- las otras columnas son aritméticas `(*Int*, Float32/64)`, vamos a llamarlo `(values...)`, - -entonces esta tabla anidada se interpreta como una asignación de `key => (values...)`, y al fusionar sus filas, los elementos de dos conjuntos de datos se fusionan por `key` con una suma de los correspondientes `(values...)`. - -Ejemplos: - -``` text -[(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] -[(1, 100)] + [(1, 150)] -> [(1, 250)] -[(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] -[(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] -``` - -Al solicitar datos, utilice el [sumMap(clave, valor)](../../../sql-reference/aggregate-functions/reference.md) función para la agregación de `Map`. - -Para la estructura de datos anidados, no necesita especificar sus columnas en la tupla de columnas para la suma. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/summingmergetree/) diff --git a/docs/es/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md b/docs/es/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md deleted file mode 100644 index d69bfe9440e..00000000000 --- a/docs/es/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md +++ /dev/null @@ -1,238 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 37 -toc_title: VersionedCollapsingMergeTree ---- - -# VersionedCollapsingMergeTree {#versionedcollapsingmergetree} - -Este motor: - -- Permite la escritura rápida de estados de objetos que cambian continuamente. -- Elimina los estados de objetos antiguos en segundo plano. Esto reduce significativamente el volumen de almacenamiento. - -Vea la sección [Derrumbar](#table_engines_versionedcollapsingmergetree) para más detalles. - -El motor hereda de [Método de codificación de datos:](mergetree.md#table_engines-mergetree) y agrega la lógica para colapsar filas al algoritmo para fusionar partes de datos. `VersionedCollapsingMergeTree` tiene el mismo propósito que [ColapsarMergeTree](collapsingmergetree.md) pero usa un algoritmo de colapso diferente que permite insertar los datos en cualquier orden con múltiples hilos. En particular, el `Version` columna ayuda a contraer las filas correctamente, incluso si se insertan en el orden incorrecto. En contraste, `CollapsingMergeTree` sólo permite la inserción estrictamente consecutiva. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = VersionedCollapsingMergeTree(sign, version) -[PARTITION BY expr] -[ORDER BY expr] -[SAMPLE BY expr] -[SETTINGS name=value, ...] -``` - -Para obtener una descripción de los parámetros de consulta, consulte [descripción de la consulta](../../../sql-reference/statements/create.md). - -**Parámetros del motor** - -``` sql -VersionedCollapsingMergeTree(sign, version) -``` - -- `sign` — Name of the column with the type of row: `1` es una “state” fila, `-1` es una “cancel” fila. - - El tipo de datos de columna debe ser `Int8`. - -- `version` — Name of the column with the version of the object state. - - El tipo de datos de columna debe ser `UInt*`. - -**Cláusulas de consulta** - -Al crear un `VersionedCollapsingMergeTree` mesa, la misma [clausula](mergetree.md) se requieren como al crear un `MergeTree` tabla. - -
- -Método obsoleto para crear una tabla - -!!! attention "Atención" - No utilice este método en nuevos proyectos. Si es posible, cambie los proyectos antiguos al método descrito anteriormente. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE [=] VersionedCollapsingMergeTree(date-column [, samp#table_engines_versionedcollapsingmergetreeling_expression], (primary, key), index_granularity, sign, version) -``` - -Todos los parámetros excepto `sign` y `version` el mismo significado que en `MergeTree`. - -- `sign` — Name of the column with the type of row: `1` es una “state” fila, `-1` es una “cancel” fila. - - Column Data Type — `Int8`. - -- `version` — Name of the column with the version of the object state. - - El tipo de datos de columna debe ser `UInt*`. - -
- -## Derrumbar {#table_engines_versionedcollapsingmergetree} - -### Datos {#data} - -Considere una situación en la que necesite guardar datos que cambien continuamente para algún objeto. Es razonable tener una fila para un objeto y actualizar la fila siempre que haya cambios. Sin embargo, la operación de actualización es costosa y lenta para un DBMS porque requiere volver a escribir los datos en el almacenamiento. La actualización no es aceptable si necesita escribir datos rápidamente, pero puede escribir los cambios en un objeto secuencialmente de la siguiente manera. - -Utilice el `Sign` columna al escribir la fila. Si `Sign = 1` significa que la fila es un estado de un objeto (llamémoslo el “state” fila). Si `Sign = -1` indica la cancelación del estado de un objeto con los mismos atributos (llamémoslo el “cancel” fila). También use el `Version` columna, que debe identificar cada estado de un objeto con un número separado. - -Por ejemplo, queremos calcular cuántas páginas visitaron los usuarios en algún sitio y cuánto tiempo estuvieron allí. En algún momento escribimos la siguiente fila con el estado de la actividad del usuario: - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | -└─────────────────────┴───────────┴──────────┴──────┴─────────┘ -``` - -En algún momento después registramos el cambio de actividad del usuario y lo escribimos con las siguientes dos filas. - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | -│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | -└─────────────────────┴───────────┴──────────┴──────┴─────────┘ -``` - -La primera fila cancela el estado anterior del objeto (usuario). Debe copiar todos los campos del estado cancelado excepto `Sign`. - -La segunda fila contiene el estado actual. - -Debido a que solo necesitamos el último estado de actividad del usuario, las filas - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | -│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | -└─────────────────────┴───────────┴──────────┴──────┴─────────┘ -``` - -se puede eliminar, colapsando el estado no válido (antiguo) del objeto. `VersionedCollapsingMergeTree` hace esto mientras fusiona las partes de datos. - -Para averiguar por qué necesitamos dos filas para cada cambio, vea [Algoritmo](#table_engines-versionedcollapsingmergetree-algorithm). - -**Notas sobre el uso** - -1. El programa que escribe los datos debe recordar el estado de un objeto para cancelarlo. El “cancel” cadena debe ser una copia de la “state” con lo opuesto `Sign`. Esto aumenta el tamaño inicial de almacenamiento, pero permite escribir los datos rápidamente. -2. Las matrices de largo crecimiento en columnas reducen la eficiencia del motor debido a la carga para escribir. Cuanto más sencillos sean los datos, mejor será la eficiencia. -3. `SELECT` Los resultados dependen en gran medida de la coherencia del historial de cambios de objetos. Sea preciso al preparar los datos para insertarlos. Puede obtener resultados impredecibles con datos incoherentes, como valores negativos para métricas no negativas, como la profundidad de la sesión. - -### Algoritmo {#table_engines-versionedcollapsingmergetree-algorithm} - -Cuando ClickHouse combina partes de datos, elimina cada par de filas que tienen la misma clave principal y versión y diferentes `Sign`. El orden de las filas no importa. - -Cuando ClickHouse inserta datos, ordena filas por la clave principal. Si el `Version` la columna no está en la clave principal, ClickHouse la agrega a la clave principal implícitamente como el último campo y la usa para ordenar. - -## Selección de datos {#selecting-data} - -ClickHouse no garantiza que todas las filas con la misma clave principal estén en la misma parte de datos resultante o incluso en el mismo servidor físico. Esto es cierto tanto para escribir los datos como para la posterior fusión de las partes de datos. Además, ClickHouse procesa `SELECT` consultas con múltiples subprocesos, y no puede predecir el orden de las filas en el resultado. Esto significa que la agregación es necesaria si hay una necesidad de obtener completamente “collapsed” datos de un `VersionedCollapsingMergeTree` tabla. - -Para finalizar el colapso, escriba una consulta con un `GROUP BY` cláusula y funciones agregadas que representan el signo. Por ejemplo, para calcular la cantidad, use `sum(Sign)` en lugar de `count()`. Para calcular la suma de algo, use `sum(Sign * x)` en lugar de `sum(x)` y agregar `HAVING sum(Sign) > 0`. - -Los agregados `count`, `sum` y `avg` se puede calcular de esta manera. El agregado `uniq` se puede calcular si un objeto tiene al menos un estado no colapsado. Los agregados `min` y `max` no se puede calcular porque `VersionedCollapsingMergeTree` no guarda el historial de valores de estados colapsados. - -Si necesita extraer los datos con “collapsing” pero sin agregación (por ejemplo, para verificar si hay filas presentes cuyos valores más nuevos coinciden con ciertas condiciones), puede usar el `FINAL` modificador para el `FROM` clausula. Este enfoque es ineficiente y no debe usarse con tablas grandes. - -## Ejemplo de uso {#example-of-use} - -Datos de ejemplo: - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | -│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | -│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | -└─────────────────────┴───────────┴──────────┴──────┴─────────┘ -``` - -Creación de la tabla: - -``` sql -CREATE TABLE UAct -( - UserID UInt64, - PageViews UInt8, - Duration UInt8, - Sign Int8, - Version UInt8 -) -ENGINE = VersionedCollapsingMergeTree(Sign, Version) -ORDER BY UserID -``` - -Insertar los datos: - -``` sql -INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1, 1) -``` - -``` sql -INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1, 1),(4324182021466249494, 6, 185, 1, 2) -``` - -Usamos dos `INSERT` consultas para crear dos partes de datos diferentes. Si insertamos los datos con una sola consulta, ClickHouse crea una parte de datos y nunca realizará ninguna fusión. - -Obtener los datos: - -``` sql -SELECT * FROM UAct -``` - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┴─────────┘ -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ -└─────────────────────┴───────────┴──────────┴──────┴─────────┘ -``` - -¿Qué vemos aquí y dónde están las partes colapsadas? -Creamos dos partes de datos usando dos `INSERT` consulta. El `SELECT` la consulta se realizó en dos subprocesos, y el resultado es un orden aleatorio de filas. -No se produjo el colapso porque las partes de datos aún no se han fusionado. ClickHouse fusiona partes de datos en un punto desconocido en el tiempo que no podemos predecir. - -Es por eso que necesitamos agregación: - -``` sql -SELECT - UserID, - sum(PageViews * Sign) AS PageViews, - sum(Duration * Sign) AS Duration, - Version -FROM UAct -GROUP BY UserID, Version -HAVING sum(Sign) > 0 -``` - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Version─┐ -│ 4324182021466249494 │ 6 │ 185 │ 2 │ -└─────────────────────┴───────────┴──────────┴─────────┘ -``` - -Si no necesitamos agregación y queremos forzar el colapso, podemos usar el `FINAL` modificador para el `FROM` clausula. - -``` sql -SELECT * FROM UAct FINAL -``` - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ -└─────────────────────┴───────────┴──────────┴──────┴─────────┘ -``` - -Esta es una forma muy ineficiente de seleccionar datos. No lo use para mesas grandes. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/versionedcollapsingmergetree/) diff --git a/docs/es/engines/table-engines/special/buffer.md b/docs/es/engines/table-engines/special/buffer.md deleted file mode 100644 index b3a26ff356a..00000000000 --- a/docs/es/engines/table-engines/special/buffer.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 45 -toc_title: "B\xFAfer" ---- - -# Búfer {#buffer} - -Almacena los datos para escribir en la memoria RAM, enjuagándolos periódicamente a otra tabla. Durante la operación de lectura, los datos se leen desde el búfer y la otra tabla simultáneamente. - -``` sql -Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) -``` - -Parámetros del motor: - -- `database` – Database name. Instead of the database name, you can use a constant expression that returns a string. -- `table` – Table to flush data to. -- `num_layers` – Parallelism layer. Physically, the table will be represented as `num_layers` de búferes independientes. Valor recomendado: 16. -- `min_time`, `max_time`, `min_rows`, `max_rows`, `min_bytes`, y `max_bytes` – Conditions for flushing data from the buffer. - -Los datos se vacían del búfer y se escriben en la tabla de destino si `min*` condiciones o al menos una `max*` condición se cumplen. - -- `min_time`, `max_time` – Condition for the time in seconds from the moment of the first write to the buffer. -- `min_rows`, `max_rows` – Condition for the number of rows in the buffer. -- `min_bytes`, `max_bytes` – Condition for the number of bytes in the buffer. - -Durante la operación de escritura, los datos se insertan en un `num_layers` número de búferes aleatorios. O bien, si la parte de datos para insertar es lo suficientemente grande (mayor que `max_rows` o `max_bytes`), se escribe directamente en la tabla de destino, omitiendo el búfer. - -Las condiciones para el lavado de los datos se calculan por separado para cada uno de los `num_layers` búfer. Por ejemplo, si `num_layers = 16` y `max_bytes = 100000000`, el consumo máximo de RAM es de 1.6 GB. - -Ejemplo: - -``` sql -CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) -``` - -Creación de un ‘merge.hits_buffer’ mesa con la misma estructura que ‘merge.hits’ y usando el motor Buffer. Al escribir en esta tabla, los datos se almacenan en la memoria RAM y ‘merge.hits’ tabla. Se crean 16 búferes. Los datos de cada uno de ellos se vacían si han pasado 100 segundos o se han escrito un millón de filas o se han escrito 100 MB de datos; o si simultáneamente han pasado 10 segundos y se han escrito 10.000 filas y 10 MB de datos. Por ejemplo, si solo se ha escrito una fila, después de 100 segundos se vaciará, pase lo que pase. Pero si se han escrito muchas filas, los datos se vaciarán antes. - -Cuando se detiene el servidor, con DROP TABLE o DETACH TABLE, los datos del búfer también se vacían a la tabla de destino. - -Puede establecer cadenas vacías entre comillas simples para la base de datos y el nombre de la tabla. Esto indica la ausencia de una tabla de destino. En este caso, cuando se alcanzan las condiciones de descarga de datos, el búfer simplemente se borra. Esto puede ser útil para mantener una ventana de datos en la memoria. - -Al leer desde una tabla de búfer, los datos se procesan tanto desde el búfer como desde la tabla de destino (si hay uno). -Tenga en cuenta que las tablas Buffer no admiten un índice. En otras palabras, los datos del búfer se analizan por completo, lo que puede ser lento para los búferes grandes. (Para los datos de una tabla subordinada, se utilizará el índice que admite.) - -Si el conjunto de columnas de la tabla Buffer no coincide con el conjunto de columnas de una tabla subordinada, se inserta un subconjunto de columnas que existen en ambas tablas. - -Si los tipos no coinciden con una de las columnas de la tabla Búfer y una tabla subordinada, se escribe un mensaje de error en el registro del servidor y se borra el búfer. -Lo mismo sucede si la tabla subordinada no existe cuando se vacía el búfer. - -Si necesita ejecutar ALTER para una tabla subordinada y la tabla de búfer, se recomienda eliminar primero la tabla de búfer, ejecutar ALTER para la tabla subordinada y, a continuación, crear la tabla de búfer de nuevo. - -Si el servidor se reinicia de forma anormal, se pierden los datos del búfer. - -FINAL y SAMPLE no funcionan correctamente para las tablas Buffer. Estas condiciones se pasan a la tabla de destino, pero no se utilizan para procesar datos en el búfer. Si se requieren estas características, recomendamos usar solo la tabla Buffer para escribir, mientras lee desde la tabla de destino. - -Al agregar datos a un búfer, uno de los búferes está bloqueado. Esto provoca retrasos si se realiza una operación de lectura simultáneamente desde la tabla. - -Los datos que se insertan en una tabla de búfer pueden terminar en la tabla subordinada en un orden diferente y en bloques diferentes. Debido a esto, una tabla Buffer es difícil de usar para escribir en un CollapsingMergeTree correctamente. Para evitar problemas, puede establecer ‘num_layers’ a 1. - -Si se replica la tabla de destino, se pierden algunas características esperadas de las tablas replicadas al escribir en una tabla de búfer. Los cambios aleatorios en el orden de las filas y los tamaños de las partes de datos hacen que la desduplicación de datos deje de funcionar, lo que significa que no es posible tener un ‘exactly once’ escribir en tablas replicadas. - -Debido a estas desventajas, solo podemos recomendar el uso de una tabla Buffer en casos raros. - -Una tabla de búfer se usa cuando se reciben demasiados INSERT de un gran número de servidores durante una unidad de tiempo y los datos no se pueden almacenar en búfer antes de la inserción, lo que significa que los INSERT no pueden ejecutarse lo suficientemente rápido. - -Tenga en cuenta que no tiene sentido insertar datos una fila a la vez, incluso para las tablas de búfer. Esto solo producirá una velocidad de unos pocos miles de filas por segundo, mientras que la inserción de bloques de datos más grandes puede producir más de un millón de filas por segundo (consulte la sección “Performance”). - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/buffer/) diff --git a/docs/es/engines/table-engines/special/dictionary.md b/docs/es/engines/table-engines/special/dictionary.md deleted file mode 100644 index 6d9136a6a23..00000000000 --- a/docs/es/engines/table-engines/special/dictionary.md +++ /dev/null @@ -1,97 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 35 -toc_title: Diccionario ---- - -# Diccionario {#dictionary} - -El `Dictionary` el motor muestra el [diccionario](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) datos como una tabla ClickHouse. - -Como ejemplo, considere un diccionario de `products` con la siguiente configuración: - -``` xml - - - products - - -
products
- DSN=some-db-server - - - - 300 - 360 - - - - - - - product_id - - - title - String - - - - - -``` - -Consultar los datos del diccionario: - -``` sql -SELECT - name, - type, - key, - attribute.names, - attribute.types, - bytes_allocated, - element_count, - source -FROM system.dictionaries -WHERE name = 'products' -``` - -``` text -┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ -│ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ -└──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ -``` - -Puede usar el [dictGet\*](../../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions) función para obtener los datos del diccionario en este formato. - -Esta vista no es útil cuando necesita obtener datos sin procesar o cuando `JOIN` operación. Para estos casos, puede usar el `Dictionary` motor, que muestra los datos del diccionario en una tabla. - -Sintaxis: - -``` sql -CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` -``` - -Ejemplo de uso: - -``` sql -create table products (product_id UInt64, title String) Engine = Dictionary(products); -``` - - Ok - -Echa un vistazo a lo que hay en la mesa. - -``` sql -select * from products limit 1; -``` - -``` text -┌────product_id─┬─title───────────┐ -│ 152689 │ Some item │ -└───────────────┴─────────────────┘ -``` - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/dictionary/) diff --git a/docs/es/engines/table-engines/special/distributed.md b/docs/es/engines/table-engines/special/distributed.md deleted file mode 100644 index bac407a651a..00000000000 --- a/docs/es/engines/table-engines/special/distributed.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 33 -toc_title: Distribuido ---- - -# Distribuido {#distributed} - -**Las tablas con motor distribuido no almacenan ningún dato por sí mismas**, pero permite el procesamiento de consultas distribuidas en varios servidores. -La lectura se paralela automáticamente. Durante una lectura, se utilizan los índices de tabla en servidores remotos, si los hay. - -El motor distribuido acepta parámetros: - -- el nombre del clúster en el archivo de configuración del servidor - -- el nombre de una base de datos remota - -- el nombre de una tabla remota - -- (opcionalmente) clave de fragmentación - -- nombre de política (opcionalmente), se usará para almacenar archivos temporales para el envío asíncrono - - Ver también: - - - `insert_distributed_sync` configuración - - [Método de codificación de datos:](../mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) para los ejemplos - -Ejemplo: - -``` sql -Distributed(logs, default, hits[, sharding_key[, policy_name]]) -``` - -Los datos se leerán desde todos los servidores ‘logs’ clúster, desde el valor predeterminado.tabla de éxitos ubicada en cada servidor del clúster. -Los datos no solo se leen sino que se procesan parcialmente en los servidores remotos (en la medida en que esto sea posible). -Por ejemplo, para una consulta con GROUP BY, los datos se agregarán en servidores remotos y los estados intermedios de las funciones agregadas se enviarán al servidor solicitante. Luego, los datos se agregarán más. - -En lugar del nombre de la base de datos, puede usar una expresión constante que devuelva una cadena. Por ejemplo: currentDatabase(). - -logs – The cluster name in the server's config file. - -Los clústeres se establecen así: - -``` xml - - - - - 1 - - false - - example01-01-1 - 9000 - - - example01-01-2 - 9000 - - - - 2 - false - - example01-02-1 - 9000 - - - example01-02-2 - 1 - 9440 - - - - -``` - -Aquí se define un clúster con el nombre ‘logs’ que consta de dos fragmentos, cada uno de los cuales contiene dos réplicas. -Los fragmentos se refieren a los servidores que contienen diferentes partes de los datos (para leer todos los datos, debe acceder a todos los fragmentos). -Las réplicas están duplicando servidores (para leer todos los datos, puede acceder a los datos en cualquiera de las réplicas). - -Los nombres de clúster no deben contener puntos. - -Los parámetros `host`, `port`, y opcionalmente `user`, `password`, `secure`, `compression` se especifican para cada servidor: -- `host` – The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server doesn't start. If you change the DNS record, restart the server. -- `port` – The TCP port for messenger activity (‘tcp_port’ en la configuración, generalmente establecido en 9000). No lo confundas con http_port. -- `user` – Name of the user for connecting to a remote server. Default value: default. This user must have access to connect to the specified server. Access is configured in the users.xml file. For more information, see the section [Derechos de acceso](../../../operations/access-rights.md). -- `password` – The password for connecting to a remote server (not masked). Default value: empty string. -- `secure` - Use ssl para la conexión, por lo general también debe definir `port` = 9440. El servidor debe escuchar en `9440` y tener certificados correctos. -- `compression` - Utilice la compresión de datos. Valor predeterminado: true. - -When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) – see the [load_balancing](../../../operations/settings/settings.md#settings-load_balancing) configuración. -Si no se establece la conexión con el servidor, habrá un intento de conectarse con un breve tiempo de espera. Si la conexión falla, se seleccionará la siguiente réplica, y así sucesivamente para todas las réplicas. Si el intento de conexión falló para todas las réplicas, el intento se repetirá de la misma manera, varias veces. -Esto funciona a favor de la resiliencia, pero no proporciona una tolerancia completa a errores: un servidor remoto podría aceptar la conexión, pero podría no funcionar o funcionar mal. - -Puede especificar solo uno de los fragmentos (en este caso, el procesamiento de consultas debe denominarse remoto, en lugar de distribuido) o hasta cualquier número de fragmentos. En cada fragmento, puede especificar entre una y cualquier número de réplicas. Puede especificar un número diferente de réplicas para cada fragmento. - -Puede especificar tantos clústeres como desee en la configuración. - -Para ver los clústeres, utilice el ‘system.clusters’ tabla. - -El motor distribuido permite trabajar con un clúster como un servidor local. Sin embargo, el clúster es inextensible: debe escribir su configuración en el archivo de configuración del servidor (mejor aún, para todos los servidores del clúster). - -The Distributed engine requires writing clusters to the config file. Clusters from the config file are updated on the fly, without restarting the server. If you need to send a query to an unknown set of shards and replicas each time, you don't need to create a Distributed table – use the ‘remote’ función de tabla en su lugar. Vea la sección [Funciones de tabla](../../../sql-reference/table-functions/index.md). - -Hay dos métodos para escribir datos en un clúster: - -Primero, puede definir a qué servidores escribir en qué datos y realizar la escritura directamente en cada fragmento. En otras palabras, realice INSERT en las tablas que la tabla distribuida “looks at”. Esta es la solución más flexible, ya que puede usar cualquier esquema de fragmentación, que podría ser no trivial debido a los requisitos del área temática. Esta es también la solución más óptima ya que los datos se pueden escribir en diferentes fragmentos de forma completamente independiente. - -En segundo lugar, puede realizar INSERT en una tabla distribuida. En este caso, la tabla distribuirá los datos insertados a través de los propios servidores. Para escribir en una tabla distribuida, debe tener un conjunto de claves de fragmentación (el último parámetro). Además, si solo hay un fragmento, la operación de escritura funciona sin especificar la clave de fragmentación, ya que no significa nada en este caso. - -Cada fragmento puede tener un peso definido en el archivo de configuración. Por defecto, el peso es igual a uno. Los datos se distribuyen entre fragmentos en la cantidad proporcional al peso del fragmento. Por ejemplo, si hay dos fragmentos y el primero tiene un peso de 9 mientras que el segundo tiene un peso de 10, el primero se enviará 9 / 19 partes de las filas, y el segundo se enviará 10 / 19. - -Cada fragmento puede tener el ‘internal_replication’ parámetro definido en el archivo de configuración. - -Si este parámetro se establece en ‘true’, la operación de escritura selecciona la primera réplica en buen estado y escribe datos en ella. Utilice esta alternativa si la tabla Distribuida “looks at” tablas replicadas. En otras palabras, si la tabla donde se escribirán los datos los replicará por sí misma. - -Si se establece en ‘false’ (el valor predeterminado), los datos se escriben en todas las réplicas. En esencia, esto significa que la tabla distribuida replica los datos en sí. Esto es peor que usar tablas replicadas, porque no se verifica la consistencia de las réplicas y, con el tiempo, contendrán datos ligeramente diferentes. - -Para seleccionar el fragmento al que se envía una fila de datos, se analiza la expresión de fragmentación y su resto se toma de dividirlo por el peso total de los fragmentos. La fila se envía al fragmento que corresponde al medio intervalo de los restos de ‘prev_weight’ a ‘prev_weights + weight’, donde ‘prev_weights’ es el peso total de los fragmentos con el número más pequeño, y ‘weight’ es el peso de este fragmento. Por ejemplo, si hay dos fragmentos, y el primero tiene un peso de 9 mientras que el segundo tiene un peso de 10, la fila se enviará al primer fragmento para los restos del rango \[0, 9), y al segundo para los restos del rango \[9, 19). - -La expresión de fragmentación puede ser cualquier expresión de constantes y columnas de tabla que devuelva un entero. Por ejemplo, puede usar la expresión ‘rand()’ para la distribución aleatoria de datos, o ‘UserID’ para la distribución por el resto de dividir la ID del usuario (entonces los datos de un solo usuario residirán en un solo fragmento, lo que simplifica la ejecución de IN y JOIN por los usuarios). Si una de las columnas no se distribuye lo suficientemente uniformemente, puede envolverla en una función hash: intHash64(UserID) . - -Un simple recordatorio de la división es una solución limitada para sharding y no siempre es apropiado. Funciona para volúmenes medianos y grandes de datos (docenas de servidores), pero no para volúmenes muy grandes de datos (cientos de servidores o más). En este último caso, use el esquema de fragmentación requerido por el área asunto, en lugar de usar entradas en Tablas distribuidas. - -SELECT queries are sent to all the shards and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you don't have to transfer the old data to it. You can write new data with a heavier weight – the data will be distributed slightly unevenly, but queries will work correctly and efficiently. - -Debería preocuparse por el esquema de fragmentación en los siguientes casos: - -- Se utilizan consultas que requieren unir datos (IN o JOIN) mediante una clave específica. Si esta clave fragmenta datos, puede usar IN local o JOIN en lugar de GLOBAL IN o GLOBAL JOIN, que es mucho más eficiente. -- Se usa una gran cantidad de servidores (cientos o más) con una gran cantidad de consultas pequeñas (consultas de clientes individuales: sitios web, anunciantes o socios). Para que las pequeñas consultas no afecten a todo el clúster, tiene sentido ubicar datos para un solo cliente en un solo fragmento. Alternativamente, como lo hemos hecho en Yandex.Metrica, puede configurar sharding de dos niveles: divida todo el clúster en “layers”, donde una capa puede consistir en varios fragmentos. Los datos de un único cliente se encuentran en una sola capa, pero los fragmentos se pueden agregar a una capa según sea necesario y los datos se distribuyen aleatoriamente dentro de ellos. Las tablas distribuidas se crean para cada capa y se crea una única tabla distribuida compartida para consultas globales. - -Los datos se escriben de forma asíncrona. Cuando se inserta en la tabla, el bloque de datos se acaba de escribir en el sistema de archivos local. Los datos se envían a los servidores remotos en segundo plano tan pronto como sea posible. El período de envío de datos está gestionado por el [Distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) y [Distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) configuración. El `Distributed` el motor envía cada archivo con datos insertados por separado, pero puede habilitar el envío por lotes de archivos [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) configuración. Esta configuración mejora el rendimiento del clúster al utilizar mejor los recursos de red y servidor local. Debe comprobar si los datos se envían correctamente comprobando la lista de archivos (datos en espera de ser enviados) en el directorio de la tabla: `/var/lib/clickhouse/data/database/table/`. - -Si el servidor dejó de existir o tuvo un reinicio aproximado (por ejemplo, después de un error de dispositivo) después de un INSERT en una tabla distribuida, es posible que se pierdan los datos insertados. Si se detecta un elemento de datos dañado en el directorio de la tabla, se transfiere al ‘broken’ subdirectorio y ya no se utiliza. - -Cuando la opción max_parallel_replicas está habilitada, el procesamiento de consultas se paralela en todas las réplicas dentro de un solo fragmento. Para obtener más información, consulte la sección [max_parallel_replicas](../../../operations/settings/settings.md#settings-max_parallel_replicas). - -## Virtual Columnas {#virtual-columns} - -- `_shard_num` — Contains the `shard_num` (de `system.clusters`). Tipo: [UInt32](../../../sql-reference/data-types/int-uint.md). - -!!! note "Nota" - Ya [`remote`](../../../sql-reference/table-functions/remote.md)/`cluster` funciones de tabla crean internamente instancia temporal del mismo motor distribuido, `_shard_num` está disponible allí también. - -**Ver también** - -- [Virtual columnas](index.md#table_engines-virtual_columns) - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/distributed/) diff --git a/docs/es/engines/table-engines/special/external-data.md b/docs/es/engines/table-engines/special/external-data.md deleted file mode 100644 index f2ce4abbb0f..00000000000 --- a/docs/es/engines/table-engines/special/external-data.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 34 -toc_title: Datos externos ---- - -# Datos externos para el procesamiento de consultas {#external-data-for-query-processing} - -ClickHouse permite enviar a un servidor los datos necesarios para procesar una consulta, junto con una consulta SELECT. Estos datos se colocan en una tabla temporal (consulte la sección “Temporary tables”) y se puede utilizar en la consulta (por ejemplo, en operadores IN). - -Por ejemplo, si tiene un archivo de texto con identificadores de usuario importantes, puede cargarlo en el servidor junto con una consulta que utilice la filtración de esta lista. - -Si necesita ejecutar más de una consulta con un gran volumen de datos externos, no utilice esta función. Es mejor cargar los datos a la base de datos con anticipación. - -Los datos externos se pueden cargar mediante el cliente de línea de comandos (en modo no interactivo) o mediante la interfaz HTTP. - -En el cliente de línea de comandos, puede especificar una sección de parámetros en el formato - -``` bash ---external --file=... [--name=...] [--format=...] [--types=...|--structure=...] -``` - -Puede tener varias secciones como esta, para el número de tablas que se transmiten. - -**–external** – Marks the beginning of a clause. -**–file** – Path to the file with the table dump, or -, which refers to stdin. -Solo se puede recuperar una sola tabla de stdin. - -Los siguientes parámetros son opcionales: **–name**– Name of the table. If omitted, _data is used. -**–format** – Data format in the file. If omitted, TabSeparated is used. - -Se requiere uno de los siguientes parámetros:**–types** – A list of comma-separated column types. For example: `UInt64,String`. The columns will be named _1, _2, … -**–structure**– The table structure in the format`UserID UInt64`, `URL String`. Define los nombres y tipos de columna. - -Los archivos especificados en ‘file’ se analizará mediante el formato especificado en ‘format’ utilizando los tipos de datos especificados en ‘types’ o ‘structure’. La mesa será cargado en el servidor y accesibles, como una tabla temporal con el nombre de ‘name’. - -Ejemplos: - -``` bash -$ echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 -849897 -$ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' -/bin/sh 20 -/bin/false 5 -/bin/bash 4 -/usr/sbin/nologin 1 -/bin/sync 1 -``` - -Cuando se utiliza la interfaz HTTP, los datos externos se pasan en el formato multipart/form-data. Cada tabla se transmite como un archivo separado. El nombre de la tabla se toma del nombre del archivo. El ‘query_string’ se pasa los parámetros ‘name_format’, ‘name_types’, y ‘name_structure’, donde ‘name’ es el nombre de la tabla a la que corresponden estos parámetros. El significado de los parámetros es el mismo que cuando se usa el cliente de línea de comandos. - -Ejemplo: - -``` bash -$ cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv - -$ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' -/bin/sh 20 -/bin/false 5 -/bin/bash 4 -/usr/sbin/nologin 1 -/bin/sync 1 -``` - -Para el procesamiento de consultas distribuidas, las tablas temporales se envían a todos los servidores remotos. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/external_data/) diff --git a/docs/es/engines/table-engines/special/file.md b/docs/es/engines/table-engines/special/file.md deleted file mode 100644 index fb739506a22..00000000000 --- a/docs/es/engines/table-engines/special/file.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 37 -toc_title: File ---- - -# File {#table_engines-file} - -El motor de tabla de archivos mantiene los datos en un archivo en uno de los [file -formato](../../../interfaces/formats.md#formats) (TabSeparated, Native, etc.). - -Ejemplos de uso: - -- Exportación de datos de ClickHouse a archivo. -- Convertir datos de un formato a otro. -- Actualización de datos en ClickHouse mediante la edición de un archivo en un disco. - -## Uso en el servidor ClickHouse {#usage-in-clickhouse-server} - -``` sql -File(Format) -``` - -El `Format` parámetro especifica uno de los formatos de archivo disponibles. Realizar -`SELECT` consultas, el formato debe ser compatible para la entrada, y para realizar -`INSERT` queries – for output. The available formats are listed in the -[Formato](../../../interfaces/formats.md#formats) apartado. - -ClickHouse no permite especificar la ruta del sistema de archivos para`File`. Utilizará la carpeta definida por [camino](../../../operations/server-configuration-parameters/settings.md) configuración en la configuración del servidor. - -Al crear una tabla usando `File(Format)` crea un subdirectorio vacío en esa carpeta. Cuando los datos se escriben en esa tabla, se colocan en `data.Format` en ese subdirectorio. - -Puede crear manualmente esta subcarpeta y archivo en el sistema de archivos del servidor y luego [ATTACH](../../../sql-reference/statements/misc.md) para mostrar información con el nombre coincidente, para que pueda consultar datos desde ese archivo. - -!!! warning "Advertencia" - Tenga cuidado con esta funcionalidad, ya que ClickHouse no realiza un seguimiento de los cambios externos en dichos archivos. El resultado de las escrituras simultáneas a través de ClickHouse y fuera de ClickHouse no está definido. - -**Ejemplo:** - -**1.** Configurar el `file_engine_table` tabla: - -``` sql -CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSeparated) -``` - -Por defecto, ClickHouse creará una carpeta `/var/lib/clickhouse/data/default/file_engine_table`. - -**2.** Crear manualmente `/var/lib/clickhouse/data/default/file_engine_table/data.TabSeparated` contener: - -``` bash -$ cat data.TabSeparated -one 1 -two 2 -``` - -**3.** Consultar los datos: - -``` sql -SELECT * FROM file_engine_table -``` - -``` text -┌─name─┬─value─┐ -│ one │ 1 │ -│ two │ 2 │ -└──────┴───────┘ -``` - -## Uso en ClickHouse-local {#usage-in-clickhouse-local} - -En [Sistema abierto.](../../../operations/utilities/clickhouse-local.md#clickhouse-local) El motor de archivos acepta la ruta del archivo además de `Format`. Los flujos de entrada / salida predeterminados se pueden especificar utilizando nombres numéricos o legibles por humanos como `0` o `stdin`, `1` o `stdout`. -**Ejemplo:** - -``` bash -$ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" -``` - -## Detalles de la implementación {#details-of-implementation} - -- Multiple `SELECT` las consultas se pueden realizar simultáneamente, pero `INSERT` las consultas se esperarán entre sí. -- Apoyado la creación de nuevos archivos por `INSERT` consulta. -- Si el archivo existe, `INSERT` añadiría nuevos valores en él. -- No soportado: - - `ALTER` - - `SELECT ... SAMPLE` - - Indice - - Replicación - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/file/) diff --git a/docs/es/engines/table-engines/special/generate.md b/docs/es/engines/table-engines/special/generate.md deleted file mode 100644 index 67e664284b4..00000000000 --- a/docs/es/engines/table-engines/special/generate.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 46 -toc_title: GenerateRandom ---- - -# Generaterandom {#table_engines-generate} - -El motor de tabla GenerateRandom produce datos aleatorios para el esquema de tabla determinado. - -Ejemplos de uso: - -- Se usa en la prueba para poblar una tabla grande reproducible. -- Generar entrada aleatoria para pruebas de fuzzing. - -## Uso en el servidor ClickHouse {#usage-in-clickhouse-server} - -``` sql -ENGINE = GenerateRandom(random_seed, max_string_length, max_array_length) -``` - -El `max_array_length` y `max_string_length` parámetros especifican la longitud máxima de todos -columnas y cadenas de matriz correspondientemente en los datos generados. - -Generar motor de tabla sólo admite `SELECT` consulta. - -Es compatible con todos [Tipos de datos](../../../sql-reference/data-types/index.md) que se pueden almacenar en una tabla excepto `LowCardinality` y `AggregateFunction`. - -**Ejemplo:** - -**1.** Configurar el `generate_engine_table` tabla: - -``` sql -CREATE TABLE generate_engine_table (name String, value UInt32) ENGINE = GenerateRandom(1, 5, 3) -``` - -**2.** Consultar los datos: - -``` sql -SELECT * FROM generate_engine_table LIMIT 3 -``` - -``` text -┌─name─┬──────value─┐ -│ c4xJ │ 1412771199 │ -│ r │ 1791099446 │ -│ 7#$ │ 124312908 │ -└──────┴────────────┘ -``` - -## Detalles de la implementación {#details-of-implementation} - -- No soportado: - - `ALTER` - - `SELECT ... SAMPLE` - - `INSERT` - - Indice - - Replicación - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/generate/) diff --git a/docs/es/engines/table-engines/special/index.md b/docs/es/engines/table-engines/special/index.md deleted file mode 100644 index 9927a1f61d9..00000000000 --- a/docs/es/engines/table-engines/special/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Especial -toc_priority: 31 ---- - - diff --git a/docs/es/engines/table-engines/special/join.md b/docs/es/engines/table-engines/special/join.md deleted file mode 100644 index 83e21b7c8cc..00000000000 --- a/docs/es/engines/table-engines/special/join.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 40 -toc_title: Unir ---- - -# Unir {#join} - -Estructura de datos preparada para usar en [JOIN](../../../sql-reference/statements/select/join.md#select-join) operación. - -## Creación de una tabla {#creating-a-table} - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], -) ENGINE = Join(join_strictness, join_type, k1[, k2, ...]) -``` - -Vea la descripción detallada del [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) consulta. - -**Parámetros del motor** - -- `join_strictness` – [ÚNETE a la rigurosidad](../../../sql-reference/statements/select/join.md#select-join-types). -- `join_type` – [Tipo de unión](../../../sql-reference/statements/select/join.md#select-join-types). -- `k1[, k2, ...]` – Key columns from the `USING` cláusula que el `JOIN` operación se hace con. - -Entrar `join_strictness` y `join_type` parámetros sin comillas, por ejemplo, `Join(ANY, LEFT, col1)`. Deben coincidir con el `JOIN` operación para la que se utilizará la tabla. Si los parámetros no coinciden, ClickHouse no lanza una excepción y puede devolver datos incorrectos. - -## Uso de la tabla {#table-usage} - -### Ejemplo {#example} - -Creación de la tabla del lado izquierdo: - -``` sql -CREATE TABLE id_val(`id` UInt32, `val` UInt32) ENGINE = TinyLog -``` - -``` sql -INSERT INTO id_val VALUES (1,11)(2,12)(3,13) -``` - -Creando el lado derecho `Join` tabla: - -``` sql -CREATE TABLE id_val_join(`id` UInt32, `val` UInt8) ENGINE = Join(ANY, LEFT, id) -``` - -``` sql -INSERT INTO id_val_join VALUES (1,21)(1,22)(3,23) -``` - -Unirse a las tablas: - -``` sql -SELECT * FROM id_val ANY LEFT JOIN id_val_join USING (id) SETTINGS join_use_nulls = 1 -``` - -``` text -┌─id─┬─val─┬─id_val_join.val─┐ -│ 1 │ 11 │ 21 │ -│ 2 │ 12 │ ᴺᵁᴸᴸ │ -│ 3 │ 13 │ 23 │ -└────┴─────┴─────────────────┘ -``` - -Como alternativa, puede recuperar datos del `Join` tabla, especificando el valor de la clave de unión: - -``` sql -SELECT joinGet('id_val_join', 'val', toUInt32(1)) -``` - -``` text -┌─joinGet('id_val_join', 'val', toUInt32(1))─┐ -│ 21 │ -└────────────────────────────────────────────┘ -``` - -### Selección e inserción de datos {#selecting-and-inserting-data} - -Usted puede utilizar `INSERT` consultas para agregar datos al `Join`-mesas de motor. Si la tabla se creó con el `ANY` estricta, se ignoran los datos de las claves duplicadas. Con el `ALL` estricta, se agregan todas las filas. - -No se puede realizar una `SELECT` consulta directamente desde la tabla. En su lugar, use uno de los siguientes métodos: - -- Coloque la mesa hacia el lado derecho en un `JOIN` clausula. -- Llame al [joinGet](../../../sql-reference/functions/other-functions.md#joinget) función, que le permite extraer datos de la tabla de la misma manera que de un diccionario. - -### Limitaciones y ajustes {#join-limitations-and-settings} - -Al crear una tabla, se aplican los siguientes valores: - -- [Sistema abierto.](../../../operations/settings/settings.md#join_use_nulls) -- [Método de codificación de datos:](../../../operations/settings/query-complexity.md#settings-max_rows_in_join) -- [Método de codificación de datos:](../../../operations/settings/query-complexity.md#settings-max_bytes_in_join) -- [join_overflow_mode](../../../operations/settings/query-complexity.md#settings-join_overflow_mode) -- [join_any_take_last_row](../../../operations/settings/settings.md#settings-join_any_take_last_row) - -El `Join`-las tablas del motor no se pueden usar en `GLOBAL JOIN` operación. - -El `Join`-motor permite el uso [Sistema abierto.](../../../operations/settings/settings.md#join_use_nulls) ajuste en el `CREATE TABLE` instrucción. Y [SELECT](../../../sql-reference/statements/select/index.md) consulta permite el uso `join_use_nulls` demasiado. Si tienes diferentes `join_use_nulls` configuración, puede obtener un error al unirse a la tabla. Depende del tipo de JOIN. Cuando se utiliza [joinGet](../../../sql-reference/functions/other-functions.md#joinget) función, usted tiene que utilizar el mismo `join_use_nulls` ajuste en `CRATE TABLE` y `SELECT` instrucción. - -## Almacenamiento de datos {#data-storage} - -`Join` datos de la tabla siempre se encuentra en la memoria RAM. Al insertar filas en una tabla, ClickHouse escribe bloques de datos en el directorio del disco para que puedan restaurarse cuando se reinicie el servidor. - -Si el servidor se reinicia incorrectamente, el bloque de datos en el disco puede perderse o dañarse. En este caso, es posible que deba eliminar manualmente el archivo con datos dañados. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/join/) diff --git a/docs/es/engines/table-engines/special/materializedview.md b/docs/es/engines/table-engines/special/materializedview.md deleted file mode 100644 index 87e5218eb6a..00000000000 --- a/docs/es/engines/table-engines/special/materializedview.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 43 -toc_title: "M\xE9todo de codificaci\xF3n de datos:" ---- - -# Método de codificación de datos: {#materializedview} - -Se utiliza para implementar vistas materializadas (para obtener más información, consulte [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query)). Para almacenar datos, utiliza un motor diferente que se especificó al crear la vista. Al leer desde una tabla, solo usa este motor. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/materializedview/) diff --git a/docs/es/engines/table-engines/special/memory.md b/docs/es/engines/table-engines/special/memory.md deleted file mode 100644 index 3d4f8ddff54..00000000000 --- a/docs/es/engines/table-engines/special/memory.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 44 -toc_title: Memoria ---- - -# Memoria {#memory} - -El motor de memoria almacena datos en RAM, en forma sin comprimir. Los datos se almacenan exactamente en la misma forma en que se reciben cuando se leen. En otras palabras, la lectura de esta tabla es completamente gratuita. -El acceso a los datos simultáneos está sincronizado. Los bloqueos son cortos: las operaciones de lectura y escritura no se bloquean entre sí. -Los índices no son compatibles. La lectura está paralelizada. -La productividad máxima (más de 10 GB/s) se alcanza en consultas simples, porque no hay lectura del disco, descomprimir o deserializar datos. (Cabe señalar que, en muchos casos, la productividad del motor MergeTree es casi tan alta.) -Al reiniciar un servidor, los datos desaparecen de la tabla y la tabla queda vacía. -Normalmente, el uso de este motor de tabla no está justificado. Sin embargo, se puede usar para pruebas y para tareas donde se requiere la velocidad máxima en un número relativamente pequeño de filas (hasta aproximadamente 100,000,000). - -El sistema utiliza el motor de memoria para tablas temporales con datos de consulta externos (consulte la sección “External data for processing a query”), y para la implementación de GLOBAL IN (véase la sección “IN operators”). - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/memory/) diff --git a/docs/es/engines/table-engines/special/merge.md b/docs/es/engines/table-engines/special/merge.md deleted file mode 100644 index 6ed2c272914..00000000000 --- a/docs/es/engines/table-engines/special/merge.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 36 -toc_title: Fusionar ---- - -# Fusionar {#merge} - -El `Merge` motor (no debe confundirse con `MergeTree`) no almacena datos en sí, pero permite leer de cualquier número de otras tablas simultáneamente. -La lectura se paralela automáticamente. No se admite la escritura en una tabla. Al leer, se usan los índices de las tablas que realmente se están leyendo, si existen. -El `Merge` engine acepta parámetros: el nombre de la base de datos y una expresión regular para las tablas. - -Ejemplo: - -``` sql -Merge(hits, '^WatchLog') -``` - -Los datos se leerán de las tablas en el `hits` base de datos que tienen nombres que coinciden con la expresión regular ‘`^WatchLog`’. - -En lugar del nombre de la base de datos, puede usar una expresión constante que devuelva una cadena. Por ejemplo, `currentDatabase()`. - -Regular expressions — [Re2](https://github.com/google/re2) (soporta un subconjunto de PCRE), sensible a mayúsculas y minúsculas. -Vea las notas sobre los símbolos de escape en expresiones regulares en el “match” apartado. - -Al seleccionar tablas para leer, el `Merge` no se seleccionará la tabla en sí, incluso si coincide con la expresión regular. Esto es para evitar bucles. -Es posible crear dos `Merge` tablas que intentarán interminablemente leer los datos de los demás, pero esta no es una buena idea. - -La forma típica de usar el `Merge` para trabajar con un gran número de `TinyLog` tablas como si con una sola tabla. - -Ejemplo 2: - -Digamos que tiene una tabla antigua (WatchLog_old) y decidió cambiar la partición sin mover datos a una nueva tabla (WatchLog_new) y necesita ver datos de ambas tablas. - -``` sql -CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) -ENGINE=MergeTree(date, (UserId, EventType), 8192); -INSERT INTO WatchLog_old VALUES ('2018-01-01', 1, 'hit', 3); - -CREATE TABLE WatchLog_new(date Date, UserId Int64, EventType String, Cnt UInt64) -ENGINE=MergeTree PARTITION BY date ORDER BY (UserId, EventType) SETTINGS index_granularity=8192; -INSERT INTO WatchLog_new VALUES ('2018-01-02', 2, 'hit', 3); - -CREATE TABLE WatchLog as WatchLog_old ENGINE=Merge(currentDatabase(), '^WatchLog'); - -SELECT * -FROM WatchLog -``` - -``` text -┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ -│ 2018-01-01 │ 1 │ hit │ 3 │ -└────────────┴────────┴───────────┴─────┘ -┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ -│ 2018-01-02 │ 2 │ hit │ 3 │ -└────────────┴────────┴───────────┴─────┘ -``` - -## Virtual Columnas {#virtual-columns} - -- `_table` — Contains the name of the table from which data was read. Type: [Cadena](../../../sql-reference/data-types/string.md). - - Puede establecer las condiciones constantes en `_table` en el `WHERE/PREWHERE` cláusula (por ejemplo, `WHERE _table='xyz'`). En este caso, la operación de lectura se realiza sólo para las tablas donde la condición en `_table` está satisfecho, por lo que el `_table` columna actúa como un índice. - -**Ver también** - -- [Virtual columnas](index.md#table_engines-virtual_columns) - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/merge/) diff --git a/docs/es/engines/table-engines/special/null.md b/docs/es/engines/table-engines/special/null.md deleted file mode 100644 index cc05e7839c9..00000000000 --- a/docs/es/engines/table-engines/special/null.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 38 -toc_title: Nulo ---- - -# Nulo {#null} - -Al escribir en una tabla Null, los datos se ignoran. Al leer desde una tabla Null, la respuesta está vacía. - -Sin embargo, puede crear una vista materializada en una tabla Null. Entonces los datos escritos en la tabla terminarán en la vista. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/null/) diff --git a/docs/es/engines/table-engines/special/set.md b/docs/es/engines/table-engines/special/set.md deleted file mode 100644 index 4ff23202443..00000000000 --- a/docs/es/engines/table-engines/special/set.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 39 -toc_title: Establecer ---- - -# Establecer {#set} - -Un conjunto de datos que siempre está en la memoria RAM. Está diseñado para su uso en el lado derecho del operador IN (consulte la sección “IN operators”). - -Puede usar INSERT para insertar datos en la tabla. Se agregarán nuevos elementos al conjunto de datos, mientras que los duplicados se ignorarán. -Pero no puede realizar SELECT desde la tabla. La única forma de recuperar datos es usándolos en la mitad derecha del operador IN. - -Los datos siempre se encuentran en la memoria RAM. Para INSERT, los bloques de datos insertados también se escriben en el directorio de tablas en el disco. Al iniciar el servidor, estos datos se cargan en la RAM. En otras palabras, después de reiniciar, los datos permanecen en su lugar. - -Para un reinicio aproximado del servidor, el bloque de datos en el disco puede perderse o dañarse. En este último caso, es posible que deba eliminar manualmente el archivo con datos dañados. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/set/) diff --git a/docs/es/engines/table-engines/special/url.md b/docs/es/engines/table-engines/special/url.md deleted file mode 100644 index 654b8e99a4e..00000000000 --- a/docs/es/engines/table-engines/special/url.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 41 -toc_title: URL ---- - -# URL(URL, Formato) {#table_engines-url} - -Administra datos en un servidor HTTP/HTTPS remoto. Este motor es similar -a la [File](file.md) motor. - -## Uso del motor en el servidor ClickHouse {#using-the-engine-in-the-clickhouse-server} - -El `format` debe ser uno que ClickHouse pueda usar en -`SELECT` consultas y, si es necesario, en `INSERTs`. Para obtener la lista completa de formatos admitidos, consulte -[Formato](../../../interfaces/formats.md#formats). - -El `URL` debe ajustarse a la estructura de un localizador uniforme de recursos. La dirección URL especificada debe apuntar a un servidor -que utiliza HTTP o HTTPS. Esto no requiere ningún -encabezados adicionales para obtener una respuesta del servidor. - -`INSERT` y `SELECT` las consultas se transforman en `POST` y `GET` peticiones, -respectivamente. Para el procesamiento `POST` solicitudes, el servidor remoto debe admitir -[Codificación de transferencia fragmentada](https://en.wikipedia.org/wiki/Chunked_transfer_encoding). - -Puede limitar el número máximo de saltos de redirección HTTP GET utilizando el [Nombre de la red inalámbrica (SSID):](../../../operations/settings/settings.md#setting-max_http_get_redirects) configuración. - -**Ejemplo:** - -**1.** Crear un `url_engine_table` tabla en el servidor : - -``` sql -CREATE TABLE url_engine_table (word String, value UInt64) -ENGINE=URL('http://127.0.0.1:12345/', CSV) -``` - -**2.** Cree un servidor HTTP básico utilizando las herramientas estándar de Python 3 y -comenzarlo: - -``` python3 -from http.server import BaseHTTPRequestHandler, HTTPServer - -class CSVHTTPServer(BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(200) - self.send_header('Content-type', 'text/csv') - self.end_headers() - - self.wfile.write(bytes('Hello,1\nWorld,2\n', "utf-8")) - -if __name__ == "__main__": - server_address = ('127.0.0.1', 12345) - HTTPServer(server_address, CSVHTTPServer).serve_forever() -``` - -``` bash -$ python3 server.py -``` - -**3.** Solicitar datos: - -``` sql -SELECT * FROM url_engine_table -``` - -``` text -┌─word──┬─value─┐ -│ Hello │ 1 │ -│ World │ 2 │ -└───────┴───────┘ -``` - -## Detalles de la implementación {#details-of-implementation} - -- Las lecturas y escrituras pueden ser paralelas -- No soportado: - - `ALTER` y `SELECT...SAMPLE` operación. - - Índices. - - Replicación. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/url/) diff --git a/docs/es/engines/table-engines/special/view.md b/docs/es/engines/table-engines/special/view.md deleted file mode 100644 index dbb496bcca4..00000000000 --- a/docs/es/engines/table-engines/special/view.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 42 -toc_title: Vista ---- - -# Vista {#table_engines-view} - -Se utiliza para implementar vistas (para obtener más información, consulte `CREATE VIEW query`). No almacena datos, pero solo almacena los datos especificados `SELECT` consulta. Al leer desde una tabla, ejecuta esta consulta (y elimina todas las columnas innecesarias de la consulta). - -[Artículo Original](https://clickhouse.tech/docs/en/operations/table_engines/view/) diff --git a/docs/es/faq/general.md b/docs/es/faq/general.md deleted file mode 100644 index f8446e99152..00000000000 --- a/docs/es/faq/general.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 78 -toc_title: Preguntas generales ---- - -# Preguntas generales {#general-questions} - -## ¿Por qué no usar algo como MapReduce? {#why-not-use-something-like-mapreduce} - -Podemos referirnos a sistemas como MapReduce como sistemas informáticos distribuidos en los que la operación de reducción se basa en la clasificación distribuida. La solución de código abierto más común en esta clase es [Acerca de nosotros](http://hadoop.apache.org). Yandex utiliza su solución interna, YT. - -Estos sistemas no son apropiados para consultas en línea debido a su alta latencia. En otras palabras, no se pueden usar como back-end para una interfaz web. Estos tipos de sistemas no son útiles para actualizaciones de datos en tiempo real. La clasificación distribuida no es la mejor manera de realizar operaciones de reducción si el resultado de la operación y todos los resultados intermedios (si los hay) se encuentran en la RAM de un único servidor, que generalmente es el caso de las consultas en línea. En tal caso, una tabla hash es una forma óptima de realizar operaciones de reducción. Un enfoque común para optimizar las tareas de reducción de mapas es la preagregación (reducción parcial) utilizando una tabla hash en RAM. El usuario realiza esta optimización manualmente. La clasificación distribuida es una de las principales causas de un rendimiento reducido cuando se ejecutan tareas simples de reducción de mapas. - -La mayoría de las implementaciones de MapReduce le permiten ejecutar código arbitrario en un clúster. Pero un lenguaje de consulta declarativo es más adecuado para OLAP para ejecutar experimentos rápidamente. Por ejemplo, Hadoop tiene Hive y Pig. También considere Cloudera Impala o Shark (obsoleto) para Spark, así como Spark SQL, Presto y Apache Drill. El rendimiento cuando se ejecutan tales tareas es muy subóptimo en comparación con los sistemas especializados, pero la latencia relativamente alta hace que sea poco realista utilizar estos sistemas como back-end para una interfaz web. - -## ¿Qué sucede si tengo un problema con las codificaciones al usar Oracle a través de ODBC? {#oracle-odbc-encodings} - -Si utiliza Oracle a través del controlador ODBC como fuente de diccionarios externos, debe establecer el valor `NLS_LANG` variable de entorno en `/etc/default/clickhouse`. Para obtener más información, consulte [Oracle NLS_LANG Preguntas frecuentes](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). - -**Ejemplo** - -``` sql -NLS_LANG=RUSSIAN_RUSSIA.UTF8 -``` - -## Cómo exporto datos de ClickHouse a un archivo? {#how-to-export-to-file} - -### Uso de la cláusula INTO OUTFILE {#using-into-outfile-clause} - -Añadir un [INTO OUTFILE](../sql-reference/statements/select/into-outfile.md#into-outfile-clause) cláusula a su consulta. - -Por ejemplo: - -``` sql -SELECT * FROM table INTO OUTFILE 'file' -``` - -De forma predeterminada, ClickHouse usa el [TabSeparated](../interfaces/formats.md#tabseparated) formato de datos de salida. Para seleccionar el [formato de datos](../interfaces/formats.md), utilizar el [Cláusula FORMAT](../sql-reference/statements/select/format.md#format-clause). - -Por ejemplo: - -``` sql -SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV -``` - -### Uso de una tabla de motor de archivo {#using-a-file-engine-table} - -Ver [File](../engines/table-engines/special/file.md). - -### Uso de la redirección de línea de comandos {#using-command-line-redirection} - -``` sql -$ clickhouse-client --query "SELECT * from table" --format FormatName > result.txt -``` - -Ver [Casa de clics-cliente](../interfaces/cli.md). - -{## [Artículo Original](https://clickhouse.tech/docs/en/faq/general/) ##} diff --git a/docs/es/faq/index.md b/docs/es/faq/index.md deleted file mode 100644 index a44dbb31e89..00000000000 --- a/docs/es/faq/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: F.A.Q. -toc_priority: 76 ---- - - diff --git a/docs/es/getting-started/example-datasets/amplab-benchmark.md b/docs/es/getting-started/example-datasets/amplab-benchmark.md deleted file mode 100644 index 066bf036266..00000000000 --- a/docs/es/getting-started/example-datasets/amplab-benchmark.md +++ /dev/null @@ -1,129 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 17 -toc_title: Referencia de Big Data de AMPLab ---- - -# Referencia de Big Data de AMPLab {#amplab-big-data-benchmark} - -Ver https://amplab.cs.berkeley.edu/benchmark/ - -Regístrese para obtener una cuenta gratuita en https://aws.amazon.com. Requiere una tarjeta de crédito, correo electrónico y número de teléfono. Obtenga una nueva clave de acceso en https://console.aws.amazon.com/iam/home?nc2=h_m_sc#security_credential - -Ejecute lo siguiente en la consola: - -``` bash -$ sudo apt-get install s3cmd -$ mkdir tiny; cd tiny; -$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . -$ cd .. -$ mkdir 1node; cd 1node; -$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . -$ cd .. -$ mkdir 5nodes; cd 5nodes; -$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . -$ cd .. -``` - -Ejecute las siguientes consultas de ClickHouse: - -``` sql -CREATE TABLE rankings_tiny -( - pageURL String, - pageRank UInt32, - avgDuration UInt32 -) ENGINE = Log; - -CREATE TABLE uservisits_tiny -( - sourceIP String, - destinationURL String, - visitDate Date, - adRevenue Float32, - UserAgent String, - cCode FixedString(3), - lCode FixedString(6), - searchWord String, - duration UInt32 -) ENGINE = MergeTree(visitDate, visitDate, 8192); - -CREATE TABLE rankings_1node -( - pageURL String, - pageRank UInt32, - avgDuration UInt32 -) ENGINE = Log; - -CREATE TABLE uservisits_1node -( - sourceIP String, - destinationURL String, - visitDate Date, - adRevenue Float32, - UserAgent String, - cCode FixedString(3), - lCode FixedString(6), - searchWord String, - duration UInt32 -) ENGINE = MergeTree(visitDate, visitDate, 8192); - -CREATE TABLE rankings_5nodes_on_single -( - pageURL String, - pageRank UInt32, - avgDuration UInt32 -) ENGINE = Log; - -CREATE TABLE uservisits_5nodes_on_single -( - sourceIP String, - destinationURL String, - visitDate Date, - adRevenue Float32, - UserAgent String, - cCode FixedString(3), - lCode FixedString(6), - searchWord String, - duration UInt32 -) ENGINE = MergeTree(visitDate, visitDate, 8192); -``` - -Volver a la consola: - -``` bash -$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done -$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done -$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done -$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done -$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done -$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done -``` - -Consultas para obtener muestras de datos: - -``` sql -SELECT pageURL, pageRank FROM rankings_1node WHERE pageRank > 1000 - -SELECT substring(sourceIP, 1, 8), sum(adRevenue) FROM uservisits_1node GROUP BY substring(sourceIP, 1, 8) - -SELECT - sourceIP, - sum(adRevenue) AS totalRevenue, - avg(pageRank) AS pageRank -FROM rankings_1node ALL INNER JOIN -( - SELECT - sourceIP, - destinationURL AS pageURL, - adRevenue - FROM uservisits_1node - WHERE (visitDate > '1980-01-01') AND (visitDate < '1980-04-01') -) USING pageURL -GROUP BY sourceIP -ORDER BY totalRevenue DESC -LIMIT 1 -``` - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/example_datasets/amplab_benchmark/) diff --git a/docs/es/getting-started/example-datasets/criteo.md b/docs/es/getting-started/example-datasets/criteo.md deleted file mode 100644 index 79203b0276d..00000000000 --- a/docs/es/getting-started/example-datasets/criteo.md +++ /dev/null @@ -1,81 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 19 -toc_title: Registros de clics de Terabyte de Criteo ---- - -# Terabyte de registros de clics de Criteo {#terabyte-of-click-logs-from-criteo} - -Descargue los datos de http://labs.criteo.com/downloads/download-terabyte-click-logs/ - -Cree una tabla para importar el registro: - -``` sql -CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log -``` - -Descargar los datos: - -``` bash -$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done -``` - -Crear una tabla para los datos convertidos: - -``` sql -CREATE TABLE criteo -( - date Date, - clicked UInt8, - int1 Int32, - int2 Int32, - int3 Int32, - int4 Int32, - int5 Int32, - int6 Int32, - int7 Int32, - int8 Int32, - int9 Int32, - int10 Int32, - int11 Int32, - int12 Int32, - int13 Int32, - icat1 UInt32, - icat2 UInt32, - icat3 UInt32, - icat4 UInt32, - icat5 UInt32, - icat6 UInt32, - icat7 UInt32, - icat8 UInt32, - icat9 UInt32, - icat10 UInt32, - icat11 UInt32, - icat12 UInt32, - icat13 UInt32, - icat14 UInt32, - icat15 UInt32, - icat16 UInt32, - icat17 UInt32, - icat18 UInt32, - icat19 UInt32, - icat20 UInt32, - icat21 UInt32, - icat22 UInt32, - icat23 UInt32, - icat24 UInt32, - icat25 UInt32, - icat26 UInt32 -) ENGINE = MergeTree(date, intHash32(icat1), (date, intHash32(icat1)), 8192) -``` - -Transforme los datos del registro sin procesar y colóquelos en la segunda tabla: - -``` sql -INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; - -DROP TABLE criteo_log; -``` - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/example_datasets/criteo/) diff --git a/docs/es/getting-started/example-datasets/index.md b/docs/es/getting-started/example-datasets/index.md deleted file mode 100644 index 28e06987af1..00000000000 --- a/docs/es/getting-started/example-datasets/index.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Datos De Ejemplo -toc_priority: 12 -toc_title: "Implantaci\xF3n" ---- - -# Datos De Ejemplo {#example-datasets} - -En esta sección se describe cómo obtener conjuntos de datos de ejemplo e importarlos a ClickHouse. -Para algunos conjuntos de datos también están disponibles consultas de ejemplo. - -- [Yandex anonimizado.Conjunto de datos de Metrica](metrica.md) -- [Estrella Schema Benchmark](star-schema.md) -- [Nombre de la red inalámbrica (SSID):](wikistat.md) -- [Terabyte de registros de clics de Criteo](criteo.md) -- [Referencia de Big Data de AMPLab](amplab-benchmark.md) -- [Datos de taxis de Nueva York](nyc-taxi.md) -- [A tiempo](ontime.md) - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/example_datasets) diff --git a/docs/es/getting-started/example-datasets/metrica.md b/docs/es/getting-started/example-datasets/metrica.md deleted file mode 100644 index 0b3bc8b6833..00000000000 --- a/docs/es/getting-started/example-datasets/metrica.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 14 -toc_title: El Yandex.Metrica Datos ---- - -# Yandex anonimizado.Metrica Datos {#anonymized-yandex-metrica-data} - -El conjunto de datos consta de dos tablas que contienen datos anónimos sobre los hits (`hits_v1`) y visitas (`visits_v1`) el Yandex.Métrica. Puedes leer más sobre Yandex.Metrica en [Historial de ClickHouse](../../introduction/history.md) apartado. - -El conjunto de datos consta de dos tablas, cualquiera de ellas se puede descargar como `tsv.xz` o como particiones preparadas. Además, una versión extendida de la `hits` La tabla que contiene 100 millones de filas está disponible como TSV en https://datasets.clickhouse.tech/hits/tsv/hits_100m_obfuscated_v1.tsv.xz y como particiones preparadas en https://datasets.clickhouse.tech/hits/partitions/hits_100m_obfuscated_v1.tar.xz. - -## Obtención de tablas a partir de particiones preparadas {#obtaining-tables-from-prepared-partitions} - -Descargar e importar tabla de hits: - -``` bash -curl -O https://datasets.clickhouse.tech/hits/partitions/hits_v1.tar -tar xvf hits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory -# check permissions on unpacked data, fix if required -sudo service clickhouse-server restart -clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" -``` - -Descargar e importar visitas: - -``` bash -curl -O https://datasets.clickhouse.tech/visits/partitions/visits_v1.tar -tar xvf visits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory -# check permissions on unpacked data, fix if required -sudo service clickhouse-server restart -clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" -``` - -## Obtención de tablas a partir de un archivo TSV comprimido {#obtaining-tables-from-compressed-tsv-file} - -Descargar e importar hits desde un archivo TSV comprimido: - -``` bash -curl https://datasets.clickhouse.tech/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv -# now create table -clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" -clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" -# import data -cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000 -# optionally you can optimize table -clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" -clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" -``` - -Descargue e importe visitas desde un archivo tsv comprimido: - -``` bash -curl https://datasets.clickhouse.tech/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv -# now create table -clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" -clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" -# import data -cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000 -# optionally you can optimize table -clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" -clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" -``` - -## Consultas de ejemplo {#example-queries} - -[Tutorial de ClickHouse](../../getting-started/tutorial.md) se basa en Yandex.El conjunto de datos de Metrica y la forma recomendada de comenzar con este conjunto de datos es simplemente pasar por el tutorial. - -Se pueden encontrar ejemplos adicionales de consultas a estas tablas entre [pruebas estatales](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) de ClickHouse (se nombran `test.hists` y `test.visits` alli). diff --git a/docs/es/getting-started/example-datasets/nyc-taxi.md b/docs/es/getting-started/example-datasets/nyc-taxi.md deleted file mode 100644 index c6441311c96..00000000000 --- a/docs/es/getting-started/example-datasets/nyc-taxi.md +++ /dev/null @@ -1,390 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 16 -toc_title: Datos de taxis de Nueva York ---- - -# Datos de taxis de Nueva York {#new-york-taxi-data} - -Este conjunto de datos se puede obtener de dos maneras: - -- importación de datos sin procesar -- descarga de particiones preparadas - -## Cómo importar los datos sin procesar {#how-to-import-the-raw-data} - -Consulte https://github.com/toddwschneider/nyc-taxi-data y http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html para obtener la descripción de un conjunto de datos e instrucciones para descargar. - -La descarga dará como resultado aproximadamente 227 GB de datos sin comprimir en archivos CSV. La descarga tarda aproximadamente una hora en una conexión de 1 Gbit (la descarga paralela de s3.amazonaws.com recupera al menos la mitad de un canal de 1 Gbit). -Es posible que algunos de los archivos no se descarguen por completo. Verifique los tamaños de archivo y vuelva a descargar cualquiera que parezca dudoso. - -Algunos de los archivos pueden contener filas no válidas. Puede arreglarlos de la siguiente manera: - -``` bash -sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_ -sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-03.csv > data/yellow_tripdata_2010-03.csv_ -mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv -mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv -``` - -Entonces los datos deben ser preprocesados en PostgreSQL. Esto creará selecciones de puntos en los polígonos (para hacer coincidir los puntos en el mapa con los distritos de la ciudad de Nueva York) y combinará todos los datos en una única tabla plana desnormalizada mediante el uso de una unión. Para hacer esto, deberá instalar PostgreSQL con soporte PostGIS. - -Tenga cuidado al correr `initialize_database.sh` y volver a verificar manualmente que todas las tablas se crearon correctamente. - -Se tarda entre 20 y 30 minutos en procesar los datos de cada mes en PostgreSQL, por un total de aproximadamente 48 horas. - -Puede comprobar el número de filas descargadas de la siguiente manera: - -``` bash -$ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" -## Count - 1298979494 -(1 row) - -real 7m9.164s -``` - -(Esto es un poco más de 1.1 mil millones de filas reportadas por Mark Litwintschik en una serie de publicaciones de blog.) - -Los datos en PostgreSQL utilizan 370 GB de espacio. - -Exportación de los datos de PostgreSQL: - -``` sql -COPY -( - SELECT trips.id, - trips.vendor_id, - trips.pickup_datetime, - trips.dropoff_datetime, - trips.store_and_fwd_flag, - trips.rate_code_id, - trips.pickup_longitude, - trips.pickup_latitude, - trips.dropoff_longitude, - trips.dropoff_latitude, - trips.passenger_count, - trips.trip_distance, - trips.fare_amount, - trips.extra, - trips.mta_tax, - trips.tip_amount, - trips.tolls_amount, - trips.ehail_fee, - trips.improvement_surcharge, - trips.total_amount, - trips.payment_type, - trips.trip_type, - trips.pickup, - trips.dropoff, - - cab_types.type cab_type, - - weather.precipitation_tenths_of_mm rain, - weather.snow_depth_mm, - weather.snowfall_mm, - weather.max_temperature_tenths_degrees_celsius max_temp, - weather.min_temperature_tenths_degrees_celsius min_temp, - weather.average_wind_speed_tenths_of_meters_per_second wind, - - pick_up.gid pickup_nyct2010_gid, - pick_up.ctlabel pickup_ctlabel, - pick_up.borocode pickup_borocode, - pick_up.boroname pickup_boroname, - pick_up.ct2010 pickup_ct2010, - pick_up.boroct2010 pickup_boroct2010, - pick_up.cdeligibil pickup_cdeligibil, - pick_up.ntacode pickup_ntacode, - pick_up.ntaname pickup_ntaname, - pick_up.puma pickup_puma, - - drop_off.gid dropoff_nyct2010_gid, - drop_off.ctlabel dropoff_ctlabel, - drop_off.borocode dropoff_borocode, - drop_off.boroname dropoff_boroname, - drop_off.ct2010 dropoff_ct2010, - drop_off.boroct2010 dropoff_boroct2010, - drop_off.cdeligibil dropoff_cdeligibil, - drop_off.ntacode dropoff_ntacode, - drop_off.ntaname dropoff_ntaname, - drop_off.puma dropoff_puma - FROM trips - LEFT JOIN cab_types - ON trips.cab_type_id = cab_types.id - LEFT JOIN central_park_weather_observations_raw weather - ON weather.date = trips.pickup_datetime::date - LEFT JOIN nyct2010 pick_up - ON pick_up.gid = trips.pickup_nyct2010_gid - LEFT JOIN nyct2010 drop_off - ON drop_off.gid = trips.dropoff_nyct2010_gid -) TO '/opt/milovidov/nyc-taxi-data/trips.tsv'; -``` - -La instantánea de datos se crea a una velocidad de aproximadamente 50 MB por segundo. Al crear la instantánea, PostgreSQL lee desde el disco a una velocidad de aproximadamente 28 MB por segundo. -Esto toma alrededor de 5 horas. El archivo TSV resultante es 590612904969 bytes. - -Crear una tabla temporal en ClickHouse: - -``` sql -CREATE TABLE trips -( -trip_id UInt32, -vendor_id String, -pickup_datetime DateTime, -dropoff_datetime Nullable(DateTime), -store_and_fwd_flag Nullable(FixedString(1)), -rate_code_id Nullable(UInt8), -pickup_longitude Nullable(Float64), -pickup_latitude Nullable(Float64), -dropoff_longitude Nullable(Float64), -dropoff_latitude Nullable(Float64), -passenger_count Nullable(UInt8), -trip_distance Nullable(Float64), -fare_amount Nullable(Float32), -extra Nullable(Float32), -mta_tax Nullable(Float32), -tip_amount Nullable(Float32), -tolls_amount Nullable(Float32), -ehail_fee Nullable(Float32), -improvement_surcharge Nullable(Float32), -total_amount Nullable(Float32), -payment_type Nullable(String), -trip_type Nullable(UInt8), -pickup Nullable(String), -dropoff Nullable(String), -cab_type Nullable(String), -precipitation Nullable(UInt8), -snow_depth Nullable(UInt8), -snowfall Nullable(UInt8), -max_temperature Nullable(UInt8), -min_temperature Nullable(UInt8), -average_wind_speed Nullable(UInt8), -pickup_nyct2010_gid Nullable(UInt8), -pickup_ctlabel Nullable(String), -pickup_borocode Nullable(UInt8), -pickup_boroname Nullable(String), -pickup_ct2010 Nullable(String), -pickup_boroct2010 Nullable(String), -pickup_cdeligibil Nullable(FixedString(1)), -pickup_ntacode Nullable(String), -pickup_ntaname Nullable(String), -pickup_puma Nullable(String), -dropoff_nyct2010_gid Nullable(UInt8), -dropoff_ctlabel Nullable(String), -dropoff_borocode Nullable(UInt8), -dropoff_boroname Nullable(String), -dropoff_ct2010 Nullable(String), -dropoff_boroct2010 Nullable(String), -dropoff_cdeligibil Nullable(String), -dropoff_ntacode Nullable(String), -dropoff_ntaname Nullable(String), -dropoff_puma Nullable(String) -) ENGINE = Log; -``` - -Es necesario para convertir campos a tipos de datos más correctos y, si es posible, para eliminar NULL. - -``` bash -$ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv - -real 75m56.214s -``` - -Los datos se leen a una velocidad de 112-140 Mb / segundo. -La carga de datos en una tabla de tipos de registro en una secuencia tardó 76 minutos. -Los datos de esta tabla utilizan 142 GB. - -(Importar datos directamente desde Postgres también es posible usando `COPY ... TO PROGRAM`.) - -Unfortunately, all the fields associated with the weather (precipitation…average_wind_speed) were filled with NULL. Because of this, we will remove them from the final data set. - -Para empezar, crearemos una tabla en un único servidor. Posteriormente haremos la mesa distribuida. - -Crear y rellenar una tabla de resumen: - -``` sql -CREATE TABLE trips_mergetree -ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) -AS SELECT - -trip_id, -CAST(vendor_id AS Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14)) AS vendor_id, -toDate(pickup_datetime) AS pickup_date, -ifNull(pickup_datetime, toDateTime(0)) AS pickup_datetime, -toDate(dropoff_datetime) AS dropoff_date, -ifNull(dropoff_datetime, toDateTime(0)) AS dropoff_datetime, -assumeNotNull(store_and_fwd_flag) IN ('Y', '1', '2') AS store_and_fwd_flag, -assumeNotNull(rate_code_id) AS rate_code_id, -assumeNotNull(pickup_longitude) AS pickup_longitude, -assumeNotNull(pickup_latitude) AS pickup_latitude, -assumeNotNull(dropoff_longitude) AS dropoff_longitude, -assumeNotNull(dropoff_latitude) AS dropoff_latitude, -assumeNotNull(passenger_count) AS passenger_count, -assumeNotNull(trip_distance) AS trip_distance, -assumeNotNull(fare_amount) AS fare_amount, -assumeNotNull(extra) AS extra, -assumeNotNull(mta_tax) AS mta_tax, -assumeNotNull(tip_amount) AS tip_amount, -assumeNotNull(tolls_amount) AS tolls_amount, -assumeNotNull(ehail_fee) AS ehail_fee, -assumeNotNull(improvement_surcharge) AS improvement_surcharge, -assumeNotNull(total_amount) AS total_amount, -CAST((assumeNotNull(payment_type) AS pt) IN ('CSH', 'CASH', 'Cash', 'CAS', 'Cas', '1') ? 'CSH' : (pt IN ('CRD', 'Credit', 'Cre', 'CRE', 'CREDIT', '2') ? 'CRE' : (pt IN ('NOC', 'No Charge', 'No', '3') ? 'NOC' : (pt IN ('DIS', 'Dispute', 'Dis', '4') ? 'DIS' : 'UNK'))) AS Enum8('CSH' = 1, 'CRE' = 2, 'UNK' = 0, 'NOC' = 3, 'DIS' = 4)) AS payment_type_, -assumeNotNull(trip_type) AS trip_type, -ifNull(toFixedString(unhex(pickup), 25), toFixedString('', 25)) AS pickup, -ifNull(toFixedString(unhex(dropoff), 25), toFixedString('', 25)) AS dropoff, -CAST(assumeNotNull(cab_type) AS Enum8('yellow' = 1, 'green' = 2, 'uber' = 3)) AS cab_type, - -assumeNotNull(pickup_nyct2010_gid) AS pickup_nyct2010_gid, -toFloat32(ifNull(pickup_ctlabel, '0')) AS pickup_ctlabel, -assumeNotNull(pickup_borocode) AS pickup_borocode, -CAST(assumeNotNull(pickup_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS pickup_boroname, -toFixedString(ifNull(pickup_ct2010, '000000'), 6) AS pickup_ct2010, -toFixedString(ifNull(pickup_boroct2010, '0000000'), 7) AS pickup_boroct2010, -CAST(assumeNotNull(ifNull(pickup_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS pickup_cdeligibil, -toFixedString(ifNull(pickup_ntacode, '0000'), 4) AS pickup_ntacode, - -CAST(assumeNotNull(pickup_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS pickup_ntaname, - -toUInt16(ifNull(pickup_puma, '0')) AS pickup_puma, - -assumeNotNull(dropoff_nyct2010_gid) AS dropoff_nyct2010_gid, -toFloat32(ifNull(dropoff_ctlabel, '0')) AS dropoff_ctlabel, -assumeNotNull(dropoff_borocode) AS dropoff_borocode, -CAST(assumeNotNull(dropoff_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS dropoff_boroname, -toFixedString(ifNull(dropoff_ct2010, '000000'), 6) AS dropoff_ct2010, -toFixedString(ifNull(dropoff_boroct2010, '0000000'), 7) AS dropoff_boroct2010, -CAST(assumeNotNull(ifNull(dropoff_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS dropoff_cdeligibil, -toFixedString(ifNull(dropoff_ntacode, '0000'), 4) AS dropoff_ntacode, - -CAST(assumeNotNull(dropoff_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS dropoff_ntaname, - -toUInt16(ifNull(dropoff_puma, '0')) AS dropoff_puma - -FROM trips -``` - -Esto toma 3030 segundos a una velocidad de aproximadamente 428,000 filas por segundo. -Para cargarlo más rápido, puede crear la tabla con el `Log` motor en lugar de `MergeTree`. En este caso, la descarga funciona más rápido que 200 segundos. - -La tabla utiliza 126 GB de espacio en disco. - -``` sql -SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active -``` - -``` text -┌─formatReadableSize(sum(bytes))─┐ -│ 126.18 GiB │ -└────────────────────────────────┘ -``` - -Entre otras cosas, puede ejecutar la consulta OPTIMIZE en MergeTree. Pero no es necesario ya que todo estará bien sin él. - -## Descarga de Prepared Partitions {#download-of-prepared-partitions} - -``` bash -$ curl -O https://datasets.clickhouse.tech/trips_mergetree/partitions/trips_mergetree.tar -$ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory -$ # check permissions of unpacked data, fix if required -$ sudo service clickhouse-server restart -$ clickhouse-client --query "select count(*) from datasets.trips_mergetree" -``` - -!!! info "INFO" - Si va a ejecutar las consultas que se describen a continuación, debe usar el nombre completo de la tabla, `datasets.trips_mergetree`. - -## Resultados en un solo servidor {#results-on-single-server} - -Q1: - -``` sql -SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type -``` - -0.490 segundos. - -Q2: - -``` sql -SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count -``` - -1.224 segundos. - -Q3: - -``` sql -SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year -``` - -2.104 segundos. - -Q4: - -``` sql -SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) -FROM trips_mergetree -GROUP BY passenger_count, year, distance -ORDER BY year, count(*) DESC -``` - -3.593 segundos. - -Se utilizó el siguiente servidor: - -Dos CPU Intel (R) Xeon (R) E5-2650 v2 @ 2.60GHz, 16 núcleos físicos en total, 128 GiB RAM, 8x6 TB HD en hardware RAID-5 - -El tiempo de ejecución es el mejor de tres carreras. Pero a partir de la segunda ejecución, las consultas leen datos de la memoria caché del sistema de archivos. No se produce más almacenamiento en caché: los datos se leen y procesan en cada ejecución. - -Creación de una tabla en tres servidores: - -En cada servidor: - -``` sql -CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) -``` - -En el servidor de origen: - -``` sql -CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) -``` - -La siguiente consulta redistribuye los datos: - -``` sql -INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree -``` - -Esto tarda 2454 segundos. - -En tres servidores: - -Q1: 0.212 segundos. -Q2: 0.438 segundos. -Q3: 0.733 segundos. -Q4: 1.241 segundos. - -No hay sorpresas aquí, ya que las consultas se escalan linealmente. - -También tenemos los resultados de un clúster de 140 servidores: - -Q1: 0,028 seg. -Q2: 0,043 seg. -Q3: 0,051 seg. -Q4: 0,072 seg. - -En este caso, el tiempo de procesamiento de la consulta está determinado sobre todo por la latencia de la red. -Ejecutamos consultas utilizando un cliente ubicado en un centro de datos de Yandex en Finlandia en un clúster en Rusia, que agregó aproximadamente 20 ms de latencia. - -## Resumen {#summary} - -| servidor | Q1 | Q2 | Q3 | Q4 | -|----------|-------|-------|-------|-------| -| 1 | 0.490 | 1.224 | 2.104 | 3.593 | -| 3 | 0.212 | 0.438 | 0.733 | 1.241 | -| 140 | 0.028 | 0.043 | 0.051 | 0.072 | - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/example_datasets/nyc_taxi/) diff --git a/docs/es/getting-started/example-datasets/ontime.md b/docs/es/getting-started/example-datasets/ontime.md deleted file mode 100644 index f89d74048bd..00000000000 --- a/docs/es/getting-started/example-datasets/ontime.md +++ /dev/null @@ -1,412 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 15 -toc_title: A tiempo ---- - -# A tiempo {#ontime} - -Este conjunto de datos se puede obtener de dos maneras: - -- importación de datos sin procesar -- descarga de particiones preparadas - -## Importar desde datos sin procesar {#import-from-raw-data} - -Descarga de datos: - -``` bash -for s in `seq 1987 2018` -do -for m in `seq 1 12` -do -wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip -done -done -``` - -(a partir de https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) - -Creación de una tabla: - -``` sql -CREATE TABLE `ontime` ( - `Year` UInt16, - `Quarter` UInt8, - `Month` UInt8, - `DayofMonth` UInt8, - `DayOfWeek` UInt8, - `FlightDate` Date, - `UniqueCarrier` FixedString(7), - `AirlineID` Int32, - `Carrier` FixedString(2), - `TailNum` String, - `FlightNum` String, - `OriginAirportID` Int32, - `OriginAirportSeqID` Int32, - `OriginCityMarketID` Int32, - `Origin` FixedString(5), - `OriginCityName` String, - `OriginState` FixedString(2), - `OriginStateFips` String, - `OriginStateName` String, - `OriginWac` Int32, - `DestAirportID` Int32, - `DestAirportSeqID` Int32, - `DestCityMarketID` Int32, - `Dest` FixedString(5), - `DestCityName` String, - `DestState` FixedString(2), - `DestStateFips` String, - `DestStateName` String, - `DestWac` Int32, - `CRSDepTime` Int32, - `DepTime` Int32, - `DepDelay` Int32, - `DepDelayMinutes` Int32, - `DepDel15` Int32, - `DepartureDelayGroups` String, - `DepTimeBlk` String, - `TaxiOut` Int32, - `WheelsOff` Int32, - `WheelsOn` Int32, - `TaxiIn` Int32, - `CRSArrTime` Int32, - `ArrTime` Int32, - `ArrDelay` Int32, - `ArrDelayMinutes` Int32, - `ArrDel15` Int32, - `ArrivalDelayGroups` Int32, - `ArrTimeBlk` String, - `Cancelled` UInt8, - `CancellationCode` FixedString(1), - `Diverted` UInt8, - `CRSElapsedTime` Int32, - `ActualElapsedTime` Int32, - `AirTime` Int32, - `Flights` Int32, - `Distance` Int32, - `DistanceGroup` UInt8, - `CarrierDelay` Int32, - `WeatherDelay` Int32, - `NASDelay` Int32, - `SecurityDelay` Int32, - `LateAircraftDelay` Int32, - `FirstDepTime` String, - `TotalAddGTime` String, - `LongestAddGTime` String, - `DivAirportLandings` String, - `DivReachedDest` String, - `DivActualElapsedTime` String, - `DivArrDelay` String, - `DivDistance` String, - `Div1Airport` String, - `Div1AirportID` Int32, - `Div1AirportSeqID` Int32, - `Div1WheelsOn` String, - `Div1TotalGTime` String, - `Div1LongestGTime` String, - `Div1WheelsOff` String, - `Div1TailNum` String, - `Div2Airport` String, - `Div2AirportID` Int32, - `Div2AirportSeqID` Int32, - `Div2WheelsOn` String, - `Div2TotalGTime` String, - `Div2LongestGTime` String, - `Div2WheelsOff` String, - `Div2TailNum` String, - `Div3Airport` String, - `Div3AirportID` Int32, - `Div3AirportSeqID` Int32, - `Div3WheelsOn` String, - `Div3TotalGTime` String, - `Div3LongestGTime` String, - `Div3WheelsOff` String, - `Div3TailNum` String, - `Div4Airport` String, - `Div4AirportID` Int32, - `Div4AirportSeqID` Int32, - `Div4WheelsOn` String, - `Div4TotalGTime` String, - `Div4LongestGTime` String, - `Div4WheelsOff` String, - `Div4TailNum` String, - `Div5Airport` String, - `Div5AirportID` Int32, - `Div5AirportSeqID` Int32, - `Div5WheelsOn` String, - `Div5TotalGTime` String, - `Div5LongestGTime` String, - `Div5WheelsOff` String, - `Div5TailNum` String -) ENGINE = MergeTree -PARTITION BY Year -ORDER BY (Carrier, FlightDate) -SETTINGS index_granularity = 8192; -``` - -Carga de datos: - -``` bash -$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done -``` - -## Descarga de Prepared Partitions {#download-of-prepared-partitions} - -``` bash -$ curl -O https://datasets.clickhouse.tech/ontime/partitions/ontime.tar -$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory -$ # check permissions of unpacked data, fix if required -$ sudo service clickhouse-server restart -$ clickhouse-client --query "select count(*) from datasets.ontime" -``` - -!!! info "INFO" - Si va a ejecutar las consultas que se describen a continuación, debe usar el nombre completo de la tabla, `datasets.ontime`. - -## Consulta {#queries} - -Q0. - -``` sql -SELECT avg(c1) -FROM -( - SELECT Year, Month, count(*) AS c1 - FROM ontime - GROUP BY Year, Month -); -``` - -Q1. El número de vuelos por día desde el año 2000 hasta 2008 - -``` sql -SELECT DayOfWeek, count(*) AS c -FROM ontime -WHERE Year>=2000 AND Year<=2008 -GROUP BY DayOfWeek -ORDER BY c DESC; -``` - -Preguntas frecuentes El número de vuelos retrasados por más de 10 minutos, agrupados por el día de la semana, para 2000-2008 - -``` sql -SELECT DayOfWeek, count(*) AS c -FROM ontime -WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 -GROUP BY DayOfWeek -ORDER BY c DESC; -``` - -Q3. El número de retrasos por parte del aeropuerto para 2000-2008 - -``` sql -SELECT Origin, count(*) AS c -FROM ontime -WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 -GROUP BY Origin -ORDER BY c DESC -LIMIT 10; -``` - -Preguntas más frecuentes Número de retrasos por transportista para 2007 - -``` sql -SELECT Carrier, count(*) -FROM ontime -WHERE DepDelay>10 AND Year=2007 -GROUP BY Carrier -ORDER BY count(*) DESC; -``` - -Q5. El porcentaje de retrasos por transportista para 2007 - -``` sql -SELECT Carrier, c, c2, c*100/c2 as c3 -FROM -( - SELECT - Carrier, - count(*) AS c - FROM ontime - WHERE DepDelay>10 - AND Year=2007 - GROUP BY Carrier -) -JOIN -( - SELECT - Carrier, - count(*) AS c2 - FROM ontime - WHERE Year=2007 - GROUP BY Carrier -) USING Carrier -ORDER BY c3 DESC; -``` - -Mejor versión de la misma consulta: - -``` sql -SELECT Carrier, avg(DepDelay>10)*100 AS c3 -FROM ontime -WHERE Year=2007 -GROUP BY Carrier -ORDER BY c3 DESC -``` - -¿Por qué? La solicitud anterior de una gama más amplia de años, 2000-2008 - -``` sql -SELECT Carrier, c, c2, c*100/c2 as c3 -FROM -( - SELECT - Carrier, - count(*) AS c - FROM ontime - WHERE DepDelay>10 - AND Year>=2000 AND Year<=2008 - GROUP BY Carrier -) -JOIN -( - SELECT - Carrier, - count(*) AS c2 - FROM ontime - WHERE Year>=2000 AND Year<=2008 - GROUP BY Carrier -) USING Carrier -ORDER BY c3 DESC; -``` - -Mejor versión de la misma consulta: - -``` sql -SELECT Carrier, avg(DepDelay>10)*100 AS c3 -FROM ontime -WHERE Year>=2000 AND Year<=2008 -GROUP BY Carrier -ORDER BY c3 DESC; -``` - -Preguntas frecuentes Porcentaje de vuelos retrasados por más de 10 minutos, por año - -``` sql -SELECT Year, c1/c2 -FROM -( - select - Year, - count(*)*100 as c1 - from ontime - WHERE DepDelay>10 - GROUP BY Year -) -JOIN -( - select - Year, - count(*) as c2 - from ontime - GROUP BY Year -) USING (Year) -ORDER BY Year; -``` - -Mejor versión de la misma consulta: - -``` sql -SELECT Year, avg(DepDelay>10)*100 -FROM ontime -GROUP BY Year -ORDER BY Year; -``` - -¿Por qué? Los destinos más populares por el número de ciudades conectadas directamente para varios rangos de año - -``` sql -SELECT DestCityName, uniqExact(OriginCityName) AS u -FROM ontime -WHERE Year >= 2000 and Year <= 2010 -GROUP BY DestCityName -ORDER BY u DESC LIMIT 10; -``` - -Q9. - -``` sql -SELECT Year, count(*) AS c1 -FROM ontime -GROUP BY Year; -``` - -Q10. - -``` sql -SELECT - min(Year), max(Year), Carrier, count(*) AS cnt, - sum(ArrDelayMinutes>30) AS flights_delayed, - round(sum(ArrDelayMinutes>30)/count(*),2) AS rate -FROM ontime -WHERE - DayOfWeek NOT IN (6,7) AND OriginState NOT IN ('AK', 'HI', 'PR', 'VI') - AND DestState NOT IN ('AK', 'HI', 'PR', 'VI') - AND FlightDate < '2010-01-01' -GROUP by Carrier -HAVING cnt>100000 and max(Year)>1990 -ORDER by rate DESC -LIMIT 1000; -``` - -Bono: - -``` sql -SELECT avg(cnt) -FROM -( - SELECT Year,Month,count(*) AS cnt - FROM ontime - WHERE DepDel15=1 - GROUP BY Year,Month -); - -SELECT avg(c1) FROM -( - SELECT Year,Month,count(*) AS c1 - FROM ontime - GROUP BY Year,Month -); - -SELECT DestCityName, uniqExact(OriginCityName) AS u -FROM ontime -GROUP BY DestCityName -ORDER BY u DESC -LIMIT 10; - -SELECT OriginCityName, DestCityName, count() AS c -FROM ontime -GROUP BY OriginCityName, DestCityName -ORDER BY c DESC -LIMIT 10; - -SELECT OriginCityName, count() AS c -FROM ontime -GROUP BY OriginCityName -ORDER BY c DESC -LIMIT 10; -``` - -Esta prueba de rendimiento fue creada por Vadim Tkachenko. Ver: - -- https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/ -- https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/ -- https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/ -- https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/ -- https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ -- http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/example_datasets/ontime/) diff --git a/docs/es/getting-started/example-datasets/star-schema.md b/docs/es/getting-started/example-datasets/star-schema.md deleted file mode 100644 index 43f878eb205..00000000000 --- a/docs/es/getting-started/example-datasets/star-schema.md +++ /dev/null @@ -1,370 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 20 -toc_title: Estrella Schema Benchmark ---- - -# Estrella Schema Benchmark {#star-schema-benchmark} - -Compilación de dbgen: - -``` bash -$ git clone git@github.com:vadimtk/ssb-dbgen.git -$ cd ssb-dbgen -$ make -``` - -Generación de datos: - -!!! warning "Atención" - Con `-s 100` dbgen genera 600 millones de filas (67 GB), mientras que `-s 1000` genera 6 mil millones de filas (lo que lleva mucho tiempo) - -``` bash -$ ./dbgen -s 1000 -T c -$ ./dbgen -s 1000 -T l -$ ./dbgen -s 1000 -T p -$ ./dbgen -s 1000 -T s -$ ./dbgen -s 1000 -T d -``` - -Creación de tablas en ClickHouse: - -``` sql -CREATE TABLE customer -( - C_CUSTKEY UInt32, - C_NAME String, - C_ADDRESS String, - C_CITY LowCardinality(String), - C_NATION LowCardinality(String), - C_REGION LowCardinality(String), - C_PHONE String, - C_MKTSEGMENT LowCardinality(String) -) -ENGINE = MergeTree ORDER BY (C_CUSTKEY); - -CREATE TABLE lineorder -( - LO_ORDERKEY UInt32, - LO_LINENUMBER UInt8, - LO_CUSTKEY UInt32, - LO_PARTKEY UInt32, - LO_SUPPKEY UInt32, - LO_ORDERDATE Date, - LO_ORDERPRIORITY LowCardinality(String), - LO_SHIPPRIORITY UInt8, - LO_QUANTITY UInt8, - LO_EXTENDEDPRICE UInt32, - LO_ORDTOTALPRICE UInt32, - LO_DISCOUNT UInt8, - LO_REVENUE UInt32, - LO_SUPPLYCOST UInt32, - LO_TAX UInt8, - LO_COMMITDATE Date, - LO_SHIPMODE LowCardinality(String) -) -ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY); - -CREATE TABLE part -( - P_PARTKEY UInt32, - P_NAME String, - P_MFGR LowCardinality(String), - P_CATEGORY LowCardinality(String), - P_BRAND LowCardinality(String), - P_COLOR LowCardinality(String), - P_TYPE LowCardinality(String), - P_SIZE UInt8, - P_CONTAINER LowCardinality(String) -) -ENGINE = MergeTree ORDER BY P_PARTKEY; - -CREATE TABLE supplier -( - S_SUPPKEY UInt32, - S_NAME String, - S_ADDRESS String, - S_CITY LowCardinality(String), - S_NATION LowCardinality(String), - S_REGION LowCardinality(String), - S_PHONE String -) -ENGINE = MergeTree ORDER BY S_SUPPKEY; -``` - -Insertar datos: - -``` bash -$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl -$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl -$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl -$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl -``` - -Conversión “star schema” a desnormalizado “flat schema”: - -``` sql -SET max_memory_usage = 20000000000; - -CREATE TABLE lineorder_flat -ENGINE = MergeTree -PARTITION BY toYear(LO_ORDERDATE) -ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS -SELECT - l.LO_ORDERKEY AS LO_ORDERKEY, - l.LO_LINENUMBER AS LO_LINENUMBER, - l.LO_CUSTKEY AS LO_CUSTKEY, - l.LO_PARTKEY AS LO_PARTKEY, - l.LO_SUPPKEY AS LO_SUPPKEY, - l.LO_ORDERDATE AS LO_ORDERDATE, - l.LO_ORDERPRIORITY AS LO_ORDERPRIORITY, - l.LO_SHIPPRIORITY AS LO_SHIPPRIORITY, - l.LO_QUANTITY AS LO_QUANTITY, - l.LO_EXTENDEDPRICE AS LO_EXTENDEDPRICE, - l.LO_ORDTOTALPRICE AS LO_ORDTOTALPRICE, - l.LO_DISCOUNT AS LO_DISCOUNT, - l.LO_REVENUE AS LO_REVENUE, - l.LO_SUPPLYCOST AS LO_SUPPLYCOST, - l.LO_TAX AS LO_TAX, - l.LO_COMMITDATE AS LO_COMMITDATE, - l.LO_SHIPMODE AS LO_SHIPMODE, - c.C_NAME AS C_NAME, - c.C_ADDRESS AS C_ADDRESS, - c.C_CITY AS C_CITY, - c.C_NATION AS C_NATION, - c.C_REGION AS C_REGION, - c.C_PHONE AS C_PHONE, - c.C_MKTSEGMENT AS C_MKTSEGMENT, - s.S_NAME AS S_NAME, - s.S_ADDRESS AS S_ADDRESS, - s.S_CITY AS S_CITY, - s.S_NATION AS S_NATION, - s.S_REGION AS S_REGION, - s.S_PHONE AS S_PHONE, - p.P_NAME AS P_NAME, - p.P_MFGR AS P_MFGR, - p.P_CATEGORY AS P_CATEGORY, - p.P_BRAND AS P_BRAND, - p.P_COLOR AS P_COLOR, - p.P_TYPE AS P_TYPE, - p.P_SIZE AS P_SIZE, - p.P_CONTAINER AS P_CONTAINER -FROM lineorder AS l -INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY -INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY -INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY; -``` - -Las consultas: - -Q1.1 - -``` sql -SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue -FROM lineorder_flat -WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; -``` - -Q1.2 - -``` sql -SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue -FROM lineorder_flat -WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; -``` - -Q1.3 - -``` sql -SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue -FROM lineorder_flat -WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 - AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; -``` - -Q2.1 - -``` sql -SELECT - sum(LO_REVENUE), - toYear(LO_ORDERDATE) AS year, - P_BRAND -FROM lineorder_flat -WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' -GROUP BY - year, - P_BRAND -ORDER BY - year, - P_BRAND; -``` - -Q2.2 - -``` sql -SELECT - sum(LO_REVENUE), - toYear(LO_ORDERDATE) AS year, - P_BRAND -FROM lineorder_flat -WHERE P_BRAND >= 'MFGR#2221' AND P_BRAND <= 'MFGR#2228' AND S_REGION = 'ASIA' -GROUP BY - year, - P_BRAND -ORDER BY - year, - P_BRAND; -``` - -Q2.3 - -``` sql -SELECT - sum(LO_REVENUE), - toYear(LO_ORDERDATE) AS year, - P_BRAND -FROM lineorder_flat -WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' -GROUP BY - year, - P_BRAND -ORDER BY - year, - P_BRAND; -``` - -Q3.1 - -``` sql -SELECT - C_NATION, - S_NATION, - toYear(LO_ORDERDATE) AS year, - sum(LO_REVENUE) AS revenue -FROM lineorder_flat -WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 -GROUP BY - C_NATION, - S_NATION, - year -ORDER BY - year ASC, - revenue DESC; -``` - -Q3.2 - -``` sql -SELECT - C_CITY, - S_CITY, - toYear(LO_ORDERDATE) AS year, - sum(LO_REVENUE) AS revenue -FROM lineorder_flat -WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 -GROUP BY - C_CITY, - S_CITY, - year -ORDER BY - year ASC, - revenue DESC; -``` - -Q3.3 - -``` sql -SELECT - C_CITY, - S_CITY, - toYear(LO_ORDERDATE) AS year, - sum(LO_REVENUE) AS revenue -FROM lineorder_flat -WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 -GROUP BY - C_CITY, - S_CITY, - year -ORDER BY - year ASC, - revenue DESC; -``` - -Q3.4 - -``` sql -SELECT - C_CITY, - S_CITY, - toYear(LO_ORDERDATE) AS year, - sum(LO_REVENUE) AS revenue -FROM lineorder_flat -WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = 199712 -GROUP BY - C_CITY, - S_CITY, - year -ORDER BY - year ASC, - revenue DESC; -``` - -Q4.1 - -``` sql -SELECT - toYear(LO_ORDERDATE) AS year, - C_NATION, - sum(LO_REVENUE - LO_SUPPLYCOST) AS profit -FROM lineorder_flat -WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') -GROUP BY - year, - C_NATION -ORDER BY - year ASC, - C_NATION ASC; -``` - -Q4.2 - -``` sql -SELECT - toYear(LO_ORDERDATE) AS year, - S_NATION, - P_CATEGORY, - sum(LO_REVENUE - LO_SUPPLYCOST) AS profit -FROM lineorder_flat -WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') -GROUP BY - year, - S_NATION, - P_CATEGORY -ORDER BY - year ASC, - S_NATION ASC, - P_CATEGORY ASC; -``` - -Q4.3 - -``` sql -SELECT - toYear(LO_ORDERDATE) AS year, - S_CITY, - P_BRAND, - sum(LO_REVENUE - LO_SUPPLYCOST) AS profit -FROM lineorder_flat -WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' -GROUP BY - year, - S_CITY, - P_BRAND -ORDER BY - year ASC, - S_CITY ASC, - P_BRAND ASC; -``` - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/example_datasets/star_schema/) diff --git a/docs/es/getting-started/example-datasets/wikistat.md b/docs/es/getting-started/example-datasets/wikistat.md deleted file mode 100644 index 49d7263cdec..00000000000 --- a/docs/es/getting-started/example-datasets/wikistat.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 18 -toc_title: "Nombre de la red inal\xE1mbrica (SSID):" ---- - -# Nombre de la red inalámbrica (SSID): {#wikistat} - -Ver: http://dumps.wikimedia.org/other/pagecounts-raw/ - -Creación de una tabla: - -``` sql -CREATE TABLE wikistat -( - date Date, - time DateTime, - project String, - subproject String, - path String, - hits UInt64, - size UInt64 -) ENGINE = MergeTree(date, (path, time), 8192); -``` - -Carga de datos: - -``` bash -$ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt -$ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done -$ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done -``` - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/example_datasets/wikistat/) diff --git a/docs/es/getting-started/index.md b/docs/es/getting-started/index.md deleted file mode 100644 index 681c2017ac1..00000000000 --- a/docs/es/getting-started/index.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Primeros pasos -toc_hidden: true -toc_priority: 8 -toc_title: oculto ---- - -# Primeros pasos {#getting-started} - -Si eres nuevo en ClickHouse y quieres tener una sensación práctica de su rendimiento, antes que nada, debes pasar por el [proceso de instalación](install.md). Después de eso puedes: - -- [Ir a través de tutorial detallado](tutorial.md) -- [Experimente con conjuntos de datos de ejemplo](example-datasets/ontime.md) - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/) diff --git a/docs/es/getting-started/install.md b/docs/es/getting-started/install.md deleted file mode 100644 index 092ef47b2f7..00000000000 --- a/docs/es/getting-started/install.md +++ /dev/null @@ -1,182 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 11 -toc_title: "Instalaci\xF3n" ---- - -# Instalación {#installation} - -## Requisitos del sistema {#system-requirements} - -ClickHouse puede ejecutarse en cualquier Linux, FreeBSD o Mac OS X con arquitectura de CPU x86_64, AArch64 o PowerPC64LE. - -Los binarios oficiales preconstruidos generalmente se compilan para x86_64 y aprovechan el conjunto de instrucciones SSE 4.2, por lo que, a menos que se indique lo contrario, el uso de la CPU que lo admite se convierte en un requisito adicional del sistema. Aquí está el comando para verificar si la CPU actual tiene soporte para SSE 4.2: - -``` bash -$ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" -``` - -Para ejecutar ClickHouse en procesadores que no admiten SSE 4.2 o tienen arquitectura AArch64 o PowerPC64LE, debe [construir ClickHouse a partir de fuentes](#from-sources) con los ajustes de configuración adecuados. - -## Opciones de instalación disponibles {#available-installation-options} - -### De paquetes DEB {#install-from-deb-packages} - -Se recomienda utilizar pre-compilado oficial `deb` Paquetes para Debian o Ubuntu. Ejecute estos comandos para instalar paquetes: - -``` bash -{% include 'install/deb.sh' %} -``` - -Si desea utilizar la versión más reciente, reemplace `stable` con `testing` (esto se recomienda para sus entornos de prueba). - -También puede descargar e instalar paquetes manualmente desde [aqui](https://repo.clickhouse.tech/deb/stable/main/). - -#### Paquete {#packages} - -- `clickhouse-common-static` — Installs ClickHouse compiled binary files. -- `clickhouse-server` — Creates a symbolic link for `clickhouse-server` e instala la configuración predeterminada del servidor. -- `clickhouse-client` — Creates a symbolic link for `clickhouse-client` y otras herramientas relacionadas con el cliente. e instala los archivos de configuración del cliente. -- `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info. - -### De paquetes RPM {#from-rpm-packages} - -Se recomienda utilizar pre-compilado oficial `rpm` También puede utilizar los paquetes para CentOS, RedHat y todas las demás distribuciones de Linux basadas en rpm. - -Primero, necesitas agregar el repositorio oficial: - -``` bash -sudo yum install yum-utils -sudo rpm --import https://repo.clickhouse.tech/CLICKHOUSE-KEY.GPG -sudo yum-config-manager --add-repo https://repo.clickhouse.tech/rpm/stable/x86_64 -``` - -Si desea utilizar la versión más reciente, reemplace `stable` con `testing` (esto se recomienda para sus entornos de prueba). El `prestable` etiqueta a veces está disponible también. - -A continuación, ejecute estos comandos para instalar paquetes: - -``` bash -sudo yum install clickhouse-server clickhouse-client -``` - -También puede descargar e instalar paquetes manualmente desde [aqui](https://repo.clickhouse.tech/rpm/stable/x86_64). - -### De archivos Tgz {#from-tgz-archives} - -Se recomienda utilizar pre-compilado oficial `tgz` para todas las distribuciones de Linux, donde la instalación de `deb` o `rpm` paquetes no es posible. - -La versión requerida se puede descargar con `curl` o `wget` desde el repositorio https://repo.clickhouse.tech/tgz/. -Después de eso, los archivos descargados deben desempaquetarse e instalarse con scripts de instalación. Ejemplo para la última versión: - -``` bash -export LATEST_VERSION=`curl https://api.github.com/repos/ClickHouse/ClickHouse/tags 2>/dev/null | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1` -curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-dbg-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.tech/tgz/clickhouse-server-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.tech/tgz/clickhouse-client-$LATEST_VERSION.tgz - -tar -xzvf clickhouse-common-static-$LATEST_VERSION.tgz -sudo clickhouse-common-static-$LATEST_VERSION/install/doinst.sh - -tar -xzvf clickhouse-common-static-dbg-$LATEST_VERSION.tgz -sudo clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh - -tar -xzvf clickhouse-server-$LATEST_VERSION.tgz -sudo clickhouse-server-$LATEST_VERSION/install/doinst.sh -sudo /etc/init.d/clickhouse-server start - -tar -xzvf clickhouse-client-$LATEST_VERSION.tgz -sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh -``` - -Para los entornos de producción, se recomienda utilizar las últimas `stable`-versión. Puede encontrar su número en la página de GitHub https://github.com/ClickHouse/ClickHouse/tags con postfix `-stable`. - -### Desde Docker Image {#from-docker-image} - -Para ejecutar ClickHouse dentro de Docker, siga la guía en [Eje de acoplador](https://hub.docker.com/r/yandex/clickhouse-server/). Esas imágenes usan oficial `deb` paquetes dentro. - -### De fuentes {#from-sources} - -Para compilar manualmente ClickHouse, siga las instrucciones para [Linux](../development/build.md) o [Mac OS X](../development/build-osx.md). - -Puede compilar paquetes e instalarlos o usar programas sin instalar paquetes. Además, al construir manualmente, puede deshabilitar el requisito de SSE 4.2 o compilar para CPU AArch64. - - Client: programs/clickhouse-client - Server: programs/clickhouse-server - -Tendrá que crear carpetas de datos y metadatos y `chown` para el usuario deseado. Sus rutas se pueden cambiar en la configuración del servidor (src/programs/server/config.xml), por defecto son: - - /opt/clickhouse/data/default/ - /opt/clickhouse/metadata/default/ - -En Gentoo, puedes usar `emerge clickhouse` para instalar ClickHouse desde fuentes. - -## Lanzar {#launch} - -Para iniciar el servidor como demonio, ejecute: - -``` bash -$ sudo service clickhouse-server start -``` - -Si no tienes `service` comando ejecutar como - -``` bash -$ sudo /etc/init.d/clickhouse-server start -``` - -Vea los registros en el `/var/log/clickhouse-server/` directorio. - -Si el servidor no se inicia, compruebe las configuraciones en el archivo `/etc/clickhouse-server/config.xml`. - -También puede iniciar manualmente el servidor desde la consola: - -``` bash -$ clickhouse-server --config-file=/etc/clickhouse-server/config.xml -``` - -En este caso, el registro se imprimirá en la consola, lo cual es conveniente durante el desarrollo. -Si el archivo de configuración está en el directorio actual, no es necesario `--config-file` parámetro. De forma predeterminada, utiliza `./config.xml`. - -ClickHouse admite la configuración de restricción de acceso. Están ubicados en el `users.xml` archivo (junto a `config.xml`). -De forma predeterminada, se permite el acceso desde cualquier lugar `default` usuario, sin una contraseña. Ver `user/default/networks`. -Para obtener más información, consulte la sección [“Configuration Files”](../operations/configuration-files.md). - -Después de iniciar el servidor, puede usar el cliente de línea de comandos para conectarse a él: - -``` bash -$ clickhouse-client -``` - -Por defecto, se conecta a `localhost:9000` en nombre del usuario `default` sin una contraseña. También se puede usar para conectarse a un servidor remoto usando `--host` argumento. - -El terminal debe usar codificación UTF-8. -Para obtener más información, consulte la sección [“Command-line client”](../interfaces/cli.md). - -Ejemplo: - -``` bash -$ ./clickhouse-client -ClickHouse client version 0.0.18749. -Connecting to localhost:9000. -Connected to ClickHouse server version 0.0.18749. - -:) SELECT 1 - -SELECT 1 - -┌─1─┐ -│ 1 │ -└───┘ - -1 rows in set. Elapsed: 0.003 sec. - -:) -``` - -**Felicidades, el sistema funciona!** - -Para continuar experimentando, puede descargar uno de los conjuntos de datos de prueba o pasar por [tutorial](https://clickhouse.tech/tutorial.html). - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/install/) diff --git a/docs/es/getting-started/playground.md b/docs/es/getting-started/playground.md deleted file mode 100644 index 1ab7246e2d4..00000000000 --- a/docs/es/getting-started/playground.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 14 -toc_title: Infantil ---- - -# Zona de juegos ClickHouse {#clickhouse-playground} - -[Zona de juegos ClickHouse](https://play.clickhouse.tech?file=welcome) permite a las personas experimentar con ClickHouse ejecutando consultas al instante, sin configurar su servidor o clúster. -Varios conjuntos de datos de ejemplo están disponibles en Playground, así como consultas de ejemplo que muestran las características de ClickHouse. - -Las consultas se ejecutan como un usuario de sólo lectura. Implica algunas limitaciones: - -- No se permiten consultas DDL -- Las consultas INSERT no están permitidas - -También se aplican los siguientes valores: -- [`max_result_bytes=10485760`](../operations/settings/query_complexity/#max-result-bytes) -- [`max_result_rows=2000`](../operations/settings/query_complexity/#setting-max_result_rows) -- [`result_overflow_mode=break`](../operations/settings/query_complexity/#result-overflow-mode) -- [`max_execution_time=60000`](../operations/settings/query_complexity/#max-execution-time) - -ClickHouse Playground da la experiencia de m2.pequeño -[Servicio administrado para ClickHouse](https://cloud.yandex.com/services/managed-clickhouse) -instancia alojada en [El Yandex.Nube](https://cloud.yandex.com/). -Más información sobre [proveedores de la nube](../commercial/cloud.md). - -La interfaz web de ClickHouse Playground realiza solicitudes a través de ClickHouse [HTTP API](../interfaces/http.md). -El backend Playground es solo un clúster ClickHouse sin ninguna aplicación adicional del lado del servidor. -El punto final HTTPS de ClickHouse también está disponible como parte de Playground. - -Puede realizar consultas al patio de recreo utilizando cualquier cliente HTTP, por ejemplo [rizo](https://curl.haxx.se) o [wget](https://www.gnu.org/software/wget/), o configurar una conexión usando [JDBC](../interfaces/jdbc.md) o [ODBC](../interfaces/odbc.md) controlador. -Más información sobre los productos de software compatibles con ClickHouse está disponible [aqui](../interfaces/index.md). - -| Parámetro | Valor | -|:------------|:----------------------------------------------| -| Punto final | https://play-api.casa de clic.tecnología:8443 | -| Usuario | `playground` | -| Contraseña | `clickhouse` | - -Tenga en cuenta que este extremo requiere una conexión segura. - -Ejemplo: - -``` bash -curl "https://play-api.clickhouse.tech:8443/?query=SELECT+'Play+ClickHouse!';&user=playground&password=clickhouse&database=datasets" -``` diff --git a/docs/es/getting-started/tutorial.md b/docs/es/getting-started/tutorial.md deleted file mode 100644 index 2cc9339f954..00000000000 --- a/docs/es/getting-started/tutorial.md +++ /dev/null @@ -1,664 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 12 -toc_title: Tutorial ---- - -# Tutorial de ClickHouse {#clickhouse-tutorial} - -## Qué Esperar de Este Tutorial? {#what-to-expect-from-this-tutorial} - -Al pasar por este tutorial, aprenderá cómo configurar un clúster de ClickHouse simple. Será pequeño, pero tolerante a fallos y escalable. Luego usaremos uno de los conjuntos de datos de ejemplo para llenarlo con datos y ejecutar algunas consultas de demostración. - -## Configuración de nodo único {#single-node-setup} - -Para posponer las complejidades de un entorno distribuido, comenzaremos con la implementación de ClickHouse en un único servidor o máquina virtual. ClickHouse generalmente se instala desde [deb](install.md#install-from-deb-packages) o [RPM](install.md#from-rpm-packages) paquetes, pero hay [alternativa](install.md#from-docker-image) para los sistemas operativos que no los admiten. - -Por ejemplo, ha elegido `deb` paquetes y ejecutado: - -``` bash -{% include 'install/deb.sh' %} -``` - -¿Qué tenemos en los paquetes que tengo instalados: - -- `clickhouse-client` el paquete contiene [Casa de clics-cliente](../interfaces/cli.md) aplicación, cliente interactivo de la consola ClickHouse. -- `clickhouse-common` El paquete contiene un archivo ejecutable ClickHouse. -- `clickhouse-server` El paquete contiene archivos de configuración para ejecutar ClickHouse como servidor. - -Los archivos de configuración del servidor se encuentran en `/etc/clickhouse-server/`. Antes de ir más lejos, tenga en cuenta el `` elemento en `config.xml`. La ruta determina la ubicación para el almacenamiento de datos, por lo que debe ubicarse en un volumen con gran capacidad de disco; el valor predeterminado es `/var/lib/clickhouse/`. Si desea ajustar la configuración, no es útil editar directamente `config.xml` archivo, teniendo en cuenta que podría ser reescrito en futuras actualizaciones de paquetes. La forma recomendada de anular los elementos de configuración es crear [archivos en config.directorio d](../operations/configuration-files.md) que sirven como “patches” de configuración.XML. - -Como habrás notado, `clickhouse-server` no se inicia automáticamente después de la instalación del paquete. Tampoco se reiniciará automáticamente después de las actualizaciones. La forma en que inicia el servidor depende de su sistema de inicio, por lo general, es: - -``` bash -sudo service clickhouse-server start -``` - -o - -``` bash -sudo /etc/init.d/clickhouse-server start -``` - -La ubicación predeterminada para los registros del servidor es `/var/log/clickhouse-server/`. El servidor está listo para manejar las conexiones de cliente una vez que registra el `Ready for connections` mensaje. - -Una vez que el `clickhouse-server` está en funcionamiento, podemos usar `clickhouse-client` para conectarse al servidor y ejecutar algunas consultas de prueba como `SELECT "Hello, world!";`. - -
- -Consejos rápidos para clickhouse-cliente - -Modo interactivo: - -``` bash -clickhouse-client -clickhouse-client --host=... --port=... --user=... --password=... -``` - -Habilitar consultas multilínea: - -``` bash -clickhouse-client -m -clickhouse-client --multiline -``` - -Ejecutar consultas en modo por lotes: - -``` bash -clickhouse-client --query='SELECT 1' -echo 'SELECT 1' | clickhouse-client -clickhouse-client <<< 'SELECT 1' -``` - -Insertar datos de un archivo en el formato especificado: - -``` bash -clickhouse-client --query='INSERT INTO table VALUES' < data.txt -clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv -``` - -
- -## Importar conjunto de datos de muestra {#import-sample-dataset} - -Ahora es el momento de llenar nuestro servidor ClickHouse con algunos datos de muestra. En este tutorial, usaremos los datos anónimos de Yandex.Metrica, el primer servicio que ejecuta ClickHouse en forma de producción antes de que se convirtiera en código abierto (más sobre eso en [sección de historia](../introduction/history.md)). Hay [múltiples formas de importar Yandex.Conjunto de datos de Metrica](example-datasets/metrica.md), y por el bien del tutorial, iremos con el más realista. - -### Descargar y extraer datos de tabla {#download-and-extract-table-data} - -``` bash -curl https://datasets.clickhouse.tech/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv -curl https://datasets.clickhouse.tech/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv -``` - -Los archivos extraídos tienen un tamaño de aproximadamente 10 GB. - -### Crear tablas {#create-tables} - -Como en la mayoría de los sistemas de gestión de bases de datos, ClickHouse agrupa lógicamente las tablas en “databases”. Hay un `default` base de datos, pero crearemos una nueva llamada `tutorial`: - -``` bash -clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial" -``` - -La sintaxis para crear tablas es mucho más complicada en comparación con las bases de datos (ver [referencia](../sql-reference/statements/create.md). En general `CREATE TABLE` declaración tiene que especificar tres cosas clave: - -1. Nombre de la tabla que se va a crear. -2. Table schema, i.e. list of columns and their [tipos de datos](../sql-reference/data-types/index.md). -3. [Motor de tabla](../engines/table-engines/index.md) y su configuración, que determina todos los detalles sobre cómo se ejecutarán físicamente las consultas a esta tabla. - -El Yandex.Metrica es un servicio de análisis web, y el conjunto de datos de muestra no cubre toda su funcionalidad, por lo que solo hay dos tablas para crear: - -- `hits` es una tabla con cada acción realizada por todos los usuarios en todos los sitios web cubiertos por el servicio. -- `visits` es una tabla que contiene sesiones precompiladas en lugar de acciones individuales. - -Veamos y ejecutemos las consultas de tabla de creación real para estas tablas: - -``` sql -CREATE TABLE tutorial.hits_v1 -( - `WatchID` UInt64, - `JavaEnable` UInt8, - `Title` String, - `GoodEvent` Int16, - `EventTime` DateTime, - `EventDate` Date, - `CounterID` UInt32, - `ClientIP` UInt32, - `ClientIP6` FixedString(16), - `RegionID` UInt32, - `UserID` UInt64, - `CounterClass` Int8, - `OS` UInt8, - `UserAgent` UInt8, - `URL` String, - `Referer` String, - `URLDomain` String, - `RefererDomain` String, - `Refresh` UInt8, - `IsRobot` UInt8, - `RefererCategories` Array(UInt16), - `URLCategories` Array(UInt16), - `URLRegions` Array(UInt32), - `RefererRegions` Array(UInt32), - `ResolutionWidth` UInt16, - `ResolutionHeight` UInt16, - `ResolutionDepth` UInt8, - `FlashMajor` UInt8, - `FlashMinor` UInt8, - `FlashMinor2` String, - `NetMajor` UInt8, - `NetMinor` UInt8, - `UserAgentMajor` UInt16, - `UserAgentMinor` FixedString(2), - `CookieEnable` UInt8, - `JavascriptEnable` UInt8, - `IsMobile` UInt8, - `MobilePhone` UInt8, - `MobilePhoneModel` String, - `Params` String, - `IPNetworkID` UInt32, - `TraficSourceID` Int8, - `SearchEngineID` UInt16, - `SearchPhrase` String, - `AdvEngineID` UInt8, - `IsArtifical` UInt8, - `WindowClientWidth` UInt16, - `WindowClientHeight` UInt16, - `ClientTimeZone` Int16, - `ClientEventTime` DateTime, - `SilverlightVersion1` UInt8, - `SilverlightVersion2` UInt8, - `SilverlightVersion3` UInt32, - `SilverlightVersion4` UInt16, - `PageCharset` String, - `CodeVersion` UInt32, - `IsLink` UInt8, - `IsDownload` UInt8, - `IsNotBounce` UInt8, - `FUniqID` UInt64, - `HID` UInt32, - `IsOldCounter` UInt8, - `IsEvent` UInt8, - `IsParameter` UInt8, - `DontCountHits` UInt8, - `WithHash` UInt8, - `HitColor` FixedString(1), - `UTCEventTime` DateTime, - `Age` UInt8, - `Sex` UInt8, - `Income` UInt8, - `Interests` UInt16, - `Robotness` UInt8, - `GeneralInterests` Array(UInt16), - `RemoteIP` UInt32, - `RemoteIP6` FixedString(16), - `WindowName` Int32, - `OpenerName` Int32, - `HistoryLength` Int16, - `BrowserLanguage` FixedString(2), - `BrowserCountry` FixedString(2), - `SocialNetwork` String, - `SocialAction` String, - `HTTPError` UInt16, - `SendTiming` Int32, - `DNSTiming` Int32, - `ConnectTiming` Int32, - `ResponseStartTiming` Int32, - `ResponseEndTiming` Int32, - `FetchTiming` Int32, - `RedirectTiming` Int32, - `DOMInteractiveTiming` Int32, - `DOMContentLoadedTiming` Int32, - `DOMCompleteTiming` Int32, - `LoadEventStartTiming` Int32, - `LoadEventEndTiming` Int32, - `NSToDOMContentLoadedTiming` Int32, - `FirstPaintTiming` Int32, - `RedirectCount` Int8, - `SocialSourceNetworkID` UInt8, - `SocialSourcePage` String, - `ParamPrice` Int64, - `ParamOrderID` String, - `ParamCurrency` FixedString(3), - `ParamCurrencyID` UInt16, - `GoalsReached` Array(UInt32), - `OpenstatServiceName` String, - `OpenstatCampaignID` String, - `OpenstatAdID` String, - `OpenstatSourceID` String, - `UTMSource` String, - `UTMMedium` String, - `UTMCampaign` String, - `UTMContent` String, - `UTMTerm` String, - `FromTag` String, - `HasGCLID` UInt8, - `RefererHash` UInt64, - `URLHash` UInt64, - `CLID` UInt32, - `YCLID` UInt64, - `ShareService` String, - `ShareURL` String, - `ShareTitle` String, - `ParsedParams` Nested( - Key1 String, - Key2 String, - Key3 String, - Key4 String, - Key5 String, - ValueDouble Float64), - `IslandID` FixedString(16), - `RequestNum` UInt32, - `RequestTry` UInt8 -) -ENGINE = MergeTree() -PARTITION BY toYYYYMM(EventDate) -ORDER BY (CounterID, EventDate, intHash32(UserID)) -SAMPLE BY intHash32(UserID) -``` - -``` sql -CREATE TABLE tutorial.visits_v1 -( - `CounterID` UInt32, - `StartDate` Date, - `Sign` Int8, - `IsNew` UInt8, - `VisitID` UInt64, - `UserID` UInt64, - `StartTime` DateTime, - `Duration` UInt32, - `UTCStartTime` DateTime, - `PageViews` Int32, - `Hits` Int32, - `IsBounce` UInt8, - `Referer` String, - `StartURL` String, - `RefererDomain` String, - `StartURLDomain` String, - `EndURL` String, - `LinkURL` String, - `IsDownload` UInt8, - `TraficSourceID` Int8, - `SearchEngineID` UInt16, - `SearchPhrase` String, - `AdvEngineID` UInt8, - `PlaceID` Int32, - `RefererCategories` Array(UInt16), - `URLCategories` Array(UInt16), - `URLRegions` Array(UInt32), - `RefererRegions` Array(UInt32), - `IsYandex` UInt8, - `GoalReachesDepth` Int32, - `GoalReachesURL` Int32, - `GoalReachesAny` Int32, - `SocialSourceNetworkID` UInt8, - `SocialSourcePage` String, - `MobilePhoneModel` String, - `ClientEventTime` DateTime, - `RegionID` UInt32, - `ClientIP` UInt32, - `ClientIP6` FixedString(16), - `RemoteIP` UInt32, - `RemoteIP6` FixedString(16), - `IPNetworkID` UInt32, - `SilverlightVersion3` UInt32, - `CodeVersion` UInt32, - `ResolutionWidth` UInt16, - `ResolutionHeight` UInt16, - `UserAgentMajor` UInt16, - `UserAgentMinor` UInt16, - `WindowClientWidth` UInt16, - `WindowClientHeight` UInt16, - `SilverlightVersion2` UInt8, - `SilverlightVersion4` UInt16, - `FlashVersion3` UInt16, - `FlashVersion4` UInt16, - `ClientTimeZone` Int16, - `OS` UInt8, - `UserAgent` UInt8, - `ResolutionDepth` UInt8, - `FlashMajor` UInt8, - `FlashMinor` UInt8, - `NetMajor` UInt8, - `NetMinor` UInt8, - `MobilePhone` UInt8, - `SilverlightVersion1` UInt8, - `Age` UInt8, - `Sex` UInt8, - `Income` UInt8, - `JavaEnable` UInt8, - `CookieEnable` UInt8, - `JavascriptEnable` UInt8, - `IsMobile` UInt8, - `BrowserLanguage` UInt16, - `BrowserCountry` UInt16, - `Interests` UInt16, - `Robotness` UInt8, - `GeneralInterests` Array(UInt16), - `Params` Array(String), - `Goals` Nested( - ID UInt32, - Serial UInt32, - EventTime DateTime, - Price Int64, - OrderID String, - CurrencyID UInt32), - `WatchIDs` Array(UInt64), - `ParamSumPrice` Int64, - `ParamCurrency` FixedString(3), - `ParamCurrencyID` UInt16, - `ClickLogID` UInt64, - `ClickEventID` Int32, - `ClickGoodEvent` Int32, - `ClickEventTime` DateTime, - `ClickPriorityID` Int32, - `ClickPhraseID` Int32, - `ClickPageID` Int32, - `ClickPlaceID` Int32, - `ClickTypeID` Int32, - `ClickResourceID` Int32, - `ClickCost` UInt32, - `ClickClientIP` UInt32, - `ClickDomainID` UInt32, - `ClickURL` String, - `ClickAttempt` UInt8, - `ClickOrderID` UInt32, - `ClickBannerID` UInt32, - `ClickMarketCategoryID` UInt32, - `ClickMarketPP` UInt32, - `ClickMarketCategoryName` String, - `ClickMarketPPName` String, - `ClickAWAPSCampaignName` String, - `ClickPageName` String, - `ClickTargetType` UInt16, - `ClickTargetPhraseID` UInt64, - `ClickContextType` UInt8, - `ClickSelectType` Int8, - `ClickOptions` String, - `ClickGroupBannerID` Int32, - `OpenstatServiceName` String, - `OpenstatCampaignID` String, - `OpenstatAdID` String, - `OpenstatSourceID` String, - `UTMSource` String, - `UTMMedium` String, - `UTMCampaign` String, - `UTMContent` String, - `UTMTerm` String, - `FromTag` String, - `HasGCLID` UInt8, - `FirstVisit` DateTime, - `PredLastVisit` Date, - `LastVisit` Date, - `TotalVisits` UInt32, - `TraficSource` Nested( - ID Int8, - SearchEngineID UInt16, - AdvEngineID UInt8, - PlaceID UInt16, - SocialSourceNetworkID UInt8, - Domain String, - SearchPhrase String, - SocialSourcePage String), - `Attendance` FixedString(16), - `CLID` UInt32, - `YCLID` UInt64, - `NormalizedRefererHash` UInt64, - `SearchPhraseHash` UInt64, - `RefererDomainHash` UInt64, - `NormalizedStartURLHash` UInt64, - `StartURLDomainHash` UInt64, - `NormalizedEndURLHash` UInt64, - `TopLevelDomain` UInt64, - `URLScheme` UInt64, - `OpenstatServiceNameHash` UInt64, - `OpenstatCampaignIDHash` UInt64, - `OpenstatAdIDHash` UInt64, - `OpenstatSourceIDHash` UInt64, - `UTMSourceHash` UInt64, - `UTMMediumHash` UInt64, - `UTMCampaignHash` UInt64, - `UTMContentHash` UInt64, - `UTMTermHash` UInt64, - `FromHash` UInt64, - `WebVisorEnabled` UInt8, - `WebVisorActivity` UInt32, - `ParsedParams` Nested( - Key1 String, - Key2 String, - Key3 String, - Key4 String, - Key5 String, - ValueDouble Float64), - `Market` Nested( - Type UInt8, - GoalID UInt32, - OrderID String, - OrderPrice Int64, - PP UInt32, - DirectPlaceID UInt32, - DirectOrderID UInt32, - DirectBannerID UInt32, - GoodID String, - GoodName String, - GoodQuantity Int32, - GoodPrice Int64), - `IslandID` FixedString(16) -) -ENGINE = CollapsingMergeTree(Sign) -PARTITION BY toYYYYMM(StartDate) -ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) -SAMPLE BY intHash32(UserID) -``` - -Puede ejecutar esas consultas utilizando el modo interactivo de `clickhouse-client` (simplemente ejecútelo en un terminal sin especificar una consulta por adelantado) o pruebe algunos [interfaz alternativa](../interfaces/index.md) Si quieres. - -Como podemos ver, `hits_v1` utiliza el [motor básico MergeTree](../engines/table-engines/mergetree-family/mergetree.md), mientras que el `visits_v1` utiliza el [Derrumbar](../engines/table-engines/mergetree-family/collapsingmergetree.md) variante. - -### Importar datos {#import-data} - -La importación de datos a ClickHouse se realiza a través de [INSERT INTO](../sql-reference/statements/insert-into.md) consulta como en muchas otras bases de datos SQL. Sin embargo, los datos generalmente se proporcionan en uno de los [Formatos de serialización compatibles](../interfaces/formats.md) en lugar de `VALUES` cláusula (que también es compatible). - -Los archivos que descargamos anteriormente están en formato separado por tabuladores, así que aquí le mostramos cómo importarlos a través del cliente de la consola: - -``` bash -clickhouse-client --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert_block_size=100000 < hits_v1.tsv -clickhouse-client --query "INSERT INTO tutorial.visits_v1 FORMAT TSV" --max_insert_block_size=100000 < visits_v1.tsv -``` - -ClickHouse tiene un montón de [ajustes para sintonizar](../operations/settings/index.md) y una forma de especificarlos en el cliente de la consola es a través de argumentos, como podemos ver con `--max_insert_block_size`. La forma más fácil de averiguar qué configuraciones están disponibles, qué significan y cuáles son los valores predeterminados es consultar el `system.settings` tabla: - -``` sql -SELECT name, value, changed, description -FROM system.settings -WHERE name LIKE '%max_insert_b%' -FORMAT TSV - -max_insert_block_size 1048576 0 "The maximum block size for insertion, if we control the creation of blocks for insertion." -``` - -Opcionalmente se puede [OPTIMIZE](../sql-reference/statements/misc.md#misc_operations-optimize) las tablas después de la importación. Las tablas que están configuradas con un motor de la familia MergeTree siempre fusionan partes de datos en segundo plano para optimizar el almacenamiento de datos (o al menos verificar si tiene sentido). Estas consultas obligan al motor de tablas a realizar la optimización del almacenamiento en este momento en lugar de algún tiempo después: - -``` bash -clickhouse-client --query "OPTIMIZE TABLE tutorial.hits_v1 FINAL" -clickhouse-client --query "OPTIMIZE TABLE tutorial.visits_v1 FINAL" -``` - -Estas consultas inician una operación intensiva de E / S y CPU, por lo que si la tabla recibe datos nuevos de manera consistente, es mejor dejarlos solos y dejar que las fusiones se ejecuten en segundo plano. - -Ahora podemos comprobar si la importación de la tabla fue exitosa: - -``` bash -clickhouse-client --query "SELECT COUNT(*) FROM tutorial.hits_v1" -clickhouse-client --query "SELECT COUNT(*) FROM tutorial.visits_v1" -``` - -## Consultas de ejemplo {#example-queries} - -``` sql -SELECT - StartURL AS URL, - AVG(Duration) AS AvgDuration -FROM tutorial.visits_v1 -WHERE StartDate BETWEEN '2014-03-23' AND '2014-03-30' -GROUP BY URL -ORDER BY AvgDuration DESC -LIMIT 10 -``` - -``` sql -SELECT - sum(Sign) AS visits, - sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits, - (100. * goal_visits) / visits AS goal_percent -FROM tutorial.visits_v1 -WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru') -``` - -## Implementación de clúster {#cluster-deployment} - -El clúster ClickHouse es un clúster homogéneo. Pasos para configurar: - -1. Instale el servidor ClickHouse en todas las máquinas del clúster -2. Configurar configuraciones de clúster en archivos de configuración -3. Crear tablas locales en cada instancia -4. Crear un [Tabla distribuida](../engines/table-engines/special/distributed.md) - -[Tabla distribuida](../engines/table-engines/special/distributed.md) es en realidad una especie de “view” a las tablas locales del clúster ClickHouse. La consulta SELECT de una tabla distribuida se ejecuta utilizando recursos de todos los fragmentos del clúster. Puede especificar configuraciones para varios clústeres y crear varias tablas distribuidas que proporcionen vistas a diferentes clústeres. - -Ejemplo de configuración para un clúster con tres fragmentos, una réplica cada uno: - -``` xml - - - - - example-perftest01j.yandex.ru - 9000 - - - - - example-perftest02j.yandex.ru - 9000 - - - - - example-perftest03j.yandex.ru - 9000 - - - - -``` - -Para más demostraciones, vamos a crear una nueva tabla local con la misma `CREATE TABLE` consulta que utilizamos para `hits_v1`, pero nombre de tabla diferente: - -``` sql -CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... -``` - -Creación de una tabla distribuida que proporcione una vista en las tablas locales del clúster: - -``` sql -CREATE TABLE tutorial.hits_all AS tutorial.hits_local -ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); -``` - -Una práctica común es crear tablas distribuidas similares en todas las máquinas del clúster. Permite ejecutar consultas distribuidas en cualquier máquina del clúster. También hay una opción alternativa para crear una tabla distribuida temporal para una consulta SELECT determinada usando [remoto](../sql-reference/table-functions/remote.md) función de la tabla. - -Vamos a correr [INSERT SELECT](../sql-reference/statements/insert-into.md) en la tabla Distributed para extender la tabla a varios servidores. - -``` sql -INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; -``` - -!!! warning "Aviso" - Este enfoque no es adecuado para la fragmentación de tablas grandes. Hay una herramienta separada [Método de codificación de datos:](../operations/utilities/clickhouse-copier.md) que puede volver a fragmentar tablas grandes arbitrarias. - -Como era de esperar, las consultas computacionalmente pesadas se ejecutan N veces más rápido si utilizan 3 servidores en lugar de uno. - -En este caso, hemos utilizado un clúster con 3 fragmentos, y cada uno contiene una sola réplica. - -Para proporcionar resiliencia en un entorno de producción, se recomienda que cada fragmento contenga 2-3 réplicas distribuidas entre varias zonas de disponibilidad o centros de datos (o al menos racks). Tenga en cuenta que ClickHouse admite un número ilimitado de réplicas. - -Ejemplo de configuración para un clúster de un fragmento que contiene tres réplicas: - -``` xml - - ... - - - - example-perftest01j.yandex.ru - 9000 - - - example-perftest02j.yandex.ru - 9000 - - - example-perftest03j.yandex.ru - 9000 - - - - -``` - -Para habilitar la replicación nativa [ZooKeeper](http://zookeeper.apache.org/) se requiere. ClickHouse se encarga de la coherencia de los datos en todas las réplicas y ejecuta el procedimiento de restauración después de la falla automáticamente. Se recomienda implementar el clúster ZooKeeper en servidores independientes (donde no se están ejecutando otros procesos, incluido ClickHouse). - -!!! note "Nota" - ZooKeeper no es un requisito estricto: en algunos casos simples, puede duplicar los datos escribiéndolos en todas las réplicas de su código de aplicación. Este enfoque es **ni** recomendado, en este caso, ClickHouse no podrá garantizar la coherencia de los datos en todas las réplicas. Por lo tanto, se convierte en responsabilidad de su aplicación. - -Las ubicaciones de ZooKeeper se especifican en el archivo de configuración: - -``` xml - - - zoo01.yandex.ru - 2181 - - - zoo02.yandex.ru - 2181 - - - zoo03.yandex.ru - 2181 - - -``` - -Además, necesitamos establecer macros para identificar cada fragmento y réplica que se utilizan en la creación de tablas: - -``` xml - - 01 - 01 - -``` - -Si no hay réplicas en este momento en la creación de la tabla replicada, se crea una instancia de una nueva primera réplica. Si ya hay réplicas activas, la nueva réplica clona los datos de las existentes. Tiene la opción de crear primero todas las tablas replicadas y, a continuación, insertar datos en ella. Otra opción es crear algunas réplicas y agregar las otras después o durante la inserción de datos. - -``` sql -CREATE TABLE tutorial.hits_replica (...) -ENGINE = ReplcatedMergeTree( - '/clickhouse_perftest/tables/{shard}/hits', - '{replica}' -) -... -``` - -Aquí usamos [ReplicatedMergeTree](../engines/table-engines/mergetree-family/replication.md) motor de mesa. En los parámetros, especificamos la ruta ZooKeeper que contiene identificadores de fragmentos y réplicas. - -``` sql -INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; -``` - -La replicación funciona en modo multi-master. Los datos se pueden cargar en cualquier réplica y el sistema los sincroniza automáticamente con otras instancias. La replicación es asíncrona, por lo que en un momento dado, no todas las réplicas pueden contener datos insertados recientemente. Al menos una réplica debe estar disponible para permitir la ingestión de datos. Otros sincronizarán los datos y repararán la coherencia una vez que vuelvan a activarse. Tenga en cuenta que este enfoque permite la baja posibilidad de una pérdida de datos recientemente insertados. - -[Artículo Original](https://clickhouse.tech/docs/en/getting_started/tutorial/) diff --git a/docs/es/guides/apply-catboost-model.md b/docs/es/guides/apply-catboost-model.md deleted file mode 100644 index b1fe50f3276..00000000000 --- a/docs/es/guides/apply-catboost-model.md +++ /dev/null @@ -1,239 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 41 -toc_title: "Aplicaci\xF3n de modelos CatBoost" ---- - -# Aplicación de un modelo Catboost en ClickHouse {#applying-catboost-model-in-clickhouse} - -[CatBoost](https://catboost.ai) es una biblioteca de impulso de gradiente libre y de código abierto desarrollada en [Yandex](https://yandex.com/company/) para el aprendizaje automático. - -Con esta instrucción, aprenderá a aplicar modelos preentrenados en ClickHouse ejecutando la inferencia de modelos desde SQL. - -Para aplicar un modelo CatBoost en ClickHouse: - -1. [Crear una tabla](#create-table). -2. [Insertar los datos en la tabla](#insert-data-to-table). -3. [Integrar CatBoost en ClickHouse](#integrate-catboost-into-clickhouse) (Paso opcional). -4. [Ejecute la inferencia del modelo desde SQL](#run-model-inference). - -Para obtener más información sobre la formación de modelos CatBoost, consulte [Entrenamiento y aplicación de modelos](https://catboost.ai/docs/features/training.html#training). - -## Requisito {#prerequisites} - -Si no tienes el [Acoplador](https://docs.docker.com/install/) sin embargo, instalarlo. - -!!! note "Nota" - [Acoplador](https://www.docker.com) es una plataforma de software que le permite crear contenedores que aíslan una instalación de CatBoost y ClickHouse del resto del sistema. - -Antes de aplicar un modelo CatBoost: - -**1.** Tire de la [Imagen de acoplador](https://hub.docker.com/r/yandex/tutorial-catboost-clickhouse) del registro: - -``` bash -$ docker pull yandex/tutorial-catboost-clickhouse -``` - -Esta imagen de Docker contiene todo lo que necesita para ejecutar CatBoost y ClickHouse: código, tiempo de ejecución, bibliotecas, variables de entorno y archivos de configuración. - -**2.** Asegúrese de que la imagen de Docker se haya extraído correctamente: - -``` bash -$ docker image ls -REPOSITORY TAG IMAGE ID CREATED SIZE -yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 hours ago 1.37GB -``` - -**3.** Inicie un contenedor Docker basado en esta imagen: - -``` bash -$ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse -``` - -## 1. Crear una tabla {#create-table} - -Para crear una tabla ClickHouse para el ejemplo de capacitación: - -**1.** Inicie el cliente de consola ClickHouse en el modo interactivo: - -``` bash -$ clickhouse client -``` - -!!! note "Nota" - El servidor ClickHouse ya se está ejecutando dentro del contenedor Docker. - -**2.** Cree la tabla usando el comando: - -``` sql -:) CREATE TABLE amazon_train -( - date Date MATERIALIZED today(), - ACTION UInt8, - RESOURCE UInt32, - MGR_ID UInt32, - ROLE_ROLLUP_1 UInt32, - ROLE_ROLLUP_2 UInt32, - ROLE_DEPTNAME UInt32, - ROLE_TITLE UInt32, - ROLE_FAMILY_DESC UInt32, - ROLE_FAMILY UInt32, - ROLE_CODE UInt32 -) -ENGINE = MergeTree ORDER BY date -``` - -**3.** Salir del cliente de la consola ClickHouse: - -``` sql -:) exit -``` - -## 2. Insertar los datos en la tabla {#insert-data-to-table} - -Para insertar los datos: - -**1.** Ejecute el siguiente comando: - -``` bash -$ clickhouse client --host 127.0.0.1 --query 'INSERT INTO amazon_train FORMAT CSVWithNames' < ~/amazon/train.csv -``` - -**2.** Inicie el cliente de consola ClickHouse en el modo interactivo: - -``` bash -$ clickhouse client -``` - -**3.** Asegúrese de que los datos se hayan cargado: - -``` sql -:) SELECT count() FROM amazon_train - -SELECT count() -FROM amazon_train - -+-count()-+ -| 65538 | -+-------+ -``` - -## 3. Integrar CatBoost en ClickHouse {#integrate-catboost-into-clickhouse} - -!!! note "Nota" - **Paso opcional.** La imagen de Docker contiene todo lo que necesita para ejecutar CatBoost y ClickHouse. - -Para integrar CatBoost en ClickHouse: - -**1.** Construir la biblioteca de evaluación. - -La forma más rápida de evaluar un modelo CatBoost es compilar `libcatboostmodel.` biblioteca. Para obtener más información acerca de cómo construir la biblioteca, vea [Documentación de CatBoost](https://catboost.ai/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper.html). - -**2.** Cree un nuevo directorio en cualquier lugar y con cualquier nombre, por ejemplo, `data` y poner la biblioteca creada en ella. La imagen de Docker ya contiene la biblioteca `data/libcatboostmodel.so`. - -**3.** Cree un nuevo directorio para el modelo de configuración en cualquier lugar y con cualquier nombre, por ejemplo, `models`. - -**4.** Cree un archivo de configuración de modelo con cualquier nombre, por ejemplo, `models/amazon_model.xml`. - -**5.** Describir la configuración del modelo: - -``` xml - - - - catboost - - amazon - - /home/catboost/tutorial/catboost_model.bin - - 0 - - -``` - -**6.** Agregue la ruta de acceso a CatBoost y la configuración del modelo a la configuración de ClickHouse: - -``` xml - -/home/catboost/data/libcatboostmodel.so -/home/catboost/models/*_model.xml -``` - -## 4. Ejecute la inferencia del modelo desde SQL {#run-model-inference} - -Para el modelo de prueba, ejecute el cliente ClickHouse `$ clickhouse client`. - -Asegurémonos de que el modelo esté funcionando: - -``` sql -:) SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) > 0 AS prediction, - ACTION AS target -FROM amazon_train -LIMIT 10 -``` - -!!! note "Nota" - Función [modelEvaluar](../sql-reference/functions/other-functions.md#function-modelevaluate) devuelve tupla con predicciones sin procesar por clase para modelos multiclase. - -Vamos a predecir la probabilidad: - -``` sql -:) SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) AS prediction, - 1. / (1 + exp(-prediction)) AS probability, - ACTION AS target -FROM amazon_train -LIMIT 10 -``` - -!!! note "Nota" - Más información sobre [exp()](../sql-reference/functions/math-functions.md) función. - -Vamos a calcular LogLoss en la muestra: - -``` sql -:) SELECT -avg(tg * log(prob) + (1 - tg) * log(1 - prob)) AS logloss -FROM -( - SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) AS prediction, - 1. / (1. + exp(-prediction)) AS prob, - ACTION AS tg - FROM amazon_train -) -``` - -!!! note "Nota" - Más información sobre [avg()](../sql-reference/aggregate-functions/reference.md#agg_function-avg) y [registro()](../sql-reference/functions/math-functions.md) función. - -[Artículo Original](https://clickhouse.tech/docs/en/guides/apply_catboost_model/) diff --git a/docs/es/guides/index.md b/docs/es/guides/index.md deleted file mode 100644 index c8332ac7846..00000000000 --- a/docs/es/guides/index.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Guiar -toc_priority: 38 -toc_title: "Descripci\xF3n" ---- - -# Guías de ClickHouse {#clickhouse-guides} - -Lista de instrucciones detalladas paso a paso que ayudan a resolver varias tareas usando ClickHouse: - -- [Tutorial sobre la configuración simple del clúster](../getting-started/tutorial.md) -- [Aplicación de un modelo CatBoost en ClickHouse](apply-catboost-model.md) - -[Artículo Original](https://clickhouse.tech/docs/en/guides/) diff --git a/docs/es/images/column-oriented.gif b/docs/es/images/column-oriented.gif deleted file mode 100644 index d5ac7c82848..00000000000 Binary files a/docs/es/images/column-oriented.gif and /dev/null differ diff --git a/docs/es/images/logo.svg b/docs/es/images/logo.svg deleted file mode 100644 index b5ab923ff65..00000000000 --- a/docs/es/images/logo.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/es/images/row-oriented.gif b/docs/es/images/row-oriented.gif deleted file mode 100644 index 41395b5693e..00000000000 Binary files a/docs/es/images/row-oriented.gif and /dev/null differ diff --git a/docs/es/index.md b/docs/es/index.md deleted file mode 100644 index c76fe32e33b..00000000000 --- a/docs/es/index.md +++ /dev/null @@ -1,97 +0,0 @@ ---- -machine_translated: false -machine_translated_rev: -toc_priority: 0 -toc_title: "Descripción" ---- - -# ¿Qué es ClickHouse? {#what-is-clickhouse} - -ClickHouse es un sistema de gestión de bases de datos (DBMS), orientado a columnas, para el procesamiento analítico de consultas en línea (OLAP). - -En un DBMS “normal”, orientado a filas, los datos se almacenan en este orden: - -| Fila | Argumento | JavaEnable | Titular | GoodEvent | EventTime | -|------|-------------|------------|---------------------------|-----------|---------------------| -| #0 | 89354350662 | 1 | Relaciones con inversores | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contáctenos | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mision | 1 | 2016-05-18 07:38:00 | -| #N | … | … | … | … | … | - -En otras palabras, todos los valores relacionados con una fila se almacenan físicamente uno junto al otro. - -Ejemplos de un DBMS orientado a filas son MySQL, Postgres y MS SQL Server. - -En un DBMS orientado a columnas, los datos se almacenan así: - -| Fila: | #0 | #1 | #2 | #N | -|-------------|---------------------------|---------------------|---------------------|-----| -| Argumento: | 89354350662 | 90329509958 | 89953706054 | … | -| JavaEnable: | 1 | 0 | 1 | … | -| Titular: | Relaciones con inversores | Contáctenos | Mision | … | -| GoodEvent: | 1 | 1 | 1 | … | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | - -Estos ejemplos solo muestran el orden en el que se organizan los datos. Los valores de diferentes columnas se almacenan por separado y los datos de la misma columna se almacenan juntos. - -Ejemplos de un DBMS orientado a columnas: Vertica, Paraccel (Actian Matrix y Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise y Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid y kdb+. - -Los diferentes modos de ordenar los datos al guardarlos se adecúan mejor a diferentes escenarios. El escenario de acceso a los datos se refiere a qué consultas se hacen, con qué frecuencia y en qué proporción; cuántos datos se leen para cada tipo de consulta - filas, columnas y bytes; la relación entre lectura y actualización de datos; el tamaño de trabajo de los datos y qué tan localmente son usados; si se usan transacciones y qué tan aisladas están;requerimientos de replicación de los datos y de integridad lógica, requerimientos de latencia y caudal (throughput) para cada tipo de consulta, y cosas por el estilo. - -Cuanto mayor sea la carga en el sistema, más importante es personalizar el sistema configurado para que coincida con los requisitos del escenario de uso, y más fino será esta personalización. No existe un sistema que sea igualmente adecuado para escenarios significativamente diferentes. Si un sistema es adaptable a un amplio conjunto de escenarios, bajo una carga alta, el sistema manejará todos los escenarios igualmente mal, o funcionará bien para solo uno o algunos de los escenarios posibles. - -## Propiedades clave del escenario OLAP {#key-properties-of-olap-scenario} - -- La gran mayoría de las solicitudes son para acceso de lectura. -- Los datos se actualizan en lotes bastante grandes (\> 1000 filas), no por filas individuales; o no se actualiza en absoluto. -- Los datos se agregan a la base de datos pero no se modifican. -- Para las lecturas, se extrae un número bastante grande de filas de la base de datos, pero solo un pequeño subconjunto de columnas. -- Las tablas son “wide,” lo que significa que contienen un gran número de columnas. -- Las consultas son relativamente raras (generalmente cientos de consultas por servidor o menos por segundo). -- Para consultas simples, se permiten latencias de alrededor de 50 ms. -- Los valores de columna son bastante pequeños: números y cadenas cortas (por ejemplo, 60 bytes por URL). -- Requiere un alto rendimiento al procesar una sola consulta (hasta miles de millones de filas por segundo por servidor). -- Las transacciones no son necesarias. -- Bajos requisitos para la coherencia de los datos. -- Hay una tabla grande por consulta. Todas las mesas son pequeñas, excepto una. -- Un resultado de consulta es significativamente menor que los datos de origen. En otras palabras, los datos se filtran o se agregan, por lo que el resultado se ajusta a la RAM de un solo servidor. - -Es fácil ver que el escenario OLAP es muy diferente de otros escenarios populares (como el acceso OLTP o Key-Value). Por lo tanto, no tiene sentido intentar usar OLTP o una base de datos de valor clave para procesar consultas analíticas si desea obtener un rendimiento decente. Por ejemplo, si intenta usar MongoDB o Redis para análisis, obtendrá un rendimiento muy bajo en comparación con las bases de datos OLAP. - -## Por qué las bases de datos orientadas a columnas funcionan mejor en el escenario OLAP {#why-column-oriented-databases-work-better-in-the-olap-scenario} - -Las bases de datos orientadas a columnas son más adecuadas para los escenarios OLAP: son al menos 100 veces más rápidas en el procesamiento de la mayoría de las consultas. Las razones se explican en detalle a continuación, pero el hecho es más fácil de demostrar visualmente: - -**DBMS orientado a filas** - -![Row-oriented](images/row-oriented.gif#) - -**DBMS orientado a columnas** - -![Column-oriented](images/column-oriented.gif#) - -Ver la diferencia? - -### Entrada/salida {#inputoutput} - -1. Para una consulta analítica, solo es necesario leer un pequeño número de columnas de tabla. En una base de datos orientada a columnas, puede leer solo los datos que necesita. Por ejemplo, si necesita 5 columnas de 100, puede esperar una reducción de 20 veces en E/S. -2. Dado que los datos se leen en paquetes, es más fácil de comprimir. Los datos en columnas también son más fáciles de comprimir. Esto reduce aún más el volumen de E/S. -3. Debido a la reducción de E / S, más datos se ajustan a la memoria caché del sistema. - -Por ejemplo, la consulta “count the number of records for each advertising platform” requiere leer uno “advertising platform ID” columna, que ocupa 1 byte sin comprimir. Si la mayor parte del tráfico no proviene de plataformas publicitarias, puede esperar al menos una compresión de 10 veces de esta columna. Cuando se utiliza un algoritmo de compresión rápida, la descompresión de datos es posible a una velocidad de al menos varios gigabytes de datos sin comprimir por segundo. En otras palabras, esta consulta se puede procesar a una velocidad de aproximadamente varios miles de millones de filas por segundo en un único servidor. Esta velocidad se logra realmente en la práctica. - -### CPU {#cpu} - -Dado que la ejecución de una consulta requiere procesar un gran número de filas, ayuda enviar todas las operaciones para vectores completos en lugar de para filas separadas, o implementar el motor de consultas para que casi no haya costo de envío. Si no hace esto, con cualquier subsistema de disco medio decente, el intérprete de consultas inevitablemente detiene la CPU. Tiene sentido almacenar datos en columnas y procesarlos, cuando sea posible, por columnas. - -Hay dos formas de hacer esto: - -1. Un vector motor. Todas las operaciones se escriben para vectores, en lugar de para valores separados. Esto significa que no necesita llamar a las operaciones con mucha frecuencia, y los costos de envío son insignificantes. El código de operación contiene un ciclo interno optimizado. - -2. Generación de código. El código generado para la consulta tiene todas las llamadas indirectas. - -Esto no se hace en “normal” bases de datos, porque no tiene sentido cuando se ejecutan consultas simples. Sin embargo, hay excepciones. Por ejemplo, MemSQL utiliza la generación de código para reducir la latencia al procesar consultas SQL. (A modo de comparación, los DBMS analíticos requieren la optimización del rendimiento, no la latencia.) - -Tenga en cuenta que para la eficiencia de la CPU, el lenguaje de consulta debe ser declarativo (SQL o MDX), o al menos un vector (J, K). La consulta solo debe contener bucles implícitos, lo que permite la optimización. - -{## [Artículo Original](https://clickhouse.tech/docs/en/) ##} diff --git a/docs/es/interfaces/cli.md b/docs/es/interfaces/cli.md deleted file mode 100644 index 395f9831a4e..00000000000 --- a/docs/es/interfaces/cli.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 17 -toc_title: "Cliente de l\xEDnea de comandos" ---- - -# Cliente de línea de comandos {#command-line-client} - -ClickHouse proporciona un cliente de línea de comandos nativo: `clickhouse-client`. El cliente admite opciones de línea de comandos y archivos de configuración. Para obtener más información, consulte [Configuración](#interfaces_cli_configuration). - -[Instalar](../getting-started/index.md) desde el `clickhouse-client` paquete y ejecútelo con el comando `clickhouse-client`. - -``` bash -$ clickhouse-client -ClickHouse client version 19.17.1.1579 (official build). -Connecting to localhost:9000 as user default. -Connected to ClickHouse server version 19.17.1 revision 54428. - -:) -``` - -Las diferentes versiones de cliente y servidor son compatibles entre sí, pero es posible que algunas funciones no estén disponibles en clientes anteriores. Se recomienda utilizar la misma versión del cliente que la aplicación de servidor. Cuando intenta usar un cliente de la versión anterior, entonces el servidor, `clickhouse-client` muestra el mensaje: - - ClickHouse client version is older than ClickHouse server. It may lack support for new features. - -## Uso {#cli_usage} - -El cliente se puede utilizar en modo interactivo y no interactivo (por lotes). Para utilizar el modo por lotes, especifique el ‘query’ parámetro, o enviar datos a ‘stdin’ (verifica que ‘stdin’ no es un terminal), o ambos. Similar a la interfaz HTTP, cuando se utiliza el ‘query’ parámetro y el envío de datos a ‘stdin’ la solicitud es una concatenación de la ‘query’ parámetro, un avance de línea y los datos en ‘stdin’. Esto es conveniente para grandes consultas INSERT. - -Ejemplo de uso del cliente para insertar datos: - -``` bash -$ echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; - -$ cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; -3, 'some text', '2016-08-14 00:00:00' -4, 'some more text', '2016-08-14 00:00:01' -_EOF - -$ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; -``` - -En el modo por lotes, el formato de datos predeterminado es TabSeparated. Puede establecer el formato en la cláusula FORMAT de la consulta. - -De forma predeterminada, solo puede procesar una única consulta en modo por lotes. Para realizar múltiples consultas desde un “script,” utilizar el `--multiquery` parámetro. Esto funciona para todas las consultas excepto INSERT . Los resultados de la consulta se generan consecutivamente sin separadores adicionales. Del mismo modo, para procesar un gran número de consultas, puede ejecutar ‘clickhouse-client’ para cada consulta. Tenga en cuenta que puede tomar decenas de milisegundos para iniciar el ‘clickhouse-client’ programa. - -En el modo interactivo, obtiene una línea de comandos donde puede ingresar consultas. - -Si ‘multiline’ no se especifica (el valor predeterminado): Para ejecutar la consulta, pulse Intro. El punto y coma no es necesario al final de la consulta. Para introducir una consulta de varias líneas, introduzca una barra invertida `\` antes de la alimentación de línea. Después de presionar Enter, se le pedirá que ingrese la siguiente línea de la consulta. - -Si se especifica multilínea: Para ejecutar una consulta, finalícela con un punto y coma y presione Intro. Si se omitió el punto y coma al final de la línea ingresada, se le pedirá que ingrese la siguiente línea de la consulta. - -Solo se ejecuta una sola consulta, por lo que se ignora todo después del punto y coma. - -Puede especificar `\G` en lugar o después del punto y coma. Esto indica el formato vertical. En este formato, cada valor se imprime en una línea separada, lo cual es conveniente para tablas anchas. Esta característica inusual se agregó por compatibilidad con la CLI de MySQL. - -La línea de comandos se basa en ‘replxx’ (similar a ‘readline’). En otras palabras, utiliza los atajos de teclado familiares y mantiene un historial. La historia está escrita para `~/.clickhouse-client-history`. - -De forma predeterminada, el formato utilizado es PrettyCompact. Puede cambiar el formato en la cláusula FORMAT de la consulta o especificando `\G` al final de la consulta, utilizando el `--format` o `--vertical` en la línea de comandos, o utilizando el archivo de configuración del cliente. - -Para salir del cliente, presione Ctrl+D o introduzca una de las siguientes opciones en lugar de una consulta: “exit”, “quit”, “logout”, “exit;”, “quit;”, “logout;”, “q”, “Q”, “:q” - -Al procesar una consulta, el cliente muestra: - -1. Progreso, que se actualiza no más de 10 veces por segundo (de forma predeterminada). Para consultas rápidas, es posible que el progreso no tenga tiempo para mostrarse. -2. La consulta con formato después del análisis, para la depuración. -3. El resultado en el formato especificado. -4. El número de líneas en el resultado, el tiempo transcurrido y la velocidad promedio de procesamiento de consultas. - -Puede cancelar una consulta larga presionando Ctrl + C. Sin embargo, aún tendrá que esperar un poco para que el servidor aborte la solicitud. No es posible cancelar una consulta en determinadas etapas. Si no espera y presiona Ctrl + C por segunda vez, el cliente saldrá. - -El cliente de línea de comandos permite pasar datos externos (tablas temporales externas) para consultar. Para obtener más información, consulte la sección “External data for query processing”. - -### Consultas con parámetros {#cli-queries-with-parameters} - -Puede crear una consulta con parámetros y pasarles valores desde la aplicación cliente. Esto permite evitar formatear consultas con valores dinámicos específicos en el lado del cliente. Por ejemplo: - -``` bash -$ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}" -``` - -#### Sintaxis de consulta {#cli-queries-with-parameters-syntax} - -Formatee una consulta como de costumbre, luego coloque los valores que desea pasar de los parámetros de la aplicación a la consulta entre llaves en el siguiente formato: - -``` sql -{:} -``` - -- `name` — Placeholder identifier. In the console client it should be used in app parameters as `--param_ = value`. -- `data type` — [Tipo de datos](../sql-reference/data-types/index.md) del valor del parámetro de la aplicación. Por ejemplo, una estructura de datos como `(integer, ('string', integer))` puede tener el `Tuple(UInt8, Tuple(String, UInt8))` tipo de datos (también puede usar otro [entero](../sql-reference/data-types/int-uint.md) tipo). - -#### Ejemplo {#example} - -``` bash -$ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" -``` - -## Configuración {#interfaces_cli_configuration} - -Puede pasar parámetros a `clickhouse-client` (todos los parámetros tienen un valor predeterminado) usando: - -- Desde la línea de comandos - - Las opciones de la línea de comandos anulan los valores y valores predeterminados de los archivos de configuración. - -- Archivos de configuración. - - Los valores de los archivos de configuración anulan los valores predeterminados. - -### Opciones de línea de comandos {#command-line-options} - -- `--host, -h` -– The server name, ‘localhost’ predeterminada. Puede utilizar el nombre o la dirección IPv4 o IPv6. -- `--port` – The port to connect to. Default value: 9000. Note that the HTTP interface and the native interface use different ports. -- `--user, -u` – The username. Default value: default. -- `--password` – The password. Default value: empty string. -- `--query, -q` – The query to process when using non-interactive mode. -- `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ predeterminada). -- `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter). -- `--multiquery, -n` – If specified, allow processing multiple queries separated by semicolons. -- `--format, -f` – Use the specified default format to output the result. -- `--vertical, -E` – If specified, use the Vertical format by default to output the result. This is the same as ‘–format=Vertical’. En este formato, cada valor se imprime en una línea separada, lo que es útil cuando se muestran tablas anchas. -- `--time, -t` – If specified, print the query execution time to ‘stderr’ en modo no interactivo. -- `--stacktrace` – If specified, also print the stack trace if an exception occurs. -- `--config-file` – The name of the configuration file. -- `--secure` – If specified, will connect to server over secure connection. -- `--param_` — Value for a [consulta con parámetros](#cli-queries-with-parameters). - -### Archivos de configuración {#configuration_files} - -`clickhouse-client` utiliza el primer archivo existente de los siguientes: - -- Definido en el `--config-file` parámetro. -- `./clickhouse-client.xml` -- `~/.clickhouse-client/config.xml` -- `/etc/clickhouse-client/config.xml` - -Ejemplo de un archivo de configuración: - -``` xml - - username - password - False - -``` - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/cli/) diff --git a/docs/es/interfaces/cpp.md b/docs/es/interfaces/cpp.md deleted file mode 100644 index bc5dc3dbc24..00000000000 --- a/docs/es/interfaces/cpp.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 24 -toc_title: Biblioteca de clientes de C++ ---- - -# Biblioteca de clientes de C++ {#c-client-library} - -Ver README en [Bienvenidos](https://github.com/ClickHouse/clickhouse-cpp) repositorio. - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/cpp/) diff --git a/docs/es/interfaces/formats.md b/docs/es/interfaces/formats.md deleted file mode 100644 index 03c1873d306..00000000000 --- a/docs/es/interfaces/formats.md +++ /dev/null @@ -1,1212 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 21 -toc_title: Formatos de entrada y salida ---- - -# Formatos para datos de entrada y salida {#formats} - -ClickHouse puede aceptar y devolver datos en varios formatos. Se puede utilizar un formato admitido para la entrada para analizar los datos proporcionados a `INSERT`s, para llevar a cabo `SELECT`s de una tabla respaldada por archivos como File, URL o HDFS, o para leer un diccionario externo. Se puede utilizar un formato compatible con la salida para organizar el -resultados de un `SELECT`, y realizar `INSERT`s en una tabla respaldada por archivos. - -Los formatos soportados son: - -| Formato | Entrada | Salida | -|-----------------------------------------------------------------|---------|--------| -| [TabSeparated](#tabseparated) | ✔ | ✔ | -| [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | -| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | -| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [Plantilla](#format-template) | ✔ | ✔ | -| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | -| [CSV](#csv) | ✔ | ✔ | -| [CSVWithNames](#csvwithnames) | ✔ | ✔ | -| [CustomSeparated](#format-customseparated) | ✔ | ✔ | -| [Valor](#data-format-values) | ✔ | ✔ | -| [Vertical](#vertical) | ✗ | ✔ | -| [VerticalRaw](#verticalraw) | ✗ | ✔ | -| [JSON](#json) | ✗ | ✔ | -| [JSONCompact](#jsoncompact) | ✗ | ✔ | -| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | -| [TSKV](#tskv) | ✔ | ✔ | -| [Bastante](#pretty) | ✗ | ✔ | -| [PrettyCompact](#prettycompact) | ✗ | ✔ | -| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | -| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | -| [Bienvenido a WordPress.](#prettyspace) | ✗ | ✔ | -| [Protobuf](#protobuf) | ✔ | ✔ | -| [Avro](#data-format-avro) | ✔ | ✔ | -| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | -| [Parquet](#data-format-parquet) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | -| [RowBinary](#rowbinary) | ✔ | ✔ | -| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | -| [Nativo](#native) | ✔ | ✔ | -| [Nulo](#null) | ✗ | ✔ | -| [XML](#xml) | ✗ | ✔ | -| [CapnProto](#capnproto) | ✔ | ✗ | - -Puede controlar algunos parámetros de procesamiento de formato con la configuración de ClickHouse. Para obtener más información, lea el [Configuración](../operations/settings/settings.md) apartado. - -## TabSeparated {#tabseparated} - -En el formato TabSeparated, los datos se escriben por fila. Cada fila contiene valores separados por pestañas. Cada valor es seguido por una ficha, excepto el último valor de la fila, que es seguido por un avance de línea. Estrictamente las fuentes de línea Unix se asumen en todas partes. La última fila también debe contener un avance de línea al final. Los valores se escriben en formato de texto, sin incluir comillas y con caracteres especiales escapados. - -Este formato también está disponible bajo el nombre `TSV`. - -El `TabSeparated` es conveniente para procesar datos utilizando programas y scripts personalizados. Se usa de forma predeterminada en la interfaz HTTP y en el modo por lotes del cliente de línea de comandos. Este formato también permite transferir datos entre diferentes DBMS. Por ejemplo, puede obtener un volcado de MySQL y subirlo a ClickHouse, o viceversa. - -El `TabSeparated` el formato admite la salida de valores totales (cuando se usa WITH TOTALS) y valores extremos (cuando ‘extremes’ se establece en 1). En estos casos, los valores totales y los extremos se emiten después de los datos principales. El resultado principal, los valores totales y los extremos están separados entre sí por una línea vacía. Ejemplo: - -``` sql -SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` -``` - -``` text -2014-03-17 1406958 -2014-03-18 1383658 -2014-03-19 1405797 -2014-03-20 1353623 -2014-03-21 1245779 -2014-03-22 1031592 -2014-03-23 1046491 - -1970-01-01 8873898 - -2014-03-17 1031592 -2014-03-23 1406958 -``` - -### Formato de datos {#data-formatting} - -Los números enteros se escriben en forma decimal. Los números pueden contener un extra “+” carácter al principio (ignorado al analizar y no grabado al formatear). Los números no negativos no pueden contener el signo negativo. Al leer, se permite analizar una cadena vacía como cero, o (para tipos con signo) una cadena que consiste en solo un signo menos como cero. Los números que no encajan en el tipo de datos correspondiente se pueden analizar como un número diferente, sin un mensaje de error. - -Los números de punto flotante se escriben en forma decimal. El punto se usa como separador decimal. Las entradas exponenciales son compatibles, al igual que ‘inf’, ‘+inf’, ‘-inf’, y ‘nan’. Una entrada de números de coma flotante puede comenzar o terminar con un punto decimal. -Durante el formateo, la precisión puede perderse en los números de coma flotante. -Durante el análisis, no es estrictamente necesario leer el número representable de la máquina más cercano. - -Las fechas se escriben en formato AAAA-MM-DD y se analizan en el mismo formato, pero con los caracteres como separadores. -Las fechas con horas se escriben en el formato `YYYY-MM-DD hh:mm:ss` y analizado en el mismo formato, pero con cualquier carácter como separadores. -Todo esto ocurre en la zona horaria del sistema en el momento en que se inicia el cliente o servidor (dependiendo de cuál de ellos formatea los datos). Para fechas con horarios, no se especifica el horario de verano. Por lo tanto, si un volcado tiene tiempos durante el horario de verano, el volcado no coincide inequívocamente con los datos, y el análisis seleccionará una de las dos veces. -Durante una operación de lectura, las fechas incorrectas y las fechas con horas se pueden analizar con desbordamiento natural o como fechas y horas nulas, sin un mensaje de error. - -Como excepción, el análisis de fechas con horas también se admite en el formato de marca de tiempo Unix, si consta de exactamente 10 dígitos decimales. El resultado no depende de la zona horaria. Los formatos AAAA-MM-DD hh:mm:ss y NNNNNNNNNN se diferencian automáticamente. - -Las cadenas se generan con caracteres especiales de escape de barra invertida. Las siguientes secuencias de escape se utilizan para la salida: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`. El análisis también admite las secuencias `\a`, `\v`, y `\xHH` (secuencias de escape hexagonales) y cualquier `\c` secuencias, donde `c` es cualquier carácter (estas secuencias se convierten en `c`). Por lo tanto, la lectura de datos admite formatos donde un avance de línea se puede escribir como `\n` o `\` o como un avance de línea. Por ejemplo, la cadena `Hello world` con un avance de línea entre las palabras en lugar de espacio se puede analizar en cualquiera de las siguientes variaciones: - -``` text -Hello\nworld - -Hello\ -world -``` - -La segunda variante es compatible porque MySQL la usa al escribir volcados separados por tabuladores. - -El conjunto mínimo de caracteres que debe escapar al pasar datos en formato TabSeparated: tabulación, salto de línea (LF) y barra invertida. - -Solo se escapa un pequeño conjunto de símbolos. Puede tropezar fácilmente con un valor de cadena que su terminal arruinará en la salida. - -Las matrices se escriben como una lista de valores separados por comas entre corchetes. Los elementos numéricos de la matriz tienen el formato normal. `Date` y `DateTime` están escritos entre comillas simples. Las cadenas se escriben entre comillas simples con las mismas reglas de escape que las anteriores. - -[NULL](../sql-reference/syntax.md) se formatea como `\N`. - -Cada elemento de [Anidar](../sql-reference/data-types/nested-data-structures/nested.md) estructuras se representa como una matriz. - -Por ejemplo: - -``` sql -CREATE TABLE nestedt -( - `id` UInt8, - `aux` Nested( - a UInt8, - b String - ) -) -ENGINE = TinyLog -``` - -``` sql -INSERT INTO nestedt Values ( 1, [1], ['a']) -``` - -``` sql -SELECT * FROM nestedt FORMAT TSV -``` - -``` text -1 [1] ['a'] -``` - -## TabSeparatedRaw {#tabseparatedraw} - -Difiere de `TabSeparated` formato en que las filas se escriben sin escapar. -Este formato solo es apropiado para generar un resultado de consulta, pero no para analizar (recuperar datos para insertar en una tabla). - -Este formato también está disponible bajo el nombre `TSVRaw`. - -## TabSeparatedWithNames {#tabseparatedwithnames} - -Difiere de la `TabSeparated` formato en que los nombres de columna se escriben en la primera fila. -Durante el análisis, la primera fila se ignora por completo. No puede usar nombres de columna para determinar su posición o para comprobar su corrección. -(Se puede agregar soporte para analizar la fila de encabezado en el futuro.) - -Este formato también está disponible bajo el nombre `TSVWithNames`. - -## TabSeparatedWithNamesAndTypes {#tabseparatedwithnamesandtypes} - -Difiere de la `TabSeparated` formato en que los nombres de columna se escriben en la primera fila, mientras que los tipos de columna están en la segunda fila. -Durante el análisis, la primera y la segunda filas se ignoran por completo. - -Este formato también está disponible bajo el nombre `TSVWithNamesAndTypes`. - -## Plantilla {#format-template} - -Este formato permite especificar una cadena de formato personalizado con marcadores de posición para los valores con una regla de escape especificada. - -Utiliza la configuración `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` cuando se utiliza `JSON` escapar, ver más) - -Configuración `format_template_row` especifica la ruta de acceso al archivo, que contiene una cadena de formato para las filas con la siguiente sintaxis: - -`delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, - -donde `delimiter_i` es un delimitador entre valores (`$` símbolo se puede escapar como `$$`), -`column_i` es un nombre o índice de una columna cuyos valores se deben seleccionar o insertar (si está vacío, se omitirá la columna), -`serializeAs_i` es una regla de escape para los valores de columna. Se admiten las siguientes reglas de escape: - -- `CSV`, `JSON`, `XML` (similar a los formatos de los mismos nombres) -- `Escaped` (similar a `TSV`) -- `Quoted` (similar a `Values`) -- `Raw` (sin escapar, de manera similar a `TSVRaw`) -- `None` (sin regla de escape, ver más) - -Si se omite una regla de escape, entonces `None` se utilizará. `XML` y `Raw` son adecuados sólo para la salida. - -Entonces, para la siguiente cadena de formato: - - `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` - -los valores de `SearchPhrase`, `c` y `price` columnas, que se escapan como `Quoted`, `Escaped` y `JSON` se imprimirá (para seleccionar) o se esperará (para insertar) entre `Search phrase:`, `, count:`, `, ad price: $` y `;` delimitadores respectivamente. Por ejemplo: - -`Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - -El `format_template_rows_between_delimiter` setting especifica el delimitador entre filas, que se imprime (o se espera) después de cada fila, excepto la última (`\n` predeterminada) - -Configuración `format_template_resultset` especifica la ruta al archivo, que contiene una cadena de formato para el conjunto de resultados. La cadena de formato para el conjunto de resultados tiene la misma sintaxis que una cadena de formato para la fila y permite especificar un prefijo, un sufijo y una forma de imprimir información adicional. Contiene los siguientes marcadores de posición en lugar de nombres de columna: - -- `data` son las filas con datos en `format_template_row` formato, separados por `format_template_rows_between_delimiter`. Este marcador de posición debe ser el primer marcador de posición en la cadena de formato. -- `totals` es la fila con valores totales en `format_template_row` formato (cuando se usa WITH TOTALS) -- `min` es la fila con valores mínimos en `format_template_row` formato (cuando los extremos se establecen en 1) -- `max` es la fila con valores máximos en `format_template_row` formato (cuando los extremos se establecen en 1) -- `rows` es el número total de filas de salida -- `rows_before_limit` es el número mínimo de filas que habría habido sin LIMIT. Salida solo si la consulta contiene LIMIT. Si la consulta contiene GROUP BY, rows_before_limit_at_least es el número exacto de filas que habría habido sin un LIMIT . -- `time` es el tiempo de ejecución de la solicitud en segundos -- `rows_read` es el número de filas que se ha leído -- `bytes_read` es el número de bytes (sin comprimir) que se ha leído - -Marcador `data`, `totals`, `min` y `max` no debe tener una regla de escape especificada (o `None` debe especificarse explícitamente). Los marcadores de posición restantes pueden tener cualquier regla de escape especificada. -Si el `format_template_resultset` valor es una cadena vacía, `${data}` se utiliza como valor predeterminado. -Para el formato de consultas de inserción permite omitir algunas columnas o algunos campos si prefijo o sufijo (ver ejemplo). - -Seleccionar ejemplo: - -``` sql -SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 FORMAT Template SETTINGS -format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = '\n ' -``` - -`/some/path/resultset.format`: - -``` text - - Search phrases - - - - ${data} -
Search phrases
Search phrase Count
- - ${max} -
Max
- Processed ${rows_read:XML} rows in ${time:XML} sec - - -``` - -`/some/path/row.format`: - -``` text - ${0:XML} ${1:XML} -``` - -Resultado: - -``` html - - Search phrases - - - - - - - - -
Search phrases
Search phrase Count
8267016
bathroom interior design 2166
yandex 1655
spring 2014 fashion 1549
freeform photos 1480
- - -
Max
8873898
- Processed 3095973 rows in 0.1569913 sec - - -``` - -Insertar ejemplo: - -``` text -Some header -Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 -Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 -Total rows: 2 -``` - -``` sql -INSERT INTO UserActivity FORMAT Template SETTINGS -format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format' -``` - -`/some/path/resultset.format`: - -``` text -Some header\n${data}\nTotal rows: ${:CSV}\n -``` - -`/some/path/row.format`: - -``` text -Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV} -``` - -`PageViews`, `UserID`, `Duration` y `Sign` dentro de los marcadores de posición son nombres de columnas en la tabla. Valores después `Useless field` en filas y después `\nTotal rows:` en el sufijo será ignorado. -Todos los delimitadores de los datos de entrada deben ser estrictamente iguales a los delimitadores de las cadenas de formato especificadas. - -## TemplateIgnoreSpaces {#templateignorespaces} - -Este formato es adecuado sólo para la entrada. -Similar a `Template`, pero omite caracteres de espacio en blanco entre delimitadores y valores en la secuencia de entrada. Sin embargo, si las cadenas de formato contienen caracteres de espacio en blanco, se esperarán estos caracteres en la secuencia de entrada. También permite especificar marcadores de posición vacíos (`${}` o `${:None}`) para dividir algún delimitador en partes separadas para ignorar los espacios entre ellos. Dichos marcadores de posición se usan solo para omitir caracteres de espacio en blanco. -Es posible leer `JSON` usando este formato, si los valores de las columnas tienen el mismo orden en todas las filas. Por ejemplo, la siguiente solicitud se puede utilizar para insertar datos del ejemplo de salida de formato [JSON](#json): - -``` sql -INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS -format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = ',' -``` - -`/some/path/resultset.format`: - -``` text -{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}} -``` - -`/some/path/row.format`: - -``` text -{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}} -``` - -## TSKV {#tskv} - -Similar a TabSeparated , pero genera un valor en formato name=value . Los nombres se escapan de la misma manera que en el formato TabSeparated, y el símbolo = también se escapa. - -``` text -SearchPhrase= count()=8267016 -SearchPhrase=bathroom interior design count()=2166 -SearchPhrase=yandex count()=1655 -SearchPhrase=2014 spring fashion count()=1549 -SearchPhrase=freeform photos count()=1480 -SearchPhrase=angelina jolie count()=1245 -SearchPhrase=omsk count()=1112 -SearchPhrase=photos of dog breeds count()=1091 -SearchPhrase=curtain designs count()=1064 -SearchPhrase=baku count()=1000 -``` - -[NULL](../sql-reference/syntax.md) se formatea como `\N`. - -``` sql -SELECT * FROM t_null FORMAT TSKV -``` - -``` text -x=1 y=\N -``` - -Cuando hay una gran cantidad de columnas pequeñas, este formato no es efectivo y generalmente no hay razón para usarlo. Sin embargo, no es peor que JSONEachRow en términos de eficiencia. - -Both data output and parsing are supported in this format. For parsing, any order is supported for the values of different columns. It is acceptable for some values to be omitted – they are treated as equal to their default values. In this case, zeros and blank rows are used as default values. Complex values that could be specified in the table are not supported as defaults. - -El análisis permite la presencia del campo adicional `tskv` sin el signo igual o un valor. Este campo se ignora. - -## CSV {#csv} - -Formato de valores separados por comas ([RFC](https://tools.ietf.org/html/rfc4180)). - -Al formatear, las filas están encerradas en comillas dobles. Una comilla doble dentro de una cadena se genera como dos comillas dobles en una fila. No hay otras reglas para escapar de los personajes. Fecha y fecha-hora están encerrados en comillas dobles. Los números se emiten sin comillas. Los valores están separados por un carácter delimitador, que es `,` predeterminada. El carácter delimitador se define en la configuración [Formato_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Las filas se separan usando el avance de línea Unix (LF). Las matrices se serializan en CSV de la siguiente manera: primero, la matriz se serializa en una cadena como en el formato TabSeparated, y luego la cadena resultante se envía a CSV en comillas dobles. Las tuplas en formato CSV se serializan como columnas separadas (es decir, se pierde su anidamiento en la tupla). - -``` bash -$ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv -``` - -\*De forma predeterminada, el delimitador es `,`. Ver el [Formato_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) para obtener más información. - -Al analizar, todos los valores se pueden analizar con o sin comillas. Ambas comillas dobles y simples son compatibles. Las filas también se pueden organizar sin comillas. En este caso, se analizan hasta el carácter delimitador o el avance de línea (CR o LF). En violación del RFC, al analizar filas sin comillas, se ignoran los espacios y pestañas iniciales y finales. Para el avance de línea, se admiten los tipos Unix (LF), Windows (CR LF) y Mac OS Classic (CR LF). - -Los valores de entrada vacíos sin comillas se sustituyen por valores predeterminados para las columnas respectivas, si -[Entrada_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) -está habilitado. - -`NULL` se formatea como `\N` o `NULL` o una cadena vacía sin comillas (consulte la configuración [input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) y [Entrada_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)). - -El formato CSV admite la salida de totales y extremos de la misma manera que `TabSeparated`. - -## CSVWithNames {#csvwithnames} - -También imprime la fila del encabezado, similar a `TabSeparatedWithNames`. - -## CustomSeparated {#format-customseparated} - -Similar a [Plantilla](#format-template), pero imprime o lee todas las columnas y usa la regla de escape de la configuración `format_custom_escaping_rule` y delimitadores desde la configuración `format_custom_field_delimiter`, `format_custom_row_before_delimiter`, `format_custom_row_after_delimiter`, `format_custom_row_between_delimiter`, `format_custom_result_before_delimiter` y `format_custom_result_after_delimiter`, no de cadenas de formato. -También hay `CustomSeparatedIgnoreSpaces` formato, que es similar a `TemplateIgnoreSpaces`. - -## JSON {#json} - -Salida de datos en formato JSON. Además de las tablas de datos, también genera nombres y tipos de columnas, junto con información adicional: el número total de filas de salida y el número de filas que podrían haberse generado si no hubiera un LIMIT . Ejemplo: - -``` sql -SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON -``` - -``` json -{ - "meta": - [ - { - "name": "SearchPhrase", - "type": "String" - }, - { - "name": "c", - "type": "UInt64" - } - ], - - "data": - [ - { - "SearchPhrase": "", - "c": "8267016" - }, - { - "SearchPhrase": "bathroom interior design", - "c": "2166" - }, - { - "SearchPhrase": "yandex", - "c": "1655" - }, - { - "SearchPhrase": "spring 2014 fashion", - "c": "1549" - }, - { - "SearchPhrase": "freeform photos", - "c": "1480" - } - ], - - "totals": - { - "SearchPhrase": "", - "c": "8873898" - }, - - "extremes": - { - "min": - { - "SearchPhrase": "", - "c": "1480" - }, - "max": - { - "SearchPhrase": "", - "c": "8267016" - } - }, - - "rows": 5, - - "rows_before_limit_at_least": 141137 -} -``` - -El JSON es compatible con JavaScript. Para garantizar esto, algunos caracteres se escapan adicionalmente: la barra inclinada `/` se escapa como `\/`; saltos de línea alternativos `U+2028` y `U+2029`, que rompen algunos navegadores, se escapan como `\uXXXX`. Los caracteres de control ASCII se escapan: retroceso, avance de formulario, avance de línea, retorno de carro y tabulación horizontal se reemplazan con `\b`, `\f`, `\n`, `\r`, `\t` , así como los bytes restantes en el rango 00-1F usando `\uXXXX` sequences. Invalid UTF-8 sequences are changed to the replacement character � so the output text will consist of valid UTF-8 sequences. For compatibility with JavaScript, Int64 and UInt64 integers are enclosed in double-quotes by default. To remove the quotes, you can set the configuration parameter [output_format_json_quote_64bit_integers](../operations/settings/settings.md#session_settings-output_format_json_quote_64bit_integers) a 0. - -`rows` – The total number of output rows. - -`rows_before_limit_at_least` El número mínimo de filas habría sido sin LIMIT . Salida solo si la consulta contiene LIMIT. -Si la consulta contiene GROUP BY, rows_before_limit_at_least es el número exacto de filas que habría habido sin un LIMIT . - -`totals` – Total values (when using WITH TOTALS). - -`extremes` – Extreme values (when extremes are set to 1). - -Este formato solo es apropiado para generar un resultado de consulta, pero no para analizar (recuperar datos para insertar en una tabla). - -Soporta ClickHouse [NULL](../sql-reference/syntax.md), que se muestra como `null` en la salida JSON. - -Ver también el [JSONEachRow](#jsoneachrow) formato. - -## JSONCompact {#jsoncompact} - -Difiere de JSON solo en que las filas de datos se generan en matrices, no en objetos. - -Ejemplo: - -``` json -{ - "meta": - [ - { - "name": "SearchPhrase", - "type": "String" - }, - { - "name": "c", - "type": "UInt64" - } - ], - - "data": - [ - ["", "8267016"], - ["bathroom interior design", "2166"], - ["yandex", "1655"], - ["fashion trends spring 2014", "1549"], - ["freeform photo", "1480"] - ], - - "totals": ["","8873898"], - - "extremes": - { - "min": ["","1480"], - "max": ["","8267016"] - }, - - "rows": 5, - - "rows_before_limit_at_least": 141137 -} -``` - -Este formato solo es apropiado para generar un resultado de consulta, pero no para analizar (recuperar datos para insertar en una tabla). -Ver también el `JSONEachRow` formato. - -## JSONEachRow {#jsoneachrow} - -Al usar este formato, ClickHouse genera filas como objetos JSON separados, delimitados por nuevas líneas, pero los datos en su conjunto no son JSON válidos. - -``` json -{"SearchPhrase":"curtain designs","count()":"1064"} -{"SearchPhrase":"baku","count()":"1000"} -{"SearchPhrase":"","count()":"8267016"} -``` - -Al insertar los datos, debe proporcionar un objeto JSON independiente para cada fila. - -### Insertar datos {#inserting-data} - -``` sql -INSERT INTO UserActivity FORMAT JSONEachRow {"PageViews":5, "UserID":"4324182021466249494", "Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} -``` - -ClickHouse permite: - -- Cualquier orden de pares clave-valor en el objeto. -- Omitiendo algunos valores. - -ClickHouse ignora los espacios entre los elementos y las comas después de los objetos. Puede pasar todos los objetos en una línea. No tiene que separarlos con saltos de línea. - -**Procesamiento de valores omitidos** - -ClickHouse sustituye los valores omitidos por los valores predeterminados para el [tipos de datos](../sql-reference/data-types/index.md). - -Si `DEFAULT expr` se especifica, ClickHouse utiliza diferentes reglas de sustitución dependiendo de la [Entrada_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) configuración. - -Considere la siguiente tabla: - -``` sql -CREATE TABLE IF NOT EXISTS example_table -( - x UInt32, - a DEFAULT x * 2 -) ENGINE = Memory; -``` - -- Si `input_format_defaults_for_omitted_fields = 0`, entonces el valor predeterminado para `x` y `a` igual `0` (como el valor predeterminado para el `UInt32` tipo de datos). -- Si `input_format_defaults_for_omitted_fields = 1`, entonces el valor predeterminado para `x` igual `0` pero el valor predeterminado de `a` igual `x * 2`. - -!!! note "Advertencia" - Al insertar datos con `insert_sample_with_metadata = 1`, ClickHouse consume más recursos computacionales, en comparación con la inserción con `insert_sample_with_metadata = 0`. - -### Selección de datos {#selecting-data} - -Considere el `UserActivity` tabla como un ejemplo: - -``` text -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` - -Consulta `SELECT * FROM UserActivity FORMAT JSONEachRow` devoluciones: - -``` text -{"UserID":"4324182021466249494","PageViews":5,"Duration":146,"Sign":-1} -{"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} -``` - -A diferencia de la [JSON](#json) formato, no hay sustitución de secuencias UTF-8 no válidas. Los valores se escapan de la misma manera que para `JSON`. - -!!! note "Nota" - Cualquier conjunto de bytes se puede generar en las cadenas. Utilice el `JSONEachRow` si está seguro de que los datos de la tabla se pueden formatear como JSON sin perder ninguna información. - -### Uso de estructuras anidadas {#jsoneachrow-nested} - -Si tienes una mesa con [Anidar](../sql-reference/data-types/nested-data-structures/nested.md) columnas de tipo de datos, puede insertar datos JSON con la misma estructura. Habilite esta función con el [Entrada_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) configuración. - -Por ejemplo, considere la siguiente tabla: - -``` sql -CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory -``` - -Como se puede ver en el `Nested` descripción del tipo de datos, ClickHouse trata cada componente de la estructura anidada como una columna separada (`n.s` y `n.i` para nuestra mesa). Puede insertar datos de la siguiente manera: - -``` sql -INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} -``` - -Para insertar datos como un objeto JSON jerárquico, establezca [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). - -``` json -{ - "n": { - "s": ["abc", "def"], - "i": [1, 23] - } -} -``` - -Sin esta configuración, ClickHouse produce una excepción. - -``` sql -SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' -``` - -``` text -┌─name────────────────────────────┬─value─┐ -│ input_format_import_nested_json │ 0 │ -└─────────────────────────────────┴───────┘ -``` - -``` sql -INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} -``` - -``` text -Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) -``` - -``` sql -SET input_format_import_nested_json=1 -INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} -SELECT * FROM json_each_row_nested -``` - -``` text -┌─n.s───────────┬─n.i────┐ -│ ['abc','def'] │ [1,23] │ -└───────────────┴────────┘ -``` - -## Nativo {#native} - -El formato más eficiente. Los datos son escritos y leídos por bloques en formato binario. Para cada bloque, el número de filas, número de columnas, nombres y tipos de columnas y partes de columnas de este bloque se registran una tras otra. En otras palabras, este formato es “columnar” – it doesn't convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients. - -Puede utilizar este formato para generar rápidamente volcados que sólo pueden ser leídos por el DBMS de ClickHouse. No tiene sentido trabajar con este formato usted mismo. - -## Nulo {#null} - -Nada es salida. Sin embargo, la consulta se procesa y, cuando se utiliza el cliente de línea de comandos, los datos se transmiten al cliente. Esto se usa para pruebas, incluidas las pruebas de rendimiento. -Obviamente, este formato solo es apropiado para la salida, no para el análisis. - -## Bastante {#pretty} - -Salidas de datos como tablas de arte Unicode, también utilizando secuencias de escape ANSI para establecer colores en el terminal. -Se dibuja una cuadrícula completa de la tabla, y cada fila ocupa dos líneas en la terminal. -Cada bloque de resultados se muestra como una tabla separada. Esto es necesario para que los bloques se puedan generar sin resultados de almacenamiento en búfer (el almacenamiento en búfer sería necesario para calcular previamente el ancho visible de todos los valores). - -[NULL](../sql-reference/syntax.md) se emite como `ᴺᵁᴸᴸ`. - -Ejemplo (mostrado para el [PrettyCompact](#prettycompact) formato): - -``` sql -SELECT * FROM t_null -``` - -``` text -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -└───┴──────┘ -``` - -Las filas no se escapan en formatos Pretty \*. Se muestra un ejemplo para el [PrettyCompact](#prettycompact) formato: - -``` sql -SELECT 'String with \'quotes\' and \t character' AS Escaping_test -``` - -``` text -┌─Escaping_test────────────────────────┐ -│ String with 'quotes' and character │ -└──────────────────────────────────────┘ -``` - -Para evitar volcar demasiados datos al terminal, solo se imprimen las primeras 10.000 filas. Si el número de filas es mayor o igual que 10.000, el mensaje “Showed first 10 000” se imprime. -Este formato solo es apropiado para generar un resultado de consulta, pero no para analizar (recuperar datos para insertar en una tabla). - -El formato Pretty admite la salida de valores totales (cuando se usa WITH TOTALS) y extremos (cuando ‘extremes’ se establece en 1). En estos casos, los valores totales y los valores extremos se generan después de los datos principales, en tablas separadas. Ejemplo (mostrado para el [PrettyCompact](#prettycompact) formato): - -``` sql -SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact -``` - -``` text -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1406958 │ -│ 2014-03-18 │ 1383658 │ -│ 2014-03-19 │ 1405797 │ -│ 2014-03-20 │ 1353623 │ -│ 2014-03-21 │ 1245779 │ -│ 2014-03-22 │ 1031592 │ -│ 2014-03-23 │ 1046491 │ -└────────────┴─────────┘ - -Totals: -┌──EventDate─┬───────c─┐ -│ 1970-01-01 │ 8873898 │ -└────────────┴─────────┘ - -Extremes: -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1031592 │ -│ 2014-03-23 │ 1406958 │ -└────────────┴─────────┘ -``` - -## PrettyCompact {#prettycompact} - -Difiere de [Bastante](#pretty) en que la cuadrícula se dibuja entre filas y el resultado es más compacto. -Este formato se usa de forma predeterminada en el cliente de línea de comandos en modo interactivo. - -## PrettyCompactMonoBlock {#prettycompactmonoblock} - -Difiere de [PrettyCompact](#prettycompact) en que hasta 10,000 filas se almacenan en búfer, luego se salen como una sola tabla, no por bloques. - -## PrettyNoEscapes {#prettynoescapes} - -Difiere de Pretty en que las secuencias de escape ANSI no se usan. Esto es necesario para mostrar este formato en un navegador, así como para usar el ‘watch’ utilidad de línea de comandos. - -Ejemplo: - -``` bash -$ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" -``` - -Puede usar la interfaz HTTP para mostrar en el navegador. - -### PrettyCompactNoEscapes {#prettycompactnoescapes} - -Lo mismo que el ajuste anterior. - -### PrettySpaceNoEscapes {#prettyspacenoescapes} - -Lo mismo que el ajuste anterior. - -## Bienvenido a WordPress {#prettyspace} - -Difiere de [PrettyCompact](#prettycompact) en ese espacio en blanco (caracteres de espacio) se usa en lugar de la cuadrícula. - -## RowBinary {#rowbinary} - -Formatea y analiza datos por fila en formato binario. Las filas y los valores se enumeran consecutivamente, sin separadores. -Este formato es menos eficiente que el formato nativo, ya que está basado en filas. - -Los integradores usan una representación little-endian de longitud fija. Por ejemplo, UInt64 usa 8 bytes. -DateTime se representa como UInt32 que contiene la marca de tiempo Unix como el valor. -Date se representa como un objeto UInt16 que contiene el número de días desde 1970-01-01 como el valor. -La cadena se representa como una longitud varint (sin signo [LEB128](https://en.wikipedia.org/wiki/LEB128)), seguido de los bytes de la cadena. -FixedString se representa simplemente como una secuencia de bytes. - -La matriz se representa como una longitud varint (sin signo [LEB128](https://en.wikipedia.org/wiki/LEB128)), seguido de elementos sucesivos de la matriz. - -Para [NULL](../sql-reference/syntax.md#null-literal) soporte, se añade un byte adicional que contiene 1 o 0 antes de cada [NULL](../sql-reference/data-types/nullable.md) valor. Si 1, entonces el valor es `NULL` y este byte se interpreta como un valor separado. Si es 0, el valor después del byte no es `NULL`. - -## RowBinaryWithNamesAndTypes {#rowbinarywithnamesandtypes} - -Similar a [RowBinary](#rowbinary), pero con encabezado añadido: - -- [LEB128](https://en.wikipedia.org/wiki/LEB128)-número codificado de columnas (N) -- N `String`s especificando nombres de columna -- N `String`s especificando tipos de columna - -## Valor {#data-format-values} - -Imprime cada fila entre paréntesis. Las filas están separadas por comas. No hay coma después de la última fila. Los valores dentro de los corchetes también están separados por comas. Los números se emiten en formato decimal sin comillas. Las matrices se emiten entre corchetes. Las cadenas, fechas y fechas con horas se generan entre comillas. Las reglas de escape y el análisis son similares a las [TabSeparated](#tabseparated) formato. Durante el formateo, los espacios adicionales no se insertan, pero durante el análisis, se permiten y omiten (excepto los espacios dentro de los valores de la matriz, que no están permitidos). [NULL](../sql-reference/syntax.md) se representa como `NULL`. - -The minimum set of characters that you need to escape when passing data in Values ​​format: single quotes and backslashes. - -Este es el formato que se utiliza en `INSERT INTO t VALUES ...`, pero también puede usarlo para formatear los resultados de la consulta. - -Ver también: [input_format_values_interpret_expressions](../operations/settings/settings.md#settings-input_format_values_interpret_expressions) y [input_format_values_deduce_templates_of_expressions](../operations/settings/settings.md#settings-input_format_values_deduce_templates_of_expressions) configuración. - -## Vertical {#vertical} - -Imprime cada valor en una línea independiente con el nombre de columna especificado. Este formato es conveniente para imprimir solo una o varias filas si cada fila consta de un gran número de columnas. - -[NULL](../sql-reference/syntax.md) se emite como `ᴺᵁᴸᴸ`. - -Ejemplo: - -``` sql -SELECT * FROM t_null FORMAT Vertical -``` - -``` text -Row 1: -────── -x: 1 -y: ᴺᵁᴸᴸ -``` - -Las filas no se escapan en formato vertical: - -``` sql -SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical -``` - -``` text -Row 1: -────── -test: string with 'quotes' and with some special - characters -``` - -Este formato solo es apropiado para generar un resultado de consulta, pero no para analizar (recuperar datos para insertar en una tabla). - -## VerticalRaw {#verticalraw} - -Similar a [Vertical](#vertical), pero con escapar deshabilitado. Este formato solo es adecuado para generar resultados de consultas, no para analizar (recibir datos e insertarlos en la tabla). - -## XML {#xml} - -El formato XML es adecuado solo para la salida, no para el análisis. Ejemplo: - -``` xml - - - - - - SearchPhrase - String - - - count() - UInt64 - - - - - - - 8267016 - - - bathroom interior design - 2166 - - - yandex - 1655 - - - 2014 spring fashion - 1549 - - - freeform photos - 1480 - - - angelina jolie - 1245 - - - omsk - 1112 - - - photos of dog breeds - 1091 - - - curtain designs - 1064 - - - baku - 1000 - - - 10 - 141137 - -``` - -Si el nombre de la columna no tiene un formato aceptable, simplemente ‘field’ se utiliza como el nombre del elemento. En general, la estructura XML sigue la estructura JSON. -Just as for JSON, invalid UTF-8 sequences are changed to the replacement character � so the output text will consist of valid UTF-8 sequences. - -En los valores de cadena, los caracteres `<` y `&` se escaparon como `<` y `&`. - -Las matrices se emiten como `HelloWorld...`y tuplas como `HelloWorld...`. - -## CapnProto {#capnproto} - -Cap'n Proto es un formato de mensaje binario similar a Protocol Buffers y Thrift, pero no como JSON o MessagePack. - -Los mensajes de Cap'n Proto están estrictamente escritos y no autodescribidos, lo que significa que necesitan una descripción de esquema externo. El esquema se aplica sobre la marcha y se almacena en caché para cada consulta. - -``` bash -$ cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema='schema:Message'" -``` - -Donde `schema.capnp` se ve así: - -``` capnp -struct Message { - SearchPhrase @0 :Text; - c @1 :Uint64; -} -``` - -La deserialización es efectiva y generalmente no aumenta la carga del sistema. - -Ver también [Esquema de formato](#formatschema). - -## Protobuf {#protobuf} - -Protobuf - es un [Búferes de protocolo](https://developers.google.com/protocol-buffers/) formato. - -Este formato requiere un esquema de formato externo. El esquema se almacena en caché entre las consultas. -ClickHouse soporta ambos `proto2` y `proto3` sintaxis. Se admiten campos repetidos / opcionales / requeridos. - -Ejemplos de uso: - -``` sql -SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:MessageType' -``` - -``` bash -cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" -``` - -donde el archivo `schemafile.proto` se ve así: - -``` capnp -syntax = "proto3"; - -message MessageType { - string name = 1; - string surname = 2; - uint32 birthDate = 3; - repeated string phoneNumbers = 4; -}; -``` - -Para encontrar la correspondencia entre las columnas de la tabla y los campos del tipo de mensaje de Protocol Buffers, ClickHouse compara sus nombres. -Esta comparación no distingue entre mayúsculas y minúsculas y los caracteres `_` (subrayado) y `.` (punto) se consideran iguales. -Si los tipos de una columna y un campo del mensaje de Protocol Buffers son diferentes, se aplica la conversión necesaria. - -Los mensajes anidados son compatibles. Por ejemplo, para el campo `z` en el siguiente tipo de mensaje - -``` capnp -message MessageType { - message XType { - message YType { - int32 z; - }; - repeated YType y; - }; - XType x; -}; -``` - -ClickHouse intenta encontrar una columna llamada `x.y.z` (o `x_y_z` o `X.y_Z` y así sucesivamente). -Los mensajes anidados son adecuados para [estructuras de datos anidados](../sql-reference/data-types/nested-data-structures/nested.md). - -Valores predeterminados definidos en un esquema protobuf como este - -``` capnp -syntax = "proto2"; - -message MessageType { - optional int32 result_per_page = 3 [default = 10]; -} -``` - -no se aplican; el [valores predeterminados de la tabla](../sql-reference/statements/create.md#create-default-values) se utilizan en lugar de ellos. - -ClickHouse entra y emite mensajes protobuf en el `length-delimited` formato. -Significa que antes de cada mensaje debe escribirse su longitud como un [varint](https://developers.google.com/protocol-buffers/docs/encoding#varints). -Ver también [cómo leer / escribir mensajes protobuf delimitados por longitud en idiomas populares](https://cwiki.apache.org/confluence/display/GEODE/Delimiting+Protobuf+Messages). - -## Avro {#data-format-avro} - -[Más información](http://avro.apache.org/) es un marco de serialización de datos orientado a filas desarrollado dentro del proyecto Hadoop de Apache. - -El formato ClickHouse Avro admite lectura y escritura [Archivos de datos Avro](http://avro.apache.org/docs/current/spec.html#Object+Container+Files). - -### Coincidencia de tipos de datos {#data_types-matching} - -La siguiente tabla muestra los tipos de datos admitidos y cómo coinciden con ClickHouse [tipos de datos](../sql-reference/data-types/index.md) en `INSERT` y `SELECT` consulta. - -| Tipo de datos Avro `INSERT` | Tipo de datos ClickHouse | Tipo de datos Avro `SELECT` | -|---------------------------------------------|-------------------------------------------------------------------------------------------------------------------------|------------------------------| -| `boolean`, `int`, `long`, `float`, `double` | [¿Cómo funciona?)](../sql-reference/data-types/int-uint.md), [UInt(8\|16\|32)](../sql-reference/data-types/int-uint.md) | `int` | -| `boolean`, `int`, `long`, `float`, `double` | [Int64](../sql-reference/data-types/int-uint.md), [UInt64](../sql-reference/data-types/int-uint.md) | `long` | -| `boolean`, `int`, `long`, `float`, `double` | [Float32](../sql-reference/data-types/float.md) | `float` | -| `boolean`, `int`, `long`, `float`, `double` | [Float64](../sql-reference/data-types/float.md) | `double` | -| `bytes`, `string`, `fixed`, `enum` | [Cadena](../sql-reference/data-types/string.md) | `bytes` | -| `bytes`, `string`, `fixed` | [Cadena fija (N)](../sql-reference/data-types/fixedstring.md) | `fixed(N)` | -| `enum` | [Enum (8\|16)](../sql-reference/data-types/enum.md) | `enum` | -| `array(T)` | [Matriz (T)](../sql-reference/data-types/array.md) | `array(T)` | -| `union(null, T)`, `union(T, null)` | [Nivel de Cifrado WEP)](../sql-reference/data-types/date.md) | `union(null, T)` | -| `null` | [Nullable (nada)](../sql-reference/data-types/special-data-types/nothing.md) | `null` | -| `int (date)` \* | [Fecha](../sql-reference/data-types/date.md) | `int (date)` \* | -| `long (timestamp-millis)` \* | [¿Qué puedes encontrar en Neodigit)](../sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \* | -| `long (timestamp-micros)` \* | [Cómo hacer esto?)](../sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \* | - -\* [Tipos lógicos Avro](http://avro.apache.org/docs/current/spec.html#Logical+Types) - -Tipos de datos Avro no admitidos: `record` (no root), `map` - -Tipos de datos lógicos Avro no admitidos: `uuid`, `time-millis`, `time-micros`, `duration` - -### Insertar datos {#inserting-data-1} - -Para insertar datos de un archivo Avro en la tabla ClickHouse: - -``` bash -$ cat file.avro | clickhouse-client --query="INSERT INTO {some_table} FORMAT Avro" -``` - -El esquema raíz del archivo Avro de entrada debe ser de `record` tipo. - -Para encontrar la correspondencia entre las columnas de la tabla y los campos de Avro esquema ClickHouse compara sus nombres. Esta comparación distingue entre mayúsculas y minúsculas. -Los campos no utilizados se omiten. - -Los tipos de datos de las columnas de tabla ClickHouse pueden diferir de los campos correspondientes de los datos de Avro insertados. Al insertar datos, ClickHouse interpreta los tipos de datos de acuerdo con la tabla anterior y luego [elenco](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) los datos al tipo de columna correspondiente. - -### Selección de datos {#selecting-data-1} - -Para seleccionar datos de la tabla ClickHouse en un archivo Avro: - -``` bash -$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Avro" > file.avro -``` - -Los nombres de columna deben: - -- comenzar con `[A-Za-z_]` -- posteriormente contienen sólo `[A-Za-z0-9_]` - -La compresión de archivos Avro de salida y el intervalo de sincronización se pueden configurar con [Sistema abierto.](../operations/settings/settings.md#settings-output_format_avro_codec) y [Sistema abierto.](../operations/settings/settings.md#settings-output_format_avro_sync_interval) respectivamente. - -## AvroConfluent {#data-format-avro-confluent} - -AvroConfluent admite la decodificación de mensajes Avro de un solo objeto comúnmente utilizados con [Kafka](https://kafka.apache.org/) y [Registro de Esquemas Confluentes](https://docs.confluent.io/current/schema-registry/index.html). - -Cada mensaje de Avro incrusta un id de esquema que se puede resolver en el esquema real con la ayuda del Registro de esquemas. - -Los esquemas se almacenan en caché una vez resueltos. - -La URL del registro de esquemas se configura con [Todos los derechos reservados.](../operations/settings/settings.md#settings-format_avro_schema_registry_url) - -### Coincidencia de tipos de datos {#data_types-matching-1} - -Lo mismo que [Avro](#data-format-avro) - -### Uso {#usage} - -Para verificar rápidamente la resolución del esquema, puede usar [Método de codificación de datos:](https://github.com/edenhill/kafkacat) con [Sistema abierto.](../operations/utilities/clickhouse-local.md#clickhouse-local): - -``` bash -$ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse-local --input-format AvroConfluent --format_avro_schema_registry_url 'http://schema-registry' -S "field1 Int64, field2 String" -q 'select * from table' -1 a -2 b -3 c -``` - -Utilizar `AvroConfluent` con [Kafka](../engines/table-engines/integrations/kafka.md): - -``` sql -CREATE TABLE topic1_stream -( - field1 String, - field2 String -) -ENGINE = Kafka() -SETTINGS -kafka_broker_list = 'kafka-broker', -kafka_topic_list = 'topic1', -kafka_group_name = 'group1', -kafka_format = 'AvroConfluent'; - -SET format_avro_schema_registry_url = 'http://schema-registry'; - -SELECT * FROM topic1_stream; -``` - -!!! note "Advertencia" - Configuración `format_avro_schema_registry_url` necesita ser configurado en `users.xml` para mantener su valor después de un reinicio. - -## Parquet {#data-format-parquet} - -[Apache Parquet](http://parquet.apache.org/) es un formato de almacenamiento columnar generalizado en el ecosistema Hadoop. ClickHouse admite operaciones de lectura y escritura para este formato. - -### Coincidencia de tipos de datos {#data_types-matching-2} - -La siguiente tabla muestra los tipos de datos admitidos y cómo coinciden con ClickHouse [tipos de datos](../sql-reference/data-types/index.md) en `INSERT` y `SELECT` consulta. - -| Tipo de datos de parquet (`INSERT`) | Tipo de datos ClickHouse | Tipo de datos de parquet (`SELECT`) | -|-------------------------------------|-----------------------------------------------------------|-------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | -| `DATE32` | [Fecha](../sql-reference/data-types/date.md) | `UINT16` | -| `DATE64`, `TIMESTAMP` | [FechaHora](../sql-reference/data-types/datetime.md) | `UINT32` | -| `STRING`, `BINARY` | [Cadena](../sql-reference/data-types/string.md) | `STRING` | -| — | [Cadena fija](../sql-reference/data-types/fixedstring.md) | `STRING` | -| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | - -ClickHouse admite una precisión configurable de `Decimal` tipo. El `INSERT` consulta trata el Parquet `DECIMAL` tipo como el ClickHouse `Decimal128` tipo. - -Tipos de datos de parquet no admitidos: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. - -Los tipos de datos de las columnas de tabla ClickHouse pueden diferir de los campos correspondientes de los datos de Parquet insertados. Al insertar datos, ClickHouse interpreta los tipos de datos de acuerdo con la tabla anterior y luego [elenco](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) los datos de ese tipo de datos que se establece para la columna de tabla ClickHouse. - -### Insertar y seleccionar datos {#inserting-and-selecting-data} - -Puede insertar datos de Parquet desde un archivo en la tabla ClickHouse mediante el siguiente comando: - -``` bash -$ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" -``` - -Puede seleccionar datos de una tabla ClickHouse y guardarlos en algún archivo en el formato Parquet mediante el siguiente comando: - -``` bash -$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} -``` - -Para intercambiar datos con Hadoop, puede usar [Motor de mesa HDFS](../engines/table-engines/integrations/hdfs.md). - -## ORC {#data-format-orc} - -[Apache ORC](https://orc.apache.org/) es un formato de almacenamiento columnar generalizado en el ecosistema Hadoop. Solo puede insertar datos en este formato en ClickHouse. - -### Coincidencia de tipos de datos {#data_types-matching-3} - -La siguiente tabla muestra los tipos de datos admitidos y cómo coinciden con ClickHouse [tipos de datos](../sql-reference/data-types/index.md) en `INSERT` consulta. - -| Tipo de datos ORC (`INSERT`) | Tipo de datos ClickHouse | -|------------------------------|------------------------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | -| `DATE32` | [Fecha](../sql-reference/data-types/date.md) | -| `DATE64`, `TIMESTAMP` | [FechaHora](../sql-reference/data-types/datetime.md) | -| `STRING`, `BINARY` | [Cadena](../sql-reference/data-types/string.md) | -| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | - -ClickHouse soporta la precisión configurable de la `Decimal` tipo. El `INSERT` consulta trata el ORC `DECIMAL` tipo como el ClickHouse `Decimal128` tipo. - -Tipos de datos ORC no admitidos: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. - -Los tipos de datos de las columnas de tabla ClickHouse no tienen que coincidir con los campos de datos ORC correspondientes. Al insertar datos, ClickHouse interpreta los tipos de datos de acuerdo con la tabla anterior y luego [elenco](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) los datos al tipo de datos establecido para la columna de tabla ClickHouse. - -### Insertar datos {#inserting-data-2} - -Puede insertar datos ORC de un archivo en la tabla ClickHouse mediante el siguiente comando: - -``` bash -$ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" -``` - -Para intercambiar datos con Hadoop, puede usar [Motor de mesa HDFS](../engines/table-engines/integrations/hdfs.md). - -## Esquema de formato {#formatschema} - -El valor establece el nombre de archivo que contiene el esquema de formato `format_schema`. -Es necesario establecer esta configuración cuando se utiliza uno de los formatos `Cap'n Proto` y `Protobuf`. -El esquema de formato es una combinación de un nombre de archivo y el nombre de un tipo de mensaje en este archivo, delimitado por dos puntos, -e.g. `schemafile.proto:MessageType`. -Si el archivo tiene la extensión estándar para el formato (por ejemplo, `.proto` para `Protobuf`), -se puede omitir y en este caso, el esquema de formato se ve así `schemafile:MessageType`. - -Si introduce o emite datos a través del [cliente](../interfaces/cli.md) en el [modo interactivo](../interfaces/cli.md#cli_usage), el nombre de archivo especificado en el esquema de formato -puede contener una ruta absoluta o una ruta relativa al directorio actual en el cliente. -Si utiliza el cliente en el [modo por lotes](../interfaces/cli.md#cli_usage), la ruta de acceso al esquema debe ser relativa por razones de seguridad. - -Si introduce o emite datos a través del [Interfaz HTTP](../interfaces/http.md) el nombre de archivo especificado en el esquema de formato -debe estar ubicado en el directorio especificado en [format_schema_path](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-format_schema_path) -en la configuración del servidor. - -## Salto de errores {#skippingerrors} - -Algunos formatos como `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` y `Protobuf` puede omitir la fila rota si se produjo un error de análisis y continuar el análisis desde el comienzo de la siguiente fila. Ver [Entrada_format_allow_errors_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) y -[Entrada_format_allow_errors_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio) configuración. -Limitacion: -- En caso de error de análisis `JSONEachRow` omite todos los datos hasta la nueva línea (o EOF), por lo que las filas deben estar delimitadas por `\n` para contar los errores correctamente. -- `Template` y `CustomSeparated` use el delimitador después de la última columna y el delimitador entre filas para encontrar el comienzo de la siguiente fila, por lo que omitir errores solo funciona si al menos uno de ellos no está vacío. - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/formats/) diff --git a/docs/es/interfaces/http.md b/docs/es/interfaces/http.md deleted file mode 100644 index ab510a268e3..00000000000 --- a/docs/es/interfaces/http.md +++ /dev/null @@ -1,617 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 19 -toc_title: Interfaz HTTP ---- - -# Interfaz HTTP {#http-interface} - -La interfaz HTTP le permite usar ClickHouse en cualquier plataforma desde cualquier lenguaje de programación. Lo usamos para trabajar desde Java y Perl, así como scripts de shell. En otros departamentos, la interfaz HTTP se usa desde Perl, Python y Go. La interfaz HTTP es más limitada que la interfaz nativa, pero tiene una mejor compatibilidad. - -De forma predeterminada, clickhouse-server escucha HTTP en el puerto 8123 (esto se puede cambiar en la configuración). - -Si realiza una solicitud GET / sin parámetros, devuelve 200 códigos de respuesta y la cadena que definió en [http_server_default_response](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-http_server_default_response) valor predeterminado “Ok.” (con un avance de línea al final) - -``` bash -$ curl 'http://localhost:8123/' -Ok. -``` - -Use la solicitud GET / ping en los scripts de comprobación de estado. Este controlador siempre devuelve “Ok.” (con un avance de línea al final). Disponible a partir de la versión 18.12.13. - -``` bash -$ curl 'http://localhost:8123/ping' -Ok. -``` - -Enviar la solicitud como una URL ‘query’ parámetro, o como un POST. O envíe el comienzo de la consulta en el ‘query’ parámetro, y el resto en el POST (explicaremos más adelante por qué esto es necesario). El tamaño de la URL está limitado a 16 KB, así que tenga esto en cuenta al enviar consultas grandes. - -Si tiene éxito, recibirá el código de respuesta 200 y el resultado en el cuerpo de respuesta. -Si se produce un error, recibirá el código de respuesta 500 y un texto de descripción de error en el cuerpo de la respuesta. - -Al usar el método GET, ‘readonly’ se establece. En otras palabras, para consultas que modifican datos, solo puede usar el método POST. Puede enviar la consulta en sí misma en el cuerpo POST o en el parámetro URL. - -Ejemplos: - -``` bash -$ curl 'http://localhost:8123/?query=SELECT%201' -1 - -$ wget -nv -O- 'http://localhost:8123/?query=SELECT 1' -1 - -$ echo -ne 'GET /?query=SELECT%201 HTTP/1.0\r\n\r\n' | nc localhost 8123 -HTTP/1.0 200 OK -Date: Wed, 27 Nov 2019 10:30:18 GMT -Connection: Close -Content-Type: text/tab-separated-values; charset=UTF-8 -X-ClickHouse-Server-Display-Name: clickhouse.ru-central1.internal -X-ClickHouse-Query-Id: 5abe861c-239c-467f-b955-8a201abb8b7f -X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} - -1 -``` - -Como puede ver, curl es algo inconveniente ya que los espacios deben ser URL escapadas. -Aunque wget escapa de todo en sí, no recomendamos usarlo porque no funciona bien sobre HTTP 1.1 cuando se usa keep-alive y Transfer-Encoding: chunked . - -``` bash -$ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @- -1 - -$ echo 'SELECT 1' | curl 'http://localhost:8123/?query=' --data-binary @- -1 - -$ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @- -1 -``` - -Si se envía parte de la consulta en el parámetro y parte en el POST, se inserta un avance de línea entre estas dos partes de datos. -Ejemplo (esto no funcionará): - -``` bash -$ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @- -Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL -ECT 1 -, expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception -``` - -De forma predeterminada, los datos se devuelven en formato TabSeparated (para obtener más información, “Formats” apartado). -Utilice la cláusula FORMAT de la consulta para solicitar cualquier otro formato. - -``` bash -$ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @- -┏━━━┓ -┃ 1 ┃ -┡━━━┩ -│ 1 │ -└───┘ -``` - -El método POST de transmitir datos es necesario para las consultas INSERT. En este caso, puede escribir el comienzo de la consulta en el parámetro URL y usar POST para pasar los datos a insertar. Los datos a insertar podrían ser, por ejemplo, un volcado separado por tabuladores de MySQL. De esta manera, la consulta INSERT reemplaza LOAD DATA LOCAL INFILE de MySQL. - -Ejemplos: Crear una tabla: - -``` bash -$ echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- -``` - -Uso de la consulta INSERT familiar para la inserción de datos: - -``` bash -$ echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- -``` - -Los datos se pueden enviar por separado de la consulta: - -``` bash -$ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- -``` - -Puede especificar cualquier formato de datos. El ‘Values’ el formato es el mismo que el que se usa al escribir INSERT INTO t VALUES: - -``` bash -$ echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- -``` - -Para insertar datos de un volcado separado por tabuladores, especifique el formato correspondiente: - -``` bash -$ echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- -``` - -Lectura del contenido de la tabla. Los datos se emiten en orden aleatorio debido al procesamiento de consultas paralelas: - -``` bash -$ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' -7 -8 -9 -10 -11 -12 -1 -2 -3 -4 -5 -6 -``` - -Eliminando la mesa. - -``` bash -$ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- -``` - -Para las solicitudes correctas que no devuelven una tabla de datos, se devuelve un cuerpo de respuesta vacío. - -Puede utilizar el formato interno de compresión ClickHouse al transmitir datos. Los datos comprimidos tienen un formato no estándar, y deberá usar el `clickhouse-compressor` programa para trabajar con él (se instala con el `clickhouse-client` paquete). Para aumentar la eficiencia de la inserción de datos, puede deshabilitar la verificación de suma de comprobación [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) configuración. - -Si ha especificado `compress=1` en la URL, el servidor comprime los datos que le envía. -Si ha especificado `decompress=1` en la dirección URL, el servidor descomprime los mismos datos que `POST` método. - -También puede optar por utilizar [Compresión HTTP](https://en.wikipedia.org/wiki/HTTP_compression). Para enviar un `POST` solicitud, agregue el encabezado de solicitud `Content-Encoding: compression_method`. Para que ClickHouse comprima la respuesta, debe agregar `Accept-Encoding: compression_method`. Soporta ClickHouse `gzip`, `br`, y `deflate` [métodos de compresión](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens). Para habilitar la compresión HTTP, debe usar ClickHouse [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) configuración. Puede configurar el nivel de compresión de datos [http_zlib_compression_level](#settings-http_zlib_compression_level) para todos los métodos de compresión. - -Puede usar esto para reducir el tráfico de red al transmitir una gran cantidad de datos o para crear volcados que se comprimen inmediatamente. - -Ejemplos de envío de datos con compresión: - -``` bash -#Sending data to the server: -$ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' - -#Sending data to the client: -$ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' -``` - -!!! note "Nota" - Algunos clientes HTTP pueden descomprimir datos del servidor de forma predeterminada (con `gzip` y `deflate`) y puede obtener datos descomprimidos incluso si usa la configuración de compresión correctamente. - -Puede usar el ‘database’ Parámetro URL para especificar la base de datos predeterminada. - -``` bash -$ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @- -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -``` - -De forma predeterminada, la base de datos que está registrada en la configuración del servidor se utiliza como base de datos predeterminada. De forma predeterminada, esta es la base de datos llamada ‘default’. Como alternativa, siempre puede especificar la base de datos utilizando un punto antes del nombre de la tabla. - -El nombre de usuario y la contraseña se pueden indicar de una de estas tres maneras: - -1. Uso de la autenticación básica HTTP. Ejemplo: - - - -``` bash -$ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- -``` - -1. En el ‘user’ y ‘password’ Parámetros de URL. Ejemplo: - - - -``` bash -$ echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- -``` - -1. Utilizar ‘X-ClickHouse-User’ y ‘X-ClickHouse-Key’ cabecera. Ejemplo: - - - -``` bash -$ echo 'SELECT 1' | curl -H 'X-ClickHouse-User: user' -H 'X-ClickHouse-Key: password' 'http://localhost:8123/' -d @- -``` - -Si no se especifica el nombre de usuario, `default` se utiliza el nombre. Si no se especifica la contraseña, se utiliza la contraseña vacía. -También puede utilizar los parámetros de URL para especificar cualquier configuración para procesar una sola consulta o perfiles completos de configuración. Ejemplo:http://localhost:8123/?perfil=web&max_rows_to_read=1000000000&consulta=SELECCIONA+1 - -Para obtener más información, consulte [Configuración](../operations/settings/index.md) apartado. - -``` bash -$ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -``` - -Para obtener información sobre otros parámetros, consulte la sección “SET”. - -Del mismo modo, puede utilizar sesiones ClickHouse en el protocolo HTTP. Para hacer esto, debe agregar el `session_id` GET parámetro a la solicitud. Puede usar cualquier cadena como ID de sesión. De forma predeterminada, la sesión finaliza después de 60 segundos de inactividad. Para cambiar este tiempo de espera, modifique `default_session_timeout` configuración en la configuración del servidor, o `session_timeout` GET parámetro a la solicitud. Para comprobar el estado de la sesión, `session_check=1` parámetro. Solo se puede ejecutar una consulta a la vez en una sola sesión. - -Puede recibir información sobre el progreso de una consulta en `X-ClickHouse-Progress` encabezados de respuesta. Para hacer esto, habilite [send_progress_in_http_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Ejemplo de la secuencia de encabezado: - -``` text -X-ClickHouse-Progress: {"read_rows":"2752512","read_bytes":"240570816","total_rows_to_read":"8880128"} -X-ClickHouse-Progress: {"read_rows":"5439488","read_bytes":"482285394","total_rows_to_read":"8880128"} -X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_rows_to_read":"8880128"} -``` - -Posibles campos de encabezado: - -- `read_rows` — Number of rows read. -- `read_bytes` — Volume of data read in bytes. -- `total_rows_to_read` — Total number of rows to be read. -- `written_rows` — Number of rows written. -- `written_bytes` — Volume of data written in bytes. - -Las solicitudes en ejecución no se detienen automáticamente si se pierde la conexión HTTP. El análisis y el formato de datos se realizan en el lado del servidor, y el uso de la red puede ser ineficaz. -Opcional ‘query_id’ parámetro se puede pasar como el ID de consulta (cualquier cadena). Para obtener más información, consulte la sección “Settings, replace_running_query”. - -Opcional ‘quota_key’ parámetro se puede pasar como la clave de cuota (cualquier cadena). Para obtener más información, consulte la sección “Quotas”. - -La interfaz HTTP permite pasar datos externos (tablas temporales externas) para consultar. Para obtener más información, consulte la sección “External data for query processing”. - -## Almacenamiento en búfer de respuesta {#response-buffering} - -Puede habilitar el almacenamiento en búfer de respuestas en el lado del servidor. El `buffer_size` y `wait_end_of_query` Los parámetros URL se proporcionan para este propósito. - -`buffer_size` determina el número de bytes en el resultado para almacenar en búfer en la memoria del servidor. Si un cuerpo de resultado es mayor que este umbral, el búfer se escribe en el canal HTTP y los datos restantes se envían directamente al canal HTTP. - -Para asegurarse de que toda la respuesta se almacena en búfer, establezca `wait_end_of_query=1`. En este caso, los datos que no se almacenan en la memoria se almacenarán en un archivo de servidor temporal. - -Ejemplo: - -``` bash -$ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' -``` - -Utilice el almacenamiento en búfer para evitar situaciones en las que se produjo un error de procesamiento de consultas después de enviar al cliente el código de respuesta y los encabezados HTTP. En esta situación, se escribe un mensaje de error al final del cuerpo de la respuesta y, en el lado del cliente, el error solo se puede detectar en la etapa de análisis. - -### Consultas con parámetros {#cli-queries-with-parameters} - -Puede crear una consulta con parámetros y pasar valores para ellos desde los parámetros de solicitud HTTP correspondientes. Para obtener más información, consulte [Consultas con parámetros para CLI](cli.md#cli-queries-with-parameters). - -### Ejemplo {#example} - -``` bash -$ curl -sS "
?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" -``` - -## Interfaz HTTP predefinida {#predefined_http_interface} - -ClickHouse admite consultas específicas a través de la interfaz HTTP. Por ejemplo, puede escribir datos en una tabla de la siguiente manera: - -``` bash -$ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- -``` - -ClickHouse también es compatible con la interfaz HTTP predefinida que puede ayudarle a una integración más fácil con herramientas de terceros como [Prometheus exportador](https://github.com/percona-lab/clickhouse_exporter). - -Ejemplo: - -- En primer lugar, agregue esta sección al archivo de configuración del servidor: - - - -``` xml - - - /predefined_query - POST,GET - - predefined_query_handler - SELECT * FROM system.metrics LIMIT 5 FORMAT Template SETTINGS format_template_resultset = 'prometheus_template_output_format_resultset', format_template_row = 'prometheus_template_output_format_row', format_template_rows_between_delimiter = '\n' - - - ... - ... - -``` - -- Ahora puede solicitar la url directamente para los datos en el formato Prometheus: - - - -``` bash -$ curl -v 'http://localhost:8123/predefined_query' -* Trying ::1... -* Connected to localhost (::1) port 8123 (#0) -> GET /predefined_query HTTP/1.1 -> Host: localhost:8123 -> User-Agent: curl/7.47.0 -> Accept: */* -> -< HTTP/1.1 200 OK -< Date: Tue, 28 Apr 2020 08:52:56 GMT -< Connection: Keep-Alive -< Content-Type: text/plain; charset=UTF-8 -< X-ClickHouse-Server-Display-Name: i-mloy5trc -< Transfer-Encoding: chunked -< X-ClickHouse-Query-Id: 96fe0052-01e6-43ce-b12a-6b7370de6e8a -< X-ClickHouse-Format: Template -< X-ClickHouse-Timezone: Asia/Shanghai -< Keep-Alive: timeout=3 -< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} -< -# HELP "Query" "Number of executing queries" -# TYPE "Query" counter -"Query" 1 - -# HELP "Merge" "Number of executing background merges" -# TYPE "Merge" counter -"Merge" 0 - -# HELP "PartMutation" "Number of mutations (ALTER DELETE/UPDATE)" -# TYPE "PartMutation" counter -"PartMutation" 0 - -# HELP "ReplicatedFetch" "Number of data parts being fetched from replica" -# TYPE "ReplicatedFetch" counter -"ReplicatedFetch" 0 - -# HELP "ReplicatedSend" "Number of data parts being sent to replicas" -# TYPE "ReplicatedSend" counter -"ReplicatedSend" 0 - -* Connection #0 to host localhost left intact - - -* Connection #0 to host localhost left intact -``` - -Como puede ver en el ejemplo, si `` está configurado en la configuración.archivo xml y `` puede contener muchos `s`. ClickHouse coincidirá con las solicitudes HTTP recibidas con el tipo predefinido en `` y el primer emparejado ejecuta el controlador. Luego, ClickHouse ejecutará la consulta predefinida correspondiente si la coincidencia es exitosa. - -> Ahora `` puede configurar ``, ``, ``,``: -> `` es responsable de hacer coincidir la parte del método de la solicitud HTTP. `` se ajusta plenamente a la definición de [método](https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods) en el protocolo HTTP. Es una configuración opcional. Si no está definido en el archivo de configuración, no coincide con la parte del método de la solicitud HTTP. -> -> `` es responsable de hacer coincidir la parte url de la solicitud HTTP. Es compatible con [RE2](https://github.com/google/re2)expresiones regulares. Es una configuración opcional. Si no está definido en el archivo de configuración, no coincide con la parte url de la solicitud HTTP. -> -> `` es responsable de hacer coincidir la parte del encabezado de la solicitud HTTP. Es compatible con las expresiones regulares de RE2. Es una configuración opcional. Si no está definido en el archivo de configuración, no coincide con la parte de encabezado de la solicitud HTTP. -> -> `` contiene la parte de procesamiento principal. Ahora `` puede configurar ``, ``, ``, ``, ``, ``. -> \> `` Actualmente soporta tres tipos: **Dirección de correo electrónico**, **Nombre de la red inalámbrica (SSID):**, **estática**. -> \> -> \> `` - utilizar con el tipo predefined_query_handler, ejecuta la consulta cuando se llama al controlador. -> \> -> \> `` - utilizar con el tipo dynamic_query_handler, extrae y ejecuta el valor correspondiente al `` valor en parámetros de solicitud HTTP. -> \> -> \> `` - uso con tipo estático, código de estado de respuesta. -> \> -> \> `` - uso con tipo estático, respuesta [tipo de contenido](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type). -> \> -> \> `` - uso con tipo estático, contenido de respuesta enviado al cliente, cuando se usa el prefijo ‘file://’ o ‘config://’, encontrar el contenido del archivo o configuración enviar al cliente. - -A continuación están los métodos de configuración para los diferentes ``. - -## Dirección de correo electrónico {#predefined_query_handler} - -`` admite la configuración de valores Settings y query_params. Puede configurar `` en el tipo de ``. - -`` valor es una consulta predefinida de ``, que es ejecutado por ClickHouse cuando se hace coincidir una solicitud HTTP y se devuelve el resultado de la consulta. Es una configuración imprescindible. - -En el ejemplo siguiente se definen los valores de `max_threads` y `max_alter_threads` configuración, a continuación, consulta la tabla del sistema para comprobar si estos ajustes se han establecido correctamente. - -Ejemplo: - -``` xml - - - [^/]+)(/(?P[^/]+))?]]> - GET - - TEST_HEADER_VALUE - [^/]+)(/(?P[^/]+))?]]> - - - predefined_query_handler - SELECT value FROM system.settings WHERE name = {name_1:String} - SELECT name, value FROM system.settings WHERE name = {name_2:String} - - - -``` - -``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' -1 -max_alter_threads 2 -``` - -!!! note "precaución" - En uno `` sólo es compatible con uno `` de un tipo de plaquita. - -## Nombre de la red inalámbrica (SSID): {#dynamic_query_handler} - -En ``, consulta se escribe en forma de param de la solicitud HTTP. La diferencia es que en ``, consulta se escribe en el archivo de configuración. Puede configurar `` en ``. - -ClickHouse extrae y ejecuta el valor correspondiente al `` valor en la url de la solicitud HTTP. El valor predeterminado de `` ser `/query` . Es una configuración opcional. Si no hay una definición en el archivo de configuración, el parámetro no se pasa. - -Para experimentar con esta funcionalidad, el ejemplo define los valores de max_threads y max_alter_threads y consulta si la configuración se estableció correctamente. - -Ejemplo: - -``` xml - - - - TEST_HEADER_VALUE_DYNAMIC - - dynamic_query_handler - query_param - - - -``` - -``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' -max_threads 1 -max_alter_threads 2 -``` - -## estática {#static} - -`` puede volver [Content_type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type), [estatus](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) y response_content. response_content puede devolver el contenido especificado - -Ejemplo: - -Devuelve un mensaje. - -``` xml - - - GET - xxx - /hi - - static - 402 - text/html; charset=UTF-8 - Say Hi! - - - -``` - -``` bash -$ curl -vv -H 'XXX:xxx' 'http://localhost:8123/hi' -* Trying ::1... -* Connected to localhost (::1) port 8123 (#0) -> GET /hi HTTP/1.1 -> Host: localhost:8123 -> User-Agent: curl/7.47.0 -> Accept: */* -> XXX:xxx -> -< HTTP/1.1 402 Payment Required -< Date: Wed, 29 Apr 2020 03:51:26 GMT -< Connection: Keep-Alive -< Content-Type: text/html; charset=UTF-8 -< Transfer-Encoding: chunked -< Keep-Alive: timeout=3 -< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} -< -* Connection #0 to host localhost left intact -Say Hi!% -``` - -Busque el contenido de la configuración enviada al cliente. - -``` xml -
]]>
- - - - GET - xxx - /get_config_static_handler - - static - config://get_config_static_handler - - - -``` - -``` bash -$ curl -v -H 'XXX:xxx' 'http://localhost:8123/get_config_static_handler' -* Trying ::1... -* Connected to localhost (::1) port 8123 (#0) -> GET /get_config_static_handler HTTP/1.1 -> Host: localhost:8123 -> User-Agent: curl/7.47.0 -> Accept: */* -> XXX:xxx -> -< HTTP/1.1 200 OK -< Date: Wed, 29 Apr 2020 04:01:24 GMT -< Connection: Keep-Alive -< Content-Type: text/plain; charset=UTF-8 -< Transfer-Encoding: chunked -< Keep-Alive: timeout=3 -< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} -< -* Connection #0 to host localhost left intact -
% -``` - -Encuentra el contenido del archivo enviado al cliente. - -``` xml - - - GET - xxx - /get_absolute_path_static_handler - - static - text/html; charset=UTF-8 - file:///absolute_path_file.html - - - - GET - xxx - /get_relative_path_static_handler - - static - text/html; charset=UTF-8 - file://./relative_path_file.html - - - -``` - -``` bash -$ user_files_path='/var/lib/clickhouse/user_files' -$ sudo echo "Relative Path File" > $user_files_path/relative_path_file.html -$ sudo echo "Absolute Path File" > $user_files_path/absolute_path_file.html -$ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_absolute_path_static_handler' -* Trying ::1... -* Connected to localhost (::1) port 8123 (#0) -> GET /get_absolute_path_static_handler HTTP/1.1 -> Host: localhost:8123 -> User-Agent: curl/7.47.0 -> Accept: */* -> XXX:xxx -> -< HTTP/1.1 200 OK -< Date: Wed, 29 Apr 2020 04:18:16 GMT -< Connection: Keep-Alive -< Content-Type: text/html; charset=UTF-8 -< Transfer-Encoding: chunked -< Keep-Alive: timeout=3 -< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} -< -Absolute Path File -* Connection #0 to host localhost left intact -$ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler' -* Trying ::1... -* Connected to localhost (::1) port 8123 (#0) -> GET /get_relative_path_static_handler HTTP/1.1 -> Host: localhost:8123 -> User-Agent: curl/7.47.0 -> Accept: */* -> XXX:xxx -> -< HTTP/1.1 200 OK -< Date: Wed, 29 Apr 2020 04:18:31 GMT -< Connection: Keep-Alive -< Content-Type: text/html; charset=UTF-8 -< Transfer-Encoding: chunked -< Keep-Alive: timeout=3 -< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} -< -Relative Path File -* Connection #0 to host localhost left intact -``` - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/http_interface/) diff --git a/docs/es/interfaces/index.md b/docs/es/interfaces/index.md deleted file mode 100644 index 3632c8a9e29..00000000000 --- a/docs/es/interfaces/index.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Interfaz -toc_priority: 14 -toc_title: "Implantaci\xF3n" ---- - -# Interfaz {#interfaces} - -ClickHouse proporciona dos interfaces de red (ambas se pueden ajustar opcionalmente en TLS para mayor seguridad): - -- [HTTP](http.md), que está documentado y fácil de usar directamente. -- [TCP nativo](tcp.md), que tiene menos sobrecarga. - -En la mayoría de los casos, se recomienda utilizar la herramienta o biblioteca apropiada en lugar de interactuar con ellos directamente. Oficialmente apoyados por Yandex son los siguientes: - -- [Cliente de línea de comandos](cli.md) -- [Controlador JDBC](jdbc.md) -- [Controlador ODBC](odbc.md) -- [Biblioteca cliente de C++](cpp.md) - -También hay una amplia gama de bibliotecas de terceros para trabajar con ClickHouse: - -- [Bibliotecas de clientes](third-party/client-libraries.md) -- [Integración](third-party/integrations.md) -- [Interfaces visuales](third-party/gui.md) - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/) diff --git a/docs/es/interfaces/jdbc.md b/docs/es/interfaces/jdbc.md deleted file mode 100644 index 7303dec8960..00000000000 --- a/docs/es/interfaces/jdbc.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 22 -toc_title: Controlador JDBC ---- - -# Controlador JDBC {#jdbc-driver} - -- **[Conductor oficial](https://github.com/ClickHouse/clickhouse-jdbc)** -- Controladores de terceros: - - [Sistema abierto.](https://github.com/housepower/ClickHouse-Native-JDBC) - - [Método de codificación de datos:](https://github.com/blynkkk/clickhouse4j) - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/jdbc/) diff --git a/docs/es/interfaces/mysql.md b/docs/es/interfaces/mysql.md deleted file mode 100644 index a5124c61dd5..00000000000 --- a/docs/es/interfaces/mysql.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 20 -toc_title: Interfaz MySQL ---- - -# Interfaz MySQL {#mysql-interface} - -ClickHouse soporta el protocolo de cable MySQL. Puede ser habilitado por [mysql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-mysql_port) configuración en el archivo de configuración: - -``` xml -9004 -``` - -Ejemplo de conexión mediante la herramienta de línea de comandos `mysql`: - -``` bash -$ mysql --protocol tcp -u default -P 9004 -``` - -Salida si una conexión se realizó correctamente: - -``` text -Welcome to the MySQL monitor. Commands end with ; or \g. -Your MySQL connection id is 4 -Server version: 20.2.1.1-ClickHouse - -Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved. - -Oracle is a registered trademark of Oracle Corporation and/or its -affiliates. Other names may be trademarks of their respective -owners. - -Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. - -mysql> -``` - -Para la compatibilidad con todos los clientes MySQL, se recomienda especificar la contraseña de usuario con [doble SHA1](../operations/settings/settings-users.md#password_double_sha1_hex) en el archivo de configuración. -Si la contraseña de usuario se especifica usando [SHA256](../operations/settings/settings-users.md#password_sha256_hex), algunos clientes no podrán autenticarse (mysqljs y versiones antiguas de la herramienta de línea de comandos mysql). - -Restricción: - -- las consultas preparadas no son compatibles - -- algunos tipos de datos se envían como cadenas - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/mysql/) diff --git a/docs/es/interfaces/odbc.md b/docs/es/interfaces/odbc.md deleted file mode 100644 index 6ccb979c7f7..00000000000 --- a/docs/es/interfaces/odbc.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 23 -toc_title: Conductor ODBC ---- - -# Conductor ODBC {#odbc-driver} - -- [Conductor oficial](https://github.com/ClickHouse/clickhouse-odbc). - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/odbc/) diff --git a/docs/es/interfaces/tcp.md b/docs/es/interfaces/tcp.md deleted file mode 100644 index 47df0d12829..00000000000 --- a/docs/es/interfaces/tcp.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 18 -toc_title: Interfaz nativa (TCP) ---- - -# Interfaz nativa (TCP) {#native-interface-tcp} - -El protocolo nativo se utiliza en el [cliente de línea de comandos](cli.md), para la comunicación entre servidores durante el procesamiento de consultas distribuidas, y también en otros programas de C, Desafortunadamente, el protocolo nativo de ClickHouse aún no tiene especificaciones formales, pero puede ser diseñado de manera inversa desde el código fuente de ClickHouse (comenzando [por aquí](https://github.com/ClickHouse/ClickHouse/tree/master/src/Client)) y/o mediante la interceptación y el análisis del tráfico TCP. - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/tcp/) diff --git a/docs/es/interfaces/third-party/client-libraries.md b/docs/es/interfaces/third-party/client-libraries.md deleted file mode 100644 index b61ab1a5d9c..00000000000 --- a/docs/es/interfaces/third-party/client-libraries.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -toc_priority: 26 -toc_title: Client Libraries ---- - -# Client Libraries from Third-party Developers {#client-libraries-from-third-party-developers} - -!!! warning "Disclaimer" - Yandex does **not** maintain the libraries listed below and haven’t done any extensive testing to ensure their quality. - -- Python - - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) - - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) - - [clickhouse-client](https://github.com/yurial/clickhouse-client) - - [aiochclient](https://github.com/maximdanilchenko/aiochclient) - - [asynch](https://github.com/long2ice/asynch) -- PHP - - [smi2/phpclickhouse](https://packagist.org/packages/smi2/phpClickHouse) - - [8bitov/clickhouse-php-client](https://packagist.org/packages/8bitov/clickhouse-php-client) - - [bozerkins/clickhouse-client](https://packagist.org/packages/bozerkins/clickhouse-client) - - [simpod/clickhouse-client](https://packagist.org/packages/simpod/clickhouse-client) - - [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client) - - [SeasClick C++ client](https://github.com/SeasX/SeasClick) -- Go - - [clickhouse](https://github.com/kshvakov/clickhouse/) - - [go-clickhouse](https://github.com/roistat/go-clickhouse) - - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) - - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) -- NodeJs - - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) - - [node-clickhouse](https://github.com/apla/node-clickhouse) -- Perl - - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) - - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) - - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) -- Ruby - - [ClickHouse (Ruby)](https://github.com/shlima/click_house) - - [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord) -- R - - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) - - [RClickHouse](https://github.com/IMSMWU/RClickHouse) -- Java - - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) - - [clickhouse-client](https://github.com/Ecwid/clickhouse-client) -- Scala - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- Kotlin - - [AORM](https://github.com/TanVD/AORM) -- C# - - [Octonica.ClickHouseClient](https://github.com/Octonica/ClickHouseClient) - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) -- Elixir - - [clickhousex](https://github.com/appodeal/clickhousex/) - - [pillar](https://github.com/sofakingworld/pillar) -- Nim - - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) - -[Original article](https://clickhouse.tech/docs/en/interfaces/third-party/client_libraries/) diff --git a/docs/es/interfaces/third-party/gui.md b/docs/es/interfaces/third-party/gui.md deleted file mode 100644 index 754c0f68c69..00000000000 --- a/docs/es/interfaces/third-party/gui.md +++ /dev/null @@ -1,156 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 28 -toc_title: Interfaces Visuales ---- - -# Interfaces visuales de desarrolladores de terceros {#visual-interfaces-from-third-party-developers} - -## De código abierto {#open-source} - -### Tabix {#tabix} - -Interfaz web para ClickHouse en el [Tabix](https://github.com/tabixio/tabix) proyecto. - -Función: - -- Funciona con ClickHouse directamente desde el navegador, sin la necesidad de instalar software adicional. -- Editor de consultas con resaltado de sintaxis. -- Autocompletado de comandos. -- Herramientas para el análisis gráfico de la ejecución de consultas. -- Opciones de esquema de color. - -[Documentación de Tabix](https://tabix.io/doc/). - -### Sistema abierto {#houseops} - -[Sistema abierto.](https://github.com/HouseOps/HouseOps) Es una interfaz de usuario / IDE para OSX, Linux y Windows. - -Función: - -- Generador de consultas con resaltado de sintaxis. Ver la respuesta en una tabla o vista JSON. -- Exportar resultados de consultas como CSV o JSON. -- Lista de procesos con descripciones. Modo de escritura. Capacidad de parar (`KILL`) proceso. -- Gráfico de base de datos. Muestra todas las tablas y sus columnas con información adicional. -- Una vista rápida del tamaño de la columna. -- Configuración del servidor. - -Las siguientes características están planificadas para el desarrollo: - -- Gestión de bases de datos. -- Gestión de usuarios. -- Análisis de datos en tiempo real. -- Supervisión de clúster. -- Gestión de clústeres. -- Monitoreo de tablas replicadas y Kafka. - -### Faro {#lighthouse} - -[Faro](https://github.com/VKCOM/lighthouse) Es una interfaz web ligera para ClickHouse. - -Función: - -- Lista de tablas con filtrado y metadatos. -- Vista previa de la tabla con filtrado y clasificación. -- Ejecución de consultas de sólo lectura. - -### Redash {#redash} - -[Redash](https://github.com/getredash/redash) es una plataforma para la visualización de datos. - -Admite múltiples fuentes de datos, incluido ClickHouse, Redash puede unir los resultados de consultas de diferentes fuentes de datos en un conjunto de datos final. - -Función: - -- Potente editor de consultas. -- Explorador de base de datos. -- Herramientas de visualización, que le permiten representar datos en diferentes formas. - -### DBeaver {#dbeaver} - -[DBeaver](https://dbeaver.io/) - Cliente de base de datos de escritorio universal con soporte ClickHouse. - -Función: - -- Desarrollo de consultas con resaltado de sintaxis y autocompletado. -- Lista de tablas con filtros y búsqueda de metadatos. -- Vista previa de datos de tabla. -- Búsqueda de texto completo. - -### Sistema abierto {#clickhouse-cli} - -[Sistema abierto.](https://github.com/hatarist/clickhouse-cli) es un cliente de línea de comandos alternativo para ClickHouse, escrito en Python 3. - -Función: - -- Autocompletado. -- Resaltado de sintaxis para las consultas y la salida de datos. -- Soporte de buscapersonas para la salida de datos. -- Comandos similares a PostgreSQL personalizados. - -### Sistema abierto {#clickhouse-flamegraph} - -[Sistema abierto.](https://github.com/Slach/clickhouse-flamegraph) es una herramienta especializada para visualizar el `system.trace_log` como [Flamegraph](http://www.brendangregg.com/flamegraphs.html). - -### Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica {#clickhouse-plantuml} - -[Método de codificación de datos:](https://pypi.org/project/clickhouse-plantuml/) es un script para generar [PlantUML](https://plantuml.com/) diagrama de esquemas de tablas. - -## Comercial {#commercial} - -### DataGrip {#datagrip} - -[DataGrip](https://www.jetbrains.com/datagrip/) Es un IDE de base de datos de JetBrains con soporte dedicado para ClickHouse. También está integrado en otras herramientas basadas en IntelliJ: PyCharm, IntelliJ IDEA, GoLand, PhpStorm y otros. - -Función: - -- Finalización de código muy rápida. -- Resaltado de sintaxis de ClickHouse. -- Soporte para características específicas de ClickHouse, por ejemplo, columnas anidadas, motores de tablas. -- Editor de datos. -- Refactorizaciones. -- Búsqueda y navegación. - -### Yandex DataLens {#yandex-datalens} - -[Yandex DataLens](https://cloud.yandex.ru/services/datalens) es un servicio de visualización y análisis de datos. - -Función: - -- Amplia gama de visualizaciones disponibles, desde simples gráficos de barras hasta paneles complejos. -- Los paneles podrían ponerse a disposición del público. -- Soporte para múltiples fuentes de datos, incluyendo ClickHouse. -- Almacenamiento de datos materializados basados en ClickHouse. - -Nivel de Cifrado WEP [disponible de forma gratuita](https://cloud.yandex.com/docs/datalens/pricing) para proyectos de baja carga, incluso para uso comercial. - -- [Documentación de DataLens](https://cloud.yandex.com/docs/datalens/). -- [Tutorial](https://cloud.yandex.com/docs/solutions/datalens/data-from-ch-visualization) en la visualización de datos de una base de datos ClickHouse. - -### Software de Holística {#holistics-software} - -[Holística](https://www.holistics.io/) es una plataforma de datos de pila completa y una herramienta de inteligencia de negocios. - -Función: - -- Correo electrónico automatizado, Slack y horarios de informes de Google Sheet. -- Editor SQL con visualizaciones, control de versiones, autocompletado, componentes de consulta reutilizables y filtros dinámicos. -- Análisis integrado de informes y cuadros de mando a través de iframe. -- Preparación de datos y capacidades ETL. -- Soporte de modelado de datos SQL para mapeo relacional de datos. - -### Mirador {#looker} - -[Mirador](https://looker.com) Es una plataforma de datos y una herramienta de inteligencia de negocios con soporte para más de 50 dialectos de bases de datos, incluido ClickHouse. Bravo está disponible como una plataforma SaaS y auto-organizada. Los usuarios pueden utilizar Looker a través del navegador para explorar datos, crear visualizaciones y paneles, programar informes y compartir sus conocimientos con colegas. Looker proporciona un amplio conjunto de herramientas para incrustar estas características en otras aplicaciones y una API -para integrar datos con otras aplicaciones. - -Función: - -- Desarrollo fácil y ágil utilizando LookML, un lenguaje que soporta curado - [Modelado de datos](https://looker.com/platform/data-modeling) para apoyar a los redactores de informes y a los usuarios finales. -- Potente integración de flujo de trabajo a través de Looker's [Acciones de datos](https://looker.com/platform/actions). - -[Cómo configurar ClickHouse en Looker.](https://docs.looker.com/setup-and-management/database-config/clickhouse) - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/third-party/gui/) diff --git a/docs/es/interfaces/third-party/index.md b/docs/es/interfaces/third-party/index.md deleted file mode 100644 index adf50b05cdf..00000000000 --- a/docs/es/interfaces/third-party/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: tercero -toc_priority: 24 ---- - - diff --git a/docs/es/interfaces/third-party/integrations.md b/docs/es/interfaces/third-party/integrations.md deleted file mode 100644 index 7588bef0230..00000000000 --- a/docs/es/interfaces/third-party/integrations.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -toc_priority: 27 -toc_title: Integrations ---- - -# Integration Libraries from Third-party Developers {#integration-libraries-from-third-party-developers} - -!!! warning "Disclaimer" - Yandex does **not** maintain the tools and libraries listed below and haven’t done any extensive testing to ensure their quality. - -## Infrastructure Products {#infrastructure-products} - -- Relational database management systems - - [MySQL](https://www.mysql.com) - - [mysql2ch](https://github.com/long2ice/mysql2ch) - - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) - - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) - - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) - - [PostgreSQL](https://www.postgresql.org) - - [clickhousedb_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) - - [infi.clickhouse_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (uses [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pg2ch](https://github.com/mkabilov/pg2ch) - - [clickhouse_fdw](https://github.com/adjust/clickhouse_fdw) - - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) - - [ClickHouseMigrator](https://github.com/zlzforever/ClickHouseMigrator) -- Message queues - - [Kafka](https://kafka.apache.org) - - [clickhouse_sinker](https://github.com/housepower/clickhouse_sinker) (uses [Go client](https://github.com/ClickHouse/clickhouse-go/)) - - [stream-loader-clickhouse](https://github.com/adform/stream-loader) -- Stream processing - - [Flink](https://flink.apache.org) - - [flink-clickhouse-sink](https://github.com/ivi-ru/flink-clickhouse-sink) -- Object storages - - [S3](https://en.wikipedia.org/wiki/Amazon_S3) - - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) -- Container orchestration - - [Kubernetes](https://kubernetes.io) - - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) -- Configuration management - - [puppet](https://puppet.com) - - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) - - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) -- Monitoring - - [Graphite](https://graphiteapp.org) - - [graphouse](https://github.com/yandex/graphouse) - - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + - - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) - - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - optimizes staled partitions in [\*GraphiteMergeTree](../../engines/table-engines/mergetree-family/graphitemergetree.md#graphitemergetree) if rules from [rollup configuration](../../engines/table-engines/mergetree-family/graphitemergetree.md#rollup-configuration) could be applied - - [Grafana](https://grafana.com/) - - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) - - [Prometheus](https://prometheus.io/) - - [clickhouse_exporter](https://github.com/f1yegor/clickhouse_exporter) - - [PromHouse](https://github.com/Percona-Lab/PromHouse) - - [clickhouse_exporter](https://github.com/hot-wifi/clickhouse_exporter) (uses [Go client](https://github.com/kshvakov/clickhouse/)) - - [Nagios](https://www.nagios.org/) - - [check_clickhouse](https://github.com/exogroup/check_clickhouse/) - - [check_clickhouse.py](https://github.com/innogames/igmonplugins/blob/master/src/check_clickhouse.py) - - [Zabbix](https://www.zabbix.com) - - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) - - [Sematext](https://sematext.com/) - - [clickhouse integration](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) -- Logging - - [rsyslog](https://www.rsyslog.com/) - - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) - - [fluentd](https://www.fluentd.org) - - [loghouse](https://github.com/flant/loghouse) (for [Kubernetes](https://kubernetes.io)) - - [logagent](https://www.sematext.com/logagent) - - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) -- Geo - - [MaxMind](https://dev.maxmind.com/geoip/) - - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) - -## Programming Language Ecosystems {#programming-language-ecosystems} - -- Python - - [SQLAlchemy](https://www.sqlalchemy.org) - - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (uses [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pandas](https://pandas.pydata.org) - - [pandahouse](https://github.com/kszucs/pandahouse) -- PHP - - [Doctrine](https://www.doctrine-project.org/) - - [dbal-clickhouse](https://packagist.org/packages/friendsofdoctrine/dbal-clickhouse) -- R - - [dplyr](https://db.rstudio.com/dplyr/) - - [RClickHouse](https://github.com/IMSMWU/RClickHouse) (uses [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) -- Java - - [Hadoop](http://hadoop.apache.org) - - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (uses [JDBC](../../sql-reference/table-functions/jdbc.md)) -- Scala - - [Akka](https://akka.io) - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- C# - - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) - - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) -- Elixir - - [Ecto](https://github.com/elixir-ecto/ecto) - - [clickhouse_ecto](https://github.com/appodeal/clickhouse_ecto) -- Ruby - - [Ruby on Rails](https://rubyonrails.org/) - - [activecube](https://github.com/bitquery/activecube) - - [ActiveRecord](https://github.com/PNixx/clickhouse-activerecord) - - [GraphQL](https://github.com/graphql) - - [activecube-graphql](https://github.com/bitquery/activecube-graphql) - -[Original article](https://clickhouse.tech/docs/en/interfaces/third-party/integrations/) diff --git a/docs/es/interfaces/third-party/proxy.md b/docs/es/interfaces/third-party/proxy.md deleted file mode 100644 index e1aabf8fce4..00000000000 --- a/docs/es/interfaces/third-party/proxy.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 29 -toc_title: Proxy ---- - -# Servidores proxy de desarrolladores de terceros {#proxy-servers-from-third-party-developers} - -## chproxy {#chproxy} - -[chproxy](https://github.com/Vertamedia/chproxy), es un proxy HTTP y equilibrador de carga para la base de datos ClickHouse. - -Función: - -- Enrutamiento por usuario y almacenamiento en caché de respuestas. -- Flexible límites. -- Renovación automática del certificado SSL. - -Implementado en Go. - -## Bienvenido a WordPress {#kittenhouse} - -[Bienvenido a WordPress.](https://github.com/VKCOM/kittenhouse) está diseñado para ser un proxy local entre ClickHouse y el servidor de aplicaciones en caso de que sea imposible o inconveniente almacenar los datos INSERT en el lado de su aplicación. - -Función: - -- Almacenamiento en búfer de datos en memoria y en disco. -- Enrutamiento por tabla. -- Equilibrio de carga y comprobación de estado. - -Implementado en Go. - -## Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica {#clickhouse-bulk} - -[Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/nikepan/clickhouse-bulk) es un simple colector de insertos ClickHouse. - -Función: - -- Agrupe las solicitudes y envíe por umbral o intervalo. -- Múltiples servidores remotos. -- Autenticación básica. - -Implementado en Go. - -[Artículo Original](https://clickhouse.tech/docs/en/interfaces/third-party/proxy/) diff --git a/docs/es/introduction/adopters.md b/docs/es/introduction/adopters.md deleted file mode 100644 index 4c0aa78d57b..00000000000 --- a/docs/es/introduction/adopters.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 8 -toc_title: Adoptante ---- - -# Adoptadores de ClickHouse {#clickhouse-adopters} - -!!! warning "Descargo" - La siguiente lista de empresas que utilizan ClickHouse y sus historias de éxito se recopila a partir de fuentes públicas, por lo que podría diferir de la realidad actual. Le agradeceríamos que compartiera la historia de adoptar ClickHouse en su empresa y [agregarlo a la lista](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md), pero por favor asegúrese de que usted no tendrá ningunos problemas de NDA haciendo así. Proporcionar actualizaciones con publicaciones de otras compañías también es útil. - -| Empresa | Industria | Usecase | Tamaño de clúster | (Un)Tamaño de datos comprimidos\* | Referencia | -|-------------------------------------------------------------------------------------------------|------------------------------------|-----------------------------|------------------------------------------------------------------|-------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 2gis | Asignar | Monitoreo | — | — | [Charla en ruso, julio 2019](https://youtu.be/58sPkXfq6nw) | -| Aloha Browser | Aplicación móvil | Backend del navegador | — | — | [Diapositivas en ruso, mayo 2019](https://github.com/yandex/clickhouse-presentations/blob/master/meetup22/aloha.pdf) | -| Amadeus | Viaje | Analítica | — | — | [Comunicado de prensa, abril de 2018](https://www.altinity.com/blog/2018/4/5/amadeus-technologies-launches-investment-and-insights-tool-based-on-machine-learning-and-strategy-algorithms) | -| Appsflyer | Análisis móvil | Producto principal | — | — | [Charla en ruso, julio 2019](https://www.youtube.com/watch?v=M3wbRlcpBbY) | -| ArenaData | Plataforma de datos | Producto principal | — | — | [Diapositivas en ruso, diciembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup38/indexes.pdf) | -| Badoo | Citas | Serie de tiempo | — | — | [Diapositivas en ruso, diciembre 2019](https://presentations.clickhouse.tech/meetup38/forecast.pdf) | -| Benocs | Telemetría y análisis de red | Producto principal | — | — | [Diapositivas en español, octubre de 2017](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup9/lpm.pdf) | -| Bloomberg | Finanzas, Medios | Monitoreo | 102 servidores | — | [Diapositivas, Mayo 2018](https://www.slideshare.net/Altinity/http-analytics-for-6m-requests-per-second-using-clickhouse-by-alexander-bocharov) | -| Bloxy | Blockchain | Analítica | — | — | [Diapositivas en ruso, agosto 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/4_bloxy.pptx) | -| Dataliance para China Telecom | Telecomunicaciones | Analítica | — | — | [Diapositivas en chino, enero 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/telecom.pdf) | -| CARTO | Inteligencia de negocios | Análisis geográfico | — | — | [Procesamiento geoespacial con ClickHouse](https://carto.com/blog/geospatial-processing-with-clickhouse/) | -| CERN | Investigación | Experimento | — | — | [Comunicado de prensa, abril de 2012](https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/) | -| Cisco | Red | Análisis de tráfico | — | — | [Charla relámpago, octubre 2019](https://youtu.be/-hI1vDR2oPY?t=5057) | -| Citadel Securities | Financiación | — | — | — | [Contribución, marzo 2019](https://github.com/ClickHouse/ClickHouse/pull/4774) | -| Más información | Taxi | Analítica | — | — | [Blog Post en ruso, marzo 2020](https://habr.com/en/company/citymobil/blog/490660/) | -| ContentSquare | Análisis web | Producto principal | — | — | [Publicación de blog en francés, noviembre 2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | -| Cloudflare | CDN | Análisis de tráfico | 36 servidores | — | [Mensaje del blog, Mayo 2017](https://blog.cloudflare.com/how-cloudflare-analyzes-1m-dns-queries-per-second/), [Mensaje del blog, marzo 2018](https://blog.cloudflare.com/http-analytics-for-6m-requests-per-second-using-clickhouse/) | -| Corunet | Analítica | Producto principal | — | — | [Diapositivas en español, Abril 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup21/predictive_models.pdf) | -| CraiditX 氪信 | Finanzas AI | Análisis | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | -| Criteo | Menor | Producto principal | — | — | [Diapositivas en español, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/3_storetail.pptx) | -| Deutsche Bank | Financiación | BI Analytics | — | — | [Diapositivas en español, octubre 2019](https://bigdatadays.ru/wp-content/uploads/2019/10/D2-H3-3_Yakunin-Goihburg.pdf) | -| Diva-e | Consultoría digital | Producto principal | — | — | [Diapositivas en español, septiembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup29/ClickHouse-MeetUp-Unusual-Applications-sd-2019-09-17.pdf) | -| Exness | Comercio | Métricas, Registro | — | — | [Charla en ruso, mayo 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | -| Sistema abierto. | Red Ad | Producto principal | — | — | [Publicación de blog en japonés, julio 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | -| HUYA | Video Streaming | Analítica | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | -| Idealista | Inmobiliario | Analítica | — | — | [Blog Post en Inglés, Abril 2019](https://clickhouse.tech/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | -| Infovista | Red | Analítica | — | — | [Diapositivas en español, octubre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup30/infovista.pdf) | -| InnoGames | Juego | Métricas, Registro | — | — | [Diapositivas en ruso, septiembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/graphite_and_clickHouse.pdf) | -| Integros | Plataforma para servicios de video | Analítica | — | — | [Diapositivas en ruso, mayo 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | -| Datos de Kodiak | Nube | Producto principal | — | — | [Diapositivas en Engish, Abril 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) | -| Kontur | Desarrollo de software | Métricas | — | — | [Charla en ruso, noviembre 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | -| Sistema abierto. | Red Ad | Producto principal | 75 servidores (3 réplicas) | 5.27 PiB | [Publicación de blog en ruso, febrero 2017](https://habr.com/en/post/322620/) | -| Soluciones en la nube de Mail.ru | Servicios en la nube | Producto principal | — | — | [Artículo en ruso](https://mcs.mail.ru/help/db-create/clickhouse#) | -| Mensaje de pájaro | Telecomunicaciones | Estadísticas | — | — | [Diapositivas en español, noviembre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup20/messagebird.pdf) | -| MGID | Red Ad | Analítica Web | — | — | [Publicación de blog en ruso, abril 2020](http://gs-studio.com/news-about-it/32777----clickhouse---c) | -| UnoAPM | Supervisión y análisis de datos | Producto principal | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | -| Pragma Innovation | Telemetría y Análisis de Big Data | Producto principal | — | — | [Diapositivas en español, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/4_pragma_innovation.pdf) | -| QINGCLOUD | Servicios en la nube | Producto principal | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) | -| Qrator | Protección DDoS | Producto principal | — | — | [Blog Post, marzo 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) | -| Percent 百分点 | Analítica | Producto principal | — | — | [Diapositivas en chino, junio 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/4.%20ClickHouse万亿数据双中心的设计与实践%20.pdf) | -| Rambler | Servicios de Internet | Analítica | — | — | [Charla en ruso, abril 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) | -| Tencent | Mensajería | Tala | — | — | [Charla en chino, noviembre 2019](https://youtu.be/T-iVQRuw-QY?t=5050) | -| Traffic Stars | Red AD | — | — | — | [Diapositivas en ruso, mayo 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | -| S7 Airlines | Aérea | Métricas, Registro | — | — | [Charla en ruso, marzo 2019](https://www.youtube.com/watch?v=nwG68klRpPg&t=15s) | -| SEMrush | Marketing | Producto principal | — | — | [Diapositivas en ruso, agosto 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/5_semrush.pdf) | -| scireum GmbH | Comercio electrónico | Producto principal | — | — | [Charla en alemán, febrero de 2020](https://www.youtube.com/watch?v=7QWAn5RbyR4) | -| Centinela | Desarrollador de software | Backend para el producto | — | — | [Publicación de blog en inglés, mayo 2019](https://blog.sentry.io/2019/05/16/introducing-snuba-sentrys-new-search-infrastructure) | -| SGK | Gobierno Seguridad Social | Analítica | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/ClickHouse%20Meetup-Ramazan%20POLAT.pdf) | -| el seo.¿ | Analítica | Producto principal | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/CH%20Presentation-%20Metehan%20Çetinkaya.pdf) | -| Sina | Noticia | — | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/6.%20ClickHouse最佳实践%20高鹏_新浪.pdf) | -| SMI2 | Noticia | Analítica | — | — | [Blog Post en ruso, noviembre 2017](https://habr.com/ru/company/smi2/blog/314558/) | -| Salto | Análisis de negocios | Producto principal | — | — | [Diapositivas en español, enero 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/splunk.pdf) | -| Spotify | Sica | Experimentación | — | — | [Diapositivas, julio 2018](https://www.slideshare.net/glebus/using-clickhouse-for-experimentation-104247173) | -| Tencent | Grandes Datos | Procesamiento de datos | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/5.%20ClickHouse大数据集群应用_李俊飞腾讯网媒事业部.pdf) | -| Más información | Taxi | Tala | — | — | [Diapositivas, febrero de 2020](https://presentations.clickhouse.tech/meetup40/uber.pdf) | -| VKontakte | Red social | Estadísticas, Registro | — | — | [Diapositivas en ruso, agosto 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | -| Método de codificación de datos: | Soluciones de TI | Analítica | — | — | [Diapositivas en ruso, mayo 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | -| Xiaoxin Tech | Educación | Propósito común | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/sync-clickhouse-with-mysql-mongodb.pptx) | -| Ximalaya | Compartir audio | OLAP | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/ximalaya.pdf) | -| Yandex Cloud | Nube pública | Producto principal | — | — | [Charla en ruso, diciembre 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | -| Yandex DataLens | Inteligencia de negocios | Producto principal | — | — | [Diapositivas en ruso, diciembre 2019](https://presentations.clickhouse.tech/meetup38/datalens.pdf) | -| Yandex Market | Comercio electrónico | Métricas, Registro | — | — | [Charla en ruso, enero 2019](https://youtu.be/_l1qP0DyBcA?t=478) | -| Yandex Metrica | Análisis web | Producto principal | 360 servidores en un clúster, 1862 servidores en un departamento | 66.41 PiB / 5.68 PiB | [Diapositivas, febrero de 2020](https://presentations.clickhouse.tech/meetup40/introduction/#13) | -| ЦВТ | Desarrollo de software | Métricas, Registro | — | — | [Blog Post, marzo 2019, en ruso](https://vc.ru/dev/62715-kak-my-stroili-monitoring-na-prometheus-clickhouse-i-elk) | -| МКБ | Banco | Supervisión del sistema web | — | — | [Diapositivas en ruso, septiembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/mkb.pdf) | -| Jinshuju 金数据 | BI Analytics | Producto principal | — | — | [Diapositivas en chino, octubre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) | -| Instana | Plataforma APM | Producto principal | — | — | [Publicación de Twitter](https://twitter.com/mieldonkers/status/1248884119158882304) | -| Wargaming | Juego | | — | — | [Entrevista](https://habr.com/en/post/496954/) | -| Crazypanda | Juego | | — | — | Sesión en vivo en ClickHouse meetup | -| FunCorp | Juego | | — | — | [Artículo](https://www.altinity.com/blog/migrating-from-redshift-to-clickhouse) | - -[Artículo Original](https://clickhouse.tech/docs/en/introduction/adopters/) diff --git a/docs/es/introduction/distinctive-features.md b/docs/es/introduction/distinctive-features.md deleted file mode 100644 index 154b12a65e9..00000000000 --- a/docs/es/introduction/distinctive-features.md +++ /dev/null @@ -1,77 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 4 -toc_title: "Caracter\xEDsticas distintivas" ---- - -# Características distintivas de ClickHouse {#distinctive-features-of-clickhouse} - -## DBMS orientado a columnas verdaderas {#true-column-oriented-dbms} - -En un verdadero DBMS orientado a columnas, no se almacenan datos adicionales con los valores. Entre otras cosas, esto significa que los valores de longitud constante deben ser compatibles, para evitar almacenar su longitud “number” al lado de los valores. Como ejemplo, mil millones de valores de tipo UInt8 deberían consumir alrededor de 1 GB sin comprimir, o esto afecta fuertemente el uso de la CPU. Es esencial almacenar los datos de forma compacta (sin “garbage”) incluso sin comprimir, ya que la velocidad de descompresión (uso de CPU) depende principalmente del volumen de datos sin comprimir. - -Vale la pena señalar porque hay sistemas que pueden almacenar valores de diferentes columnas por separado, pero que no pueden procesar efectivamente las consultas analíticas debido a su optimización para otros escenarios. Los ejemplos son HBase, BigTable, Cassandra e HyperTable. En estos sistemas, obtendría un rendimiento de alrededor de cien mil filas por segundo, pero no cientos de millones de filas por segundo. - -También vale la pena señalar que ClickHouse es un sistema de administración de bases de datos, no una sola base de datos. ClickHouse permite crear tablas y bases de datos en tiempo de ejecución, cargar datos y ejecutar consultas sin volver a configurar y reiniciar el servidor. - -## Compresión de datos {#data-compression} - -Algunos DBMS orientados a columnas (InfiniDB CE y MonetDB) no utilizan la compresión de datos. Sin embargo, la compresión de datos juega un papel clave para lograr un rendimiento excelente. - -## Almacenamiento en disco de datos {#disk-storage-of-data} - -Mantener los datos físicamente ordenados por clave principal permite extraer datos para sus valores específicos o rangos de valores con baja latencia, menos de unas pocas docenas de milisegundos. Algunos DBMS orientados a columnas (como SAP HANA y Google PowerDrill) solo pueden funcionar en RAM. Este enfoque fomenta la asignación de un presupuesto de hardware más grande que el necesario para el análisis en tiempo real. ClickHouse está diseñado para funcionar en discos duros normales, lo que significa que el costo por GB de almacenamiento de datos es bajo, pero SSD y RAM adicional también se utilizan completamente si están disponibles. - -## Procesamiento paralelo en varios núcleos {#parallel-processing-on-multiple-cores} - -Las consultas grandes se paralelizan naturalmente, tomando todos los recursos necesarios disponibles en el servidor actual. - -## Procesamiento distribuido en varios servidores {#distributed-processing-on-multiple-servers} - -Casi ninguno de los DBMS columnar mencionados anteriormente tiene soporte para el procesamiento de consultas distribuidas. -En ClickHouse, los datos pueden residir en diferentes fragmentos. Cada fragmento puede ser un grupo de réplicas utilizadas para la tolerancia a errores. Todos los fragmentos se utilizan para ejecutar una consulta en paralelo, de forma transparente para el usuario. - -## Soporte SQL {#sql-support} - -ClickHouse admite un lenguaje de consulta declarativo basado en SQL que es idéntico al estándar SQL en muchos casos. -Las consultas admitidas incluyen GROUP BY, ORDER BY, subconsultas en cláusulas FROM, IN y JOIN y subconsultas escalares. -No se admiten subconsultas y funciones de ventana dependientes. - -## Motor del vector {#vector-engine} - -Los datos no solo se almacenan mediante columnas, sino que se procesan mediante vectores (partes de columnas), lo que permite lograr una alta eficiencia de CPU. - -## Actualizaciones de datos en tiempo real {#real-time-data-updates} - -ClickHouse admite tablas con una clave principal. Para realizar consultas rápidamente en el rango de la clave principal, los datos se ordenan de forma incremental utilizando el árbol de combinación. Debido a esto, los datos se pueden agregar continuamente a la tabla. No se toman bloqueos cuando se ingieren nuevos datos. - -## Indice {#index} - -Tener un dato ordenado físicamente por clave principal permite extraer datos para sus valores específicos o rangos de valores con baja latencia, menos de unas pocas docenas de milisegundos. - -## Adecuado para consultas en línea {#suitable-for-online-queries} - -La baja latencia significa que las consultas se pueden procesar sin demora y sin intentar preparar una respuesta por adelantado, justo en el mismo momento mientras se carga la página de la interfaz de usuario. En otras palabras, en línea. - -## Soporte para cálculos aproximados {#support-for-approximated-calculations} - -ClickHouse proporciona varias formas de intercambiar precisión por rendimiento: - -1. Funciones agregadas para el cálculo aproximado del número de valores distintos, medianas y cuantiles. -2. Ejecutar una consulta basada en una parte (muestra) de datos y obtener un resultado aproximado. En este caso, se recuperan proporcionalmente menos datos del disco. -3. Ejecutar una agregación para un número limitado de claves aleatorias, en lugar de para todas las claves. Bajo ciertas condiciones para la distribución de claves en los datos, esto proporciona un resultado razonablemente preciso mientras se utilizan menos recursos. - -## Replicación de datos e integridad de datos {#data-replication-and-data-integrity-support} - -ClickHouse utiliza la replicación multi-maestro asincrónica. Después de escribir en cualquier réplica disponible, todas las réplicas restantes recuperan su copia en segundo plano. El sistema mantiene datos idénticos en diferentes réplicas. La recuperación después de la mayoría de las fallas se realiza automáticamente, o semiautomáticamente en casos complejos. - -Para obtener más información, consulte la sección [Replicación de datos](../engines/table-engines/mergetree-family/replication.md). - -## Características que pueden considerarse desventajas {#clickhouse-features-that-can-be-considered-disadvantages} - -1. No hay transacciones completas. -2. Falta de capacidad para modificar o eliminar datos ya insertados con alta tasa y baja latencia. Hay eliminaciones y actualizaciones por lotes disponibles para limpiar o modificar datos, por ejemplo, para cumplir con [GDPR](https://gdpr-info.eu). -3. El índice disperso hace que ClickHouse no sea tan adecuado para consultas de puntos que recuperan filas individuales por sus claves. - -[Artículo Original](https://clickhouse.tech/docs/en/introduction/distinctive_features/) diff --git a/docs/es/introduction/history.md b/docs/es/introduction/history.md deleted file mode 100644 index 7311fa01959..00000000000 --- a/docs/es/introduction/history.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 7 -toc_title: Historia ---- - -# Historial de ClickHouse {#clickhouse-history} - -ClickHouse se ha desarrollado inicialmente para alimentar [El Yandex.Métrica](https://metrica.yandex.com/), [la segunda plataforma de análisis web más grande del mundo](http://w3techs.com/technologies/overview/traffic_analysis/all), y sigue siendo el componente central de este sistema. Con más de 13 billones de registros en la base de datos y más de 20 mil millones de eventos diarios, ClickHouse permite generar informes personalizados sobre la marcha directamente a partir de datos no agregados. Este artículo cubre brevemente los objetivos de ClickHouse en las primeras etapas de su desarrollo. - -El Yandex.Metrica construye informes personalizados sobre la marcha basados en hits y sesiones, con segmentos arbitrarios definidos por el usuario. Hacerlo a menudo requiere construir agregados complejos, como el número de usuarios únicos. Los nuevos datos para crear un informe llegan en tiempo real. - -A partir de abril de 2014, Yandex.Metrica estaba rastreando alrededor de 12 mil millones de eventos (vistas de páginas y clics) diariamente. Todos estos eventos deben almacenarse para crear informes personalizados. Una sola consulta puede requerir escanear millones de filas en unos pocos cientos de milisegundos, o cientos de millones de filas en solo unos segundos. - -## Uso en Yandex.Metrica y otros servicios de Yandex {#usage-in-yandex-metrica-and-other-yandex-services} - -ClickHouse sirve para múltiples propósitos en Yandex.Métrica. -Su tarea principal es crear informes en modo en línea utilizando datos no agregados. Utiliza un clúster de 374 servidores, que almacenan más de 20,3 billones de filas en la base de datos. El volumen de datos comprimidos es de aproximadamente 2 PB, sin tener en cuenta duplicados y réplicas. El volumen de datos sin comprimir (en formato TSV) sería de aproximadamente 17 PB. - -ClickHouse también juega un papel clave en los siguientes procesos: - -- Almacenamiento de datos para Session Replay de Yandex.Métrica. -- Procesamiento de datos intermedios. -- Creación de informes globales con Analytics. -- Ejecutar consultas para depurar el Yandex.Motor Metrica. -- Análisis de registros desde la API y la interfaz de usuario. - -Hoy en día, hay varias docenas de instalaciones de ClickHouse en otros servicios y departamentos de Yandex: verticales de búsqueda, comercio electrónico, publicidad, análisis de negocios, desarrollo móvil, servicios personales y otros. - -## Datos agregados y no agregados {#aggregated-and-non-aggregated-data} - -Existe una opinión generalizada de que para calcular las estadísticas de manera efectiva, debe agregar datos ya que esto reduce el volumen de datos. - -Pero la agregación de datos viene con muchas limitaciones: - -- Debe tener una lista predefinida de los informes necesarios. -- El usuario no puede hacer informes personalizados. -- Al agregar sobre un gran número de claves distintas, el volumen de datos apenas se reduce, por lo que la agregación es inútil. -- Para un gran número de informes, hay demasiadas variaciones de agregación (explosión combinatoria). -- Al agregar claves con alta cardinalidad (como las URL), el volumen de datos no se reduce en mucho (menos del doble). -- Por esta razón, el volumen de datos con agregación podría crecer en lugar de reducirse. -- Los usuarios no ven todos los informes que generamos para ellos. Una gran parte de esos cálculos es inútil. -- La integridad lógica de los datos puede ser violada para varias agregaciones. - -Si no agregamos nada y trabajamos con datos no agregados, esto podría reducir el volumen de cálculos. - -Sin embargo, con la agregación, una parte significativa del trabajo se desconecta y se completa con relativa calma. Por el contrario, los cálculos en línea requieren calcular lo más rápido posible, ya que el usuario está esperando el resultado. - -El Yandex.Metrica tiene un sistema especializado para agregar datos llamado Metrage, que se utilizó para la mayoría de los informes. -A partir de 2009, Yandex.Metrica también utilizó una base de datos OLAP especializada para datos no agregados llamada OLAPServer, que anteriormente se usaba para el generador de informes. -OLAPServer funcionó bien para datos no agregados, pero tenía muchas restricciones que no permitían que se utilizara para todos los informes según lo deseado. Estos incluyeron la falta de soporte para tipos de datos (solo números) y la incapacidad de actualizar datos de forma incremental en tiempo real (solo se podía hacer reescribiendo datos diariamente). OLAPServer no es un DBMS, sino una base de datos especializada. - -El objetivo inicial de ClickHouse era eliminar las limitaciones de OLAPServer y resolver el problema de trabajar con datos no agregados para todos los informes, pero a lo largo de los años, se ha convertido en un sistema de gestión de bases de datos de propósito general adecuado para una amplia gama de tareas analíticas. - -[Artículo Original](https://clickhouse.tech/docs/en/introduction/history/) diff --git a/docs/es/introduction/index.md b/docs/es/introduction/index.md deleted file mode 100644 index 7026dc800e4..00000000000 --- a/docs/es/introduction/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Implantaci\xF3n" -toc_priority: 1 ---- - - diff --git a/docs/es/introduction/performance.md b/docs/es/introduction/performance.md deleted file mode 100644 index 01640439128..00000000000 --- a/docs/es/introduction/performance.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 6 -toc_title: Rendimiento ---- - -# Rendimiento {#performance} - -De acuerdo con los resultados de las pruebas internas en Yandex, ClickHouse muestra el mejor rendimiento (tanto el mayor rendimiento para consultas largas como la menor latencia en consultas cortas) para escenarios operativos comparables entre los sistemas de su clase que estaban disponibles para pruebas. Puede ver los resultados de la prueba en un [página separada](https://clickhouse.tech/benchmark/dbms/). - -Numerosos puntos de referencia independientes llegaron a conclusiones similares. No son difíciles de encontrar mediante una búsqueda en Internet, o se puede ver [nuestra pequeña colección de enlaces relacionados](https://clickhouse.tech/#independent-benchmarks). - -## Rendimiento para una única consulta grande {#throughput-for-a-single-large-query} - -El rendimiento se puede medir en filas por segundo o megabytes por segundo. Si los datos se colocan en la caché de la página, una consulta que no es demasiado compleja se procesa en hardware moderno a una velocidad de aproximadamente 2-10 GB / s de datos sin comprimir en un solo servidor (para los casos más sencillos, la velocidad puede alcanzar 30 GB / s). Si los datos no se colocan en la memoria caché de la página, la velocidad depende del subsistema de disco y la velocidad de compresión de datos. Por ejemplo, si el subsistema de disco permite leer datos a 400 MB/s y la tasa de compresión de datos es 3, se espera que la velocidad sea de alrededor de 1,2 GB/s. Para obtener la velocidad en filas por segundo, divida la velocidad en bytes por segundo por el tamaño total de las columnas utilizadas en la consulta. Por ejemplo, si se extraen 10 bytes de columnas, se espera que la velocidad sea de alrededor de 100-200 millones de filas por segundo. - -La velocidad de procesamiento aumenta casi linealmente para el procesamiento distribuido, pero solo si el número de filas resultantes de la agregación o la clasificación no es demasiado grande. - -## Latencia al procesar consultas cortas {#latency-when-processing-short-queries} - -Si una consulta usa una clave principal y no selecciona demasiadas columnas y filas para procesar (cientos de miles), puede esperar menos de 50 milisegundos de latencia (dígitos individuales de milisegundos en el mejor de los casos) si los datos se colocan en la memoria caché de la página. De lo contrario, la latencia está dominada principalmente por el número de búsquedas. Si utiliza unidades de disco giratorias, para un sistema que no está sobrecargado, la latencia se puede estimar con esta fórmula: `seek time (10 ms) * count of columns queried * count of data parts`. - -## Rendimiento al procesar una gran cantidad de consultas cortas {#throughput-when-processing-a-large-quantity-of-short-queries} - -En las mismas condiciones, ClickHouse puede manejar varios cientos de consultas por segundo en un solo servidor (hasta varios miles en el mejor de los casos). Dado que este escenario no es típico para DBMS analíticos, se recomienda esperar un máximo de 100 consultas por segundo. - -## Rendimiento al insertar datos {#performance-when-inserting-data} - -Recomendamos insertar datos en paquetes de al menos 1000 filas o no más de una sola solicitud por segundo. Al insertar en una tabla MergeTree desde un volcado separado por tabuladores, la velocidad de inserción puede ser de 50 a 200 MB/s. Si las filas insertadas tienen alrededor de 1 Kb de tamaño, la velocidad será de 50,000 a 200,000 filas por segundo. Si las filas son pequeñas, el rendimiento puede ser mayor en filas por segundo (en los datos del sistema Banner -`>` 500.000 filas por segundo; en datos de grafito -`>` 1.000.000 de filas por segundo). Para mejorar el rendimiento, puede realizar varias consultas INSERT en paralelo, que se escala linealmente. - -[Artículo Original](https://clickhouse.tech/docs/en/introduction/performance/) diff --git a/docs/es/operations/access-rights.md b/docs/es/operations/access-rights.md deleted file mode 100644 index 6c777d9f081..00000000000 --- a/docs/es/operations/access-rights.md +++ /dev/null @@ -1,143 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 48 -toc_title: "Control de acceso y gesti\xF3n de cuentas" ---- - -# Control de acceso y gestión de cuentas {#access-control} - -ClickHouse admite la administración de control de acceso basada en [RBAC](https://en.wikipedia.org/wiki/Role-based_access_control) enfoque. - -Entidades de acceso de ClickHouse: -- [Cuenta de usuario](#user-account-management) -- [Rol](#role-management) -- [Política de fila](#row-policy-management) -- [Perfil de configuración](#settings-profiles-management) -- [Cuota](#quotas-management) - -Puede configurar entidades de acceso utilizando: - -- Flujo de trabajo controlado por SQL. - - Es necesario [permitir](#enabling-access-control) esta funcionalidad. - -- Servidor [archivos de configuración](configuration-files.md) `users.xml` y `config.xml`. - -Se recomienda utilizar el flujo de trabajo controlado por SQL. Ambos métodos de configuración funcionan simultáneamente, por lo que si utiliza los archivos de configuración del servidor para administrar cuentas y derechos de acceso, puede pasar suavemente al flujo de trabajo controlado por SQL. - -!!! note "Advertencia" - No puede administrar la misma entidad de acceso mediante ambos métodos de configuración simultáneamente. - -## Uso {#access-control-usage} - -De forma predeterminada, el servidor ClickHouse proporciona la cuenta de usuario `default` que no está permitido usar control de acceso controlado por SQL y administración de cuentas, pero tiene todos los derechos y permisos. El `default` cuenta de usuario se utiliza en cualquier caso cuando el nombre de usuario no está definido, por ejemplo, al iniciar sesión desde el cliente o en consultas distribuidas. En el procesamiento de consultas distribuidas se utiliza una cuenta de usuario predeterminada, si la configuración del servidor o clúster no [usuario y contraseña](../engines/table-engines/special/distributed.md) propiedad. - -Si acaba de comenzar a usar ClickHouse, puede usar el siguiente escenario: - -1. [Permitir](#enabling-access-control) Control de acceso basado en SQL y gestión de cuentas `default` usuario. -2. Inicie sesión bajo el `default` cuenta de usuario y crear todos los usuarios. No olvides crear una cuenta de administrador (`GRANT ALL ON *.* WITH GRANT OPTION TO admin_user_account`). -3. [Restringir permisos](settings/permissions-for-queries.md#permissions_for_queries) para el `default` usuario y deshabilitar el control de acceso impulsado por SQL y la administración de cuentas para ello. - -### Propiedades de la solución actual {#access-control-properties} - -- Puede conceder permisos para bases de datos y tablas incluso si no existen. -- Si se eliminó una tabla, no se revocarán todos los privilegios que corresponden a esta tabla. Por lo tanto, si se crea una nueva tabla más tarde con el mismo nombre, todos los privilegios vuelven a ser reales. Para revocar los privilegios correspondientes a la tabla eliminada, debe realizar, por ejemplo, el `REVOKE ALL PRIVILEGES ON db.table FROM ALL` consulta. -- No hay ninguna configuración de por vida para los privilegios. - -## Cuenta de usuario {#user-account-management} - -Una cuenta de usuario es una entidad de acceso que permite autorizar a alguien en ClickHouse. Una cuenta de usuario contiene: - -- Información de identificación. -- [Privilegio](../sql-reference/statements/grant.md#grant-privileges) que definen un ámbito de consultas que el usuario puede realizar. -- Hosts desde los que se permite la conexión al servidor ClickHouse. -- Roles otorgados y predeterminados. -- Configuración con sus restricciones que se aplican de forma predeterminada en el inicio de sesión del usuario. -- Perfiles de configuración asignados. - -Los privilegios a una cuenta de usuario pueden ser otorgados por el [GRANT](../sql-reference/statements/grant.md) consulta o asignando [rol](#role-management). Para revocar privilegios de un usuario, ClickHouse proporciona el [REVOKE](../sql-reference/statements/revoke.md) consulta. Para listar los privilegios de un usuario, utilice - [SHOW GRANTS](../sql-reference/statements/show.md#show-grants-statement) instrucción. - -Consultas de gestión: - -- [CREATE USER](../sql-reference/statements/create.md#create-user-statement) -- [ALTER USER](../sql-reference/statements/alter.md#alter-user-statement) -- [DROP USER](../sql-reference/statements/misc.md#drop-user-statement) -- [SHOW CREATE USER](../sql-reference/statements/show.md#show-create-user-statement) - -### Ajustes Aplicación {#access-control-settings-applying} - -Los ajustes se pueden establecer de diferentes maneras: para una cuenta de usuario, en sus roles y perfiles de configuración concedidos. En un inicio de sesión de usuario, si se establece una configuración en diferentes entidades de acceso, el valor y las restricciones de esta configuración se aplican mediante las siguientes prioridades (de mayor a menor): - -1. Configuración de la cuenta de usuario. -2. La configuración de los roles predeterminados de la cuenta de usuario. Si se establece una configuración en algunos roles, el orden de la configuración que se aplica no está definido. -3. La configuración de los perfiles de configuración asignados a un usuario o a sus roles predeterminados. Si se establece una configuración en algunos perfiles, el orden de aplicación de la configuración no está definido. -4. Ajustes aplicados a todo el servidor de forma predeterminada o desde el [perfil predeterminado](server-configuration-parameters/settings.md#default-profile). - -## Rol {#role-management} - -Role es un contenedor para las entidades de acceso que se pueden conceder a una cuenta de usuario. - -El rol contiene: - -- [Privilegio](../sql-reference/statements/grant.md#grant-privileges) -- Configuración y restricciones -- Lista de funciones concedidas - -Consultas de gestión: - -- [CREATE ROLE](../sql-reference/statements/create.md#create-role-statement) -- [ALTER ROLE](../sql-reference/statements/alter.md#alter-role-statement) -- [DROP ROLE](../sql-reference/statements/misc.md#drop-role-statement) -- [SET ROLE](../sql-reference/statements/misc.md#set-role-statement) -- [SET DEFAULT ROLE](../sql-reference/statements/misc.md#set-default-role-statement) -- [SHOW CREATE ROLE](../sql-reference/statements/show.md#show-create-role-statement) - -Los privilegios a un rol pueden ser otorgados por el [GRANT](../sql-reference/statements/grant.md) consulta. Para revocar privilegios de un rol, ClickHouse proporciona el [REVOKE](../sql-reference/statements/revoke.md) consulta. - -## Política de fila {#row-policy-management} - -La directiva de filas es un filtro que define qué filas está disponible para un usuario o para un rol. La directiva de filas contiene filtros para una tabla específica y una lista de roles y/o usuarios que deben usar esta directiva de filas. - -Consultas de gestión: - -- [CREATE ROW POLICY](../sql-reference/statements/create.md#create-row-policy-statement) -- [ALTER ROW POLICY](../sql-reference/statements/alter.md#alter-row-policy-statement) -- [DROP ROW POLICY](../sql-reference/statements/misc.md#drop-row-policy-statement) -- [SHOW CREATE ROW POLICY](../sql-reference/statements/show.md#show-create-row-policy-statement) - -## Perfil de configuración {#settings-profiles-management} - -El perfil de configuración es una colección de [configuración](settings/index.md). El perfil de configuración contiene configuraciones y restricciones, y una lista de roles y/o usuarios a los que se aplica esta cuota. - -Consultas de gestión: - -- [CREATE SETTINGS PROFILE](../sql-reference/statements/create.md#create-settings-profile-statement) -- [ALTER SETTINGS PROFILE](../sql-reference/statements/alter.md#alter-settings-profile-statement) -- [DROP SETTINGS PROFILE](../sql-reference/statements/misc.md#drop-settings-profile-statement) -- [SHOW CREATE SETTINGS PROFILE](../sql-reference/statements/show.md#show-create-settings-profile-statement) - -## Cuota {#quotas-management} - -La cuota limita el uso de recursos. Ver [Cuota](quotas.md). - -La cuota contiene un conjunto de límites para algunas duraciones y una lista de roles y / o usuarios que deben usar esta cuota. - -Consultas de gestión: - -- [CREATE QUOTA](../sql-reference/statements/create.md#create-quota-statement) -- [ALTER QUOTA](../sql-reference/statements/alter.md#alter-quota-statement) -- [DROP QUOTA](../sql-reference/statements/misc.md#drop-quota-statement) -- [SHOW CREATE QUOTA](../sql-reference/statements/show.md#show-create-quota-statement) - -## Habilitación del control de acceso basado en SQL y la administración de cuentas {#enabling-access-control} - -- Configure un directorio para el almacenamiento de configuraciones. - - ClickHouse almacena las configuraciones de entidades de acceso en la carpeta [access_control_path](server-configuration-parameters/settings.md#access_control_path) parámetro de configuración del servidor. - -- Habilite el control de acceso controlado por SQL y la administración de cuentas para al menos una cuenta de usuario. - - De forma predeterminada, el control de acceso controlado por SQL y la administración de cuentas se activan para todos los usuarios. Debe configurar al menos un usuario en el `users.xml` archivo de configuración y asigne 1 al [access_management](settings/settings-users.md#access_management-user-setting) configuración. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/access_rights/) diff --git a/docs/es/operations/backup.md b/docs/es/operations/backup.md deleted file mode 100644 index be33851574a..00000000000 --- a/docs/es/operations/backup.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -toc_priority: 49 -toc_title: Copia de seguridad de datos ---- - -# Copia de seguridad de datos {#data-backup} - -Mientras que la [replicación](../engines/table-engines/mergetree-family/replication.md) proporciona protección contra fallos de hardware, no protege de errores humanos: el borrado accidental de datos, elminar la tabla equivocada o una tabla en el clúster equivocado, y bugs de software que dan como resultado un procesado incorrecto de los datos o la corrupción de los datos. En muchos casos, errores como estos afectarán a todas las réplicas. ClickHouse dispone de salvaguardas para prevenir algunos tipos de errores — por ejemplo, por defecto [no se puede simplemente eliminar tablas con un motor similar a MergeTree que contenga más de 50 Gb de datos](server-configuration-parameters/settings.md#max-table-size-to-drop). Sin embargo, estas salvaguardas no cubren todos los casos posibles y pueden eludirse. - -Para mitigar eficazmente los posibles errores humanos, debe preparar cuidadosamente una estrategia para realizar copias de seguridad y restaurar sus datos **previamente**. - -Cada empresa tiene diferentes recursos disponibles y requisitos comerciales, por lo que no existe una solución universal para las copias de seguridad y restauraciones de ClickHouse que se adapten a cada situación. Lo que funciona para un gigabyte de datos probablemente no funcionará para decenas de petabytes. Hay una variedad de posibles enfoques con sus propios pros y contras, que se discutirán a continuación. Es una buena idea utilizar varios enfoques en lugar de uno solo para compensar sus diversas deficiencias. - -!!! note "Nota" - Tenga en cuenta que si realizó una copia de seguridad de algo y nunca intentó restaurarlo, es probable que la restauración no funcione correctamente cuando realmente la necesite (o al menos tomará más tiempo de lo que las empresas pueden tolerar). Por lo tanto, cualquiera que sea el enfoque de copia de seguridad que elija, asegúrese de automatizar el proceso de restauración también y ponerlo en practica en un clúster de ClickHouse de repuesto regularmente. - -## Duplicar datos de origen en otro lugar {#duplicating-source-data-somewhere-else} - -A menudo, los datos que se ingieren en ClickHouse se entregan a través de algún tipo de cola persistente, como [Acerca de nosotros](https://kafka.apache.org). En este caso, es posible configurar un conjunto adicional de suscriptores que leerá el mismo flujo de datos mientras se escribe en ClickHouse y lo almacenará en almacenamiento en frío en algún lugar. La mayoría de las empresas ya tienen algún almacenamiento en frío recomendado por defecto, que podría ser un almacén de objetos o un sistema de archivos distribuido como [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html). - -## Instantáneas del sistema de archivos {#filesystem-snapshots} - -Algunos sistemas de archivos locales proporcionan funcionalidad de instantánea (por ejemplo, [ZFS](https://en.wikipedia.org/wiki/ZFS)), pero podrían no ser la mejor opción para servir consultas en vivo. Una posible solución es crear réplicas adicionales con este tipo de sistema de archivos y excluirlas del [Distribuido](../engines/table-engines/special/distributed.md) tablas que se utilizan para `SELECT` consulta. Las instantáneas en tales réplicas estarán fuera del alcance de cualquier consulta que modifique los datos. Como beneficio adicional, estas réplicas podrían tener configuraciones de hardware especiales con más discos conectados por servidor, lo que sería rentable. - -## Método de codificación de datos: {#clickhouse-copier} - -[Método de codificación de datos:](utilities/clickhouse-copier.md) es una herramienta versátil que se creó inicialmente para volver a dividir tablas de tamaño petabyte. También se puede usar con fines de copia de seguridad y restauración porque copia datos de forma fiable entre tablas y clústeres de ClickHouse. - -Para volúmenes de datos más pequeños, un simple `INSERT INTO ... SELECT ...` a tablas remotas podría funcionar también. - -## Manipulaciones con piezas {#manipulations-with-parts} - -ClickHouse permite usar la consulta `ALTER TABLE ... FREEZE PARTITION ...` para crear una copia local de particiones de tabla. Esto se implementa utilizando enlaces duros a la carpeta `/var/lib/clickhouse/shadow/`, por lo que generalmente no consume espacio adicional en disco para datos antiguos. Las copias creadas de archivos no son manejadas por el servidor ClickHouse, por lo que puede dejarlas allí: tendrá una copia de seguridad simple que no requiere ningún sistema externo adicional, pero seguirá siendo propenso a problemas de hardware. Por esta razón, es mejor copiarlos de forma remota en otra ubicación y luego eliminar las copias locales. Los sistemas de archivos distribuidos y los almacenes de objetos siguen siendo una buena opción para esto, pero los servidores de archivos conectados normales con una capacidad lo suficientemente grande podrían funcionar también (en este caso, la transferencia ocurrirá a través del sistema de archivos de red o tal vez [rsync](https://en.wikipedia.org/wiki/Rsync)). - -Para obtener más información sobre las consultas relacionadas con las manipulaciones de particiones, consulte [Documentación de ALTER](../sql-reference/statements/alter.md#alter_manipulations-with-partitions). - -Una herramienta de terceros está disponible para automatizar este enfoque: [Haga clic en el botón de copia de seguridad](https://github.com/AlexAkulov/clickhouse-backup). - -[Artículo Original](https://clickhouse.tech/docs/en/operations/backup/) diff --git a/docs/es/operations/configuration-files.md b/docs/es/operations/configuration-files.md deleted file mode 100644 index d9aa8567868..00000000000 --- a/docs/es/operations/configuration-files.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 50 -toc_title: "Archivos de configuraci\xF3n" ---- - -# Archivos de configuración {#configuration_files} - -ClickHouse admite la administración de configuración de varios archivos. El archivo de configuración del servidor principal es `/etc/clickhouse-server/config.xml`. Otros archivos deben estar en el `/etc/clickhouse-server/config.d` directorio. - -!!! note "Nota" - Todos los archivos de configuración deben estar en formato XML. Además, deben tener el mismo elemento raíz, generalmente ``. - -Algunos valores especificados en el archivo de configuración principal se pueden anular en otros archivos de configuración. El `replace` o `remove` se pueden especificar atributos para los elementos de estos archivos de configuración. - -Si no se especifica ninguno, combina el contenido de los elementos de forma recursiva, reemplazando los valores de los elementos secundarios duplicados. - -Si `replace` se especifica, reemplaza todo el elemento por el especificado. - -Si `remove` se especifica, elimina el elemento. - -La configuración también puede definir “substitutions”. Si un elemento tiene el `incl` atributo, la sustitución correspondiente del archivo se utilizará como el valor. De forma predeterminada, la ruta al archivo con sustituciones es `/etc/metrika.xml`. Esto se puede cambiar en el [include_from](server-configuration-parameters/settings.md#server_configuration_parameters-include_from) elemento en la configuración del servidor. Los valores de sustitución se especifican en `/yandex/substitution_name` elementos en este archivo. Si una sustitución especificada en `incl` no existe, se registra en el registro. Para evitar que ClickHouse registre las sustituciones que faltan, especifique `optional="true"` atributo (por ejemplo, ajustes para [macro](server-configuration-parameters/settings.md)). - -Las sustituciones también se pueden realizar desde ZooKeeper. Para hacer esto, especifique el atributo `from_zk = "/path/to/node"`. El valor del elemento se sustituye por el contenido del nodo en `/path/to/node` en ZooKeeper. También puede colocar un subárbol XML completo en el nodo ZooKeeper y se insertará completamente en el elemento de origen. - -El `config.xml` file puede especificar una configuración separada con configuraciones de usuario, perfiles y cuotas. La ruta relativa a esta configuración se establece en el `users_config` elemento. Por defecto, es `users.xml`. Si `users_config` se omite, la configuración de usuario, los perfiles y las cuotas se especifican directamente en `config.xml`. - -La configuración de los usuarios se puede dividir en archivos separados similares a `config.xml` y `config.d/`. -El nombre del directorio se define como `users_config` sin `.xml` postfix concatenado con `.d`. -Directorio `users.d` se utiliza por defecto, como `users_config` por defecto `users.xml`. -Por ejemplo, puede tener un archivo de configuración separado para cada usuario como este: - -``` bash -$ cat /etc/clickhouse-server/users.d/alice.xml -``` - -``` xml - - - - analytics - - ::/0 - - ... - analytics - - - -``` - -Para cada archivo de configuración, el servidor también genera `file-preprocessed.xml` archivos al iniciar. Estos archivos contienen todas las sustituciones y anulaciones completadas, y están destinados para uso informativo. Si se utilizaron sustituciones de ZooKeeper en los archivos de configuración pero ZooKeeper no está disponible en el inicio del servidor, el servidor carga la configuración desde el archivo preprocesado. - -El servidor realiza un seguimiento de los cambios en los archivos de configuración, así como archivos y nodos ZooKeeper que se utilizaron al realizar sustituciones y anulaciones, y vuelve a cargar la configuración de los usuarios y clústeres sobre la marcha. Esto significa que puede modificar el clúster, los usuarios y su configuración sin reiniciar el servidor. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/configuration_files/) diff --git a/docs/es/operations/index.md b/docs/es/operations/index.md deleted file mode 100644 index 9a928fa0f01..00000000000 --- a/docs/es/operations/index.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Operaci\xF3n" -toc_priority: 41 -toc_title: "Implantaci\xF3n" ---- - -# Operación {#operations} - -El manual de operaciones de ClickHouse consta de las siguientes secciones principales: - -- [Requisito](requirements.md) -- [Monitoreo](monitoring.md) -- [Solución de problemas](troubleshooting.md) -- [Recomendaciones de uso](tips.md) -- [Procedimiento de actualización](update.md) -- [Derechos de acceso](access-rights.md) -- [Copia de seguridad de datos](backup.md) -- [Archivos de configuración](configuration-files.md) -- [Cuota](quotas.md) -- [Tablas del sistema](system-tables.md) -- [Parámetros de configuración del servidor](server-configuration-parameters/index.md) -- [Cómo probar su hardware con ClickHouse](performance-test.md) -- [Configuración](settings/index.md) -- [Utilidad](utilities/index.md) - -{## [Artículo Original](https://clickhouse.tech/docs/en/operations/) ##} diff --git a/docs/es/operations/monitoring.md b/docs/es/operations/monitoring.md deleted file mode 100644 index 19912d23f3b..00000000000 --- a/docs/es/operations/monitoring.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 45 -toc_title: Monitoreo ---- - -# Monitoreo {#monitoring} - -Usted puede monitorear: - -- Utilización de recursos de hardware. -- Métricas del servidor ClickHouse. - -## Utilización de recursos {#resource-utilization} - -ClickHouse no supervisa el estado de los recursos de hardware por sí mismo. - -Se recomienda encarecidamente configurar la supervisión para: - -- Carga y temperatura en los procesadores. - - Usted puede utilizar [dmesg](https://en.wikipedia.org/wiki/Dmesg), [Turbostat](https://www.linux.org/docs/man8/turbostat.html) u otros instrumentos. - -- Utilización del sistema de almacenamiento, RAM y red. - -## Métricas del servidor ClickHouse {#clickhouse-server-metrics} - -El servidor ClickHouse tiene instrumentos integrados para el monitoreo de estado propio. - -Para realizar un seguimiento de los eventos del servidor, use los registros del servidor. Ver el [registrador](server-configuration-parameters/settings.md#server_configuration_parameters-logger) sección del archivo de configuración. - -ClickHouse recoge: - -- Diferentes métricas de cómo el servidor utiliza recursos computacionales. -- Estadísticas comunes sobre el procesamiento de consultas. - -Puede encontrar métricas en el [sistema.métricas](../operations/system-tables.md#system_tables-metrics), [sistema.evento](../operations/system-tables.md#system_tables-events), y [sistema.asynchronous_metrics](../operations/system-tables.md#system_tables-asynchronous_metrics) tabla. - -Puede configurar ClickHouse para exportar métricas a [Grafito](https://github.com/graphite-project). Ver el [Sección de grafito](server-configuration-parameters/settings.md#server_configuration_parameters-graphite) en el archivo de configuración del servidor ClickHouse. Antes de configurar la exportación de métricas, debe configurar Graphite siguiendo sus [guiar](https://graphite.readthedocs.io/en/latest/install.html). - -Puede configurar ClickHouse para exportar métricas a [Prometeo](https://prometheus.io). Ver el [Sección Prometheus](server-configuration-parameters/settings.md#server_configuration_parameters-prometheus) en el archivo de configuración del servidor ClickHouse. Antes de configurar la exportación de métricas, debe configurar Prometheus siguiendo su oficial [guiar](https://prometheus.io/docs/prometheus/latest/installation/). - -Además, puede supervisar la disponibilidad del servidor a través de la API HTTP. Enviar el `HTTP GET` solicitud de `/ping`. Si el servidor está disponible, responde con `200 OK`. - -Para supervisar servidores en una configuración de clúster, debe establecer [max_replica_delay_for_distributed_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries) parámetro y utilizar el recurso HTTP `/replicas_status`. Una solicitud para `/replicas_status` devoluciones `200 OK` si la réplica está disponible y no se retrasa detrás de las otras réplicas. Si una réplica se retrasa, devuelve `503 HTTP_SERVICE_UNAVAILABLE` con información sobre la brecha. diff --git a/docs/es/operations/optimizing-performance/index.md b/docs/es/operations/optimizing-performance/index.md deleted file mode 100644 index d2796c6e0d3..00000000000 --- a/docs/es/operations/optimizing-performance/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Optimizaci\xF3n del rendimiento" -toc_priority: 52 ---- - - diff --git a/docs/es/operations/optimizing-performance/sampling-query-profiler.md b/docs/es/operations/optimizing-performance/sampling-query-profiler.md deleted file mode 100644 index a474dde6af2..00000000000 --- a/docs/es/operations/optimizing-performance/sampling-query-profiler.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 54 -toc_title: "Generaci\xF3n de perfiles de consultas" ---- - -# Analizador de consultas de muestreo {#sampling-query-profiler} - -ClickHouse ejecuta el generador de perfiles de muestreo que permite analizar la ejecución de consultas. Utilizando el generador de perfiles puede encontrar rutinas de código fuente que se utilizan con más frecuencia durante la ejecución de la consulta. Puede rastrear el tiempo de CPU y el tiempo de reloj de pared invertido, incluido el tiempo de inactividad. - -Para usar el generador de perfiles: - -- Configurar el [trace_log](../server-configuration-parameters/settings.md#server_configuration_parameters-trace_log) sección de la configuración del servidor. - - Esta sección configura la [trace_log](../../operations/system-tables.md#system_tables-trace_log) tabla del sistema que contiene los resultados del funcionamiento del generador de perfiles. Está configurado de forma predeterminada. Recuerde que los datos de esta tabla solo son válidos para un servidor en ejecución. Después de reiniciar el servidor, ClickHouse no limpia la tabla y toda la dirección de memoria virtual almacenada puede dejar de ser válida. - -- Configurar el [Los resultados de la prueba](../settings/settings.md#query_profiler_cpu_time_period_ns) o [query_profiler_real_time_period_ns](../settings/settings.md#query_profiler_real_time_period_ns) configuración. Ambos ajustes se pueden utilizar simultáneamente. - - Estas opciones le permiten configurar temporizadores del generador de perfiles. Como estos son los ajustes de sesión, puede obtener diferentes frecuencias de muestreo para todo el servidor, usuarios individuales o perfiles de usuario, para su sesión interactiva y para cada consulta individual. - -La frecuencia de muestreo predeterminada es una muestra por segundo y tanto la CPU como los temporizadores reales están habilitados. Esta frecuencia permite recopilar suficiente información sobre el clúster ClickHouse. Al mismo tiempo, al trabajar con esta frecuencia, el generador de perfiles no afecta el rendimiento del servidor ClickHouse. Si necesita perfilar cada consulta individual, intente usar una mayor frecuencia de muestreo. - -Para analizar el `trace_log` tabla del sistema: - -- Instale el `clickhouse-common-static-dbg` paquete. Ver [Instalar desde paquetes DEB](../../getting-started/install.md#install-from-deb-packages). - -- Permitir funciones de introspección [allow_introspection_functions](../settings/settings.md#settings-allow_introspection_functions) configuración. - - Por razones de seguridad, las funciones de introspección están deshabilitadas de forma predeterminada. - -- Utilice el `addressToLine`, `addressToSymbol` y `demangle` [funciones de la introspección](../../sql-reference/functions/introspection.md) para obtener nombres de funciones y sus posiciones en el código ClickHouse. Para obtener un perfil para alguna consulta, debe agregar datos del `trace_log` tabla. Puede agregar datos por funciones individuales o por los seguimientos de pila completos. - -Si necesita visualizar `trace_log` información, intente [Flamegraph](../../interfaces/third-party/gui/#clickhouse-flamegraph) y [Nivel de Cifrado WEP](https://github.com/laplab/clickhouse-speedscope). - -## Ejemplo {#example} - -En este ejemplo nos: - -- Filtrado `trace_log` datos por un identificador de consulta y la fecha actual. - -- Agregando por seguimiento de pila. - -- Usando funciones de introspección, obtendremos un informe de: - - - Nombres de símbolos y funciones de código fuente correspondientes. - - Ubicaciones del código fuente de estas funciones. - - - -``` sql -SELECT - count(), - arrayStringConcat(arrayMap(x -> concat(demangle(addressToSymbol(x)), '\n ', addressToLine(x)), trace), '\n') AS sym -FROM system.trace_log -WHERE (query_id = 'ebca3574-ad0a-400a-9cbc-dca382f5998c') AND (event_date = today()) -GROUP BY trace -ORDER BY count() DESC -LIMIT 10 -``` - -``` text -{% include "examples/sampling_query_profiler_result.txt" %} -``` diff --git a/docs/es/operations/performance-test.md b/docs/es/operations/performance-test.md deleted file mode 100644 index 97444f339cd..00000000000 --- a/docs/es/operations/performance-test.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 54 -toc_title: Prueba de hardware ---- - -# Cómo probar su hardware con ClickHouse {#how-to-test-your-hardware-with-clickhouse} - -Con esta instrucción, puede ejecutar una prueba de rendimiento básica de ClickHouse en cualquier servidor sin instalar paquetes de ClickHouse. - -1. Ir a “commits” página: https://github.com/ClickHouse/ClickHouse/commits/master - -2. Haga clic en la primera marca de verificación verde o cruz roja con verde “ClickHouse Build Check” y haga clic en el “Details” enlace cerca “ClickHouse Build Check”. No existe tal enlace en algunas confirmaciones, por ejemplo, confirmaciones con documentación. En este caso, elija la confirmación más cercana que tenga este enlace. - -3. Copie el enlace a “clickhouse” binario para amd64 o aarch64. - -4. ssh al servidor y descargarlo con wget: - - - - # For amd64: - wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578163263_binary/clickhouse - # For aarch64: - wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578161264_binary/clickhouse - # Then do: - chmod a+x clickhouse - -1. Descargar configs: - - - - wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/programs/server/config.xml - wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/programs/server/users.xml - mkdir config.d - wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/programs/server/config.d/path.xml -O config.d/path.xml - wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/programs/server/config.d/log_to_console.xml -O config.d/log_to_console.xml - -1. Descargar archivos de referencia: - - - - wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/benchmark-new.sh - chmod a+x benchmark-new.sh - wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/queries.sql - -1. Descargue los datos de prueba de acuerdo con el [El Yandex.Conjunto de datos de Metrica](../getting-started/example-datasets/metrica.md) instrucción (“hits” tabla que contiene 100 millones de filas). - - - - wget https://datasets.clickhouse.tech/hits/partitions/hits_100m_obfuscated_v1.tar.xz - tar xvf hits_100m_obfuscated_v1.tar.xz -C . - mv hits_100m_obfuscated_v1/* . - -1. Ejecute el servidor: - - - - ./clickhouse server - -1. Verifique los datos: ssh al servidor en otro terminal - - - - ./clickhouse client --query "SELECT count() FROM hits_100m_obfuscated" - 100000000 - -1. Edite el benchmark-new.sh, cambie `clickhouse-client` a `./clickhouse client` y añadir `–-max_memory_usage 100000000000` parámetro. - - - - mcedit benchmark-new.sh - -1. Ejecute el punto de referencia: - - - - ./benchmark-new.sh hits_100m_obfuscated - -1. Envíe los números y la información sobre la configuración de su hardware a clickhouse-feedback@yandex-team.com - -Todos los resultados se publican aquí: https://clickhouse.tecnología/punto de referencia/hardware/ diff --git a/docs/es/operations/quotas.md b/docs/es/operations/quotas.md deleted file mode 100644 index 9d84ce21339..00000000000 --- a/docs/es/operations/quotas.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 51 -toc_title: Cuota ---- - -# Cuota {#quotas} - -Las cuotas le permiten limitar el uso de recursos durante un período de tiempo o realizar un seguimiento del uso de recursos. -Las cuotas se configuran en la configuración del usuario, que generalmente ‘users.xml’. - -El sistema también tiene una característica para limitar la complejidad de una sola consulta. Vea la sección “Restrictions on query complexity”). - -A diferencia de las restricciones de complejidad de consultas, las cuotas: - -- Coloque restricciones en un conjunto de consultas que se pueden ejecutar durante un período de tiempo, en lugar de limitar una sola consulta. -- Tenga en cuenta los recursos gastados en todos los servidores remotos para el procesamiento de consultas distribuidas. - -Veamos la sección del ‘users.xml’ fichero que define las cuotas. - -``` xml - - - - - - - - 3600 - - - 0 - 0 - 0 - 0 - 0 - - -``` - -De forma predeterminada, la cuota realiza un seguimiento del consumo de recursos para cada hora, sin limitar el uso. -El consumo de recursos calculado para cada intervalo se envía al registro del servidor después de cada solicitud. - -``` xml - - - - - 3600 - - 1000 - 100 - 1000000000 - 100000000000 - 900 - - - - 86400 - - 10000 - 1000 - 5000000000 - 500000000000 - 7200 - - -``` - -Para el ‘statbox’ Las restricciones se establecen por cada hora y por cada 24 horas (86.400 segundos). El intervalo de tiempo se cuenta, a partir de un momento fijo definido por la implementación en el tiempo. En otras palabras, el intervalo de 24 horas no necesariamente comienza a medianoche. - -Cuando finaliza el intervalo, se borran todos los valores recopilados. Para la siguiente hora, el cálculo de la cuota comienza de nuevo. - -Estas son las cantidades que se pueden restringir: - -`queries` – The total number of requests. - -`errors` – The number of queries that threw an exception. - -`result_rows` – The total number of rows given as a result. - -`read_rows` – The total number of source rows read from tables for running the query on all remote servers. - -`execution_time` – The total query execution time, in seconds (wall time). - -Si se excede el límite durante al menos un intervalo de tiempo, se lanza una excepción con un texto sobre qué restricción se excedió, para qué intervalo y cuándo comienza el nuevo intervalo (cuando se pueden enviar consultas nuevamente). - -Las cuotas pueden usar el “quota key” característica para informar sobre los recursos para múltiples claves de forma independiente. Aquí hay un ejemplo de esto: - -``` xml - - - - -``` - -La cuota se asigna a los usuarios ‘users’ sección de la configuración. Vea la sección “Access rights”. - -Para el procesamiento de consultas distribuidas, los importes acumulados se almacenan en el servidor del solicitante. Entonces, si el usuario va a otro servidor, la cuota allí “start over”. - -Cuando se reinicia el servidor, las cuotas se restablecen. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/quotas/) diff --git a/docs/es/operations/requirements.md b/docs/es/operations/requirements.md deleted file mode 100644 index d6f0f25cf21..00000000000 --- a/docs/es/operations/requirements.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 44 -toc_title: Requisito ---- - -# Requisito {#requirements} - -## CPU {#cpu} - -Para la instalación desde paquetes deb precompilados, utilice una CPU con arquitectura x86_64 y soporte para las instrucciones de SSE 4.2. Para ejecutar ClickHouse con procesadores que no admiten SSE 4.2 o tienen arquitectura AArch64 o PowerPC64LE, debe compilar ClickHouse a partir de fuentes. - -ClickHouse implementa el procesamiento de datos paralelo y utiliza todos los recursos de hardware disponibles. Al elegir un procesador, tenga en cuenta que ClickHouse funciona de manera más eficiente en configuraciones con un gran número de núcleos pero con una velocidad de reloj más baja que en configuraciones con menos núcleos y una velocidad de reloj más alta. Por ejemplo, 16 núcleos con 2600 MHz es preferible a 8 núcleos con 3600 MHz. - -Se recomienda usar **Impulso de Turbo** y **hiper-threading** tecnología. Mejora significativamente el rendimiento con una carga de trabajo típica. - -## RAM {#ram} - -Recomendamos utilizar un mínimo de 4 GB de RAM para realizar consultas no triviales. El servidor ClickHouse puede ejecutarse con una cantidad mucho menor de RAM, pero requiere memoria para procesar consultas. - -El volumen requerido de RAM depende de: - -- La complejidad de las consultas. -- La cantidad de datos que se procesan en las consultas. - -Para calcular el volumen requerido de RAM, debe estimar el tamaño de los datos temporales para [GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause), [DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct), [JOIN](../sql-reference/statements/select/join.md#select-join) y otras operaciones que utilice. - -ClickHouse puede usar memoria externa para datos temporales. Ver [GROUP BY en memoria externa](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory) para más detalles. - -## Archivo de intercambio {#swap-file} - -Deshabilite el archivo de intercambio para entornos de producción. - -## Subsistema de almacenamiento {#storage-subsystem} - -Necesita tener 2 GB de espacio libre en disco para instalar ClickHouse. - -El volumen de almacenamiento requerido para sus datos debe calcularse por separado. La evaluación debe incluir: - -- Estimación del volumen de datos. - - Puede tomar una muestra de los datos y obtener el tamaño promedio de una fila de ella. Luego multiplique el valor por el número de filas que planea almacenar. - -- El coeficiente de compresión de datos. - - Para estimar el coeficiente de compresión de datos, cargue una muestra de sus datos en ClickHouse y compare el tamaño real de los datos con el tamaño de la tabla almacenada. Por ejemplo, los datos de clickstream generalmente se comprimen de 6 a 10 veces. - -Para calcular el volumen final de datos que se almacenarán, aplique el coeficiente de compresión al volumen de datos estimado. Si planea almacenar datos en varias réplicas, multiplique el volumen estimado por el número de réplicas. - -## Red {#network} - -Si es posible, use redes de 10G o clase superior. - -El ancho de banda de la red es fundamental para procesar consultas distribuidas con una gran cantidad de datos intermedios. Además, la velocidad de la red afecta a los procesos de replicación. - -## Software {#software} - -ClickHouse está desarrollado principalmente para la familia de sistemas operativos Linux. La distribución de Linux recomendada es Ubuntu. El `tzdata` paquete debe ser instalado en el sistema. - -ClickHouse también puede funcionar en otras familias de sistemas operativos. Ver detalles en el [Primeros pasos](../getting-started/index.md) sección de la documentación. diff --git a/docs/es/operations/server-configuration-parameters/index.md b/docs/es/operations/server-configuration-parameters/index.md deleted file mode 100644 index e1e2e777b94..00000000000 --- a/docs/es/operations/server-configuration-parameters/index.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Par\xE1metros de configuraci\xF3n del servidor" -toc_priority: 54 -toc_title: "Implantaci\xF3n" ---- - -# Parámetros de configuración del servidor {#server-settings} - -Esta sección contiene descripciones de la configuración del servidor que no se puede cambiar en el nivel de sesión o consulta. - -Estos ajustes se almacenan en el `config.xml` archivo en el servidor ClickHouse. - -Otros ajustes se describen en el “[Configuración](../settings/index.md#session-settings-intro)” apartado. - -Antes de estudiar la configuración, lea el [Archivos de configuración](../configuration-files.md#configuration_files) sección y tomar nota del uso de sustituciones (el `incl` y `optional` atributo). - -[Artículo Original](https://clickhouse.tech/docs/en/operations/server_configuration_parameters/) diff --git a/docs/es/operations/server-configuration-parameters/settings.md b/docs/es/operations/server-configuration-parameters/settings.md deleted file mode 100644 index 86264ed0440..00000000000 --- a/docs/es/operations/server-configuration-parameters/settings.md +++ /dev/null @@ -1,906 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 57 -toc_title: "Configuraci\xF3n del servidor" ---- - -# Configuración del servidor {#server-settings} - -## builtin_dictionaries_reload_interval {#builtin-dictionaries-reload-interval} - -El intervalo en segundos antes de volver a cargar los diccionarios integrados. - -ClickHouse recarga los diccionarios incorporados cada x segundos. Esto hace posible editar diccionarios “on the fly” sin reiniciar el servidor. - -Valor predeterminado: 3600. - -**Ejemplo** - -``` xml -3600 -``` - -## compresión {#server-settings-compression} - -Ajustes de compresión de datos para [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md)-mesas de motor. - -!!! warning "Advertencia" - No lo use si acaba de comenzar a usar ClickHouse. - -Plantilla de configuración: - -``` xml - - - ... - ... - ... - - ... - -``` - -`` campo: - -- `min_part_size` – The minimum size of a data part. -- `min_part_size_ratio` – The ratio of the data part size to the table size. -- `method` – Compression method. Acceptable values: `lz4` o `zstd`. - -Puede configurar múltiples `` apartado. - -Acciones cuando se cumplen las condiciones: - -- Si un elemento de datos coincide con un conjunto de condiciones, ClickHouse utiliza el método de compresión especificado. -- Si un elemento de datos coincide con varios conjuntos de condiciones, ClickHouse utiliza el primer conjunto de condiciones coincidente. - -Si no se cumplen condiciones para un elemento de datos, ClickHouse utiliza el `lz4` compresión. - -**Ejemplo** - -``` xml - - - 10000000000 - 0.01 - zstd - - -``` - -## default_database {#default-database} - -La base de datos predeterminada. - -Para obtener una lista de bases de datos, [SHOW DATABASES](../../sql-reference/statements/show.md#show-databases) consulta. - -**Ejemplo** - -``` xml -default -``` - -## default_profile {#default-profile} - -Perfil de configuración predeterminado. - -Los perfiles de configuración se encuentran en el archivo especificado en el parámetro `user_config`. - -**Ejemplo** - -``` xml -default -``` - -## Diccionarios_config {#server_configuration_parameters-dictionaries_config} - -La ruta de acceso al archivo de configuración para diccionarios externos. - -Camino: - -- Especifique la ruta absoluta o la ruta relativa al archivo de configuración del servidor. -- La ruta puede contener comodines \* y ?. - -Ver también “[Diccionarios externos](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md)”. - -**Ejemplo** - -``` xml -*_dictionary.xml -``` - -## Diccionarios_lazy_load {#server_configuration_parameters-dictionaries_lazy_load} - -La carga perezosa de los diccionarios. - -Si `true`, entonces cada diccionario es creado en el primer uso. Si se produce un error en la creación del diccionario, la función que estaba utilizando el diccionario produce una excepción. - -Si `false`, todos los diccionarios se crean cuando se inicia el servidor, y si hay un error, el servidor se apaga. - -El valor predeterminado es `true`. - -**Ejemplo** - -``` xml -true -``` - -## format_schema_path {#server_configuration_parameters-format_schema_path} - -La ruta de acceso al directorio con los esquemas para los datos de entrada, como los esquemas [CapnProto](../../interfaces/formats.md#capnproto) formato. - -**Ejemplo** - -``` xml - - format_schemas/ -``` - -## grafito {#server_configuration_parameters-graphite} - -Envío de datos a [Grafito](https://github.com/graphite-project). - -Configuración: - -- host – The Graphite server. -- port – The port on the Graphite server. -- interval – The interval for sending, in seconds. -- timeout – The timeout for sending data, in seconds. -- root_path – Prefix for keys. -- metrics – Sending data from the [sistema.métricas](../../operations/system-tables.md#system_tables-metrics) tabla. -- events – Sending deltas data accumulated for the time period from the [sistema.evento](../../operations/system-tables.md#system_tables-events) tabla. -- events_cumulative – Sending cumulative data from the [sistema.evento](../../operations/system-tables.md#system_tables-events) tabla. -- asynchronous_metrics – Sending data from the [sistema.asynchronous_metrics](../../operations/system-tables.md#system_tables-asynchronous_metrics) tabla. - -Puede configurar múltiples `` clausula. Por ejemplo, puede usar esto para enviar datos diferentes a intervalos diferentes. - -**Ejemplo** - -``` xml - - localhost - 42000 - 0.1 - 60 - one_min - true - true - false - true - -``` - -## graphite_rollup {#server_configuration_parameters-graphite-rollup} - -Ajustes para reducir los datos de grafito. - -Para obtener más información, consulte [GraphiteMergeTree](../../engines/table-engines/mergetree-family/graphitemergetree.md). - -**Ejemplo** - -``` xml - - - max - - 0 - 60 - - - 3600 - 300 - - - 86400 - 3600 - - - -``` - -## http_port/https_port {#http-porthttps-port} - -El puerto para conectarse al servidor a través de HTTP(s). - -Si `https_port` se especifica, [openSSL](#server_configuration_parameters-openssl) debe ser configurado. - -Si `http_port` se especifica, la configuración de OpenSSL se ignora incluso si está establecida. - -**Ejemplo** - -``` xml -9999 -``` - -## http_server_default_response {#server_configuration_parameters-http_server_default_response} - -La página que se muestra de forma predeterminada al acceder al servidor HTTP de ClickHouse. -El valor predeterminado es “Ok.” (con un avance de línea al final) - -**Ejemplo** - -Abrir `https://tabix.io/` al acceder `http://localhost: http_port`. - -``` xml - -
]]> -
-``` - -## include_from {#server_configuration_parameters-include_from} - -La ruta al archivo con sustituciones. - -Para obtener más información, consulte la sección “[Archivos de configuración](../configuration-files.md#configuration_files)”. - -**Ejemplo** - -``` xml -/etc/metrica.xml -``` - -## Interesante {#interserver-http-port} - -Puerto para el intercambio de datos entre servidores ClickHouse. - -**Ejemplo** - -``` xml -9009 -``` - -## Sistema abierto {#interserver-http-host} - -El nombre de host que pueden utilizar otros servidores para acceder a este servidor. - -Si se omite, se define de la misma manera que el `hostname-f` comando. - -Útil para separarse de una interfaz de red específica. - -**Ejemplo** - -``` xml -example.yandex.ru -``` - -## interserver_http_credentials {#server-settings-interserver-http-credentials} - -El nombre de usuario y la contraseña utilizados para [replicación](../../engines/table-engines/mergetree-family/replication.md) con los motores Replicated\*. Estas credenciales sólo se utilizan para la comunicación entre réplicas y no están relacionadas con las credenciales de los clientes de ClickHouse. El servidor está comprobando estas credenciales para conectar réplicas y utiliza las mismas credenciales cuando se conecta a otras réplicas. Por lo tanto, estas credenciales deben establecerse igual para todas las réplicas de un clúster. -De forma predeterminada, la autenticación no se utiliza. - -Esta sección contiene los siguientes parámetros: - -- `user` — username. -- `password` — password. - -**Ejemplo** - -``` xml - - admin - 222 - -``` - -## keep_alive_timeout {#keep-alive-timeout} - -El número de segundos que ClickHouse espera las solicitudes entrantes antes de cerrar la conexión. El valor predeterminado es de 3 segundos. - -**Ejemplo** - -``` xml -3 -``` - -## listen_host {#server_configuration_parameters-listen_host} - -Restricción en hosts de los que pueden provenir las solicitudes. Si desea que el servidor responda a todos ellos, especifique `::`. - -Ejemplos: - -``` xml -::1 -127.0.0.1 -``` - -## registrador {#server_configuration_parameters-logger} - -Configuración de registro. - -Claves: - -- level – Logging level. Acceptable values: `trace`, `debug`, `information`, `warning`, `error`. -- log – The log file. Contains all the entries according to `level`. -- errorlog – Error log file. -- size – Size of the file. Applies to `log`y`errorlog`. Una vez que el archivo alcanza `size`, ClickHouse archiva y cambia el nombre, y crea un nuevo archivo de registro en su lugar. -- count – The number of archived log files that ClickHouse stores. - -**Ejemplo** - -``` xml - - trace - /var/log/clickhouse-server/clickhouse-server.log - /var/log/clickhouse-server/clickhouse-server.err.log - 1000M - 10 - -``` - -También se admite la escritura en el syslog. Config ejemplo: - -``` xml - - 1 - -
syslog.remote:10514
- myhost.local - LOG_LOCAL6 - syslog -
-
-``` - -Claves: - -- use_syslog — Required setting if you want to write to the syslog. -- address — The host\[:port\] of syslogd. If omitted, the local daemon is used. -- hostname — Optional. The name of the host that logs are sent from. -- facility — [La palabra clave syslog facility](https://en.wikipedia.org/wiki/Syslog#Facility) en letras mayúsculas con el “LOG_” prefijo: (`LOG_USER`, `LOG_DAEMON`, `LOG_LOCAL3` y así sucesivamente). - Valor predeterminado: `LOG_USER` si `address` se especifica, `LOG_DAEMON otherwise.` -- format – Message format. Possible values: `bsd` y `syslog.` - -## macro {#macros} - -Sustituciones de parámetros para tablas replicadas. - -Se puede omitir si no se utilizan tablas replicadas. - -Para obtener más información, consulte la sección “[Creación de tablas replicadas](../../engines/table-engines/mergetree-family/replication.md)”. - -**Ejemplo** - -``` xml - -``` - -## Método de codificación de datos: {#server-mark-cache-size} - -Tamaño aproximado (en bytes) de la memoria caché de marcas utilizadas por los motores de [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md) familia. - -La memoria caché se comparte para el servidor y la memoria se asigna según sea necesario. El tamaño de la memoria caché debe ser al menos 5368709120. - -**Ejemplo** - -``` xml -5368709120 -``` - -## max_concurrent_queries {#max-concurrent-queries} - -El número máximo de solicitudes procesadas simultáneamente. - -**Ejemplo** - -``` xml -100 -``` - -## max_connections {#max-connections} - -El número máximo de conexiones entrantes. - -**Ejemplo** - -``` xml -4096 -``` - -## max_open_files {#max-open-files} - -El número máximo de archivos abiertos. - -Predeterminada: `maximum`. - -Recomendamos usar esta opción en Mac OS X desde el `getrlimit()` función devuelve un valor incorrecto. - -**Ejemplo** - -``` xml -262144 -``` - -## max_table_size_to_drop {#max-table-size-to-drop} - -Restricción en la eliminación de tablas. - -Si el tamaño de un [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md) mesa excede `max_table_size_to_drop` (en bytes), no puede eliminarlo usando una consulta DROP. - -Si aún necesita eliminar la tabla sin reiniciar el servidor ClickHouse, cree el `/flags/force_drop_table` y ejecute la consulta DROP. - -Valor predeterminado: 50 GB. - -El valor 0 significa que puede eliminar todas las tablas sin restricciones. - -**Ejemplo** - -``` xml -0 -``` - -## merge_tree {#server_configuration_parameters-merge_tree} - -Ajuste fino para tablas en el [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md). - -Para obtener más información, vea MergeTreeSettings.h archivo de encabezado. - -**Ejemplo** - -``` xml - - 5 - -``` - -## openSSL {#server_configuration_parameters-openssl} - -Configuración cliente/servidor SSL. - -El soporte para SSL es proporcionado por el `libpoco` biblioteca. La interfaz se describe en el archivo [Nombre de la red inalámbrica (SSID):h](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/SSLManager.h) - -Claves para la configuración del servidor/cliente: - -- privateKeyFile – The path to the file with the secret key of the PEM certificate. The file may contain a key and certificate at the same time. -- certificateFile – The path to the client/server certificate file in PEM format. You can omit it if `privateKeyFile` contiene el certificado. -- caConfig – The path to the file or directory that contains trusted root certificates. -- verificationMode – The method for checking the node's certificates. Details are in the description of the [Contexto](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/Context.h) clase. Valores posibles: `none`, `relaxed`, `strict`, `once`. -- verificationDepth – The maximum length of the verification chain. Verification will fail if the certificate chain length exceeds the set value. -- loadDefaultCAFile – Indicates that built-in CA certificates for OpenSSL will be used. Acceptable values: `true`, `false`. \| -- cipherList – Supported OpenSSL encryptions. For example: `ALL:!ADH:!LOW:!EXP:!MD5:@STRENGTH`. -- cacheSessions – Enables or disables caching sessions. Must be used in combination with `sessionIdContext`. Valores aceptables: `true`, `false`. -- sessionIdContext – A unique set of random characters that the server appends to each generated identifier. The length of the string must not exceed `SSL_MAX_SSL_SESSION_ID_LENGTH`. Este parámetro siempre se recomienda ya que ayuda a evitar problemas tanto si el servidor almacena en caché la sesión como si el cliente solicita el almacenamiento en caché. Valor predeterminado: `${application.name}`. -- sessionCacheSize – The maximum number of sessions that the server caches. Default value: 1024\*20. 0 – Unlimited sessions. -- sessionTimeout – Time for caching the session on the server. -- extendedVerification – Automatically extended verification of certificates after the session ends. Acceptable values: `true`, `false`. -- requireTLSv1 – Require a TLSv1 connection. Acceptable values: `true`, `false`. -- requireTLSv1_1 – Require a TLSv1.1 connection. Acceptable values: `true`, `false`. -- requireTLSv1 – Require a TLSv1.2 connection. Acceptable values: `true`, `false`. -- fips – Activates OpenSSL FIPS mode. Supported if the library's OpenSSL version supports FIPS. -- privateKeyPassphraseHandler – Class (PrivateKeyPassphraseHandler subclass) that requests the passphrase for accessing the private key. For example: ``, `KeyFileHandler`, `test`, ``. -- invalidCertificateHandler – Class (a subclass of CertificateHandler) for verifying invalid certificates. For example: ` ConsoleCertificateHandler ` . -- disableProtocols – Protocols that are not allowed to use. -- preferServerCiphers – Preferred server ciphers on the client. - -**Ejemplo de configuración:** - -``` xml - - - - /etc/clickhouse-server/server.crt - /etc/clickhouse-server/server.key - - /etc/clickhouse-server/dhparam.pem - none - true - true - sslv2,sslv3 - true - - - true - true - sslv2,sslv3 - true - - - - RejectCertificateHandler - - - -``` - -## part_log {#server_configuration_parameters-part-log} - -Registro de eventos asociados con [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md). Por ejemplo, agregar o fusionar datos. Puede utilizar el registro para simular algoritmos de combinación y comparar sus características. Puede visualizar el proceso de fusión. - -Las consultas se registran en el [sistema.part_log](../../operations/system-tables.md#system_tables-part-log) tabla, no en un archivo separado. Puede configurar el nombre de esta tabla en el `table` parámetro (ver más abajo). - -Utilice los siguientes parámetros para configurar el registro: - -- `database` – Name of the database. -- `table` – Name of the system table. -- `partition_by` – Sets a [clave de partición personalizada](../../engines/table-engines/mergetree-family/custom-partitioning-key.md). -- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. - -**Ejemplo** - -``` xml - - system - part_log
- toMonday(event_date) - 7500 -
-``` - -## camino {#server_configuration_parameters-path} - -La ruta de acceso al directorio que contiene los datos. - -!!! note "Nota" - La barra diagonal es obligatoria. - -**Ejemplo** - -``` xml -/var/lib/clickhouse/ -``` - -## prometeo {#server_configuration_parameters-prometheus} - -Exponer datos de métricas para raspar desde [Prometeo](https://prometheus.io). - -Configuración: - -- `endpoint` – HTTP endpoint for scraping metrics by prometheus server. Start from ‘/’. -- `port` – Port for `endpoint`. -- `metrics` – Flag that sets to expose metrics from the [sistema.métricas](../system-tables.md#system_tables-metrics) tabla. -- `events` – Flag that sets to expose metrics from the [sistema.evento](../system-tables.md#system_tables-events) tabla. -- `asynchronous_metrics` – Flag that sets to expose current metrics values from the [sistema.asynchronous_metrics](../system-tables.md#system_tables-asynchronous_metrics) tabla. - -**Ejemplo** - -``` xml - - /metrics - 8001 - true - true - true - -``` - -## query_log {#server_configuration_parameters-query-log} - -Configuración de las consultas de registro recibidas con [log_queries=1](../settings/settings.md) configuración. - -Las consultas se registran en el [sistema.query_log](../../operations/system-tables.md#system_tables-query_log) tabla, no en un archivo separado. Puede cambiar el nombre de la tabla en el `table` parámetro (ver más abajo). - -Utilice los siguientes parámetros para configurar el registro: - -- `database` – Name of the database. -- `table` – Name of the system table the queries will be logged in. -- `partition_by` – Sets a [clave de partición personalizada](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) para una mesa. -- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. - -Si la tabla no existe, ClickHouse la creará. Si la estructura del registro de consultas cambió cuando se actualizó el servidor ClickHouse, se cambia el nombre de la tabla con la estructura anterior y se crea una nueva tabla automáticamente. - -**Ejemplo** - -``` xml - - system - query_log
- toMonday(event_date) - 7500 -
-``` - -## Sistema abierto {#server_configuration_parameters-query-thread-log} - -Configuración de subprocesos de registro de consultas recibidas con [Log_query_threads = 1](../settings/settings.md#settings-log-query-threads) configuración. - -Las consultas se registran en el [sistema.Sistema abierto.](../../operations/system-tables.md#system_tables-query-thread-log) tabla, no en un archivo separado. Puede cambiar el nombre de la tabla en el `table` parámetro (ver más abajo). - -Utilice los siguientes parámetros para configurar el registro: - -- `database` – Name of the database. -- `table` – Name of the system table the queries will be logged in. -- `partition_by` – Sets a [clave de partición personalizada](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) para una tabla del sistema. -- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. - -Si la tabla no existe, ClickHouse la creará. Si la estructura del registro de subprocesos de consulta cambió cuando se actualizó el servidor ClickHouse, se cambia el nombre de la tabla con la estructura anterior y se crea una nueva tabla automáticamente. - -**Ejemplo** - -``` xml - - system - query_thread_log
- toMonday(event_date) - 7500 -
-``` - -## trace_log {#server_configuration_parameters-trace_log} - -Ajustes para el [trace_log](../../operations/system-tables.md#system_tables-trace_log) operación de la tabla del sistema. - -Parámetros: - -- `database` — Database for storing a table. -- `table` — Table name. -- `partition_by` — [Clave de partición personalizada](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) para una tabla del sistema. -- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. - -El archivo de configuración del servidor predeterminado `config.xml` contiene la siguiente sección de configuración: - -``` xml - - system - trace_log
- toYYYYMM(event_date) - 7500 -
-``` - -## query_masking_rules {#query-masking-rules} - -Reglas basadas en Regexp, que se aplicarán a las consultas, así como a todos los mensajes de registro antes de almacenarlos en los registros del servidor, -`system.query_log`, `system.text_log`, `system.processes` tabla, y en los registros enviados al cliente. Eso permite prevenir -fuga de datos sensible de consultas SQL (como nombres, correos electrónicos, -identificadores o números de tarjetas de crédito) a los registros. - -**Ejemplo** - -``` xml - - - hide SSN - (^|\D)\d{3}-\d{2}-\d{4}($|\D) - 000-00-0000 - - -``` - -Campos de configuración: -- `name` - nombre de la regla (opcional) -- `regexp` - Expresión regular compatible con RE2 (obligatoria) -- `replace` - cadena de sustitución para datos confidenciales (opcional, por defecto - seis asteriscos) - -Las reglas de enmascaramiento se aplican a toda la consulta (para evitar fugas de datos confidenciales de consultas mal formadas / no analizables). - -`system.events` la tabla tiene contador `QueryMaskingRulesMatch` que tienen un número total de coincidencias de reglas de enmascaramiento de consultas. - -Para consultas distribuidas, cada servidor debe configurarse por separado; de lo contrario, las subconsultas pasan a otros -los nodos se almacenarán sin enmascarar. - -## remote_servers {#server-settings-remote-servers} - -Configuración de los clústeres utilizados por [Distribuido](../../engines/table-engines/special/distributed.md) motor de mesa y por el `cluster` función de la tabla. - -**Ejemplo** - -``` xml - -``` - -Para el valor de la `incl` atributo, consulte la sección “[Archivos de configuración](../configuration-files.md#configuration_files)”. - -**Ver también** - -- [skip_unavailable_shards](../settings/settings.md#settings-skip_unavailable_shards) - -## Zona horaria {#server_configuration_parameters-timezone} - -La zona horaria del servidor. - -Especificado como un identificador de la IANA para la zona horaria UTC o la ubicación geográfica (por ejemplo, África/Abidjan). - -La zona horaria es necesaria para las conversiones entre los formatos String y DateTime cuando los campos DateTime se envían al formato de texto (impreso en la pantalla o en un archivo) y cuando se obtiene DateTime de una cadena. Además, la zona horaria se usa en funciones que funcionan con la hora y la fecha si no recibieron la zona horaria en los parámetros de entrada. - -**Ejemplo** - -``` xml -Europe/Moscow -``` - -## Tcp_port {#server_configuration_parameters-tcp_port} - -Puerto para comunicarse con clientes a través del protocolo TCP. - -**Ejemplo** - -``` xml -9000 -``` - -## Tcp_port_secure {#server_configuration_parameters-tcp_port_secure} - -Puerto TCP para una comunicación segura con los clientes. Úselo con [OpenSSL](#server_configuration_parameters-openssl) configuración. - -**Valores posibles** - -Entero positivo. - -**Valor predeterminado** - -``` xml -9440 -``` - -## mysql_port {#server_configuration_parameters-mysql_port} - -Puerto para comunicarse con clientes a través del protocolo MySQL. - -**Valores posibles** - -Entero positivo. - -Ejemplo - -``` xml -9004 -``` - -## tmp_path {#server-settings-tmp_path} - -Ruta de acceso a datos temporales para procesar consultas grandes. - -!!! note "Nota" - La barra diagonal es obligatoria. - -**Ejemplo** - -``` xml -/var/lib/clickhouse/tmp/ -``` - -## tmp_policy {#server-settings-tmp-policy} - -Política de [`storage_configuration`](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) para almacenar archivos temporales. -Si no se establece [`tmp_path`](#server-settings-tmp_path) se utiliza, de lo contrario se ignora. - -!!! note "Nota" - - `move_factor` se ignora -- `keep_free_space_bytes` se ignora -- `max_data_part_size_bytes` se ignora -- debe tener exactamente un volumen en esa política - -## Uncompressed_cache_size {#server-settings-uncompressed_cache_size} - -Tamaño de la memoria caché (en bytes) para los datos sin comprimir utilizados por los motores de [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md). - -Hay una caché compartida para el servidor. La memoria se asigna a pedido. La caché se usa si la opción [Use_uncompressed_cache](../settings/settings.md#setting-use_uncompressed_cache) está habilitado. - -La caché sin comprimir es ventajosa para consultas muy cortas en casos individuales. - -**Ejemplo** - -``` xml -8589934592 -``` - -## user_files_path {#server_configuration_parameters-user_files_path} - -El directorio con archivos de usuario. Utilizado en la función de tabla [file()](../../sql-reference/table-functions/file.md). - -**Ejemplo** - -``` xml -/var/lib/clickhouse/user_files/ -``` - -## users_config {#users-config} - -Ruta de acceso al archivo que contiene: - -- Configuraciones de usuario. -- Derechos de acceso. -- Perfiles de configuración. -- Configuración de cuota. - -**Ejemplo** - -``` xml -users.xml -``` - -## Zookeeper {#server-settings_zookeeper} - -Contiene la configuración que permite a ClickHouse interactuar con [ZooKeeper](http://zookeeper.apache.org/) Cluster. - -ClickHouse utiliza ZooKeeper para almacenar metadatos de réplicas cuando se utilizan tablas replicadas. Si no se utilizan tablas replicadas, se puede omitir esta sección de parámetros. - -Esta sección contiene los siguientes parámetros: - -- `node` — ZooKeeper endpoint. You can set multiple endpoints. - - Por ejemplo: - - - -``` xml - - example_host - 2181 - -``` - - The `index` attribute specifies the node order when trying to connect to the ZooKeeper cluster. - -- `session_timeout` — Maximum timeout for the client session in milliseconds. -- `root` — The [Znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes) que se utiliza como la raíz de los znodes utilizados por el servidor ClickHouse. Opcional. -- `identity` — User and password, that can be required by ZooKeeper to give access to requested znodes. Optional. - -**Ejemplo de configuración** - -``` xml - - - example1 - 2181 - - - example2 - 2181 - - 30000 - 10000 - - /path/to/zookeeper/node - - user:password - -``` - -**Ver también** - -- [Replicación](../../engines/table-engines/mergetree-family/replication.md) -- [Guía del programador ZooKeeper](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html) - -## use_minimalistic_part_header_in_zookeeper {#server-settings-use_minimalistic_part_header_in_zookeeper} - -Método de almacenamiento para encabezados de parte de datos en ZooKeeper. - -Esta configuración sólo se aplica a `MergeTree` familia. Se puede especificar: - -- A nivel mundial en el [merge_tree](#server_configuration_parameters-merge_tree) sección de la `config.xml` file. - - ClickHouse utiliza la configuración para todas las tablas del servidor. Puede cambiar la configuración en cualquier momento. Las tablas existentes cambian su comportamiento cuando cambia la configuración. - -- Para cada tabla. - - Al crear una tabla, especifique la correspondiente [ajuste del motor](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table). El comportamiento de una tabla existente con esta configuración no cambia, incluso si la configuración global cambia. - -**Valores posibles** - -- 0 — Functionality is turned off. -- 1 — Functionality is turned on. - -Si `use_minimalistic_part_header_in_zookeeper = 1`, entonces [repetición](../../engines/table-engines/mergetree-family/replication.md) las tablas almacenan los encabezados de las partes de datos de forma compacta `znode`. Si la tabla contiene muchas columnas, este método de almacenamiento reduce significativamente el volumen de los datos almacenados en Zookeeper. - -!!! attention "Atención" - Después de aplicar `use_minimalistic_part_header_in_zookeeper = 1`, no puede degradar el servidor ClickHouse a una versión que no admite esta configuración. Tenga cuidado al actualizar ClickHouse en servidores de un clúster. No actualice todos los servidores a la vez. Es más seguro probar nuevas versiones de ClickHouse en un entorno de prueba o solo en unos pocos servidores de un clúster. - - Data part headers already stored with this setting can't be restored to their previous (non-compact) representation. - -**Valor predeterminado:** 0. - -## disable_internal_dns_cache {#server-settings-disable-internal-dns-cache} - -Deshabilita la memoria caché DNS interna. Recomendado para operar ClickHouse en sistemas -con infraestructura que cambia frecuentemente como Kubernetes. - -**Valor predeterminado:** 0. - -## dns_cache_update_period {#server-settings-dns-cache-update-period} - -El período de actualización de las direcciones IP almacenadas en la caché DNS interna de ClickHouse (en segundos). -La actualización se realiza de forma asíncrona, en un subproceso del sistema separado. - -**Valor predeterminado**: 15. - -## access_control_path {#access_control_path} - -Ruta de acceso a una carpeta donde un servidor ClickHouse almacena configuraciones de usuario y rol creadas por comandos SQL. - -Valor predeterminado: `/var/lib/clickhouse/access/`. - -**Ver también** - -- [Control de acceso y gestión de cuentas](../access-rights.md#access-control) - -[Artículo Original](https://clickhouse.tech/docs/en/operations/server_configuration_parameters/settings/) diff --git a/docs/es/operations/settings/constraints-on-settings.md b/docs/es/operations/settings/constraints-on-settings.md deleted file mode 100644 index fe385f6ddbb..00000000000 --- a/docs/es/operations/settings/constraints-on-settings.md +++ /dev/null @@ -1,75 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 62 -toc_title: "Restricciones en la configuraci\xF3n" ---- - -# Restricciones en la configuración {#constraints-on-settings} - -Las restricciones en los ajustes se pueden definir en el `profiles` sección de la `user.xml` el archivo de configuración y prohíba a los usuarios cambiar algunos de los ajustes `SET` consulta. -Las restricciones se definen como las siguientes: - -``` xml - - - - - lower_boundary - - - upper_boundary - - - lower_boundary - upper_boundary - - - - - - - -``` - -Si el usuario intenta violar las restricciones, se lanza una excepción y la configuración no se cambia. -Se admiten tres tipos de restricciones: `min`, `max`, `readonly`. El `min` y `max` Las restricciones especifican los límites superior e inferior para una configuración numérica y se pueden usar en combinación. El `readonly` constraint especifica que el usuario no puede cambiar la configuración correspondiente en absoluto. - -**Ejemplo:** Dejar `users.xml` incluye líneas: - -``` xml - - - 10000000000 - 0 - ... - - - 5000000000 - 20000000000 - - - - - - - -``` - -Las siguientes consultas arrojan excepciones: - -``` sql -SET max_memory_usage=20000000001; -SET max_memory_usage=4999999999; -SET force_index_by_date=1; -``` - -``` text -Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be greater than 20000000000. -Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be less than 5000000000. -Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should not be changed. -``` - -**Nota:** el `default` perfil tiene un manejo especial: todas las restricciones definidas para el `default` profile se convierten en las restricciones predeterminadas, por lo que restringen a todos los usuarios hasta que se anulan explícitamente para estos usuarios. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/settings/constraints_on_settings/) diff --git a/docs/es/operations/settings/index.md b/docs/es/operations/settings/index.md deleted file mode 100644 index 37aab0a7e1b..00000000000 --- a/docs/es/operations/settings/index.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Configuraci\xF3n" -toc_priority: 55 -toc_title: "Implantaci\xF3n" ---- - -# Configuración {#session-settings-intro} - -Hay varias maneras de realizar todos los ajustes descritos en esta sección de documentación. - -Los ajustes se configuran en capas, por lo que cada capa subsiguiente redefine los ajustes anteriores. - -Formas de configurar los ajustes, por orden de prioridad: - -- Ajustes en el `users.xml` archivo de configuración del servidor. - - Establecer en el elemento ``. - -- Configuración de la sesión. - - Enviar `SET setting=value` desde el cliente de consola ClickHouse en modo interactivo. - Del mismo modo, puede utilizar sesiones ClickHouse en el protocolo HTTP. Para hacer esto, debe especificar el `session_id` Parámetro HTTP. - -- Configuración de consulta. - - - Al iniciar el cliente de consola de ClickHouse en modo no interactivo, establezca el parámetro de inicio `--setting=value`. - - Al usar la API HTTP, pase los parámetros CGI (`URL?setting_1=value&setting_2=value...`). - -Los ajustes que solo se pueden realizar en el archivo de configuración del servidor no se tratan en esta sección. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/settings/) diff --git a/docs/es/operations/settings/permissions-for-queries.md b/docs/es/operations/settings/permissions-for-queries.md deleted file mode 100644 index f9f669b876e..00000000000 --- a/docs/es/operations/settings/permissions-for-queries.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 58 -toc_title: Permisos para consultas ---- - -# Permisos para consultas {#permissions_for_queries} - -Las consultas en ClickHouse se pueden dividir en varios tipos: - -1. Leer consultas de datos: `SELECT`, `SHOW`, `DESCRIBE`, `EXISTS`. -2. Escribir consultas de datos: `INSERT`, `OPTIMIZE`. -3. Cambiar la consulta de configuración: `SET`, `USE`. -4. [DDL](https://en.wikipedia.org/wiki/Data_definition_language) consulta: `CREATE`, `ALTER`, `RENAME`, `ATTACH`, `DETACH`, `DROP` `TRUNCATE`. -5. `KILL QUERY`. - -La siguiente configuración regula los permisos de usuario según el tipo de consulta: - -- [sólo lectura](#settings_readonly) — Restricts permissions for all types of queries except DDL queries. -- [Método de codificación de datos:](#settings_allow_ddl) — Restricts permissions for DDL queries. - -`KILL QUERY` se puede realizar con cualquier configuración. - -## sólo lectura {#settings_readonly} - -Restringe los permisos para leer datos, escribir datos y cambiar las consultas de configuración. - -Vea cómo las consultas se dividen en tipos [arriba](#permissions_for_queries). - -Valores posibles: - -- 0 — All queries are allowed. -- 1 — Only read data queries are allowed. -- 2 — Read data and change settings queries are allowed. - -Después de configurar `readonly = 1` el usuario no puede cambiar `readonly` y `allow_ddl` configuración en la sesión actual. - -Cuando se utiliza el `GET` método en el [Interfaz HTTP](../../interfaces/http.md), `readonly = 1` se establece automáticamente. Para modificar los datos, `POST` método. - -Configuración `readonly = 1` prohibir al usuario cambiar todas las configuraciones. Hay una manera de prohibir al usuario -de cambiar sólo ajustes específicos, para más detalles ver [restricciones en la configuración](constraints-on-settings.md). - -Valor predeterminado: 0 - -## Método de codificación de datos: {#settings_allow_ddl} - -Permite o niega [DDL](https://en.wikipedia.org/wiki/Data_definition_language) consulta. - -Vea cómo las consultas se dividen en tipos [arriba](#permissions_for_queries). - -Valores posibles: - -- 0 — DDL queries are not allowed. -- 1 — DDL queries are allowed. - -No se puede ejecutar `SET allow_ddl = 1` si `allow_ddl = 0` para la sesión actual. - -Valor predeterminado: 1 - -[Artículo Original](https://clickhouse.tech/docs/en/operations/settings/permissions_for_queries/) diff --git a/docs/es/operations/settings/query-complexity.md b/docs/es/operations/settings/query-complexity.md deleted file mode 100644 index 82bc235c30d..00000000000 --- a/docs/es/operations/settings/query-complexity.md +++ /dev/null @@ -1,300 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 59 -toc_title: Restricciones en la complejidad de consultas ---- - -# Restricciones en la complejidad de consultas {#restrictions-on-query-complexity} - -Las restricciones en la complejidad de la consulta forman parte de la configuración. -Se utilizan para proporcionar una ejecución más segura desde la interfaz de usuario. -Casi todas las restricciones solo se aplican a `SELECT`. Para el procesamiento de consultas distribuidas, las restricciones se aplican en cada servidor por separado. - -ClickHouse comprueba las restricciones para las partes de datos, no para cada fila. Significa que puede exceder el valor de restricción con el tamaño de la parte de datos. - -Restricciones en el “maximum amount of something” puede tomar el valor 0, lo que significa “unrestricted”. -La mayoría de las restricciones también tienen un ‘overflow_mode’ establecer, lo que significa qué hacer cuando se excede el límite. -Puede tomar uno de dos valores: `throw` o `break`. Las restricciones en la agregación (group_by_overflow_mode) también tienen el valor `any`. - -`throw` – Throw an exception (default). - -`break` – Stop executing the query and return the partial result, as if the source data ran out. - -`any (only for group_by_overflow_mode)` – Continuing aggregation for the keys that got into the set, but don't add new keys to the set. - -## Método de codificación de datos: {#settings_max_memory_usage} - -La cantidad máxima de RAM que se utiliza para ejecutar una consulta en un único servidor. - -En el archivo de configuración predeterminado, el máximo es de 10 GB. - -La configuración no tiene en cuenta el volumen de memoria disponible ni el volumen total de memoria en la máquina. -La restricción se aplica a una sola consulta dentro de un único servidor. -Usted puede utilizar `SHOW PROCESSLIST` para ver el consumo de memoria actual para cada consulta. -Además, el consumo máximo de memoria se rastrea para cada consulta y se escribe en el registro. - -El uso de memoria no se supervisa para los estados de ciertas funciones agregadas. - -El uso de memoria no se realiza un seguimiento completo de los estados de las funciones agregadas `min`, `max`, `any`, `anyLast`, `argMin`, `argMax` de `String` y `Array` argumento. - -El consumo de memoria también está restringido por los parámetros `max_memory_usage_for_user` y `max_memory_usage_for_all_queries`. - -## Max_memory_usage_for_user {#max-memory-usage-for-user} - -La cantidad máxima de RAM que se utilizará para ejecutar las consultas de un usuario en un único servidor. - -Los valores predeterminados se definen en [Configuración.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Core/Settings.h#L288). De forma predeterminada, el importe no está restringido (`max_memory_usage_for_user = 0`). - -Ver también la descripción de [Método de codificación de datos:](#settings_max_memory_usage). - -## Todos los derechos reservados {#max-memory-usage-for-all-queries} - -La cantidad máxima de RAM que se utilizará para ejecutar todas las consultas en un único servidor. - -Los valores predeterminados se definen en [Configuración.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Core/Settings.h#L289). De forma predeterminada, el importe no está restringido (`max_memory_usage_for_all_queries = 0`). - -Ver también la descripción de [Método de codificación de datos:](#settings_max_memory_usage). - -## ¿Qué puedes encontrar en Neodigit {#max-rows-to-read} - -Las siguientes restricciones se pueden verificar en cada bloque (en lugar de en cada fila). Es decir, las restricciones se pueden romper un poco. - -Un número máximo de filas que se pueden leer de una tabla al ejecutar una consulta. - -## ¿Qué puedes encontrar en Neodigit {#max-bytes-to-read} - -Un número máximo de bytes (datos sin comprimir) que se pueden leer de una tabla al ejecutar una consulta. - -## Método de codificación de datos: {#read-overflow-mode} - -Qué hacer cuando el volumen de datos leídos excede uno de los límites: ‘throw’ o ‘break’. Por defecto, throw. - -## Método de codificación de datos: {#settings-max-rows-to-group-by} - -Un número máximo de claves únicas recibidas de la agregación. Esta configuración le permite limitar el consumo de memoria al agregar. - -## Grupo_by_overflow_mode {#group-by-overflow-mode} - -Qué hacer cuando el número de claves únicas para la agregación excede el límite: ‘throw’, ‘break’, o ‘any’. Por defecto, throw. -Uso de la ‘any’ valor le permite ejecutar una aproximación de GROUP BY. La calidad de esta aproximación depende de la naturaleza estadística de los datos. - -## max_bytes_before_external_group_by {#settings-max_bytes_before_external_group_by} - -Habilita o deshabilita la ejecución de `GROUP BY` en la memoria externa. Ver [GROUP BY en memoria externa](../../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory). - -Valores posibles: - -- Volumen máximo de RAM (en bytes) que puede ser utilizado por el único [GROUP BY](../../sql-reference/statements/select/group-by.md#select-group-by-clause) operación. -- 0 — `GROUP BY` en la memoria externa deshabilitada. - -Valor predeterminado: 0. - -## Método de codificación de datos: {#max-rows-to-sort} - -Un número máximo de filas antes de ordenar. Esto le permite limitar el consumo de memoria al ordenar. - -## Método de codificación de datos: {#max-bytes-to-sort} - -Un número máximo de bytes antes de ordenar. - -## sort_overflow_mode {#sort-overflow-mode} - -Qué hacer si el número de filas recibidas antes de ordenar excede uno de los límites: ‘throw’ o ‘break’. Por defecto, throw. - -## max_result_rows {#setting-max_result_rows} - -Límite en el número de filas en el resultado. También se comprueba si hay subconsultas y en servidores remotos cuando se ejecutan partes de una consulta distribuida. - -## max_result_bytes {#max-result-bytes} - -Límite en el número de bytes en el resultado. Lo mismo que el ajuste anterior. - -## result_overflow_mode {#result-overflow-mode} - -Qué hacer si el volumen del resultado excede uno de los límites: ‘throw’ o ‘break’. Por defecto, throw. - -Utilizar ‘break’ es similar a usar LIMIT. `Break` interrumpe la ejecución sólo en el nivel de bloque. Esto significa que la cantidad de filas devueltas es mayor que [max_result_rows](#setting-max_result_rows), múltiplo de [max_block_size](settings.md#setting-max_block_size) y depende de [max_threads](settings.md#settings-max_threads). - -Ejemplo: - -``` sql -SET max_threads = 3, max_block_size = 3333; -SET max_result_rows = 3334, result_overflow_mode = 'break'; - -SELECT * -FROM numbers_mt(100000) -FORMAT Null; -``` - -Resultado: - -``` text -6666 rows in set. ... -``` - -## max_execution_time {#max-execution-time} - -Tiempo máximo de ejecución de la consulta en segundos. -En este momento, no se comprueba una de las etapas de clasificación, o al fusionar y finalizar funciones agregadas. - -## timeout_overflow_mode {#timeout-overflow-mode} - -Qué hacer si la consulta se ejecuta más de ‘max_execution_time’: ‘throw’ o ‘break’. Por defecto, throw. - -## Método de codificación de datos: {#min-execution-speed} - -Velocidad de ejecución mínima en filas por segundo. Comprobado en cada bloque de datos cuando ‘timeout_before_checking_execution_speed’ expirar. Si la velocidad de ejecución es menor, se produce una excepción. - -## Todos los derechos reservados {#min-execution-speed-bytes} - -Un número mínimo de bytes de ejecución por segundo. Comprobado en cada bloque de datos cuando ‘timeout_before_checking_execution_speed’ expirar. Si la velocidad de ejecución es menor, se produce una excepción. - -## Max_execution_speed {#max-execution-speed} - -Un número máximo de filas de ejecución por segundo. Comprobado en cada bloque de datos cuando ‘timeout_before_checking_execution_speed’ expirar. Si la velocidad de ejecución es alta, la velocidad de ejecución se reducirá. - -## Max_execution_speed_bytes {#max-execution-speed-bytes} - -Un número máximo de bytes de ejecución por segundo. Comprobado en cada bloque de datos cuando ‘timeout_before_checking_execution_speed’ expirar. Si la velocidad de ejecución es alta, la velocidad de ejecución se reducirá. - -## Tiempo de espera antes de comprobar_ejecución_velocidad {#timeout-before-checking-execution-speed} - -Comprueba que la velocidad de ejecución no sea demasiado lenta (no menos de ‘min_execution_speed’), después de que el tiempo especificado en segundos haya expirado. - -## Max_columns_to_read {#max-columns-to-read} - -Un número máximo de columnas que se pueden leer de una tabla en una sola consulta. Si una consulta requiere leer un mayor número de columnas, produce una excepción. - -## max_temporary_columns {#max-temporary-columns} - -Un número máximo de columnas temporales que se deben mantener en la memoria RAM al mismo tiempo cuando se ejecuta una consulta, incluidas las columnas constantes. Si hay más columnas temporales que esto, arroja una excepción. - -## max_temporary_non_const_columns {#max-temporary-non-const-columns} - -Lo mismo que ‘max_temporary_columns’, pero sin contar columnas constantes. -Tenga en cuenta que las columnas constantes se forman con bastante frecuencia cuando se ejecuta una consulta, pero requieren aproximadamente cero recursos informáticos. - -## max_subquery_depth {#max-subquery-depth} - -Profundidad máxima de anidamiento de subconsultas. Si las subconsultas son más profundas, se produce una excepción. De forma predeterminada, 100. - -## max_pipeline_depth {#max-pipeline-depth} - -Profundidad máxima de la tubería. Corresponde al número de transformaciones que realiza cada bloque de datos durante el procesamiento de consultas. Contado dentro de los límites de un único servidor. Si la profundidad de la canalización es mayor, se produce una excepción. Por defecto, 1000. - -## max_ast_depth {#max-ast-depth} - -Profundidad máxima de anidamiento de un árbol sintáctico de consulta. Si se supera, se produce una excepción. -En este momento, no se verifica durante el análisis, sino solo después de analizar la consulta. Es decir, se puede crear un árbol sintáctico demasiado profundo durante el análisis, pero la consulta fallará. Por defecto, 1000. - -## max_ast_elements {#max-ast-elements} - -Un número máximo de elementos en un árbol sintáctico de consulta. Si se supera, se produce una excepción. -De la misma manera que la configuración anterior, se verifica solo después de analizar la consulta. De forma predeterminada, 50.000. - -## Método de codificación de datos: {#max-rows-in-set} - -Un número máximo de filas para un conjunto de datos en la cláusula IN creada a partir de una subconsulta. - -## Método de codificación de datos: {#max-bytes-in-set} - -Número máximo de bytes (datos sin comprimir) utilizados por un conjunto en la cláusula IN creada a partir de una subconsulta. - -## set_overflow_mode {#set-overflow-mode} - -Qué hacer cuando la cantidad de datos excede uno de los límites: ‘throw’ o ‘break’. Por defecto, throw. - -## Método de codificación de datos: {#max-rows-in-distinct} - -Un número máximo de filas diferentes al usar DISTINCT. - -## Método de codificación de datos: {#max-bytes-in-distinct} - -Un número máximo de bytes utilizados por una tabla hash cuando se utiliza DISTINCT. - -## distinct_overflow_mode {#distinct-overflow-mode} - -Qué hacer cuando la cantidad de datos excede uno de los límites: ‘throw’ o ‘break’. Por defecto, throw. - -## max_rows_to_transfer {#max-rows-to-transfer} - -Un número máximo de filas que se pueden pasar a un servidor remoto o guardar en una tabla temporal cuando se utiliza GLOBAL IN. - -## max_bytes_to_transfer {#max-bytes-to-transfer} - -Un número máximo de bytes (datos sin comprimir) que se pueden pasar a un servidor remoto o guardar en una tabla temporal cuando se utiliza GLOBAL IN. - -## transfer_overflow_mode {#transfer-overflow-mode} - -Qué hacer cuando la cantidad de datos excede uno de los límites: ‘throw’ o ‘break’. Por defecto, throw. - -## Método de codificación de datos: {#settings-max_rows_in_join} - -Limita el número de filas de la tabla hash que se utiliza al unir tablas. - -Esta configuración se aplica a [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operaciones y la [Unir](../../engines/table-engines/special/join.md) motor de mesa. - -Si una consulta contiene varias combinaciones, ClickHouse comprueba esta configuración para cada resultado intermedio. - -ClickHouse puede proceder con diferentes acciones cuando se alcanza el límite. Utilice el [join_overflow_mode](#settings-join_overflow_mode) configuración para elegir la acción. - -Valores posibles: - -- Entero positivo. -- 0 — Unlimited number of rows. - -Valor predeterminado: 0. - -## Método de codificación de datos: {#settings-max_bytes_in_join} - -Limita el tamaño en bytes de la tabla hash utilizada al unir tablas. - -Esta configuración se aplica a [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operaciones y [Unirse al motor de tabla](../../engines/table-engines/special/join.md). - -Si la consulta contiene combinaciones, ClickHouse comprueba esta configuración para cada resultado intermedio. - -ClickHouse puede proceder con diferentes acciones cuando se alcanza el límite. Utilizar [join_overflow_mode](#settings-join_overflow_mode) para elegir la acción. - -Valores posibles: - -- Entero positivo. -- 0 — Memory control is disabled. - -Valor predeterminado: 0. - -## join_overflow_mode {#settings-join_overflow_mode} - -Define qué acción realiza ClickHouse cuando se alcanza cualquiera de los siguientes límites de combinación: - -- [Método de codificación de datos:](#settings-max_bytes_in_join) -- [Método de codificación de datos:](#settings-max_rows_in_join) - -Valores posibles: - -- `THROW` — ClickHouse throws an exception and breaks operation. -- `BREAK` — ClickHouse breaks operation and doesn't throw an exception. - -Valor predeterminado: `THROW`. - -**Ver también** - -- [Cláusula JOIN](../../sql-reference/statements/select/join.md#select-join) -- [Unirse al motor de tabla](../../engines/table-engines/special/join.md) - -## max_partitions_per_insert_block {#max-partitions-per-insert-block} - -Limita el número máximo de particiones en un único bloque insertado. - -- Entero positivo. -- 0 — Unlimited number of partitions. - -Valor predeterminado: 100. - -**Detalles** - -Al insertar datos, ClickHouse calcula el número de particiones en el bloque insertado. Si el número de particiones es mayor que `max_partitions_per_insert_block`, ClickHouse lanza una excepción con el siguiente texto: - -> “Too many partitions for single INSERT block (more than” ¿Cómo puedo hacerlo? “). The limit is controlled by ‘max_partitions_per_insert_block’ setting. A large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).” - -[Artículo Original](https://clickhouse.tech/docs/en/operations/settings/query_complexity/) diff --git a/docs/es/operations/settings/settings-profiles.md b/docs/es/operations/settings/settings-profiles.md deleted file mode 100644 index 3d96a2c8fba..00000000000 --- a/docs/es/operations/settings/settings-profiles.md +++ /dev/null @@ -1,81 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 61 -toc_title: "Perfiles de configuraci\xF3n" ---- - -# Perfiles de configuración {#settings-profiles} - -Un perfil de configuración es una colección de configuraciones agrupadas con el mismo nombre. - -!!! note "Información" - ClickHouse también es compatible [Flujo de trabajo controlado por SQL](../access-rights.md#access-control) para administrar perfiles de configuración. Recomendamos usarlo. - -Un perfil puede tener cualquier nombre. El perfil puede tener cualquier nombre. Puede especificar el mismo perfil para diferentes usuarios. Lo más importante que puede escribir en el perfil de configuración es `readonly=1`, que asegura el acceso de sólo lectura. - -Los perfiles de configuración pueden heredar unos de otros. Para usar la herencia, indique una o varias `profile` configuraciones antes de las demás configuraciones que se enumeran en el perfil. En caso de que se defina una configuración en diferentes perfiles, se utiliza la última definida. - -Para aplicar todos los ajustes de un perfil, establezca el `profile` configuración. - -Ejemplo: - -Instale el `web` perfil. - -``` sql -SET profile = 'web' -``` - -Los perfiles de configuración se declaran en el archivo de configuración del usuario. Esto suele ser `users.xml`. - -Ejemplo: - -``` xml - - - - - - 8 - - - - - 1000000000 - 100000000000 - - 1000000 - any - - 1000000 - 1000000000 - - 100000 - 100000000 - break - - 600 - 1000000 - 15 - - 25 - 100 - 50 - - 2 - 25 - 50 - 100 - - 1 - - -``` - -El ejemplo especifica dos perfiles: `default` y `web`. - -El `default` tiene un propósito especial: siempre debe estar presente y se aplica al iniciar el servidor. En otras palabras, el `default` perfil contiene la configuración predeterminada. - -El `web` profile es un perfil regular que se puede establecer utilizando el `SET` consulta o utilizando un parámetro URL en una consulta HTTP. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/settings/settings_profiles/) diff --git a/docs/es/operations/settings/settings-users.md b/docs/es/operations/settings/settings-users.md deleted file mode 100644 index 1c1ac7914f0..00000000000 --- a/docs/es/operations/settings/settings-users.md +++ /dev/null @@ -1,164 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 63 -toc_title: "Configuraci\xF3n del usuario" ---- - -# Configuración del usuario {#user-settings} - -El `users` sección de la `user.xml` el archivo de configuración contiene la configuración del usuario. - -!!! note "Información" - ClickHouse también es compatible [Flujo de trabajo controlado por SQL](../access-rights.md#access-control) para la gestión de usuarios. Recomendamos usarlo. - -Estructura del `users` apartado: - -``` xml - - - - - - - - 0|1 - - - - - profile_name - - default - - - - - expression - - - - - - -``` - -### user_name/contraseña {#user-namepassword} - -La contraseña se puede especificar en texto sin formato o en SHA256 (formato hexagonal). - -- Para asignar una contraseña en texto sin formato (**no se recomienda**), colóquelo en un `password` elemento. - - Por ejemplo, `qwerty`. La contraseña se puede dejar en blanco. - - - -- Para asignar una contraseña utilizando su hash SHA256, colóquela en un `password_sha256_hex` elemento. - - Por ejemplo, `65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5`. - - Ejemplo de cómo generar una contraseña desde el shell: - - PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' - - La primera línea del resultado es la contraseña. La segunda línea es el hash SHA256 correspondiente. - - - -- Para la compatibilidad con los clientes MySQL, la contraseña se puede especificar en doble hash SHA1. Colóquelo en `password_double_sha1_hex` elemento. - - Por ejemplo, `08b4a0f1de6ad37da17359e592c8d74788a83eb0`. - - Ejemplo de cómo generar una contraseña desde el shell: - - PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-' - - La primera línea del resultado es la contraseña. La segunda línea es el hash SHA1 doble correspondiente. - -### access_management {#access_management-user-setting} - -Esta configuración habilita deshabilita el uso de [control de acceso y gestión de cuentas](../access-rights.md#access-control) para el usuario. - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 0. - -### user_name/redes {#user-namenetworks} - -Lista de redes desde las que el usuario puede conectarse al servidor ClickHouse. - -Cada elemento de la lista puede tener una de las siguientes formas: - -- `` — IP address or network mask. - - Ejemplos: `213.180.204.3`, `10.0.0.1/8`, `10.0.0.1/255.255.255.0`, `2a02:6b8::3`, `2a02:6b8::3/64`, `2a02:6b8::3/ffff:ffff:ffff:ffff::`. - -- `` — Hostname. - - Ejemplo: `example01.host.ru`. - - Para comprobar el acceso, se realiza una consulta DNS y todas las direcciones IP devueltas se comparan con la dirección del mismo nivel. - -- `` — Regular expression for hostnames. - - Ejemplo, `^example\d\d-\d\d-\d\.host\.ru$` - - Para comprobar el acceso, un [Consulta de DNS PTR](https://en.wikipedia.org/wiki/Reverse_DNS_lookup) se realiza para la dirección del mismo nivel y luego se aplica la expresión regular especificada. A continuación, se realiza otra consulta DNS para los resultados de la consulta PTR y todas las direcciones recibidas se comparan con la dirección del mismo nivel. Recomendamos encarecidamente que regexp termine con $ . - -Todos los resultados de las solicitudes DNS se almacenan en caché hasta que el servidor se reinicia. - -**Ejemplos** - -Para abrir el acceso del usuario desde cualquier red, especifique: - -``` xml -::/0 -``` - -!!! warning "Advertencia" - No es seguro abrir el acceso desde cualquier red a menos que tenga un firewall configurado correctamente o el servidor no esté conectado directamente a Internet. - -Para abrir el acceso solo desde localhost, especifique: - -``` xml -::1 -127.0.0.1 -``` - -### user_name/perfil {#user-nameprofile} - -Puede asignar un perfil de configuración para el usuario. Los perfiles de configuración se configuran en una sección separada del `users.xml` file. Para obtener más información, consulte [Perfiles de configuración](settings-profiles.md). - -### user_name/cuota {#user-namequota} - -Las cuotas le permiten realizar un seguimiento o limitar el uso de recursos durante un período de tiempo. Las cuotas se configuran en el `quotas` -sección de la `users.xml` archivo de configuración. - -Puede asignar un conjunto de cuotas para el usuario. Para obtener una descripción detallada de la configuración de las cuotas, consulte [Cuota](../quotas.md#quotas). - -### nombre_usuario/bases de datos {#user-namedatabases} - -En esta sección, puede limitar las filas devueltas por ClickHouse para `SELECT` consultas realizadas por el usuario actual, implementando así la seguridad básica a nivel de fila. - -**Ejemplo** - -La siguiente configuración obliga a que el usuario `user1` sólo puede ver las filas de `table1` como resultado de `SELECT` consultas, donde el valor de la `id` campo es 1000. - -``` xml - - - - - id = 1000 - - - - -``` - -El `filter` puede ser cualquier expresión que resulte en un [UInt8](../../sql-reference/data-types/int-uint.md)-tipo de valor. Por lo general, contiene comparaciones y operadores lógicos. Filas de `database_name.table1` donde los resultados del filtro a 0 no se devuelven para este usuario. El filtrado es incompatible con `PREWHERE` operaciones y desactiva `WHERE→PREWHERE` optimización. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/settings/settings_users/) diff --git a/docs/es/operations/settings/settings.md b/docs/es/operations/settings/settings.md deleted file mode 100644 index 62511dd9fc0..00000000000 --- a/docs/es/operations/settings/settings.md +++ /dev/null @@ -1,1254 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Configuración {#settings} - -## distributed_product_mode {#distributed-product-mode} - -Cambia el comportamiento de [subconsultas distribuidas](../../sql-reference/operators/in.md). - -ClickHouse applies this setting when the query contains the product of distributed tables, i.e. when the query for a distributed table contains a non-GLOBAL subquery for the distributed table. - -Restricción: - -- Solo se aplica para las subconsultas IN y JOIN. -- Solo si la sección FROM utiliza una tabla distribuida que contiene más de un fragmento. -- Si la subconsulta se refiere a una tabla distribuida que contiene más de un fragmento. -- No se usa para un valor de tabla [remoto](../../sql-reference/table-functions/remote.md) función. - -Valores posibles: - -- `deny` — Default value. Prohibits using these types of subqueries (returns the “Double-distributed in/JOIN subqueries is denied” salvedad). -- `local` — Replaces the database and table in the subquery with local ones for the destination server (shard), leaving the normal `IN`/`JOIN.` -- `global` — Replaces the `IN`/`JOIN` consulta con `GLOBAL IN`/`GLOBAL JOIN.` -- `allow` — Allows the use of these types of subqueries. - -## enable_optimize_predicate_expression {#enable-optimize-predicate-expression} - -Activa el pushdown de predicado en `SELECT` consulta. - -La extracción de predicados puede reducir significativamente el tráfico de red para consultas distribuidas. - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 1. - -Uso - -Considere las siguientes consultas: - -1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` -2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` - -Si `enable_optimize_predicate_expression = 1`, entonces el tiempo de ejecución de estas consultas es igual porque se aplica ClickHouse `WHERE` a la subconsulta al procesarla. - -Si `enable_optimize_predicate_expression = 0`, entonces el tiempo de ejecución de la segunda consulta es mucho más largo, porque el `WHERE` cláusula se aplica a todos los datos después de que finalice la subconsulta. - -## fallback_to_stale_replicas_for_distributed_queries {#settings-fallback_to_stale_replicas_for_distributed_queries} - -Fuerza una consulta a una réplica obsoleta si los datos actualizados no están disponibles. Ver [Replicación](../../engines/table-engines/mergetree-family/replication.md). - -ClickHouse selecciona la más relevante de las réplicas obsoletas de la tabla. - -Se utiliza al realizar `SELECT` desde una tabla distribuida que apunta a tablas replicadas. - -De forma predeterminada, 1 (habilitado). - -## Fecha de nacimiento {#settings-force_index_by_date} - -Deshabilita la ejecución de consultas si el índice no se puede usar por fecha. - -Funciona con tablas de la familia MergeTree. - -Si `force_index_by_date=1`, ClickHouse comprueba si la consulta tiene una condición de clave de fecha que se puede usar para restringir intervalos de datos. Si no hay una condición adecuada, arroja una excepción. Sin embargo, no comprueba si la condición reduce la cantidad de datos a leer. Por ejemplo, la condición `Date != ' 2000-01-01 '` es aceptable incluso cuando coincide con todos los datos de la tabla (es decir, ejecutar la consulta requiere un escaneo completo). Para obtener más información acerca de los intervalos de datos en las tablas MergeTree, vea [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md). - -## force_primary_key {#force-primary-key} - -Deshabilita la ejecución de consultas si no es posible la indexación mediante la clave principal. - -Funciona con tablas de la familia MergeTree. - -Si `force_primary_key=1`, ClickHouse comprueba si la consulta tiene una condición de clave principal que se puede usar para restringir rangos de datos. Si no hay una condición adecuada, arroja una excepción. Sin embargo, no comprueba si la condición reduce la cantidad de datos a leer. Para obtener más información acerca de los intervalos de datos en las tablas MergeTree, consulte [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md). - -## Formato_esquema {#format-schema} - -Este parámetro es útil cuando se utilizan formatos que requieren una definición de esquema, como [Cap'n Proto](https://capnproto.org/) o [Protobuf](https://developers.google.com/protocol-buffers/). El valor depende del formato. - -## fsync_metadata {#fsync-metadata} - -Habilita o deshabilita [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) al escribir `.sql` file. Habilitado de forma predeterminada. - -Tiene sentido desactivarlo si el servidor tiene millones de pequeñas tablas que se crean y destruyen constantemente. - -## enable_http_compression {#settings-enable_http_compression} - -Habilita o deshabilita la compresión de datos en la respuesta a una solicitud HTTP. - -Para obtener más información, lea el [Descripción de la interfaz HTTP](../../interfaces/http.md). - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 0. - -## http_zlib_compression_level {#settings-http_zlib_compression_level} - -Establece el nivel de compresión de datos en la respuesta a una solicitud HTTP si [enable_http_compression = 1](#settings-enable_http_compression). - -Valores posibles: Números del 1 al 9. - -Valor predeterminado: 3. - -## http_native_compression_disable_checksumming_on_decompress {#settings-http_native_compression_disable_checksumming_on_decompress} - -Habilita o deshabilita la verificación de suma de comprobación al descomprimir los datos HTTP POST del cliente. Se usa solo para el formato de compresión nativa ClickHouse (no se usa con `gzip` o `deflate`). - -Para obtener más información, lea el [Descripción de la interfaz HTTP](../../interfaces/http.md). - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 0. - -## send_progress_in_http_headers {#settings-send_progress_in_http_headers} - -Habilita o deshabilita `X-ClickHouse-Progress` Encabezados de respuesta HTTP en `clickhouse-server` respuesta. - -Para obtener más información, lea el [Descripción de la interfaz HTTP](../../interfaces/http.md). - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 0. - -## Nombre de la red inalámbrica (SSID): {#setting-max_http_get_redirects} - -Limita el número máximo de saltos de redirección HTTP GET para [URL](../../engines/table-engines/special/url.md)-mesas de motor. La configuración se aplica a ambos tipos de tablas: las creadas por [CREATE TABLE](../../sql-reference/statements/create.md#create-table-query) consulta y por el [URL](../../sql-reference/table-functions/url.md) función de la tabla. - -Valores posibles: - -- Cualquier número entero positivo de saltos. -- 0 — No hops allowed. - -Valor predeterminado: 0. - -## Entrada_format_allow_errors_num {#settings-input_format_allow_errors_num} - -Establece el número máximo de errores aceptables al leer desde formatos de texto (CSV, TSV, etc.). - -El valor predeterminado es 0. - -Siempre emparejarlo con `input_format_allow_errors_ratio`. - -Si se produjo un error al leer filas, pero el contador de errores sigue siendo menor que `input_format_allow_errors_num`, ClickHouse ignora la fila y pasa a la siguiente. - -Si ambos `input_format_allow_errors_num` y `input_format_allow_errors_ratio` se exceden, ClickHouse lanza una excepción. - -## Entrada_format_allow_errors_ratio {#settings-input_format_allow_errors_ratio} - -Establece el porcentaje máximo de errores permitidos al leer desde formatos de texto (CSV, TSV, etc.). -El porcentaje de errores se establece como un número de punto flotante entre 0 y 1. - -El valor predeterminado es 0. - -Siempre emparejarlo con `input_format_allow_errors_num`. - -Si se produjo un error al leer filas, pero el contador de errores sigue siendo menor que `input_format_allow_errors_ratio`, ClickHouse ignora la fila y pasa a la siguiente. - -Si ambos `input_format_allow_errors_num` y `input_format_allow_errors_ratio` se exceden, ClickHouse lanza una excepción. - -## input_format_values_interpret_expressions {#settings-input_format_values_interpret_expressions} - -Habilita o deshabilita el analizador SQL completo si el analizador de secuencias rápidas no puede analizar los datos. Esta configuración sólo se utiliza para [Valor](../../interfaces/formats.md#data-format-values) formato en la inserción de datos. Para obtener más información sobre el análisis de sintaxis, consulte [Sintaxis](../../sql-reference/syntax.md) apartado. - -Valores posibles: - -- 0 — Disabled. - - En este caso, debe proporcionar datos con formato. Ver el [Formato](../../interfaces/formats.md) apartado. - -- 1 — Enabled. - - En este caso, puede usar una expresión SQL como valor, pero la inserción de datos es mucho más lenta de esta manera. Si inserta solo datos con formato, ClickHouse se comporta como si el valor de configuración fuera 0. - -Valor predeterminado: 1. - -Ejemplo de uso - -Inserte el [FechaHora](../../sql-reference/data-types/datetime.md) valor de tipo con los diferentes ajustes. - -``` sql -SET input_format_values_interpret_expressions = 0; -INSERT INTO datetime_t VALUES (now()) -``` - -``` text -Exception on client: -Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row 1) -``` - -``` sql -SET input_format_values_interpret_expressions = 1; -INSERT INTO datetime_t VALUES (now()) -``` - -``` text -Ok. -``` - -La última consulta es equivalente a la siguiente: - -``` sql -SET input_format_values_interpret_expressions = 0; -INSERT INTO datetime_t SELECT now() -``` - -``` text -Ok. -``` - -## input_format_values_deduce_templates_of_expressions {#settings-input_format_values_deduce_templates_of_expressions} - -Habilita o deshabilita la deducción de plantilla para expresiones SQL en [Valor](../../interfaces/formats.md#data-format-values) formato. Permite analizar e interpretar expresiones en `Values` mucho más rápido si las expresiones en filas consecutivas tienen la misma estructura. ClickHouse intenta deducir la plantilla de una expresión, analizar las siguientes filas utilizando esta plantilla y evaluar la expresión en un lote de filas analizadas correctamente. - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 1. - -Para la siguiente consulta: - -``` sql -INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (upper('Values')), ... -``` - -- Si `input_format_values_interpret_expressions=1` y `format_values_deduce_templates_of_expressions=0`, las expresiones se interpretan por separado para cada fila (esto es muy lento para un gran número de filas). -- Si `input_format_values_interpret_expressions=0` y `format_values_deduce_templates_of_expressions=1`, las expresiones en la primera, segunda y tercera filas se analizan usando la plantilla `lower(String)` e interpretados juntos, la expresión en la cuarta fila se analiza con otra plantilla (`upper(String)`). -- Si `input_format_values_interpret_expressions=1` y `format_values_deduce_templates_of_expressions=1`, lo mismo que en el caso anterior, pero también permite la alternativa a la interpretación de expresiones por separado si no es posible deducir la plantilla. - -## Entrada_format_values_accurate_types_of_literals {#settings-input-format-values-accurate-types-of-literals} - -Esta configuración sólo se utiliza cuando `input_format_values_deduce_templates_of_expressions = 1`. Puede suceder que las expresiones para alguna columna tengan la misma estructura, pero contengan literales numéricos de diferentes tipos, por ejemplo - -``` sql -(..., abs(0), ...), -- UInt64 literal -(..., abs(3.141592654), ...), -- Float64 literal -(..., abs(-1), ...), -- Int64 literal -``` - -Valores posibles: - -- 0 — Disabled. - - In this case, ClickHouse may use a more general type for some literals (e.g., `Float64` o `Int64` en lugar de `UInt64` para `42`), pero puede causar problemas de desbordamiento y precisión. - -- 1 — Enabled. - - En este caso, ClickHouse comprueba el tipo real de literal y utiliza una plantilla de expresión del tipo correspondiente. En algunos casos, puede ralentizar significativamente la evaluación de expresiones en `Values`. - -Valor predeterminado: 1. - -## Entrada_format_defaults_for_omitted_fields {#session_settings-input_format_defaults_for_omitted_fields} - -Al realizar `INSERT` consultas, reemplace los valores de columna de entrada omitidos con valores predeterminados de las columnas respectivas. Esta opción sólo se aplica a [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv) y [TabSeparated](../../interfaces/formats.md#tabseparated) formato. - -!!! note "Nota" - Cuando esta opción está habilitada, los metadatos de la tabla extendida se envían del servidor al cliente. Consume recursos informáticos adicionales en el servidor y puede reducir el rendimiento. - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 1. - -## input_format_tsv_empty_as_default {#settings-input-format-tsv-empty-as-default} - -Cuando esté habilitado, reemplace los campos de entrada vacíos en TSV con valores predeterminados. Para expresiones predeterminadas complejas `input_format_defaults_for_omitted_fields` debe estar habilitado también. - -Deshabilitado de forma predeterminada. - -## input_format_null_as_default {#settings-input-format-null-as-default} - -Habilita o deshabilita el uso de valores predeterminados si los datos de entrada `NULL`, pero el tipo de datos de la columna correspondiente en no `Nullable(T)` (para formatos de entrada de texto). - -## input_format_skip_unknown_fields {#settings-input-format-skip-unknown-fields} - -Habilita o deshabilita omitir la inserción de datos adicionales. - -Al escribir datos, ClickHouse produce una excepción si los datos de entrada contienen columnas que no existen en la tabla de destino. Si la omisión está habilitada, ClickHouse no inserta datos adicionales y no lanza una excepción. - -Formatos soportados: - -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) -- [CSVWithNames](../../interfaces/formats.md#csvwithnames) -- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) -- [TSKV](../../interfaces/formats.md#tskv) - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 0. - -## Entrada_format_import_nested_json {#settings-input_format_import_nested_json} - -Habilita o deshabilita la inserción de datos JSON con objetos anidados. - -Formatos soportados: - -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 0. - -Ver también: - -- [Uso de estructuras anidadas](../../interfaces/formats.md#jsoneachrow-nested) con el `JSONEachRow` formato. - -## Entrada_format_with_names_use_header {#settings-input-format-with-names-use-header} - -Habilita o deshabilita la comprobación del orden de las columnas al insertar datos. - -Para mejorar el rendimiento de la inserción, se recomienda deshabilitar esta comprobación si está seguro de que el orden de columna de los datos de entrada es el mismo que en la tabla de destino. - -Formatos soportados: - -- [CSVWithNames](../../interfaces/formats.md#csvwithnames) -- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 1. - -## Date_time_input_format {#settings-date_time_input_format} - -Permite elegir un analizador de la representación de texto de fecha y hora. - -La configuración no se aplica a [Funciones de fecha y hora](../../sql-reference/functions/date-time-functions.md). - -Valores posibles: - -- `'best_effort'` — Enables extended parsing. - - ClickHouse puede analizar el básico `YYYY-MM-DD HH:MM:SS` formato y todo [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) formatos de fecha y hora. Por ejemplo, `'2018-06-08T01:02:03.000Z'`. - -- `'basic'` — Use basic parser. - - ClickHouse puede analizar solo lo básico `YYYY-MM-DD HH:MM:SS` formato. Por ejemplo, `'2019-08-20 10:18:56'`. - -Valor predeterminado: `'basic'`. - -Ver también: - -- [Tipo de datos DateTime.](../../sql-reference/data-types/datetime.md) -- [Funciones para trabajar con fechas y horas.](../../sql-reference/functions/date-time-functions.md) - -## Por favor, introduzca su dirección de correo electrónico {#settings-join_default_strictness} - -Establece el rigor predeterminado para [Cláusulas JOIN](../../sql-reference/statements/select/join.md#select-join). - -Valores posibles: - -- `ALL` — If the right table has several matching rows, ClickHouse creates a [Producto cartesiano](https://en.wikipedia.org/wiki/Cartesian_product) de filas coincidentes. Esta es la normal `JOIN` comportamiento de SQL estándar. -- `ANY` — If the right table has several matching rows, only the first one found is joined. If the right table has only one matching row, the results of `ANY` y `ALL` son los mismos. -- `ASOF` — For joining sequences with an uncertain match. -- `Empty string` — If `ALL` o `ANY` no se especifica en la consulta, ClickHouse produce una excepción. - -Valor predeterminado: `ALL`. - -## join_any_take_last_row {#settings-join_any_take_last_row} - -Cambia el comportamiento de las operaciones de unión con `ANY` rigor. - -!!! warning "Atención" - Esta configuración sólo se aplica a `JOIN` operaciones con [Unir](../../engines/table-engines/special/join.md) mesas de motores. - -Valores posibles: - -- 0 — If the right table has more than one matching row, only the first one found is joined. -- 1 — If the right table has more than one matching row, only the last one found is joined. - -Valor predeterminado: 0. - -Ver también: - -- [Cláusula JOIN](../../sql-reference/statements/select/join.md#select-join) -- [Unirse al motor de tabla](../../engines/table-engines/special/join.md) -- [Por favor, introduzca su dirección de correo electrónico](#settings-join_default_strictness) - -## Sistema abierto {#join_use_nulls} - -Establece el tipo de [JOIN](../../sql-reference/statements/select/join.md) comportamiento. Al fusionar tablas, pueden aparecer celdas vacías. ClickHouse los rellena de manera diferente según esta configuración. - -Valores posibles: - -- 0 — The empty cells are filled with the default value of the corresponding field type. -- 1 — `JOIN` se comporta de la misma manera que en SQL estándar. El tipo del campo correspondiente se convierte en [NULL](../../sql-reference/data-types/nullable.md#data_type-nullable), y las celdas vacías se llenan con [NULL](../../sql-reference/syntax.md). - -Valor predeterminado: 0. - -## max_block_size {#setting-max_block_size} - -En ClickHouse, los datos se procesan mediante bloques (conjuntos de partes de columna). Los ciclos de procesamiento interno para un solo bloque son lo suficientemente eficientes, pero hay gastos notables en cada bloque. El `max_block_size` set es una recomendación para el tamaño del bloque (en un recuento de filas) para cargar desde las tablas. El tamaño del bloque no debe ser demasiado pequeño, por lo que los gastos en cada bloque aún se notan, pero no demasiado grande para que la consulta con LIMIT que se complete después del primer bloque se procese rápidamente. El objetivo es evitar consumir demasiada memoria al extraer un gran número de columnas en múltiples subprocesos y preservar al menos alguna localidad de caché. - -Valor predeterminado: 65,536. - -Bloquea el tamaño de `max_block_size` no siempre se cargan desde la tabla. Si es obvio que se deben recuperar menos datos, se procesa un bloque más pequeño. - -## preferred_block_size_bytes {#preferred-block-size-bytes} - -Utilizado para el mismo propósito que `max_block_size`, pero establece el tamaño de bloque recomendado en bytes adaptándolo al número de filas en el bloque. -Sin embargo, el tamaño del bloque no puede ser más que `max_block_size` filas. -Por defecto: 1,000,000. Solo funciona cuando se lee desde los motores MergeTree. - -## merge_tree_min_rows_for_concurrent_read {#setting-merge-tree-min-rows-for-concurrent-read} - -Si el número de filas que se leerán de un fichero [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md) mesa excede `merge_tree_min_rows_for_concurrent_read` luego ClickHouse intenta realizar una lectura simultánea de este archivo en varios hilos. - -Valores posibles: - -- Cualquier entero positivo. - -Valor predeterminado: 163840. - -## merge_tree_min_bytes_for_concurrent_read {#setting-merge-tree-min-bytes-for-concurrent-read} - -Si el número de bytes a leer de un archivo de un [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md)-La tabla del motor excede `merge_tree_min_bytes_for_concurrent_read`, entonces ClickHouse intenta leer simultáneamente este archivo en varios subprocesos. - -Valor posible: - -- Cualquier entero positivo. - -Valor predeterminado: 251658240. - -## Método de codificación de datos: {#setting-merge-tree-min-rows-for-seek} - -Si la distancia entre dos bloques de datos que se leen en un archivo es menor que `merge_tree_min_rows_for_seek` filas, luego ClickHouse no busca a través del archivo, sino que lee los datos secuencialmente. - -Valores posibles: - -- Cualquier entero positivo. - -Valor predeterminado: 0. - -## merge_tree_min_bytes_for_seek {#setting-merge-tree-min-bytes-for-seek} - -Si la distancia entre dos bloques de datos que se leen en un archivo es menor que `merge_tree_min_bytes_for_seek` bytes, luego ClickHouse lee secuencialmente un rango de archivos que contiene ambos bloques, evitando así la búsqueda adicional. - -Valores posibles: - -- Cualquier entero positivo. - -Valor predeterminado: 0. - -## merge_tree_coarse_index_granularity {#setting-merge-tree-coarse-index-granularity} - -Al buscar datos, ClickHouse comprueba las marcas de datos en el archivo de índice. Si ClickHouse encuentra que las claves requeridas están en algún rango, divide este rango en `merge_tree_coarse_index_granularity` subintervalos y busca las claves necesarias allí de forma recursiva. - -Valores posibles: - -- Cualquier entero incluso positivo. - -Valor predeterminado: 8. - -## merge_tree_max_rows_to_use_cache {#setting-merge-tree-max-rows-to-use-cache} - -Si ClickHouse debería leer más de `merge_tree_max_rows_to_use_cache` en una consulta, no usa la memoria caché de bloques sin comprimir. - -La memoria caché de bloques sin comprimir almacena datos extraídos para consultas. ClickHouse utiliza esta memoria caché para acelerar las respuestas a pequeñas consultas repetidas. Esta configuración protege la memoria caché del deterioro de las consultas que leen una gran cantidad de datos. El [Uncompressed_cache_size](../server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) configuración del servidor define el tamaño de la memoria caché de bloques sin comprimir. - -Valores posibles: - -- Cualquier entero positivo. - -Default value: 128 ✕ 8192. - -## merge_tree_max_bytes_to_use_cache {#setting-merge-tree-max-bytes-to-use-cache} - -Si ClickHouse debería leer más de `merge_tree_max_bytes_to_use_cache` bytes en una consulta, no usa el caché de bloques sin comprimir. - -La memoria caché de bloques sin comprimir almacena datos extraídos para consultas. ClickHouse utiliza esta memoria caché para acelerar las respuestas a pequeñas consultas repetidas. Esta configuración protege la memoria caché del deterioro de las consultas que leen una gran cantidad de datos. El [Uncompressed_cache_size](../server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) configuración del servidor define el tamaño de la memoria caché de bloques sin comprimir. - -Valor posible: - -- Cualquier entero positivo. - -Valor predeterminado: 2013265920. - -## Todos los derechos reservados {#settings-min-bytes-to-use-direct-io} - -El volumen de datos mínimo necesario para utilizar el acceso directo de E/S al disco de almacenamiento. - -ClickHouse usa esta configuración al leer datos de tablas. Si el volumen total de almacenamiento de todos los datos a leer excede `min_bytes_to_use_direct_io` luego ClickHouse lee los datos del disco de almacenamiento con el `O_DIRECT` opcion. - -Valores posibles: - -- 0 — Direct I/O is disabled. -- Entero positivo. - -Valor predeterminado: 0. - -## Log_queries {#settings-log-queries} - -Configuración del registro de consultas. - -Las consultas enviadas a ClickHouse con esta configuración se registran de acuerdo con las reglas [query_log](../server-configuration-parameters/settings.md#server_configuration_parameters-query-log) parámetro de configuración del servidor. - -Ejemplo: - -``` text -log_queries=1 -``` - -## Nombre de la red inalámbrica (SSID): {#settings-log-queries-min-type} - -`query_log` tipo mínimo para iniciar sesión. - -Valores posibles: -- `QUERY_START` (`=1`) -- `QUERY_FINISH` (`=2`) -- `EXCEPTION_BEFORE_START` (`=3`) -- `EXCEPTION_WHILE_PROCESSING` (`=4`) - -Valor predeterminado: `QUERY_START`. - -Se puede usar para limitar a qué entiries va `query_log`, digamos que eres interesante solo en errores, entonces puedes usar `EXCEPTION_WHILE_PROCESSING`: - -``` text -log_queries_min_type='EXCEPTION_WHILE_PROCESSING' -``` - -## Log_query_threads {#settings-log-query-threads} - -Configuración del registro de subprocesos de consulta. - -Los subprocesos de consultas ejecutados por ClickHouse con esta configuración se registran de acuerdo con las reglas en el [Sistema abierto.](../server-configuration-parameters/settings.md#server_configuration_parameters-query-thread-log) parámetro de configuración del servidor. - -Ejemplo: - -``` text -log_query_threads=1 -``` - -## Max_insert_block_size {#settings-max_insert_block_size} - -El tamaño de los bloques a formar para su inserción en una tabla. -Esta configuración solo se aplica en los casos en que el servidor forma los bloques. -Por ejemplo, para un INSERT a través de la interfaz HTTP, el servidor analiza el formato de datos y forma bloques del tamaño especificado. -Pero al usar clickhouse-client, el cliente analiza los datos en sí, y el ‘max_insert_block_size’ configuración en el servidor no afecta el tamaño de los bloques insertados. -La configuración tampoco tiene un propósito cuando se usa INSERT SELECT , ya que los datos se insertan usando los mismos bloques que se forman después de SELECT . - -Valor predeterminado: 1.048.576. - -El valor predeterminado es ligeramente más que `max_block_size`. La razón de esto se debe a que ciertos motores de mesa (`*MergeTree`) formar una parte de datos en el disco para cada bloque insertado, que es una entidad bastante grande. Similar, `*MergeTree` las tablas ordenan los datos durante la inserción y un tamaño de bloque lo suficientemente grande permiten clasificar más datos en la RAM. - -## Nombre de la red inalámbrica (SSID): {#min-insert-block-size-rows} - -Establece el número mínimo de filas en el bloque que se pueden insertar en una tabla `INSERT` consulta. Los bloques de menor tamaño se aplastan en otros más grandes. - -Valores posibles: - -- Entero positivo. -- 0 — Squashing disabled. - -Valor predeterminado: 1048576. - -## Todos los derechos reservados {#min-insert-block-size-bytes} - -Establece el número mínimo de bytes en el bloque que se pueden insertar en una tabla `INSERT` consulta. Los bloques de menor tamaño se aplastan en otros más grandes. - -Valores posibles: - -- Entero positivo. -- 0 — Squashing disabled. - -Valor predeterminado: 268435456. - -## max_replica_delay_for_distributed_queries {#settings-max_replica_delay_for_distributed_queries} - -Deshabilita las réplicas rezagadas para consultas distribuidas. Ver [Replicación](../../engines/table-engines/mergetree-family/replication.md). - -Establece el tiempo en segundos. Si una réplica tiene un retraso superior al valor establecido, no se utiliza esta réplica. - -Valor predeterminado: 300. - -Se utiliza al realizar `SELECT` desde una tabla distribuida que apunta a tablas replicadas. - -## max_threads {#settings-max_threads} - -El número máximo de subprocesos de procesamiento de consultas, excluyendo subprocesos para recuperar datos de servidores ‘max_distributed_connections’ parámetro). - -Este parámetro se aplica a los subprocesos que realizan las mismas etapas de la canalización de procesamiento de consultas en paralelo. -Por ejemplo, al leer desde una tabla, si es posible evaluar expresiones con funciones, filtre con WHERE y preagregue para GROUP BY en paralelo usando al menos ‘max_threads’ número de hilos, entonces ‘max_threads’ se utilizan. - -Valor predeterminado: el número de núcleos de CPU físicos. - -Si normalmente se ejecuta menos de una consulta SELECT en un servidor a la vez, establezca este parámetro en un valor ligeramente inferior al número real de núcleos de procesador. - -Para las consultas que se completan rápidamente debido a un LIMIT, puede establecer un ‘max_threads’. Por ejemplo, si el número necesario de entradas se encuentra en cada bloque y max_threads = 8, entonces se recuperan 8 bloques, aunque hubiera sido suficiente leer solo uno. - -Cuanto menor sea el `max_threads` valor, menos memoria se consume. - -## Método de codificación de datos: {#settings-max-insert-threads} - -El número máximo de subprocesos para ejecutar el `INSERT SELECT` consulta. - -Valores posibles: - -- 0 (or 1) — `INSERT SELECT` sin ejecución paralela. -- Entero positivo. Más grande que 1. - -Valor predeterminado: 0. - -Paralelo `INSERT SELECT` sólo tiene efecto si el `SELECT` parte se ejecuta en paralelo, ver [max_threads](#settings-max_threads) configuración. -Los valores más altos conducirán a un mayor uso de memoria. - -## max_compress_block_size {#max-compress-block-size} - -El tamaño máximo de bloques de datos sin comprimir antes de comprimir para escribir en una tabla. De forma predeterminada, 1.048.576 (1 MiB). Si se reduce el tamaño, la tasa de compresión se reduce significativamente, la velocidad de compresión y descompresión aumenta ligeramente debido a la localidad de la memoria caché, y se reduce el consumo de memoria. Por lo general, no hay ninguna razón para cambiar esta configuración. - -No confunda bloques para la compresión (un fragmento de memoria que consta de bytes) con bloques para el procesamiento de consultas (un conjunto de filas de una tabla). - -## Descripción del producto {#min-compress-block-size} - -Para [Método de codificación de datos:](../../engines/table-engines/mergetree-family/mergetree.md)" tabla. Para reducir la latencia al procesar consultas, un bloque se comprime al escribir la siguiente marca si su tamaño es al menos ‘min_compress_block_size’. De forma predeterminada, 65.536. - -El tamaño real del bloque, si los datos sin comprimir son menores que ‘max_compress_block_size’, no es menor que este valor y no menor que el volumen de datos para una marca. - -Veamos un ejemplo. Supongamos que ‘index_granularity’ se estableció en 8192 durante la creación de la tabla. - -Estamos escribiendo una columna de tipo UInt32 (4 bytes por valor). Al escribir 8192 filas, el total será de 32 KB de datos. Como min_compress_block_size = 65,536, se formará un bloque comprimido por cada dos marcas. - -Estamos escribiendo una columna URL con el tipo String (tamaño promedio de 60 bytes por valor). Al escribir 8192 filas, el promedio será ligeramente inferior a 500 KB de datos. Como esto es más de 65,536, se formará un bloque comprimido para cada marca. En este caso, al leer datos del disco en el rango de una sola marca, los datos adicionales no se descomprimirán. - -Por lo general, no hay ninguna razón para cambiar esta configuración. - -## max_query_size {#settings-max_query_size} - -La parte máxima de una consulta que se puede llevar a la RAM para analizar con el analizador SQL. -La consulta INSERT también contiene datos para INSERT que es procesado por un analizador de secuencias independiente (que consume O(1) RAM), que no está incluido en esta restricción. - -Valor predeterminado: 256 KiB. - -## interactive_delay {#interactive-delay} - -El intervalo en microsegundos para comprobar si la ejecución de la solicitud se ha cancelado y enviar el progreso. - -Valor predeterminado: 100.000 (comprueba la cancelación y envía el progreso diez veces por segundo). - -## ¿Cómo puedo hacerlo? {#connect-timeout-receive-timeout-send-timeout} - -Tiempos de espera en segundos en el socket utilizado para comunicarse con el cliente. - -Valor predeterminado: 10, 300, 300. - -## Cancel_http_readonly_queries_on_client_close {#cancel-http-readonly-queries-on-client-close} - -Cancels HTTP read-only queries (e.g. SELECT) when a client closes the connection without waiting for the response. - -Valor predeterminado: 0 - -## poll_interval {#poll-interval} - -Bloquear en un bucle de espera durante el número especificado de segundos. - -Valor predeterminado: 10. - -## max_distributed_connections {#max-distributed-connections} - -El número máximo de conexiones simultáneas con servidores remotos para el procesamiento distribuido de una única consulta a una única tabla distribuida. Se recomienda establecer un valor no menor que el número de servidores en el clúster. - -Valor predeterminado: 1024. - -Los siguientes parámetros solo se usan al crear tablas distribuidas (y al iniciar un servidor), por lo que no hay ninguna razón para cambiarlas en tiempo de ejecución. - -## Distributed_connections_pool_size {#distributed-connections-pool-size} - -El número máximo de conexiones simultáneas con servidores remotos para el procesamiento distribuido de todas las consultas a una única tabla distribuida. Se recomienda establecer un valor no menor que el número de servidores en el clúster. - -Valor predeterminado: 1024. - -## Conecte_timeout_with_failover_ms {#connect-timeout-with-failover-ms} - -El tiempo de espera en milisegundos para conectarse a un servidor remoto para un motor de tablas distribuidas ‘shard’ y ‘replica’ secciones se utilizan en la definición de clúster. -Si no tiene éxito, se realizan varios intentos para conectarse a varias réplicas. - -Valor predeterminado: 50. - -## connections_with_failover_max_tries {#connections-with-failover-max-tries} - -El número máximo de intentos de conexión con cada réplica para el motor de tablas distribuidas. - -Valor predeterminado: 3. - -## extremo {#extremes} - -Ya sea para contar valores extremos (los mínimos y máximos en columnas de un resultado de consulta). Acepta 0 o 1. De forma predeterminada, 0 (deshabilitado). -Para obtener más información, consulte la sección “Extreme values”. - -## Use_uncompressed_cache {#setting-use_uncompressed_cache} - -Si se debe usar una memoria caché de bloques sin comprimir. Acepta 0 o 1. De forma predeterminada, 0 (deshabilitado). -El uso de la memoria caché sin comprimir (solo para tablas de la familia MergeTree) puede reducir significativamente la latencia y aumentar el rendimiento cuando se trabaja con un gran número de consultas cortas. Habilite esta configuración para los usuarios que envían solicitudes cortas frecuentes. También preste atención al [Uncompressed_cache_size](../server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. - -Para consultas que leen al menos un volumen algo grande de datos (un millón de filas o más), la memoria caché sin comprimir se desactiva automáticamente para ahorrar espacio para consultas realmente pequeñas. Esto significa que puede mantener el ‘use_uncompressed_cache’ ajuste siempre establecido en 1. - -## Reemplazar_running_query {#replace-running-query} - -Cuando se utiliza la interfaz HTTP, el ‘query_id’ parámetro puede ser pasado. Se trata de cualquier cadena que sirva como identificador de consulta. -Si una consulta del mismo usuario ‘query_id’ que ya existe en este momento, el comportamiento depende de la ‘replace_running_query’ parámetro. - -`0` (default) – Throw an exception (don't allow the query to run if a query with the same ‘query_id’ ya se está ejecutando). - -`1` – Cancel the old query and start running the new one. - -El Yandex.Metrica utiliza este parámetro establecido en 1 para implementar sugerencias para las condiciones de segmentación. Después de ingresar el siguiente carácter, si la consulta anterior aún no ha finalizado, debe cancelarse. - -## Nombre de la red inalámbrica (SSID): {#stream-flush-interval-ms} - -Funciona para tablas con streaming en el caso de un tiempo de espera, o cuando un subproceso genera [Max_insert_block_size](#settings-max_insert_block_size) filas. - -El valor predeterminado es 7500. - -Cuanto menor sea el valor, más a menudo los datos se vacían en la tabla. Establecer el valor demasiado bajo conduce a un rendimiento deficiente. - -## load_balancing {#settings-load_balancing} - -Especifica el algoritmo de selección de réplicas que se utiliza para el procesamiento de consultas distribuidas. - -ClickHouse admite los siguientes algoritmos para elegir réplicas: - -- [Aleatorio](#load_balancing-random) (predeterminada) -- [Nombre de host más cercano](#load_balancing-nearest_hostname) -- [En orden](#load_balancing-in_order) -- [Primero o aleatorio](#load_balancing-first_or_random) - -### Aleatorio (por defecto) {#load_balancing-random} - -``` sql -load_balancing = random -``` - -El número de errores se cuenta para cada réplica. La consulta se envía a la réplica con el menor número de errores, y si hay varios de estos, a cualquiera de ellos. -Desventajas: La proximidad del servidor no se tiene en cuenta; si las réplicas tienen datos diferentes, también obtendrá datos diferentes. - -### Nombre de host más cercano {#load_balancing-nearest_hostname} - -``` sql -load_balancing = nearest_hostname -``` - -The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a hostname that is most similar to the server's hostname in the config file (for the number of different characters in identical positions, up to the minimum length of both hostnames). - -Por ejemplo, example01-01-1 y example01-01-2.yandex.ru son diferentes en una posición, mientras que example01-01-1 y example01-02-2 difieren en dos lugares. -Este método puede parecer primitivo, pero no requiere datos externos sobre la topología de red, y no compara las direcciones IP, lo que sería complicado para nuestras direcciones IPv6. - -Por lo tanto, si hay réplicas equivalentes, se prefiere la más cercana por nombre. -También podemos suponer que al enviar una consulta al mismo servidor, en ausencia de fallas, una consulta distribuida también irá a los mismos servidores. Por lo tanto, incluso si se colocan datos diferentes en las réplicas, la consulta devolverá principalmente los mismos resultados. - -### En orden {#load_balancing-in_order} - -``` sql -load_balancing = in_order -``` - -Se accede a las réplicas con el mismo número de errores en el mismo orden en que se especifican en la configuración. -Este método es apropiado cuando se sabe exactamente qué réplica es preferible. - -### Primero o aleatorio {#load_balancing-first_or_random} - -``` sql -load_balancing = first_or_random -``` - -Este algoritmo elige la primera réplica del conjunto o una réplica aleatoria si la primera no está disponible. Es efectivo en configuraciones de topología de replicación cruzada, pero inútil en otras configuraciones. - -El `first_or_random` resuelve el problema del algoritmo `in_order` algoritmo. Con `in_order`, si una réplica se cae, la siguiente obtiene una carga doble mientras que las réplicas restantes manejan la cantidad habitual de tráfico. Cuando se utiliza el `first_or_random` algoritmo, la carga se distribuye uniformemente entre las réplicas que todavía están disponibles. - -## prefer_localhost_replica {#settings-prefer-localhost-replica} - -Habilita/deshabilita el uso preferible de la réplica localhost al procesar consultas distribuidas. - -Valores posibles: - -- 1 — ClickHouse always sends a query to the localhost replica if it exists. -- 0 — ClickHouse uses the balancing strategy specified by the [load_balancing](#settings-load_balancing) configuración. - -Valor predeterminado: 1. - -!!! warning "Advertencia" - Deshabilite esta configuración si usa [max_parallel_replicas](#settings-max_parallel_replicas). - -## totals_mode {#totals-mode} - -Cómo calcular TOTALS cuando HAVING está presente, así como cuando max_rows_to_group_by y group_by_overflow_mode = ‘any’ están presentes. -Vea la sección “WITH TOTALS modifier”. - -## totals_auto_threshold {#totals-auto-threshold} - -El umbral para `totals_mode = 'auto'`. -Vea la sección “WITH TOTALS modifier”. - -## max_parallel_replicas {#settings-max_parallel_replicas} - -El número máximo de réplicas para cada fragmento al ejecutar una consulta. -Para obtener coherencia (para obtener diferentes partes de la misma división de datos), esta opción solo funciona cuando se establece la clave de muestreo. -El retraso de réplica no está controlado. - -## compilar {#compile} - -Habilitar la compilación de consultas. De forma predeterminada, 0 (deshabilitado). - -La compilación solo se usa para parte de la canalización de procesamiento de consultas: para la primera etapa de agregación (GROUP BY). -Si se compiló esta parte de la canalización, la consulta puede ejecutarse más rápido debido a la implementación de ciclos cortos y a las llamadas de función agregadas en línea. La mejora del rendimiento máximo (hasta cuatro veces más rápido en casos excepcionales) se ve para consultas con múltiples funciones agregadas simples. Por lo general, la ganancia de rendimiento es insignificante. En casos muy raros, puede ralentizar la ejecución de la consulta. - -## min_count_to_compile {#min-count-to-compile} - -¿Cuántas veces usar potencialmente un fragmento de código compilado antes de ejecutar la compilación? Por defecto, 3. -For testing, the value can be set to 0: compilation runs synchronously and the query waits for the end of the compilation process before continuing execution. For all other cases, use values ​​starting with 1. Compilation normally takes about 5-10 seconds. -Si el valor es 1 o más, la compilación se produce de forma asíncrona en un subproceso independiente. El resultado se utilizará tan pronto como esté listo, incluidas las consultas que se están ejecutando actualmente. - -Se requiere código compilado para cada combinación diferente de funciones agregadas utilizadas en la consulta y el tipo de claves en la cláusula GROUP BY. -The results of the compilation are saved in the build directory in the form of .so files. There is no restriction on the number of compilation results since they don't use very much space. Old results will be used after server restarts, except in the case of a server upgrade – in this case, the old results are deleted. - -## output_format_json_quote_64bit_integers {#session_settings-output_format_json_quote_64bit_integers} - -Si el valor es true, los enteros aparecen entre comillas cuando se usan los formatos JSON\* Int64 y UInt64 (por compatibilidad con la mayoría de las implementaciones de JavaScript); de lo contrario, los enteros se generan sin las comillas. - -## Formato_csv_delimiter {#settings-format_csv_delimiter} - -El carácter interpretado como un delimitador en los datos CSV. De forma predeterminada, el delimitador es `,`. - -## input_format_csv_unquoted_null_literal_as_null {#settings-input_format_csv_unquoted_null_literal_as_null} - -Para el formato de entrada CSV, habilita o deshabilita el análisis de `NULL` como literal (sinónimo de `\N`). - -## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line} - -Utilice el separador de línea de estilo DOS / Windows (CRLF) en CSV en lugar de estilo Unix (LF). - -## output_format_tsv_crlf_end_of_line {#settings-output-format-tsv-crlf-end-of-line} - -Utilice el separador de línea de estilo DOC / Windows (CRLF) en TSV en lugar del estilo Unix (LF). - -## insert_quorum {#settings-insert_quorum} - -Habilita las escrituras de quórum. - -- Si `insert_quorum < 2`, las escrituras de quórum están deshabilitadas. -- Si `insert_quorum >= 2`, las escrituras de quórum están habilitadas. - -Valor predeterminado: 0. - -Quorum escribe - -`INSERT` solo tiene éxito cuando ClickHouse logra escribir correctamente datos en el `insert_quorum` de réplicas durante el `insert_quorum_timeout`. Si por alguna razón el número de réplicas con escrituras exitosas no alcanza el `insert_quorum`, la escritura se considera fallida y ClickHouse eliminará el bloque insertado de todas las réplicas donde los datos ya se han escrito. - -Todas las réplicas del quórum son consistentes, es decir, contienen datos de todas las réplicas anteriores `INSERT` consulta. El `INSERT` la secuencia está linealizada. - -Al leer los datos escritos desde el `insert_quorum` usted puede utilizar el [select_sequential_consistency](#settings-select_sequential_consistency) opcion. - -ClickHouse genera una excepción - -- Si el número de réplicas disponibles en el momento de la consulta es `insert_quorum`. -- En un intento de escribir datos cuando el bloque anterior aún no se ha insertado en el `insert_quorum` de réplicas. Esta situación puede ocurrir si el usuario intenta realizar una `INSERT` antes de la anterior con el `insert_quorum` se ha completado. - -Ver también: - -- [insert_quorum_timeout](#settings-insert_quorum_timeout) -- [select_sequential_consistency](#settings-select_sequential_consistency) - -## insert_quorum_timeout {#settings-insert_quorum_timeout} - -Escribir en tiempo de espera de quórum en segundos. Si el tiempo de espera ha pasado y aún no se ha realizado ninguna escritura, ClickHouse generará una excepción y el cliente debe repetir la consulta para escribir el mismo bloque en la misma réplica o en cualquier otra réplica. - -Valor predeterminado: 60 segundos. - -Ver también: - -- [insert_quorum](#settings-insert_quorum) -- [select_sequential_consistency](#settings-select_sequential_consistency) - -## select_sequential_consistency {#settings-select_sequential_consistency} - -Habilita o deshabilita la coherencia secuencial para `SELECT` consulta: - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 0. - -Uso - -Cuando se habilita la coherencia secuencial, ClickHouse permite al cliente ejecutar el `SELECT` consulta sólo para aquellas réplicas que contienen datos de todas las `INSERT` consultas ejecutadas con `insert_quorum`. Si el cliente hace referencia a una réplica parcial, ClickHouse generará una excepción. La consulta SELECT no incluirá datos que aún no se hayan escrito en el quórum de réplicas. - -Ver también: - -- [insert_quorum](#settings-insert_quorum) -- [insert_quorum_timeout](#settings-insert_quorum_timeout) - -## insert_deduplicate {#settings-insert-deduplicate} - -Habilita o deshabilita la desduplicación de bloques `INSERT` (para tablas replicadas\* - -Valores posibles: - -- 0 — Disabled. -- 1 — Enabled. - -Valor predeterminado: 1. - -De forma predeterminada, los bloques insertados en tablas replicadas `INSERT` declaración se deduplican (ver [Replicación de datos](../../engines/table-engines/mergetree-family/replication.md)). - -## deduplicate_blocks_in_dependent_materialized_views {#settings-deduplicate-blocks-in-dependent-materialized-views} - -Habilita o deshabilita la comprobación de desduplicación para las vistas materializadas que reciben datos de tablas replicadas\*. - -Valores posibles: - - 0 — Disabled. - 1 — Enabled. - -Valor predeterminado: 0. - -Uso - -De forma predeterminada, la desduplicación no se realiza para las vistas materializadas, sino que se realiza en sentido ascendente, en la tabla de origen. -Si se omite un bloque INSERTed debido a la desduplicación en la tabla de origen, no habrá inserción en las vistas materializadas adjuntas. Este comportamiento existe para permitir la inserción de datos altamente agregados en vistas materializadas, para los casos en que los bloques insertados son los mismos después de la agregación de vistas materializadas pero derivados de diferentes INSERT en la tabla de origen. -Al mismo tiempo, este comportamiento “breaks” `INSERT` idempotencia. Si una `INSERT` en la mesa principal fue exitoso y `INSERT` into a materialized view failed (e.g. because of communication failure with Zookeeper) a client will get an error and can retry the operation. However, the materialized view won't receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` permite cambiar este comportamiento. Al reintentar, una vista materializada recibirá la inserción de repetición y realizará la comprobación de desduplicación por sí misma, -ignorando el resultado de la comprobación para la tabla de origen, e insertará filas perdidas debido a la primera falla. - -## Método de codificación de datos: {#settings-max-network-bytes} - -Limita el volumen de datos (en bytes) que se recibe o se transmite a través de la red al ejecutar una consulta. Esta configuración se aplica a cada consulta individual. - -Valores posibles: - -- Entero positivo. -- 0 — Data volume control is disabled. - -Valor predeterminado: 0. - -## Método de codificación de datos: {#settings-max-network-bandwidth} - -Limita la velocidad del intercambio de datos a través de la red en bytes por segundo. Esta configuración se aplica a todas las consultas. - -Valores posibles: - -- Entero positivo. -- 0 — Bandwidth control is disabled. - -Valor predeterminado: 0. - -## Todos los derechos reservados {#settings-max-network-bandwidth-for-user} - -Limita la velocidad del intercambio de datos a través de la red en bytes por segundo. Esta configuración se aplica a todas las consultas que se ejecutan simultáneamente realizadas por un único usuario. - -Valores posibles: - -- Entero positivo. -- 0 — Control of the data speed is disabled. - -Valor predeterminado: 0. - -## Todos los derechos reservados {#settings-max-network-bandwidth-for-all-users} - -Limita la velocidad a la que se intercambian datos a través de la red en bytes por segundo. Esta configuración se aplica a todas las consultas que se ejecutan simultáneamente en el servidor. - -Valores posibles: - -- Entero positivo. -- 0 — Control of the data speed is disabled. - -Valor predeterminado: 0. - -## count_distinct_implementation {#settings-count_distinct_implementation} - -Especifica cuál de las `uniq*` se deben utilizar para realizar el [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference.md#agg_function-count) construcción. - -Valores posibles: - -- [uniq](../../sql-reference/aggregate-functions/reference.md#agg_function-uniq) -- [uniqCombined](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqcombined) -- [UniqCombined64](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqcombined64) -- [uniqHLL12](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqhll12) -- [uniqExact](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqexact) - -Valor predeterminado: `uniqExact`. - -## skip_unavailable_shards {#settings-skip_unavailable_shards} - -Habilita o deshabilita la omisión silenciosa de fragmentos no disponibles. - -El fragmento se considera no disponible si todas sus réplicas no están disponibles. Una réplica no está disponible en los siguientes casos: - -- ClickHouse no puede conectarse a la réplica por ningún motivo. - - Al conectarse a una réplica, ClickHouse realiza varios intentos. Si todos estos intentos fallan, la réplica se considera que no está disponible. - -- La réplica no se puede resolver a través de DNS. - - Si el nombre de host de la réplica no se puede resolver a través de DNS, puede indicar las siguientes situaciones: - - - El host de Replica no tiene registro DNS. Puede ocurrir en sistemas con DNS dinámico, por ejemplo, [Kubernetes](https://kubernetes.io), donde los nodos pueden ser irresolubles durante el tiempo de inactividad, y esto no es un error. - - - Error de configuración. El archivo de configuración de ClickHouse contiene un nombre de host incorrecto. - -Valores posibles: - -- 1 — skipping enabled. - - Si un fragmento no está disponible, ClickHouse devuelve un resultado basado en datos parciales y no informa de problemas de disponibilidad de nodos. - -- 0 — skipping disabled. - - Si un fragmento no está disponible, ClickHouse produce una excepción. - -Valor predeterminado: 0. - -## Optize_skip_unused_shards {#settings-optimize_skip_unused_shards} - -Habilita o deshabilita la omisión de fragmentos no utilizados para las consultas SELECT que tienen la condición de clave de fragmentación en PREWHERE / WHERE (supone que los datos se distribuyen mediante clave de fragmentación, de lo contrario no hacer nada). - -Valor predeterminado: 0 - -## Fuerza_optimize_skip_unused_shards {#settings-force_optimize_skip_unused_shards} - -Habilita o deshabilita la ejecución de consultas si [`optimize_skip_unused_shards`](#settings-optimize_skip_unused_shards) no es posible omitir fragmentos no utilizados. Si la omisión no es posible y la configuración está habilitada, se lanzará una excepción. - -Valores posibles: - -- 0 - Discapacitados (no lanza) -- 1: deshabilite la ejecución de consultas solo si la tabla tiene una clave de fragmentación -- 2: deshabilita la ejecución de consultas independientemente de que se haya definido la clave de fragmentación para la tabla - -Valor predeterminado: 0 - -## Optize_throw_if_noop {#setting-optimize_throw_if_noop} - -Habilita o deshabilita el lanzamiento de una excepción [OPTIMIZE](../../sql-reference/statements/misc.md#misc_operations-optimize) la consulta no realizó una fusión. - -Predeterminada, `OPTIMIZE` devuelve con éxito incluso si no hizo nada. Esta configuración le permite diferenciar estas situaciones y obtener el motivo en un mensaje de excepción. - -Valores posibles: - -- 1 — Throwing an exception is enabled. -- 0 — Throwing an exception is disabled. - -Valor predeterminado: 0. - -## distributed_replica_error_half_life {#settings-distributed_replica_error_half_life} - -- Tipo: segundos -- Valor predeterminado: 60 segundos - -Controla la rapidez con la que se ponen a cero los errores en las tablas distribuidas. Si una réplica no está disponible durante algún tiempo, acumula 5 errores y distribut_replica_error_half_life se establece en 1 segundo, la réplica se considera normal 3 segundos después del último error. - -Ver también: - -- [Motor de tabla distribuido](../../engines/table-engines/special/distributed.md) -- [distributed_replica_error_cap](#settings-distributed_replica_error_cap) - -## distributed_replica_error_cap {#settings-distributed_replica_error_cap} - -- Tipo: unsigned int -- Valor predeterminado: 1000 - -El recuento de errores de cada réplica está limitado a este valor, lo que impide que una sola réplica acumule demasiados errores. - -Ver también: - -- [Motor de tabla distribuido](../../engines/table-engines/special/distributed.md) -- [distributed_replica_error_half_life](#settings-distributed_replica_error_half_life) - -## Distributed_directory_monitor_sleep_time_ms {#distributed_directory_monitor_sleep_time_ms} - -Intervalo base para el [Distribuido](../../engines/table-engines/special/distributed.md) motor de tabla para enviar datos. El intervalo real crece exponencialmente en caso de errores. - -Valores posibles: - -- Un número entero positivo de milisegundos. - -Valor predeterminado: 100 milisegundos. - -## Distributed_directory_monitor_max_sleep_time_ms {#distributed_directory_monitor_max_sleep_time_ms} - -Intervalo máximo para el [Distribuido](../../engines/table-engines/special/distributed.md) motor de tabla para enviar datos. Limita el crecimiento exponencial del intervalo establecido en el [Distributed_directory_monitor_sleep_time_ms](#distributed_directory_monitor_sleep_time_ms) configuración. - -Valores posibles: - -- Un número entero positivo de milisegundos. - -Valor predeterminado: 30000 milisegundos (30 segundos). - -## distributed_directory_monitor_batch_inserts {#distributed_directory_monitor_batch_inserts} - -Habilita/deshabilita el envío de datos insertados en lotes. - -Cuando el envío por lotes está habilitado, el [Distribuido](../../engines/table-engines/special/distributed.md) El motor de tabla intenta enviar varios archivos de datos insertados en una operación en lugar de enviarlos por separado. El envío por lotes mejora el rendimiento del clúster al utilizar mejor los recursos del servidor y de la red. - -Valores posibles: - -- 1 — Enabled. -- 0 — Disabled. - -Valor predeterminado: 0. - -## os_thread_priority {#setting-os-thread-priority} - -Establece la prioridad ([agradable](https://en.wikipedia.org/wiki/Nice_(Unix))) para subprocesos que ejecutan consultas. El programador del sistema operativo considera esta prioridad al elegir el siguiente hilo para ejecutar en cada núcleo de CPU disponible. - -!!! warning "Advertencia" - Para utilizar esta configuración, debe establecer el `CAP_SYS_NICE` capacidad. El `clickhouse-server` paquete lo configura durante la instalación. Algunos entornos virtuales no le permiten establecer `CAP_SYS_NICE` capacidad. En este caso, `clickhouse-server` muestra un mensaje al respecto al principio. - -Valores posibles: - -- Puede establecer valores en el rango `[-20, 19]`. - -Los valores más bajos significan mayor prioridad. Hilos con bajo `nice` Los valores de prioridad se ejecutan con más frecuencia que los subprocesos con valores altos. Los valores altos son preferibles para consultas no interactivas de larga ejecución porque les permite renunciar rápidamente a recursos en favor de consultas interactivas cortas cuando llegan. - -Valor predeterminado: 0. - -## query_profiler_real_time_period_ns {#query_profiler_real_time_period_ns} - -Establece el período para un temporizador de reloj real del [perfilador de consultas](../../operations/optimizing-performance/sampling-query-profiler.md). El temporizador de reloj real cuenta el tiempo del reloj de pared. - -Valores posibles: - -- Número entero positivo, en nanosegundos. - - Valores recomendados: - - - 10000000 (100 times a second) nanoseconds and less for single queries. - - 1000000000 (once a second) for cluster-wide profiling. - -- 0 para apagar el temporizador. - -Tipo: [UInt64](../../sql-reference/data-types/int-uint.md). - -Valor predeterminado: 1000000000 nanosegundos (una vez por segundo). - -Ver también: - -- Tabla del sistema [trace_log](../../operations/system-tables.md#system_tables-trace_log) - -## Los resultados de la prueba {#query_profiler_cpu_time_period_ns} - -Establece el período para un temporizador de reloj de CPU [perfilador de consultas](../../operations/optimizing-performance/sampling-query-profiler.md). Este temporizador solo cuenta el tiempo de CPU. - -Valores posibles: - -- Un número entero positivo de nanosegundos. - - Valores recomendados: - - - 10000000 (100 times a second) nanoseconds and more for single queries. - - 1000000000 (once a second) for cluster-wide profiling. - -- 0 para apagar el temporizador. - -Tipo: [UInt64](../../sql-reference/data-types/int-uint.md). - -Valor predeterminado: 1000000000 nanosegundos. - -Ver también: - -- Tabla del sistema [trace_log](../../operations/system-tables.md#system_tables-trace_log) - -## allow_introspection_functions {#settings-allow_introspection_functions} - -Habilita deshabilita [funciones de introspecciones](../../sql-reference/functions/introspection.md) para la creación de perfiles de consultas. - -Valores posibles: - -- 1 — Introspection functions enabled. -- 0 — Introspection functions disabled. - -Valor predeterminado: 0. - -**Ver también** - -- [Analizador de consultas de muestreo](../optimizing-performance/sampling-query-profiler.md) -- Tabla del sistema [trace_log](../../operations/system-tables.md#system_tables-trace_log) - -## input_format_parallel_parsing {#input-format-parallel-parsing} - -- Tipo: bool -- Valor predeterminado: True - -Habilitar el análisis paralelo de los formatos de datos para preservar el orden. Solo se admite para los formatos TSV, TKSV, CSV y JSONEachRow. - -## También puede utilizar los siguientes métodos de envío: {#min-chunk-bytes-for-parallel-parsing} - -- Tipo: unsigned int -- Valor predeterminado: 1 MiB - -El tamaño mínimo de fragmento en bytes, que cada subproceso analizará en paralelo. - -## Sistema abierto {#settings-output_format_avro_codec} - -Establece el códec de compresión utilizado para el archivo Avro de salida. - -Tipo: cadena - -Valores posibles: - -- `null` — No compression -- `deflate` — Compress with Deflate (zlib) -- `snappy` — Compress with [Rápido](https://google.github.io/snappy/) - -Valor predeterminado: `snappy` (si está disponible) o `deflate`. - -## Sistema abierto {#settings-output_format_avro_sync_interval} - -Establece el tamaño mínimo de datos (en bytes) entre los marcadores de sincronización para el archivo Avro de salida. - -Tipo: unsigned int - -Valores posibles: 32 (32 bytes) - 1073741824 (1 GiB) - -Valor predeterminado: 32768 (32 KiB) - -## Todos los derechos reservados {#settings-format_avro_schema_registry_url} - -Establece la URL del Registro de esquemas confluentes para usar con [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent) formato - -Tipo: URL - -Valor predeterminado: Vacío - -## background_pool_size {#background_pool_size} - -Establece el número de subprocesos que realizan operaciones en segundo plano en motores de tabla (por ejemplo, fusiona [Motor MergeTree](../../engines/table-engines/mergetree-family/index.md) tabla). Esta configuración se aplica al inicio del servidor ClickHouse y no se puede cambiar en una sesión de usuario. Al ajustar esta configuración, puede administrar la carga de la CPU y el disco. Un tamaño de grupo más pequeño utiliza menos recursos de CPU y disco, pero los procesos en segundo plano avanzan más lentamente, lo que eventualmente podría afectar el rendimiento de la consulta. - -Valores posibles: - -- Cualquier entero positivo. - -Valor predeterminado: 16. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/settings/settings/) diff --git a/docs/es/operations/system-tables.md b/docs/es/operations/system-tables.md deleted file mode 100644 index 18e7f7227da..00000000000 --- a/docs/es/operations/system-tables.md +++ /dev/null @@ -1,1168 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 52 -toc_title: Tablas del sistema ---- - -# Tablas del sistema {#system-tables} - -Las tablas del sistema se utilizan para implementar parte de la funcionalidad del sistema y para proporcionar acceso a información sobre cómo funciona el sistema. -No puede eliminar una tabla del sistema (pero puede realizar DETACH). -Las tablas del sistema no tienen archivos con datos en el disco o archivos con metadatos. El servidor crea todas las tablas del sistema cuando se inicia. -Las tablas del sistema son de solo lectura. -Están ubicados en el ‘system’ base. - -## sistema.asynchronous_metrics {#system_tables-asynchronous_metrics} - -Contiene métricas que se calculan periódicamente en segundo plano. Por ejemplo, la cantidad de RAM en uso. - -Columna: - -- `metric` ([Cadena](../sql-reference/data-types/string.md)) — Metric name. -- `value` ([Float64](../sql-reference/data-types/float.md)) — Metric value. - -**Ejemplo** - -``` sql -SELECT * FROM system.asynchronous_metrics LIMIT 10 -``` - -``` text -┌─metric──────────────────────────────────┬──────value─┐ -│ jemalloc.background_thread.run_interval │ 0 │ -│ jemalloc.background_thread.num_runs │ 0 │ -│ jemalloc.background_thread.num_threads │ 0 │ -│ jemalloc.retained │ 422551552 │ -│ jemalloc.mapped │ 1682989056 │ -│ jemalloc.resident │ 1656446976 │ -│ jemalloc.metadata_thp │ 0 │ -│ jemalloc.metadata │ 10226856 │ -│ UncompressedCacheCells │ 0 │ -│ MarkCacheFiles │ 0 │ -└─────────────────────────────────────────┴────────────┘ -``` - -**Ver también** - -- [Monitoreo](monitoring.md) — Base concepts of ClickHouse monitoring. -- [sistema.métricas](#system_tables-metrics) — Contains instantly calculated metrics. -- [sistema.evento](#system_tables-events) — Contains a number of events that have occurred. -- [sistema.metric_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. - -## sistema.Cluster {#system-clusters} - -Contiene información sobre los clústeres disponibles en el archivo de configuración y los servidores que contienen. - -Columna: - -- `cluster` (String) — The cluster name. -- `shard_num` (UInt32) — The shard number in the cluster, starting from 1. -- `shard_weight` (UInt32) — The relative weight of the shard when writing data. -- `replica_num` (UInt32) — The replica number in the shard, starting from 1. -- `host_name` (String) — The host name, as specified in the config. -- `host_address` (String) — The host IP address obtained from DNS. -- `port` (UInt16) — The port to use for connecting to the server. -- `user` (String) — The name of the user for connecting to the server. -- `errors_count` (UInt32): número de veces que este host no pudo alcanzar la réplica. -- `estimated_recovery_time` (UInt32): quedan segundos hasta que el recuento de errores de réplica se ponga a cero y se considere que vuelve a la normalidad. - -Tenga en cuenta que `errors_count` se actualiza una vez por consulta al clúster, pero `estimated_recovery_time` se vuelve a calcular bajo demanda. Entonces podría haber un caso distinto de cero `errors_count` y cero `estimated_recovery_time`, esa próxima consulta será cero `errors_count` e intente usar la réplica como si no tuviera errores. - -**Ver también** - -- [Motor de tabla distribuido](../engines/table-engines/special/distributed.md) -- [distributed_replica_error_cap configuración](settings/settings.md#settings-distributed_replica_error_cap) -- [distributed_replica_error_half_life configuración](settings/settings.md#settings-distributed_replica_error_half_life) - -## sistema.columna {#system-columns} - -Contiene información sobre las columnas de todas las tablas. - -Puede utilizar esta tabla para obtener información similar a la [DESCRIBE TABLE](../sql-reference/statements/misc.md#misc-describe-table) consulta, pero para varias tablas a la vez. - -El `system.columns` tabla contiene las siguientes columnas (el tipo de columna se muestra entre corchetes): - -- `database` (String) — Database name. -- `table` (String) — Table name. -- `name` (String) — Column name. -- `type` (String) — Column type. -- `default_kind` (String) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`) para el valor predeterminado, o una cadena vacía si no está definida. -- `default_expression` (String) — Expression for the default value, or an empty string if it is not defined. -- `data_compressed_bytes` (UInt64) — The size of compressed data, in bytes. -- `data_uncompressed_bytes` (UInt64) — The size of decompressed data, in bytes. -- `marks_bytes` (UInt64) — The size of marks, in bytes. -- `comment` (String) — Comment on the column, or an empty string if it is not defined. -- `is_in_partition_key` (UInt8) — Flag that indicates whether the column is in the partition expression. -- `is_in_sorting_key` (UInt8) — Flag that indicates whether the column is in the sorting key expression. -- `is_in_primary_key` (UInt8) — Flag that indicates whether the column is in the primary key expression. -- `is_in_sampling_key` (UInt8) — Flag that indicates whether the column is in the sampling key expression. - -## sistema.colaborador {#system-contributors} - -Contiene información sobre los colaboradores. Todos los constributores en orden aleatorio. El orden es aleatorio en el momento de la ejecución de la consulta. - -Columna: - -- `name` (String) — Contributor (author) name from git log. - -**Ejemplo** - -``` sql -SELECT * FROM system.contributors LIMIT 10 -``` - -``` text -┌─name─────────────┐ -│ Olga Khvostikova │ -│ Max Vetrov │ -│ LiuYangkuan │ -│ svladykin │ -│ zamulla │ -│ Šimon Podlipský │ -│ BayoNet │ -│ Ilya Khomutov │ -│ Amy Krishnevsky │ -│ Loud_Scream │ -└──────────────────┘ -``` - -Para descubrirlo en la tabla, use una consulta: - -``` sql -SELECT * FROM system.contributors WHERE name='Olga Khvostikova' -``` - -``` text -┌─name─────────────┐ -│ Olga Khvostikova │ -└──────────────────┘ -``` - -## sistema.base {#system-databases} - -Esta tabla contiene una sola columna String llamada ‘name’ – the name of a database. -Cada base de datos que el servidor conoce tiene una entrada correspondiente en la tabla. -Esta tabla del sistema se utiliza para implementar el `SHOW DATABASES` consulta. - -## sistema.detached_parts {#system_tables-detached_parts} - -Contiene información sobre piezas separadas de [Método de codificación de datos:](../engines/table-engines/mergetree-family/mergetree.md) tabla. El `reason` columna especifica por qué se separó la pieza. Para las piezas separadas por el usuario, el motivo está vacío. Tales partes se pueden unir con [ALTER TABLE ATTACH PARTITION\|PART](../sql-reference/statements/alter.md#alter_attach-partition) comando. Para obtener la descripción de otras columnas, consulte [sistema.parte](#system_tables-parts). Si el nombre de la pieza no es válido, los valores de algunas columnas pueden ser `NULL`. Tales partes se pueden eliminar con [ALTER TABLE DROP DETACHED PART](../sql-reference/statements/alter.md#alter_drop-detached). - -## sistema.diccionario {#system_tables-dictionaries} - -Contiene información sobre [diccionarios externos](../sql-reference/dictionaries/external-dictionaries/external-dicts.md). - -Columna: - -- `database` ([Cadena](../sql-reference/data-types/string.md)) — Name of the database containing the dictionary created by DDL query. Empty string for other dictionaries. -- `name` ([Cadena](../sql-reference/data-types/string.md)) — [Nombre del diccionario](../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md). -- `status` ([Enum8](../sql-reference/data-types/enum.md)) — Dictionary status. Possible values: - - `NOT_LOADED` — Dictionary was not loaded because it was not used. - - `LOADED` — Dictionary loaded successfully. - - `FAILED` — Unable to load the dictionary as a result of an error. - - `LOADING` — Dictionary is loading now. - - `LOADED_AND_RELOADING` — Dictionary is loaded successfully, and is being reloaded right now (frequent reasons: [SYSTEM RELOAD DICTIONARY](../sql-reference/statements/system.md#query_language-system-reload-dictionary) consulta, tiempo de espera, configuración del diccionario ha cambiado). - - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. -- `origin` ([Cadena](../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. -- `type` ([Cadena](../sql-reference/data-types/string.md)) — Type of a dictionary allocation. [Almacenamiento de diccionarios en la memoria](../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). -- `key` — [Tipo de llave](../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key): Clave numérica ([UInt64](../sql-reference/data-types/int-uint.md#uint-ranges)) or Сomposite key ([Cadena](../sql-reference/data-types/string.md)) — form “(type 1, type 2, …, type n)”. -- `attribute.names` ([Matriz](../sql-reference/data-types/array.md)([Cadena](../sql-reference/data-types/string.md))) — Array of [nombres de atributos](../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) proporcionada por el diccionario. -- `attribute.types` ([Matriz](../sql-reference/data-types/array.md)([Cadena](../sql-reference/data-types/string.md))) — Corresponding array of [tipos de atributos](../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) que son proporcionados por el diccionario. -- `bytes_allocated` ([UInt64](../sql-reference/data-types/int-uint.md#uint-ranges)) — Amount of RAM allocated for the dictionary. -- `query_count` ([UInt64](../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of queries since the dictionary was loaded or since the last successful reboot. -- `hit_rate` ([Float64](../sql-reference/data-types/float.md)) — For cache dictionaries, the percentage of uses for which the value was in the cache. -- `element_count` ([UInt64](../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of items stored in the dictionary. -- `load_factor` ([Float64](../sql-reference/data-types/float.md)) — Percentage filled in the dictionary (for a hashed dictionary, the percentage filled in the hash table). -- `source` ([Cadena](../sql-reference/data-types/string.md)) — Text describing the [fuente de datos](../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) para el diccionario. -- `lifetime_min` ([UInt64](../sql-reference/data-types/int-uint.md#uint-ranges)) — Minimum [vida](../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) del diccionario en la memoria, después de lo cual ClickHouse intenta volver a cargar el diccionario (si `invalidate_query` está configurado, entonces solo si ha cambiado). Establecer en segundos. -- `lifetime_max` ([UInt64](../sql-reference/data-types/int-uint.md#uint-ranges)) — Maximum [vida](../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) del diccionario en la memoria, después de lo cual ClickHouse intenta volver a cargar el diccionario (si `invalidate_query` está configurado, entonces solo si ha cambiado). Establecer en segundos. -- `loading_start_time` ([FechaHora](../sql-reference/data-types/datetime.md)) — Start time for loading the dictionary. -- `last_successful_update_time` ([FechaHora](../sql-reference/data-types/datetime.md)) — End time for loading or updating the dictionary. Helps to monitor some troubles with external sources and investigate causes. -- `loading_duration` ([Float32](../sql-reference/data-types/float.md)) — Duration of a dictionary loading. -- `last_exception` ([Cadena](../sql-reference/data-types/string.md)) — Text of the error that occurs when creating or reloading the dictionary if the dictionary couldn't be created. - -**Ejemplo** - -Configurar el diccionario. - -``` sql -CREATE DICTIONARY dictdb.dict -( - `key` Int64 DEFAULT -1, - `value_default` String DEFAULT 'world', - `value_expression` String DEFAULT 'xxx' EXPRESSION 'toString(127 * 172)' -) -PRIMARY KEY key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'dicttbl' DB 'dictdb')) -LIFETIME(MIN 0 MAX 1) -LAYOUT(FLAT()) -``` - -Asegúrese de que el diccionario esté cargado. - -``` sql -SELECT * FROM system.dictionaries -``` - -``` text -┌─database─┬─name─┬─status─┬─origin──────┬─type─┬─key────┬─attribute.names──────────────────────┬─attribute.types─────┬─bytes_allocated─┬─query_count─┬─hit_rate─┬─element_count─┬───────────load_factor─┬─source─────────────────────┬─lifetime_min─┬─lifetime_max─┬──loading_start_time─┌──last_successful_update_time─┬──────loading_duration─┬─last_exception─┐ -│ dictdb │ dict │ LOADED │ dictdb.dict │ Flat │ UInt64 │ ['value_default','value_expression'] │ ['String','String'] │ 74032 │ 0 │ 1 │ 1 │ 0.0004887585532746823 │ ClickHouse: dictdb.dicttbl │ 0 │ 1 │ 2020-03-04 04:17:34 │ 2020-03-04 04:30:34 │ 0.002 │ │ -└──────────┴──────┴────────┴─────────────┴──────┴────────┴──────────────────────────────────────┴─────────────────────┴─────────────────┴─────────────┴──────────┴───────────────┴───────────────────────┴────────────────────────────┴──────────────┴──────────────┴─────────────────────┴──────────────────────────────┘───────────────────────┴────────────────┘ -``` - -## sistema.evento {#system_tables-events} - -Contiene información sobre el número de eventos que se han producido en el sistema. Por ejemplo, en la tabla, puede encontrar cuántos `SELECT` las consultas se procesaron desde que se inició el servidor ClickHouse. - -Columna: - -- `event` ([Cadena](../sql-reference/data-types/string.md)) — Event name. -- `value` ([UInt64](../sql-reference/data-types/int-uint.md)) — Number of events occurred. -- `description` ([Cadena](../sql-reference/data-types/string.md)) — Event description. - -**Ejemplo** - -``` sql -SELECT * FROM system.events LIMIT 5 -``` - -``` text -┌─event─────────────────────────────────┬─value─┬─description────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ Query │ 12 │ Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries. │ -│ SelectQuery │ 8 │ Same as Query, but only for SELECT queries. │ -│ FileOpen │ 73 │ Number of files opened. │ -│ ReadBufferFromFileDescriptorRead │ 155 │ Number of reads (read/pread) from a file descriptor. Does not include sockets. │ -│ ReadBufferFromFileDescriptorReadBytes │ 9931 │ Number of bytes read from file descriptors. If the file is compressed, this will show the compressed data size. │ -└───────────────────────────────────────┴───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -**Ver también** - -- [sistema.asynchronous_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. -- [sistema.métricas](#system_tables-metrics) — Contains instantly calculated metrics. -- [sistema.metric_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. -- [Monitoreo](monitoring.md) — Base concepts of ClickHouse monitoring. - -## sistema.función {#system-functions} - -Contiene información sobre funciones normales y agregadas. - -Columna: - -- `name`(`String`) – The name of the function. -- `is_aggregate`(`UInt8`) — Whether the function is aggregate. - -## sistema.graphite_retentions {#system-graphite-retentions} - -Contiene información sobre los parámetros [graphite_rollup](server-configuration-parameters/settings.md#server_configuration_parameters-graphite) que se utilizan en tablas con [\*GraphiteMergeTree](../engines/table-engines/mergetree-family/graphitemergetree.md) motor. - -Columna: - -- `config_name` (Cadena) - `graphite_rollup` nombre del parámetro. -- `regexp` (Cadena) - Un patrón para el nombre de la métrica. -- `function` (String) - El nombre de la función de agregación. -- `age` (UInt64) - La edad mínima de los datos en segundos. -- `precision` (UInt64) - Cómo definir con precisión la edad de los datos en segundos. -- `priority` (UInt16) - Prioridad de patrón. -- `is_default` (UInt8) - Si el patrón es el predeterminado. -- `Tables.database` (Array(String)) - Matriz de nombres de tablas de base de datos que utilizan `config_name` parámetro. -- `Tables.table` (Array(String)) - Matriz de nombres de tablas que utilizan `config_name` parámetro. - -## sistema.fusionar {#system-merges} - -Contiene información sobre fusiones y mutaciones de piezas actualmente en proceso para tablas de la familia MergeTree. - -Columna: - -- `database` (String) — The name of the database the table is in. -- `table` (String) — Table name. -- `elapsed` (Float64) — The time elapsed (in seconds) since the merge started. -- `progress` (Float64) — The percentage of completed work from 0 to 1. -- `num_parts` (UInt64) — The number of pieces to be merged. -- `result_part_name` (String) — The name of the part that will be formed as the result of merging. -- `is_mutation` (UInt8) - 1 si este proceso es una mutación parte. -- `total_size_bytes_compressed` (UInt64) — The total size of the compressed data in the merged chunks. -- `total_size_marks` (UInt64) — The total number of marks in the merged parts. -- `bytes_read_uncompressed` (UInt64) — Number of bytes read, uncompressed. -- `rows_read` (UInt64) — Number of rows read. -- `bytes_written_uncompressed` (UInt64) — Number of bytes written, uncompressed. -- `rows_written` (UInt64) — Number of rows written. - -## sistema.métricas {#system_tables-metrics} - -Contiene métricas que pueden calcularse instantáneamente o tener un valor actual. Por ejemplo, el número de consultas procesadas simultáneamente o el retraso de réplica actual. Esta tabla está siempre actualizada. - -Columna: - -- `metric` ([Cadena](../sql-reference/data-types/string.md)) — Metric name. -- `value` ([Int64](../sql-reference/data-types/int-uint.md)) — Metric value. -- `description` ([Cadena](../sql-reference/data-types/string.md)) — Metric description. - -La lista de métricas admitidas que puede encontrar en el [src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CurrentMetrics.cpp) archivo fuente de ClickHouse. - -**Ejemplo** - -``` sql -SELECT * FROM system.metrics LIMIT 10 -``` - -``` text -┌─metric─────────────────────┬─value─┬─description──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ Query │ 1 │ Number of executing queries │ -│ Merge │ 0 │ Number of executing background merges │ -│ PartMutation │ 0 │ Number of mutations (ALTER DELETE/UPDATE) │ -│ ReplicatedFetch │ 0 │ Number of data parts being fetched from replicas │ -│ ReplicatedSend │ 0 │ Number of data parts being sent to replicas │ -│ ReplicatedChecks │ 0 │ Number of data parts checking for consistency │ -│ BackgroundPoolTask │ 0 │ Number of active tasks in BackgroundProcessingPool (merges, mutations, fetches, or replication queue bookkeeping) │ -│ BackgroundSchedulePoolTask │ 0 │ Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc. │ -│ DiskSpaceReservedForMerge │ 0 │ Disk space reserved for currently running background merges. It is slightly more than the total size of currently merging parts. │ -│ DistributedSend │ 0 │ Number of connections to remote servers sending data that was INSERTed into Distributed tables. Both synchronous and asynchronous mode. │ -└────────────────────────────┴───────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -**Ver también** - -- [sistema.asynchronous_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. -- [sistema.evento](#system_tables-events) — Contains a number of events that occurred. -- [sistema.metric_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. -- [Monitoreo](monitoring.md) — Base concepts of ClickHouse monitoring. - -## sistema.metric_log {#system_tables-metric_log} - -Contiene el historial de valores de métricas de tablas `system.metrics` y `system.events`, periódicamente enjuagado al disco. -Para activar la recopilación de historial de métricas en `system.metric_log`, crear `/etc/clickhouse-server/config.d/metric_log.xml` con el siguiente contenido: - -``` xml - - - system - metric_log
- 7500 - 1000 -
-
-``` - -**Ejemplo** - -``` sql -SELECT * FROM system.metric_log LIMIT 1 FORMAT Vertical; -``` - -``` text -Row 1: -────── -event_date: 2020-02-18 -event_time: 2020-02-18 07:15:33 -milliseconds: 554 -ProfileEvent_Query: 0 -ProfileEvent_SelectQuery: 0 -ProfileEvent_InsertQuery: 0 -ProfileEvent_FileOpen: 0 -ProfileEvent_Seek: 0 -ProfileEvent_ReadBufferFromFileDescriptorRead: 1 -ProfileEvent_ReadBufferFromFileDescriptorReadFailed: 0 -ProfileEvent_ReadBufferFromFileDescriptorReadBytes: 0 -ProfileEvent_WriteBufferFromFileDescriptorWrite: 1 -ProfileEvent_WriteBufferFromFileDescriptorWriteFailed: 0 -ProfileEvent_WriteBufferFromFileDescriptorWriteBytes: 56 -... -CurrentMetric_Query: 0 -CurrentMetric_Merge: 0 -CurrentMetric_PartMutation: 0 -CurrentMetric_ReplicatedFetch: 0 -CurrentMetric_ReplicatedSend: 0 -CurrentMetric_ReplicatedChecks: 0 -... -``` - -**Ver también** - -- [sistema.asynchronous_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. -- [sistema.evento](#system_tables-events) — Contains a number of events that occurred. -- [sistema.métricas](#system_tables-metrics) — Contains instantly calculated metrics. -- [Monitoreo](monitoring.md) — Base concepts of ClickHouse monitoring. - -## sistema.numero {#system-numbers} - -Esta tabla contiene una única columna UInt64 llamada ‘number’ que contiene casi todos los números naturales a partir de cero. -Puede usar esta tabla para pruebas, o si necesita hacer una búsqueda de fuerza bruta. -Las lecturas de esta tabla no están paralelizadas. - -## sistema.Números_mt {#system-numbers-mt} - -Lo mismo que ‘system.numbers’ pero las lecturas están paralelizadas. Los números se pueden devolver en cualquier orden. -Se utiliza para pruebas. - -## sistema.una {#system-one} - -Esta tabla contiene una sola fila con una ‘dummy’ Columna UInt8 que contiene el valor 0. -Esta tabla se utiliza si una consulta SELECT no especifica la cláusula FROM. -Esto es similar a la tabla DUAL que se encuentra en otros DBMS. - -## sistema.parte {#system_tables-parts} - -Contiene información sobre partes de [Método de codificación de datos:](../engines/table-engines/mergetree-family/mergetree.md) tabla. - -Cada fila describe una parte de datos. - -Columna: - -- `partition` (String) – The partition name. To learn what a partition is, see the description of the [ALTER](../sql-reference/statements/alter.md#query_language_queries_alter) consulta. - - Formato: - - - `YYYYMM` para la partición automática por mes. - - `any_string` al particionar manualmente. - -- `name` (`String`) – Name of the data part. - -- `active` (`UInt8`) – Flag that indicates whether the data part is active. If a data part is active, it's used in a table. Otherwise, it's deleted. Inactive data parts remain after merging. - -- `marks` (`UInt64`) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` por la granularidad del índice (generalmente 8192) (esta sugerencia no funciona para la granularidad adaptativa). - -- `rows` (`UInt64`) – The number of rows. - -- `bytes_on_disk` (`UInt64`) – Total size of all the data part files in bytes. - -- `data_compressed_bytes` (`UInt64`) – Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. - -- `data_uncompressed_bytes` (`UInt64`) – Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. - -- `marks_bytes` (`UInt64`) – The size of the file with marks. - -- `modification_time` (`DateTime`) – The time the directory with the data part was modified. This usually corresponds to the time of data part creation.\| - -- `remove_time` (`DateTime`) – The time when the data part became inactive. - -- `refcount` (`UInt32`) – The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. - -- `min_date` (`Date`) – The minimum value of the date key in the data part. - -- `max_date` (`Date`) – The maximum value of the date key in the data part. - -- `min_time` (`DateTime`) – The minimum value of the date and time key in the data part. - -- `max_time`(`DateTime`) – The maximum value of the date and time key in the data part. - -- `partition_id` (`String`) – ID of the partition. - -- `min_block_number` (`UInt64`) – The minimum number of data parts that make up the current part after merging. - -- `max_block_number` (`UInt64`) – The maximum number of data parts that make up the current part after merging. - -- `level` (`UInt32`) – Depth of the merge tree. Zero means that the current part was created by insert rather than by merging other parts. - -- `data_version` (`UInt64`) – Number that is used to determine which mutations should be applied to the data part (mutations with a version higher than `data_version`). - -- `primary_key_bytes_in_memory` (`UInt64`) – The amount of memory (in bytes) used by primary key values. - -- `primary_key_bytes_in_memory_allocated` (`UInt64`) – The amount of memory (in bytes) reserved for primary key values. - -- `is_frozen` (`UInt8`) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup doesn't exist. For more details, see [FREEZE PARTITION](../sql-reference/statements/alter.md#alter_freeze-partition) - -- `database` (`String`) – Name of the database. - -- `table` (`String`) – Name of the table. - -- `engine` (`String`) – Name of the table engine without parameters. - -- `path` (`String`) – Absolute path to the folder with data part files. - -- `disk` (`String`) – Name of a disk that stores the data part. - -- `hash_of_all_files` (`String`) – [sipHash128](../sql-reference/functions/hash-functions.md#hash_functions-siphash128) de archivos comprimidos. - -- `hash_of_uncompressed_files` (`String`) – [sipHash128](../sql-reference/functions/hash-functions.md#hash_functions-siphash128) de archivos sin comprimir (archivos con marcas, archivo de índice, etc.). - -- `uncompressed_hash_of_compressed_files` (`String`) – [sipHash128](../sql-reference/functions/hash-functions.md#hash_functions-siphash128) de datos en los archivos comprimidos como si estuvieran descomprimidos. - -- `bytes` (`UInt64`) – Alias for `bytes_on_disk`. - -- `marks_size` (`UInt64`) – Alias for `marks_bytes`. - -## sistema.part_log {#system_tables-part-log} - -El `system.part_log` se crea sólo si el [part_log](server-configuration-parameters/settings.md#server_configuration_parameters-part-log) se especifica la configuración del servidor. - -Esta tabla contiene información sobre eventos que ocurrieron con [partes de datos](../engines/table-engines/mergetree-family/custom-partitioning-key.md) en el [Método de codificación de datos:](../engines/table-engines/mergetree-family/mergetree.md) tablas familiares, como agregar o fusionar datos. - -El `system.part_log` contiene las siguientes columnas: - -- `event_type` (Enum) — Type of the event that occurred with the data part. Can have one of the following values: - - `NEW_PART` — Inserting of a new data part. - - `MERGE_PARTS` — Merging of data parts. - - `DOWNLOAD_PART` — Downloading a data part. - - `REMOVE_PART` — Removing or detaching a data part using [DETACH PARTITION](../sql-reference/statements/alter.md#alter_detach-partition). - - `MUTATE_PART` — Mutating of a data part. - - `MOVE_PART` — Moving the data part from the one disk to another one. -- `event_date` (Date) — Event date. -- `event_time` (DateTime) — Event time. -- `duration_ms` (UInt64) — Duration. -- `database` (String) — Name of the database the data part is in. -- `table` (String) — Name of the table the data part is in. -- `part_name` (String) — Name of the data part. -- `partition_id` (String) — ID of the partition that the data part was inserted to. The column takes the ‘all’ valor si la partición es por `tuple()`. -- `rows` (UInt64) — The number of rows in the data part. -- `size_in_bytes` (UInt64) — Size of the data part in bytes. -- `merged_from` (Array(String)) — An array of names of the parts which the current part was made up from (after the merge). -- `bytes_uncompressed` (UInt64) — Size of uncompressed bytes. -- `read_rows` (UInt64) — The number of rows was read during the merge. -- `read_bytes` (UInt64) — The number of bytes was read during the merge. -- `error` (UInt16) — The code number of the occurred error. -- `exception` (String) — Text message of the occurred error. - -El `system.part_log` se crea después de la primera inserción de datos `MergeTree` tabla. - -## sistema.procesa {#system_tables-processes} - -Esta tabla del sistema se utiliza para implementar el `SHOW PROCESSLIST` consulta. - -Columna: - -- `user` (String) – The user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the `default` usuario. El campo contiene el nombre de usuario para una consulta específica, no para una consulta que esta consulta inició. -- `address` (String) – The IP address the request was made from. The same for distributed processing. To track where a distributed query was originally made from, look at `system.processes` en el servidor de solicitud de consulta. -- `elapsed` (Float64) – The time in seconds since request execution started. -- `rows_read` (UInt64) – The number of rows read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. -- `bytes_read` (UInt64) – The number of uncompressed bytes read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. -- `total_rows_approx` (UInt64) – The approximation of the total number of rows that should be read. For distributed processing, on the requestor server, this is the total for all remote servers. It can be updated during request processing, when new sources to process become known. -- `memory_usage` (UInt64) – Amount of RAM the request uses. It might not include some types of dedicated memory. See the [Método de codificación de datos:](../operations/settings/query-complexity.md#settings_max_memory_usage) configuración. -- `query` (String) – The query text. For `INSERT`, no incluye los datos para insertar. -- `query_id` (String) – Query ID, if defined. - -## sistema.text_log {#system_tables-text_log} - -Contiene entradas de registro. El nivel de registro que va a esta tabla se puede limitar con `text_log.level` configuración del servidor. - -Columna: - -- `event_date` (`Date`) - Fecha de la entrada. -- `event_time` (`DateTime`) - Hora de la entrada. -- `microseconds` (`UInt32`) - Microsegundos de la entrada. -- `thread_name` (String) — Name of the thread from which the logging was done. -- `thread_id` (UInt64) — OS thread ID. -- `level` (`Enum8`) - Nivel de entrada. - - `'Fatal' = 1` - - `'Critical' = 2` - - `'Error' = 3` - - `'Warning' = 4` - - `'Notice' = 5` - - `'Information' = 6` - - `'Debug' = 7` - - `'Trace' = 8` -- `query_id` (`String`) - ID de la consulta. -- `logger_name` (`LowCardinality(String)`) - Name of the logger (i.e. `DDLWorker`) -- `message` (`String`) - El mensaje en sí. -- `revision` (`UInt32`) - Revisión de ClickHouse. -- `source_file` (`LowCardinality(String)`) - Archivo de origen desde el que se realizó el registro. -- `source_line` (`UInt64`) - Línea de origen desde la que se realizó el registro. - -## sistema.query_log {#system_tables-query_log} - -Contiene información sobre la ejecución de consultas. Para cada consulta, puede ver la hora de inicio del procesamiento, la duración del procesamiento, los mensajes de error y otra información. - -!!! note "Nota" - La tabla no contiene datos de entrada para `INSERT` consulta. - -ClickHouse crea esta tabla sólo si el [query_log](server-configuration-parameters/settings.md#server_configuration_parameters-query-log) se especifica el parámetro server. Este parámetro establece las reglas de registro, como el intervalo de registro o el nombre de la tabla en la que se registrarán las consultas. - -Para habilitar el registro de consultas, [Log_queries](settings/settings.md#settings-log-queries) parámetro a 1. Para obtener más información, consulte el [Configuración](settings/settings.md) apartado. - -El `system.query_log` tabla registra dos tipos de consultas: - -1. Consultas iniciales ejecutadas directamente por el cliente. -2. Consultas secundarias iniciadas por otras consultas (para la ejecución de consultas distribuidas). Para estos tipos de consultas, la información sobre las consultas principales se muestra en el `initial_*` columna. - -Columna: - -- `type` (`Enum8`) — Type of event that occurred when executing the query. Values: - - `'QueryStart' = 1` — Successful start of query execution. - - `'QueryFinish' = 2` — Successful end of query execution. - - `'ExceptionBeforeStart' = 3` — Exception before the start of query execution. - - `'ExceptionWhileProcessing' = 4` — Exception during the query execution. -- `event_date` (Date) — Query starting date. -- `event_time` (DateTime) — Query starting time. -- `query_start_time` (DateTime) — Start time of query execution. -- `query_duration_ms` (UInt64) — Duration of query execution. -- `read_rows` (UInt64) — Number of read rows. -- `read_bytes` (UInt64) — Number of read bytes. -- `written_rows` (UInt64) — For `INSERT` consultas, el número de filas escritas. Para otras consultas, el valor de la columna es 0. -- `written_bytes` (UInt64) — For `INSERT` consultas, el número de bytes escritos. Para otras consultas, el valor de la columna es 0. -- `result_rows` (UInt64) — Number of rows in the result. -- `result_bytes` (UInt64) — Number of bytes in the result. -- `memory_usage` (UInt64) — Memory consumption by the query. -- `query` (String) — Query string. -- `exception` (String) — Exception message. -- `stack_trace` (String) — Stack trace (a list of methods called before the error occurred). An empty string, if the query is completed successfully. -- `is_initial_query` (UInt8) — Query type. Possible values: - - 1 — Query was initiated by the client. - - 0 — Query was initiated by another query for distributed query execution. -- `user` (String) — Name of the user who initiated the current query. -- `query_id` (String) — ID of the query. -- `address` (IPv6) — IP address that was used to make the query. -- `port` (UInt16) — The client port that was used to make the query. -- `initial_user` (String) — Name of the user who ran the initial query (for distributed query execution). -- `initial_query_id` (String) — ID of the initial query (for distributed query execution). -- `initial_address` (IPv6) — IP address that the parent query was launched from. -- `initial_port` (UInt16) — The client port that was used to make the parent query. -- `interface` (UInt8) — Interface that the query was initiated from. Possible values: - - 1 — TCP. - - 2 — HTTP. -- `os_user` (String) — OS's username who runs [Casa de clics-cliente](../interfaces/cli.md). -- `client_hostname` (String) — Hostname of the client machine where the [Casa de clics-cliente](../interfaces/cli.md) o se ejecuta otro cliente TCP. -- `client_name` (String) — The [Casa de clics-cliente](../interfaces/cli.md) o otro nombre de cliente TCP. -- `client_revision` (UInt32) — Revision of the [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. -- `client_version_major` (UInt32) — Major version of the [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. -- `client_version_minor` (UInt32) — Minor version of the [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. -- `client_version_patch` (UInt32) — Patch component of the [Casa de clics-cliente](../interfaces/cli.md) o otra versión de cliente TCP. -- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: - - 0 — The query was launched from the TCP interface. - - 1 — `GET` se utilizó el método. - - 2 — `POST` se utilizó el método. -- `http_user_agent` (String) — The `UserAgent` encabezado pasado en la solicitud HTTP. -- `quota_key` (String) — The “quota key” especificado en el [cuota](quotas.md) ajuste (ver `keyed`). -- `revision` (UInt32) — ClickHouse revision. -- `thread_numbers` (Array(UInt32)) — Number of threads that are participating in query execution. -- `ProfileEvents.Names` (Array(String)) — Counters that measure different metrics. The description of them could be found in the table [sistema.evento](#system_tables-events) -- `ProfileEvents.Values` (Array(UInt64)) — Values of metrics that are listed in the `ProfileEvents.Names` columna. -- `Settings.Names` (Array(String)) — Names of settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parámetro a 1. -- `Settings.Values` (Array(String)) — Values of settings that are listed in the `Settings.Names` columna. - -Cada consulta crea una o dos filas en el `query_log` tabla, dependiendo del estado de la consulta: - -1. Si la ejecución de la consulta se realiza correctamente, se crean dos eventos con los tipos 1 y 2 (consulte `type` columna). -2. Si se produjo un error durante el procesamiento de la consulta, se crean dos eventos con los tipos 1 y 4. -3. Si se produjo un error antes de iniciar la consulta, se crea un solo evento con el tipo 3. - -De forma predeterminada, los registros se agregan a la tabla a intervalos de 7,5 segundos. Puede establecer este intervalo en el [query_log](server-configuration-parameters/settings.md#server_configuration_parameters-query-log) configuración del servidor (consulte el `flush_interval_milliseconds` parámetro). Para vaciar los registros a la fuerza desde el búfer de memoria a la tabla, utilice `SYSTEM FLUSH LOGS` consulta. - -Cuando la tabla se elimina manualmente, se creará automáticamente sobre la marcha. Tenga en cuenta que se eliminarán todos los registros anteriores. - -!!! note "Nota" - El período de almacenamiento para los registros es ilimitado. Los registros no se eliminan automáticamente de la tabla. Debe organizar la eliminación de registros obsoletos usted mismo. - -Puede especificar una clave de partición arbitraria `system.query_log` mesa en el [query_log](server-configuration-parameters/settings.md#server_configuration_parameters-query-log) configuración del servidor (consulte el `partition_by` parámetro). - -## sistema.Sistema abierto {#system_tables-query-thread-log} - -La tabla contiene información sobre cada subproceso de ejecución de consultas. - -ClickHouse crea esta tabla sólo si el [Sistema abierto.](server-configuration-parameters/settings.md#server_configuration_parameters-query-thread-log) se especifica el parámetro server. Este parámetro establece las reglas de registro, como el intervalo de registro o el nombre de la tabla en la que se registrarán las consultas. - -Para habilitar el registro de consultas, [Log_query_threads](settings/settings.md#settings-log-query-threads) parámetro a 1. Para obtener más información, consulte el [Configuración](settings/settings.md) apartado. - -Columna: - -- `event_date` (Date) — the date when the thread has finished execution of the query. -- `event_time` (DateTime) — the date and time when the thread has finished execution of the query. -- `query_start_time` (DateTime) — Start time of query execution. -- `query_duration_ms` (UInt64) — Duration of query execution. -- `read_rows` (UInt64) — Number of read rows. -- `read_bytes` (UInt64) — Number of read bytes. -- `written_rows` (UInt64) — For `INSERT` consultas, el número de filas escritas. Para otras consultas, el valor de la columna es 0. -- `written_bytes` (UInt64) — For `INSERT` consultas, el número de bytes escritos. Para otras consultas, el valor de la columna es 0. -- `memory_usage` (Int64) — The difference between the amount of allocated and freed memory in context of this thread. -- `peak_memory_usage` (Int64) — The maximum difference between the amount of allocated and freed memory in context of this thread. -- `thread_name` (String) — Name of the thread. -- `thread_number` (UInt32) — Internal thread ID. -- `os_thread_id` (Int32) — OS thread ID. -- `master_thread_id` (UInt64) — OS initial ID of initial thread. -- `query` (String) — Query string. -- `is_initial_query` (UInt8) — Query type. Possible values: - - 1 — Query was initiated by the client. - - 0 — Query was initiated by another query for distributed query execution. -- `user` (String) — Name of the user who initiated the current query. -- `query_id` (String) — ID of the query. -- `address` (IPv6) — IP address that was used to make the query. -- `port` (UInt16) — The client port that was used to make the query. -- `initial_user` (String) — Name of the user who ran the initial query (for distributed query execution). -- `initial_query_id` (String) — ID of the initial query (for distributed query execution). -- `initial_address` (IPv6) — IP address that the parent query was launched from. -- `initial_port` (UInt16) — The client port that was used to make the parent query. -- `interface` (UInt8) — Interface that the query was initiated from. Possible values: - - 1 — TCP. - - 2 — HTTP. -- `os_user` (String) — OS's username who runs [Casa de clics-cliente](../interfaces/cli.md). -- `client_hostname` (String) — Hostname of the client machine where the [Casa de clics-cliente](../interfaces/cli.md) o se ejecuta otro cliente TCP. -- `client_name` (String) — The [Casa de clics-cliente](../interfaces/cli.md) o otro nombre de cliente TCP. -- `client_revision` (UInt32) — Revision of the [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. -- `client_version_major` (UInt32) — Major version of the [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. -- `client_version_minor` (UInt32) — Minor version of the [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. -- `client_version_patch` (UInt32) — Patch component of the [Casa de clics-cliente](../interfaces/cli.md) o otra versión de cliente TCP. -- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: - - 0 — The query was launched from the TCP interface. - - 1 — `GET` se utilizó el método. - - 2 — `POST` se utilizó el método. -- `http_user_agent` (String) — The `UserAgent` encabezado pasado en la solicitud HTTP. -- `quota_key` (String) — The “quota key” especificado en el [cuota](quotas.md) ajuste (ver `keyed`). -- `revision` (UInt32) — ClickHouse revision. -- `ProfileEvents.Names` (Array(String)) — Counters that measure different metrics for this thread. The description of them could be found in the table [sistema.evento](#system_tables-events) -- `ProfileEvents.Values` (Array(UInt64)) — Values of metrics for this thread that are listed in the `ProfileEvents.Names` columna. - -De forma predeterminada, los registros se agregan a la tabla a intervalos de 7,5 segundos. Puede establecer este intervalo en el [Sistema abierto.](server-configuration-parameters/settings.md#server_configuration_parameters-query-thread-log) configuración del servidor (consulte el `flush_interval_milliseconds` parámetro). Para vaciar los registros a la fuerza desde el búfer de memoria a la tabla, utilice `SYSTEM FLUSH LOGS` consulta. - -Cuando la tabla se elimina manualmente, se creará automáticamente sobre la marcha. Tenga en cuenta que se eliminarán todos los registros anteriores. - -!!! note "Nota" - El período de almacenamiento para los registros es ilimitado. Los registros no se eliminan automáticamente de la tabla. Debe organizar la eliminación de registros obsoletos usted mismo. - -Puede especificar una clave de partición arbitraria `system.query_thread_log` mesa en el [Sistema abierto.](server-configuration-parameters/settings.md#server_configuration_parameters-query-thread-log) configuración del servidor (consulte el `partition_by` parámetro). - -## sistema.trace_log {#system_tables-trace_log} - -Contiene seguimientos de pila recopilados por el generador de perfiles de consultas de muestreo. - -ClickHouse crea esta tabla cuando el [trace_log](server-configuration-parameters/settings.md#server_configuration_parameters-trace_log) se establece la sección de configuración del servidor. También el [query_profiler_real_time_period_ns](settings/settings.md#query_profiler_real_time_period_ns) y [Los resultados de la prueba](settings/settings.md#query_profiler_cpu_time_period_ns) los ajustes deben establecerse. - -Para analizar los registros, utilice el `addressToLine`, `addressToSymbol` y `demangle` funciones de inspección. - -Columna: - -- `event_date` ([Fecha](../sql-reference/data-types/date.md)) — Date of sampling moment. - -- `event_time` ([FechaHora](../sql-reference/data-types/datetime.md)) — Timestamp of the sampling moment. - -- `timestamp_ns` ([UInt64](../sql-reference/data-types/int-uint.md)) — Timestamp of the sampling moment in nanoseconds. - -- `revision` ([UInt32](../sql-reference/data-types/int-uint.md)) — ClickHouse server build revision. - - Cuando se conecta al servidor por `clickhouse-client`, ves la cadena similar a `Connected to ClickHouse server version 19.18.1 revision 54429.`. Este campo contiene el `revision`, pero no el `version` de un servidor. - -- `timer_type` ([Enum8](../sql-reference/data-types/enum.md)) — Timer type: - - - `Real` representa el tiempo del reloj de pared. - - `CPU` representa el tiempo de CPU. - -- `thread_number` ([UInt32](../sql-reference/data-types/int-uint.md)) — Thread identifier. - -- `query_id` ([Cadena](../sql-reference/data-types/string.md)) — Query identifier that can be used to get details about a query that was running from the [query_log](#system_tables-query_log) tabla del sistema. - -- `trace` ([Matriz (UInt64)](../sql-reference/data-types/array.md)) — Stack trace at the moment of sampling. Each element is a virtual memory address inside ClickHouse server process. - -**Ejemplo** - -``` sql -SELECT * FROM system.trace_log LIMIT 1 \G -``` - -``` text -Row 1: -────── -event_date: 2019-11-15 -event_time: 2019-11-15 15:09:38 -revision: 54428 -timer_type: Real -thread_number: 48 -query_id: acc4d61f-5bd1-4a3e-bc91-2180be37c915 -trace: [94222141367858,94222152240175,94222152325351,94222152329944,94222152330796,94222151449980,94222144088167,94222151682763,94222144088167,94222151682763,94222144088167,94222144058283,94222144059248,94222091840750,94222091842302,94222091831228,94222189631488,140509950166747,140509942945935] -``` - -## sistema.Replica {#system_tables-replicas} - -Contiene información y estado de las tablas replicadas que residen en el servidor local. -Esta tabla se puede utilizar para el monitoreo. La tabla contiene una fila para cada tabla Replicated\*. - -Ejemplo: - -``` sql -SELECT * -FROM system.replicas -WHERE table = 'visits' -FORMAT Vertical -``` - -``` text -Row 1: -────── -database: merge -table: visits -engine: ReplicatedCollapsingMergeTree -is_leader: 1 -can_become_leader: 1 -is_readonly: 0 -is_session_expired: 0 -future_parts: 1 -parts_to_check: 0 -zookeeper_path: /clickhouse/tables/01-06/visits -replica_name: example01-06-1.yandex.ru -replica_path: /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru -columns_version: 9 -queue_size: 1 -inserts_in_queue: 0 -merges_in_queue: 1 -part_mutations_in_queue: 0 -queue_oldest_time: 2020-02-20 08:34:30 -inserts_oldest_time: 1970-01-01 00:00:00 -merges_oldest_time: 2020-02-20 08:34:30 -part_mutations_oldest_time: 1970-01-01 00:00:00 -oldest_part_to_get: -oldest_part_to_merge_to: 20200220_20284_20840_7 -oldest_part_to_mutate_to: -log_max_index: 596273 -log_pointer: 596274 -last_queue_update: 2020-02-20 08:34:32 -absolute_delay: 0 -total_replicas: 2 -active_replicas: 2 -``` - -Columna: - -- `database` (`String`) - Nombre de la base de datos -- `table` (`String`) - Nombre de la tabla -- `engine` (`String`) - Nombre del motor de tabla -- `is_leader` (`UInt8`) - Si la réplica es la líder. - Sólo una réplica a la vez puede ser el líder. El líder es responsable de seleccionar las fusiones de fondo para realizar. - Tenga en cuenta que las escrituras se pueden realizar en cualquier réplica que esté disponible y tenga una sesión en ZK, independientemente de si es un líder. -- `can_become_leader` (`UInt8`) - Si la réplica puede ser elegida como líder. -- `is_readonly` (`UInt8`) - Si la réplica está en modo de sólo lectura. - Este modo se activa si la configuración no tiene secciones con ZooKeeper, si se produce un error desconocido al reinicializar sesiones en ZooKeeper y durante la reinicialización de sesiones en ZooKeeper. -- `is_session_expired` (`UInt8`) - la sesión con ZooKeeper ha expirado. Básicamente lo mismo que `is_readonly`. -- `future_parts` (`UInt32`) - El número de partes de datos que aparecerán como resultado de INSERTs o fusiones que aún no se han realizado. -- `parts_to_check` (`UInt32`) - El número de partes de datos en la cola para la verificación. Una pieza se coloca en la cola de verificación si existe la sospecha de que podría estar dañada. -- `zookeeper_path` (`String`) - Ruta de acceso a los datos de la tabla en ZooKeeper. -- `replica_name` (`String`) - Nombre de réplica en ZooKeeper. Diferentes réplicas de la misma tabla tienen diferentes nombres. -- `replica_path` (`String`) - Ruta de acceso a los datos de réplica en ZooKeeper. Lo mismo que concatenar ‘zookeeper_path/replicas/replica_path’. -- `columns_version` (`Int32`) - Número de versión de la estructura de la tabla. Indica cuántas veces se realizó ALTER. Si las réplicas tienen versiones diferentes, significa que algunas réplicas aún no han hecho todas las ALTER. -- `queue_size` (`UInt32`) - Tamaño de la cola para las operaciones en espera de ser realizadas. Las operaciones incluyen insertar bloques de datos, fusiones y otras acciones. Por lo general, coincide con `future_parts`. -- `inserts_in_queue` (`UInt32`) - Número de inserciones de bloques de datos que deben realizarse. Las inserciones generalmente se replican con bastante rapidez. Si este número es grande, significa que algo anda mal. -- `merges_in_queue` (`UInt32`) - El número de fusiones en espera de hacerse. A veces las fusiones son largas, por lo que este valor puede ser mayor que cero durante mucho tiempo. -- `part_mutations_in_queue` (`UInt32`) - El número de mutaciones a la espera de hacerse. -- `queue_oldest_time` (`DateTime`) - Si `queue_size` mayor que 0, muestra cuándo se agregó la operación más antigua a la cola. -- `inserts_oldest_time` (`DateTime`) - Ver `queue_oldest_time` -- `merges_oldest_time` (`DateTime`) - Ver `queue_oldest_time` -- `part_mutations_oldest_time` (`DateTime`) - Ver `queue_oldest_time` - -Las siguientes 4 columnas tienen un valor distinto de cero solo cuando hay una sesión activa con ZK. - -- `log_max_index` (`UInt64`) - Número máximo de inscripción en el registro de actividad general. -- `log_pointer` (`UInt64`) - Número máximo de entrada en el registro de actividad general que la réplica copió en su cola de ejecución, más uno. Si `log_pointer` es mucho más pequeño que `log_max_index`, algo está mal. -- `last_queue_update` (`DateTime`) - Cuando la cola se actualizó la última vez. -- `absolute_delay` (`UInt64`) - ¿Qué tan grande retraso en segundos tiene la réplica actual. -- `total_replicas` (`UInt8`) - El número total de réplicas conocidas de esta tabla. -- `active_replicas` (`UInt8`) - El número de réplicas de esta tabla que tienen una sesión en ZooKeeper (es decir, el número de réplicas en funcionamiento). - -Si solicita todas las columnas, la tabla puede funcionar un poco lentamente, ya que se realizan varias lecturas de ZooKeeper para cada fila. -Si no solicita las últimas 4 columnas (log_max_index, log_pointer, total_replicas, active_replicas), la tabla funciona rápidamente. - -Por ejemplo, puede verificar que todo funcione correctamente de esta manera: - -``` sql -SELECT - database, - table, - is_leader, - is_readonly, - is_session_expired, - future_parts, - parts_to_check, - columns_version, - queue_size, - inserts_in_queue, - merges_in_queue, - log_max_index, - log_pointer, - total_replicas, - active_replicas -FROM system.replicas -WHERE - is_readonly - OR is_session_expired - OR future_parts > 20 - OR parts_to_check > 10 - OR queue_size > 20 - OR inserts_in_queue > 10 - OR log_max_index - log_pointer > 10 - OR total_replicas < 2 - OR active_replicas < total_replicas -``` - -Si esta consulta no devuelve nada, significa que todo está bien. - -## sistema.configuración {#system-tables-system-settings} - -Contiene información sobre la configuración de sesión para el usuario actual. - -Columna: - -- `name` ([Cadena](../sql-reference/data-types/string.md)) — Setting name. -- `value` ([Cadena](../sql-reference/data-types/string.md)) — Setting value. -- `changed` ([UInt8](../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether a setting is changed from its default value. -- `description` ([Cadena](../sql-reference/data-types/string.md)) — Short setting description. -- `min` ([NULL](../sql-reference/data-types/nullable.md)([Cadena](../sql-reference/data-types/string.md))) — Minimum value of the setting, if any is set via [limitación](settings/constraints-on-settings.md#constraints-on-settings). Si la configuración no tiene ningún valor mínimo, contiene [NULL](../sql-reference/syntax.md#null-literal). -- `max` ([NULL](../sql-reference/data-types/nullable.md)([Cadena](../sql-reference/data-types/string.md))) — Maximum value of the setting, if any is set via [limitación](settings/constraints-on-settings.md#constraints-on-settings). Si la configuración no tiene ningún valor máximo, contiene [NULL](../sql-reference/syntax.md#null-literal). -- `readonly` ([UInt8](../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether the current user can change the setting: - - `0` — Current user can change the setting. - - `1` — Current user can't change the setting. - -**Ejemplo** - -En el ejemplo siguiente se muestra cómo obtener información sobre la configuración cuyo nombre contiene `min_i`. - -``` sql -SELECT * -FROM system.settings -WHERE name LIKE '%min_i%' -``` - -``` text -┌─name────────────────────────────────────────┬─value─────┬─changed─┬─description───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─min──┬─max──┬─readonly─┐ -│ min_insert_block_size_rows │ 1048576 │ 0 │ Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough. │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 0 │ -│ min_insert_block_size_bytes │ 268435456 │ 0 │ Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough. │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 0 │ -│ read_backoff_min_interval_between_events_ms │ 1000 │ 0 │ Settings to reduce the number of threads in case of slow reads. Do not pay attention to the event, if the previous one has passed less than a certain amount of time. │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 0 │ -└─────────────────────────────────────────────┴───────────┴─────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────┴──────┴──────────┘ -``` - -Uso de `WHERE changed` puede ser útil, por ejemplo, cuando se desea comprobar: - -- Si los ajustes de los archivos de configuración se cargan correctamente y están en uso. -- Configuración que cambió en la sesión actual. - - - -``` sql -SELECT * FROM system.settings WHERE changed AND name='load_balancing' -``` - -**Ver también** - -- [Configuración](settings/index.md#session-settings-intro) -- [Permisos para consultas](settings/permissions-for-queries.md#settings_readonly) -- [Restricciones en la configuración](settings/constraints-on-settings.md) - -## sistema.table_engines {#system.table_engines} - -``` text -┌─name───────────────────┬─value───────┐ -│ max_threads │ 8 │ -│ use_uncompressed_cache │ 0 │ -│ load_balancing │ random │ -│ max_memory_usage │ 10000000000 │ -└────────────────────────┴─────────────┘ -``` - -## sistema.merge_tree_settings {#system-merge_tree_settings} - -Contiene información sobre la configuración `MergeTree` tabla. - -Columna: - -- `name` (String) — Setting name. -- `value` (String) — Setting value. -- `description` (String) — Setting description. -- `type` (String) — Setting type (implementation specific string value). -- `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed. - -## sistema.table_engines {#system-table-engines} - -Contiene la descripción de los motores de tablas admitidos por el servidor y su información de soporte de características. - -Esta tabla contiene las siguientes columnas (el tipo de columna se muestra entre corchetes): - -- `name` (String) — The name of table engine. -- `supports_settings` (UInt8) — Flag that indicates if table engine supports `SETTINGS` clausula. -- `supports_skipping_indices` (UInt8) — Flag that indicates if table engine supports [Índices de saltos](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-data_skipping-indexes). -- `supports_ttl` (UInt8) — Flag that indicates if table engine supports [TTL](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). -- `supports_sort_order` (UInt8) — Flag that indicates if table engine supports clauses `PARTITION_BY`, `PRIMARY_KEY`, `ORDER_BY` y `SAMPLE_BY`. -- `supports_replication` (UInt8) — Flag that indicates if table engine supports [Replicación de datos](../engines/table-engines/mergetree-family/replication.md). -- `supports_duduplication` (UInt8) — Flag that indicates if table engine supports data deduplication. - -Ejemplo: - -``` sql -SELECT * -FROM system.table_engines -WHERE name in ('Kafka', 'MergeTree', 'ReplicatedCollapsingMergeTree') -``` - -``` text -┌─name──────────────────────────┬─supports_settings─┬─supports_skipping_indices─┬─supports_sort_order─┬─supports_ttl─┬─supports_replication─┬─supports_deduplication─┐ -│ Kafka │ 1 │ 0 │ 0 │ 0 │ 0 │ 0 │ -│ MergeTree │ 1 │ 1 │ 1 │ 1 │ 0 │ 0 │ -│ ReplicatedCollapsingMergeTree │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ -└───────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────────┴──────────────┴──────────────────────┴────────────────────────┘ -``` - -**Ver también** - -- Familia MergeTree [cláusulas de consulta](../engines/table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) -- Kafka [configuración](../engines/table-engines/integrations/kafka.md#table_engine-kafka-creating-a-table) -- Unir [configuración](../engines/table-engines/special/join.md#join-limitations-and-settings) - -## sistema.tabla {#system-tables} - -Contiene metadatos de cada tabla que el servidor conoce. Las tablas separadas no se muestran en `system.tables`. - -Esta tabla contiene las siguientes columnas (el tipo de columna se muestra entre corchetes): - -- `database` (String) — The name of the database the table is in. - -- `name` (String) — Table name. - -- `engine` (String) — Table engine name (without parameters). - -- `is_temporary` (UInt8): marca que indica si la tabla es temporal. - -- `data_path` (String) - Ruta de acceso a los datos de la tabla en el sistema de archivos. - -- `metadata_path` (String) - Ruta de acceso a los metadatos de la tabla en el sistema de archivos. - -- `metadata_modification_time` (DateTime) - Hora de la última modificación de los metadatos de la tabla. - -- `dependencies_database` (Array(String)) - Dependencias de base de datos. - -- `dependencies_table` (Array(String)) - Dependencias de tabla ([Método de codificación de datos:](../engines/table-engines/special/materializedview.md) tablas basadas en la tabla actual). - -- `create_table_query` (String) - La consulta que se utilizó para crear la tabla. - -- `engine_full` (String) - Parámetros del motor de tabla. - -- `partition_key` (String) - La expresión de clave de partición especificada en la tabla. - -- `sorting_key` (String) - La expresión de clave de ordenación especificada en la tabla. - -- `primary_key` (String) - La expresión de clave principal especificada en la tabla. - -- `sampling_key` (String) - La expresión de clave de muestreo especificada en la tabla. - -- `storage_policy` (String) - La política de almacenamiento: - - - [Método de codificación de datos:](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) - - [Distribuido](../engines/table-engines/special/distributed.md#distributed) - -- `total_rows` (Nullable(UInt64)) - Número total de filas, si es posible determinar rápidamente el número exacto de filas en la tabla, de lo contrario `Null` (incluyendo underying `Buffer` tabla). - -- `total_bytes` (Nullable(UInt64)) - Número total de bytes, si es posible determinar rápidamente el número exacto de bytes para la tabla en el almacenamiento, de lo contrario `Null` (**no** incluye cualquier almacenamiento subyacente). - - - If the table stores data on disk, returns used space on disk (i.e. compressed). - - Si la tabla almacena datos en la memoria, devuelve el número aproximado de bytes utilizados en la memoria. - -El `system.tables` se utiliza en `SHOW TABLES` implementación de consultas. - -## sistema.Zookeeper {#system-zookeeper} - -La tabla no existe si ZooKeeper no está configurado. Permite leer datos del clúster ZooKeeper definido en la configuración. -La consulta debe tener un ‘path’ condición de igualdad en la cláusula WHERE. Este es el camino en ZooKeeper para los niños para los que desea obtener datos. - -Consulta `SELECT * FROM system.zookeeper WHERE path = '/clickhouse'` salidas de datos para todos los niños en el `/clickhouse` nodo. -Para generar datos para todos los nodos raíz, escriba path = ‘/’. -Si la ruta especificada en ‘path’ no existe, se lanzará una excepción. - -Columna: - -- `name` (String) — The name of the node. -- `path` (String) — The path to the node. -- `value` (String) — Node value. -- `dataLength` (Int32) — Size of the value. -- `numChildren` (Int32) — Number of descendants. -- `czxid` (Int64) — ID of the transaction that created the node. -- `mzxid` (Int64) — ID of the transaction that last changed the node. -- `pzxid` (Int64) — ID of the transaction that last deleted or added descendants. -- `ctime` (DateTime) — Time of node creation. -- `mtime` (DateTime) — Time of the last modification of the node. -- `version` (Int32) — Node version: the number of times the node was changed. -- `cversion` (Int32) — Number of added or removed descendants. -- `aversion` (Int32) — Number of changes to the ACL. -- `ephemeralOwner` (Int64) — For ephemeral nodes, the ID of the session that owns this node. - -Ejemplo: - -``` sql -SELECT * -FROM system.zookeeper -WHERE path = '/clickhouse/tables/01-08/visits/replicas' -FORMAT Vertical -``` - -``` text -Row 1: -────── -name: example01-08-1.yandex.ru -value: -czxid: 932998691229 -mzxid: 932998691229 -ctime: 2015-03-27 16:49:51 -mtime: 2015-03-27 16:49:51 -version: 0 -cversion: 47 -aversion: 0 -ephemeralOwner: 0 -dataLength: 0 -numChildren: 7 -pzxid: 987021031383 -path: /clickhouse/tables/01-08/visits/replicas - -Row 2: -────── -name: example01-08-2.yandex.ru -value: -czxid: 933002738135 -mzxid: 933002738135 -ctime: 2015-03-27 16:57:01 -mtime: 2015-03-27 16:57:01 -version: 0 -cversion: 37 -aversion: 0 -ephemeralOwner: 0 -dataLength: 0 -numChildren: 7 -pzxid: 987021252247 -path: /clickhouse/tables/01-08/visits/replicas -``` - -## sistema.mutación {#system_tables-mutations} - -La tabla contiene información sobre [mutación](../sql-reference/statements/alter.md#alter-mutations) de las tablas MergeTree y su progreso. Cada comando de mutación está representado por una sola fila. La tabla tiene las siguientes columnas: - -**base**, **tabla** - El nombre de la base de datos y la tabla a la que se aplicó la mutación. - -**mutation_id** - La identificación de la mutación. Para las tablas replicadas, estos identificadores corresponden a los nombres de znode `/mutations/` directorio en ZooKeeper. Para las tablas no duplicadas, los ID corresponden a los nombres de archivo en el directorio de datos de la tabla. - -**comando** - La cadena de comandos de mutación (la parte de la consulta después de `ALTER TABLE [db.]table`). - -**create_time** - Cuando este comando de mutación fue enviado para su ejecución. - -**block_numbers.partition_id**, **block_numbers.numero** - Una columna anidada. Para las mutaciones de tablas replicadas, contiene un registro para cada partición: el ID de partición y el número de bloque que fue adquirido por la mutación (en cada partición, solo se mutarán las partes que contienen bloques con números menores que el número de bloque adquirido por la mutación en esa partición). En tablas no replicadas, los números de bloque en todas las particiones forman una sola secuencia. Esto significa que para las mutaciones de tablas no replicadas, la columna contendrá un registro con un solo número de bloque adquirido por la mutación. - -**partes_a_do** - El número de partes de datos que deben mutarse para que finalice la mutación. - -**is_done** - Es la mutación hecho? Tenga en cuenta que incluso si `parts_to_do = 0` es posible que aún no se haya realizado una mutación de una tabla replicada debido a un INSERT de larga ejecución que creará una nueva parte de datos que deberá mutarse. - -Si hubo problemas con la mutación de algunas partes, las siguientes columnas contienen información adicional: - -**Método de codificación de datos:** - El nombre de la parte más reciente que no se pudo mutar. - -**Método de codificación de datos:** - El momento del fracaso de la mutación de la parte más reciente. - -**Método de codificación de datos:** - El mensaje de excepción que causó el error de mutación de parte más reciente. - -## sistema.disco {#system_tables-disks} - -Contiene información sobre los discos definidos en el [configuración del servidor](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). - -Columna: - -- `name` ([Cadena](../sql-reference/data-types/string.md)) — Name of a disk in the server configuration. -- `path` ([Cadena](../sql-reference/data-types/string.md)) — Path to the mount point in the file system. -- `free_space` ([UInt64](../sql-reference/data-types/int-uint.md)) — Free space on disk in bytes. -- `total_space` ([UInt64](../sql-reference/data-types/int-uint.md)) — Disk volume in bytes. -- `keep_free_space` ([UInt64](../sql-reference/data-types/int-uint.md)) — Amount of disk space that should stay free on disk in bytes. Defined in the `keep_free_space_bytes` parámetro de configuración del disco. - -## sistema.almacenamiento_policies {#system_tables-storage_policies} - -Contiene información sobre las directivas de almacenamiento y los volúmenes [configuración del servidor](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). - -Columna: - -- `policy_name` ([Cadena](../sql-reference/data-types/string.md)) — Name of the storage policy. -- `volume_name` ([Cadena](../sql-reference/data-types/string.md)) — Volume name defined in the storage policy. -- `volume_priority` ([UInt64](../sql-reference/data-types/int-uint.md)) — Volume order number in the configuration. -- `disks` ([Array(Cadena)](../sql-reference/data-types/array.md)) — Disk names, defined in the storage policy. -- `max_data_part_size` ([UInt64](../sql-reference/data-types/int-uint.md)) — Maximum size of a data part that can be stored on volume disks (0 — no limit). -- `move_factor` ([Float64](../sql-reference/data-types/float.md)) — Ratio of free disk space. When the ratio exceeds the value of configuration parameter, ClickHouse start to move data to the next volume in order. - -Si la directiva de almacenamiento contiene más de un volumen, la información de cada volumen se almacena en la fila individual de la tabla. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/system_tables/) diff --git a/docs/es/operations/tips.md b/docs/es/operations/tips.md deleted file mode 100644 index deb226450aa..00000000000 --- a/docs/es/operations/tips.md +++ /dev/null @@ -1,251 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 58 -toc_title: Recomendaciones de uso ---- - -# Recomendaciones de uso {#usage-recommendations} - -## CPU Scaling Governor {#cpu-scaling-governor} - -Utilice siempre el `performance` gobernador de escala. El `on-demand` regulador de escala funciona mucho peor con una demanda constante. - -``` bash -$ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor -``` - -## Limitaciones de la CPU {#cpu-limitations} - -Los procesadores pueden sobrecalentarse. Utilizar `dmesg` para ver si la velocidad de reloj de la CPU era limitada debido al sobrecalentamiento. -La restricción también se puede establecer externamente en el nivel del centro de datos. Usted puede utilizar `turbostat` para controlarlo bajo una carga. - -## RAM {#ram} - -Para pequeñas cantidades de datos (hasta ~200 GB comprimidos), es mejor usar tanta memoria como el volumen de datos. -Para grandes cantidades de datos y al procesar consultas interactivas (en línea), debe usar una cantidad razonable de RAM (128 GB o más) para que el subconjunto de datos en caliente quepa en la memoria caché de páginas. -Incluso para volúmenes de datos de ~ 50 TB por servidor, el uso de 128 GB de RAM mejora significativamente el rendimiento de las consultas en comparación con 64 GB. - -No deshabilite el sobrecompromiso. Valor `cat /proc/sys/vm/overcommit_memory` debe ser 0 o 1. Ejecutar - -``` bash -$ echo 0 | sudo tee /proc/sys/vm/overcommit_memory -``` - -## Páginas enormes {#huge-pages} - -Siempre deshabilite las páginas enormes transparentes. Interfiere con los asignadores de memoria, lo que conduce a una degradación significativa del rendimiento. - -``` bash -$ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled -``` - -Utilizar `perf top` para ver el tiempo pasado en el kernel para la administración de memoria. -Las páginas enormes permanentes tampoco necesitan ser asignadas. - -## Subsistema de almacenamiento {#storage-subsystem} - -Si su presupuesto le permite usar SSD, use SSD. -Si no, use HDD. Los discos duros SATA 7200 RPM servirán. - -Dar preferencia a una gran cantidad de servidores con discos duros locales sobre un número menor de servidores con estantes de discos conectados. -Pero para almacenar archivos con consultas raras, los estantes funcionarán. - -## RAID {#raid} - -Al usar HDD, puede combinar su RAID-10, RAID-5, RAID-6 o RAID-50. -Para Linux, el software RAID es mejor (con `mdadm`). No recomendamos usar LVM. -Al crear RAID-10, seleccione el `far` diseño. -Si su presupuesto lo permite, elija RAID-10. - -Si tiene más de 4 discos, utilice RAID-6 (preferido) o RAID-50, en lugar de RAID-5. -Cuando use RAID-5, RAID-6 o RAID-50, siempre aumente stripe_cache_size, ya que el valor predeterminado generalmente no es la mejor opción. - -``` bash -$ echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size -``` - -Calcule el número exacto a partir del número de dispositivos y el tamaño del bloque, utilizando la fórmula: `2 * num_devices * chunk_size_in_bytes / 4096`. - -Un tamaño de bloque de 1024 KB es suficiente para todas las configuraciones RAID. -Nunca ajuste el tamaño del bloque demasiado pequeño o demasiado grande. - -Puede usar RAID-0 en SSD. -Independientemente del uso de RAID, utilice siempre la replicación para la seguridad de los datos. - -Habilite NCQ con una cola larga. Para HDD, elija el programador CFQ, y para SSD, elija noop. No reduzca el ‘readahead’ configuración. -Para HDD, habilite la memoria caché de escritura. - -## Sistema de archivos {#file-system} - -Ext4 es la opción más confiable. Establecer las opciones de montaje `noatime, nobarrier`. -XFS también es adecuado, pero no ha sido probado tan a fondo con ClickHouse. -La mayoría de los otros sistemas de archivos también deberían funcionar bien. Los sistemas de archivos con asignación retrasada funcionan mejor. - -## Núcleo de Linux {#linux-kernel} - -No use un kernel de Linux obsoleto. - -## Red {#network} - -Si está utilizando IPv6, aumente el tamaño de la caché de ruta. -El kernel de Linux anterior a 3.2 tenía una multitud de problemas con la implementación de IPv6. - -Utilice al menos una red de 10 GB, si es posible. 1 Gb también funcionará, pero será mucho peor para parchear réplicas con decenas de terabytes de datos, o para procesar consultas distribuidas con una gran cantidad de datos intermedios. - -## ZooKeeper {#zookeeper} - -Probablemente ya esté utilizando ZooKeeper para otros fines. Puede usar la misma instalación de ZooKeeper, si aún no está sobrecargada. - -It's best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated. - -Nunca debe usar scripts escritos manualmente para transferir datos entre diferentes clústeres de ZooKeeper, ya que el resultado será incorrecto para los nodos secuenciales. Nunca utilice el “zkcopy” utilidad por la misma razón: https://github.com/ksprojects/zkcopy/issues/15 - -Si desea dividir un clúster ZooKeeper existente en dos, la forma correcta es aumentar el número de sus réplicas y, a continuación, volver a configurarlo como dos clústeres independientes. - -No ejecute ZooKeeper en los mismos servidores que ClickHouse. Porque ZooKeeper es muy sensible a la latencia y ClickHouse puede utilizar todos los recursos del sistema disponibles. - -Con la configuración predeterminada, ZooKeeper es una bomba de tiempo: - -> El servidor ZooKeeper no eliminará archivos de instantáneas y registros antiguos cuando utilice la configuración predeterminada (consulte autopurge), y esto es responsabilidad del operador. - -Esta bomba debe ser desactivada. - -La configuración ZooKeeper (3.5.1) a continuación se usa en Yandex.Entorno de producción de Métrica al 20 de mayo de 2017: - -zoológico.Cómo: - -``` bash -# http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html - -# The number of milliseconds of each tick -tickTime=2000 -# The number of ticks that the initial -# synchronization phase can take -initLimit=30000 -# The number of ticks that can pass between -# sending a request and getting an acknowledgement -syncLimit=10 - -maxClientCnxns=2000 - -maxSessionTimeout=60000000 -# the directory where the snapshot is stored. -dataDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/data -# Place the dataLogDir to a separate physical disc for better performance -dataLogDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/logs - -autopurge.snapRetainCount=10 -autopurge.purgeInterval=1 - - -# To avoid seeks ZooKeeper allocates space in the transaction log file in -# blocks of preAllocSize kilobytes. The default block size is 64M. One reason -# for changing the size of the blocks is to reduce the block size if snapshots -# are taken more often. (Also, see snapCount). -preAllocSize=131072 - -# Clients can submit requests faster than ZooKeeper can process them, -# especially if there are a lot of clients. To prevent ZooKeeper from running -# out of memory due to queued requests, ZooKeeper will throttle clients so that -# there is no more than globalOutstandingLimit outstanding requests in the -# system. The default limit is 1,000.ZooKeeper logs transactions to a -# transaction log. After snapCount transactions are written to a log file a -# snapshot is started and a new transaction log file is started. The default -# snapCount is 10,000. -snapCount=3000000 - -# If this option is defined, requests will be will logged to a trace file named -# traceFile.year.month.day. -#traceFile= - -# Leader accepts client connections. Default value is "yes". The leader machine -# coordinates updates. For higher update throughput at thes slight expense of -# read throughput the leader can be configured to not accept clients and focus -# on coordination. -leaderServes=yes - -standaloneEnabled=false -dynamicConfigFile=/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/zoo.cfg.dynamic -``` - -Versión Java: - -``` text -Java(TM) SE Runtime Environment (build 1.8.0_25-b17) -Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) -``` - -Parámetros de JVM: - -``` bash -NAME=zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} -ZOOCFGDIR=/etc/$NAME/conf - -# TODO this is really ugly -# How to find out, which jars are needed? -# seems, that log4j requires the log4j.properties file to be in the classpath -CLASSPATH="$ZOOCFGDIR:/usr/build/classes:/usr/build/lib/*.jar:/usr/share/zookeeper/zookeeper-3.5.1-metrika.jar:/usr/share/zookeeper/slf4j-log4j12-1.7.5.jar:/usr/share/zookeeper/slf4j-api-1.7.5.jar:/usr/share/zookeeper/servlet-api-2.5-20081211.jar:/usr/share/zookeeper/netty-3.7.0.Final.jar:/usr/share/zookeeper/log4j-1.2.16.jar:/usr/share/zookeeper/jline-2.11.jar:/usr/share/zookeeper/jetty-util-6.1.26.jar:/usr/share/zookeeper/jetty-6.1.26.jar:/usr/share/zookeeper/javacc.jar:/usr/share/zookeeper/jackson-mapper-asl-1.9.11.jar:/usr/share/zookeeper/jackson-core-asl-1.9.11.jar:/usr/share/zookeeper/commons-cli-1.2.jar:/usr/src/java/lib/*.jar:/usr/etc/zookeeper" - -ZOOCFG="$ZOOCFGDIR/zoo.cfg" -ZOO_LOG_DIR=/var/log/$NAME -USER=zookeeper -GROUP=zookeeper -PIDDIR=/var/run/$NAME -PIDFILE=$PIDDIR/$NAME.pid -SCRIPTNAME=/etc/init.d/$NAME -JAVA=/usr/bin/java -ZOOMAIN="org.apache.zookeeper.server.quorum.QuorumPeerMain" -ZOO_LOG4J_PROP="INFO,ROLLINGFILE" -JMXLOCALONLY=false -JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \ - -Xmx{{ '{{' }} cluster.get('xmx','1G') {{ '}}' }} \ - -Xloggc:/var/log/$NAME/zookeeper-gc.log \ - -XX:+UseGCLogFileRotation \ - -XX:NumberOfGCLogFiles=16 \ - -XX:GCLogFileSize=16M \ - -verbose:gc \ - -XX:+PrintGCTimeStamps \ - -XX:+PrintGCDateStamps \ - -XX:+PrintGCDetails - -XX:+PrintTenuringDistribution \ - -XX:+PrintGCApplicationStoppedTime \ - -XX:+PrintGCApplicationConcurrentTime \ - -XX:+PrintSafepointStatistics \ - -XX:+UseParNewGC \ - -XX:+UseConcMarkSweepGC \ --XX:+CMSParallelRemarkEnabled" -``` - -Sal init: - -``` text -description "zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} centralized coordination service" - -start on runlevel [2345] -stop on runlevel [!2345] - -respawn - -limit nofile 8192 8192 - -pre-start script - [ -r "/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment" ] || exit 0 - . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment - [ -d $ZOO_LOG_DIR ] || mkdir -p $ZOO_LOG_DIR - chown $USER:$GROUP $ZOO_LOG_DIR -end script - -script - . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment - [ -r /etc/default/zookeeper ] && . /etc/default/zookeeper - if [ -z "$JMXDISABLE" ]; then - JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=$JMXLOCALONLY" - fi - exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} \ - -- -cp $CLASSPATH $JAVA_OPTS -Dzookeeper.log.dir=${ZOO_LOG_DIR} \ - -Dzookeeper.root.logger=${ZOO_LOG4J_PROP} $ZOOMAIN $ZOOCFG -end script -``` - -{## [Artículo Original](https://clickhouse.tech/docs/en/operations/tips/) ##} diff --git a/docs/es/operations/troubleshooting.md b/docs/es/operations/troubleshooting.md deleted file mode 100644 index 9e8d2caca59..00000000000 --- a/docs/es/operations/troubleshooting.md +++ /dev/null @@ -1,146 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 46 -toc_title: "Soluci\xF3n de problemas" ---- - -# Solución de problemas {#troubleshooting} - -- [Instalación](#troubleshooting-installation-errors) -- [Conexión al servidor](#troubleshooting-accepts-no-connections) -- [Procesamiento de consultas](#troubleshooting-does-not-process-queries) -- [Eficiencia del procesamiento de consultas](#troubleshooting-too-slow) - -## Instalación {#troubleshooting-installation-errors} - -### No puede obtener paquetes Deb del repositorio ClickHouse con Apt-get {#you-cannot-get-deb-packages-from-clickhouse-repository-with-apt-get} - -- Compruebe la configuración del firewall. -- Si no puede acceder al repositorio por cualquier motivo, descargue los paquetes como se describe en el [Primeros pasos](../getting-started/index.md) artículo e instálelos manualmente usando el `sudo dpkg -i ` comando. También necesitará el `tzdata` paquete. - -## Conexión al servidor {#troubleshooting-accepts-no-connections} - -Posibles problemas: - -- El servidor no se está ejecutando. -- Parámetros de configuración inesperados o incorrectos. - -### El servidor no se está ejecutando {#server-is-not-running} - -**Compruebe si el servidor está ejecutado** - -Comando: - -``` bash -$ sudo service clickhouse-server status -``` - -Si el servidor no se está ejecutando, inícielo con el comando: - -``` bash -$ sudo service clickhouse-server start -``` - -**Comprobar registros** - -El registro principal de `clickhouse-server` está en `/var/log/clickhouse-server/clickhouse-server.log` predeterminada. - -Si el servidor se inició correctamente, debería ver las cadenas: - -- ` Application: starting up.` — Server started. -- ` Application: Ready for connections.` — Server is running and ready for connections. - -Si `clickhouse-server` error de inicio con un error de configuración, debería ver el `` cadena con una descripción de error. Por ejemplo: - -``` text -2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused -``` - -Si no ve un error al final del archivo, revise todo el archivo a partir de la cadena: - -``` text - Application: starting up. -``` - -Si intenta iniciar una segunda instancia de `clickhouse-server` en el servidor, verá el siguiente registro: - -``` text -2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 -2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up -2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: -PID: 8510 -Started at: 2019-01-11 15:24:23 -Revision: 54413 - -2019.01.11 15:25:11.156673 [ 1 ] {} Application: DB::Exception: Cannot lock file ./status. Another server instance in same directory is already running. -2019.01.11 15:25:11.156682 [ 1 ] {} Application: shutting down -2019.01.11 15:25:11.156686 [ 1 ] {} Application: Uninitializing subsystem: Logging Subsystem -2019.01.11 15:25:11.156716 [ 2 ] {} BaseDaemon: Stop SignalListener thread -``` - -**Ver sistema.d registros** - -Si no encuentra ninguna información útil en `clickhouse-server` registros o no hay registros, puede ver `system.d` registros usando el comando: - -``` bash -$ sudo journalctl -u clickhouse-server -``` - -**Iniciar clickhouse-server en modo interactivo** - -``` bash -$ sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml -``` - -Este comando inicia el servidor como una aplicación interactiva con parámetros estándar del script de inicio automático. En este modo `clickhouse-server` imprime todos los mensajes de eventos en la consola. - -### Parámetros de configuración {#configuration-parameters} - -Comprobar: - -- Configuración de Docker. - - Si ejecuta ClickHouse en Docker en una red IPv6, asegúrese de que `network=host` se establece. - -- Configuración del punto final. - - Comprobar [listen_host](server-configuration-parameters/settings.md#server_configuration_parameters-listen_host) y [Tcp_port](server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) configuración. - - El servidor ClickHouse acepta conexiones localhost solo de forma predeterminada. - -- Configuración del protocolo HTTP. - - Compruebe la configuración del protocolo para la API HTTP. - -- Configuración de conexión segura. - - Comprobar: - - - El [Tcp_port_secure](server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure) configuración. - - Ajustes para [Sertificados SSL](server-configuration-parameters/settings.md#server_configuration_parameters-openssl). - - Utilice los parámetros adecuados mientras se conecta. Por ejemplo, utilice el `port_secure` parámetro con `clickhouse_client`. - -- Configuración del usuario. - - Es posible que esté utilizando el nombre de usuario o la contraseña incorrectos. - -## Procesamiento de consultas {#troubleshooting-does-not-process-queries} - -Si ClickHouse no puede procesar la consulta, envía una descripción de error al cliente. En el `clickhouse-client` obtienes una descripción del error en la consola. Si está utilizando la interfaz HTTP, ClickHouse envía la descripción del error en el cuerpo de la respuesta. Por ejemplo: - -``` bash -$ curl 'http://localhost:8123/' --data-binary "SELECT a" -Code: 47, e.displayText() = DB::Exception: Unknown identifier: a. Note that there are no tables (FROM clause) in your query, context: required_names: 'a' source_tables: table_aliases: private_aliases: column_aliases: public_columns: 'a' masked_columns: array_join_columns: source_columns: , e.what() = DB::Exception -``` - -Si empiezas `clickhouse-client` con el `stack-trace` parámetro, ClickHouse devuelve el seguimiento de la pila del servidor con la descripción de un error. - -Es posible que vea un mensaje sobre una conexión rota. En este caso, puede repetir la consulta. Si la conexión se rompe cada vez que realiza la consulta, compruebe si hay errores en los registros del servidor. - -## Eficiencia del procesamiento de consultas {#troubleshooting-too-slow} - -Si ve que ClickHouse funciona demasiado lentamente, debe perfilar la carga en los recursos del servidor y la red para sus consultas. - -Puede utilizar la utilidad clickhouse-benchmark para crear perfiles de consultas. Muestra el número de consultas procesadas por segundo, el número de filas procesadas por segundo y percentiles de tiempos de procesamiento de consultas. diff --git a/docs/es/operations/update.md b/docs/es/operations/update.md deleted file mode 100644 index 11d15381d72..00000000000 --- a/docs/es/operations/update.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 47 -toc_title: "Actualizaci\xF3n de ClickHouse" ---- - -# Actualización de ClickHouse {#clickhouse-update} - -Si se instaló ClickHouse desde paquetes deb, ejecute los siguientes comandos en el servidor: - -``` bash -$ sudo apt-get update -$ sudo apt-get install clickhouse-client clickhouse-server -$ sudo service clickhouse-server restart -``` - -Si ha instalado ClickHouse utilizando algo distinto de los paquetes deb recomendados, utilice el método de actualización adecuado. - -ClickHouse no admite una actualización distribuida. La operación debe realizarse consecutivamente en cada servidor separado. No actualice todos los servidores de un clúster simultáneamente, o el clúster no estará disponible durante algún tiempo. diff --git a/docs/es/operations/utilities/clickhouse-benchmark.md b/docs/es/operations/utilities/clickhouse-benchmark.md deleted file mode 100644 index 9bcafa40dfe..00000000000 --- a/docs/es/operations/utilities/clickhouse-benchmark.md +++ /dev/null @@ -1,156 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 61 -toc_title: Sistema abierto. ---- - -# Sistema abierto {#clickhouse-benchmark} - -Se conecta a un servidor ClickHouse y envía repetidamente las consultas especificadas. - -Sintaxis: - -``` bash -$ echo "single query" | clickhouse-benchmark [keys] -``` - -o - -``` bash -$ clickhouse-benchmark [keys] <<< "single query" -``` - -Si desea enviar un conjunto de consultas, cree un archivo de texto y coloque cada consulta en la cadena individual de este archivo. Por ejemplo: - -``` sql -SELECT * FROM system.numbers LIMIT 10000000 -SELECT 1 -``` - -Luego pase este archivo a una entrada estándar de `clickhouse-benchmark`. - -``` bash -clickhouse-benchmark [keys] < queries_file -``` - -## Claves {#clickhouse-benchmark-keys} - -- `-c N`, `--concurrency=N` — Number of queries that `clickhouse-benchmark` se envía simultáneamente. Valor predeterminado: 1. -- `-d N`, `--delay=N` — Interval in seconds between intermediate reports (set 0 to disable reports). Default value: 1. -- `-h WORD`, `--host=WORD` — Server host. Default value: `localhost`. Para el [modo de comparación](#clickhouse-benchmark-comparison-mode) puedes usar múltiples `-h` claves. -- `-p N`, `--port=N` — Server port. Default value: 9000. For the [modo de comparación](#clickhouse-benchmark-comparison-mode) puedes usar múltiples `-p` claves. -- `-i N`, `--iterations=N` — Total number of queries. Default value: 0. -- `-r`, `--randomize` — Random order of queries execution if there is more then one input query. -- `-s`, `--secure` — Using TLS connection. -- `-t N`, `--timelimit=N` — Time limit in seconds. `clickhouse-benchmark` detiene el envío de consultas cuando se alcanza el límite de tiempo especificado. Valor predeterminado: 0 (límite de tiempo desactivado). -- `--confidence=N` — Level of confidence for T-test. Possible values: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Default value: 5. In the [modo de comparación](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` realiza el [Prueba t independiente de dos muestras para estudiantes](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) prueba para determinar si las dos distribuciones no son diferentes con el nivel de confianza seleccionado. -- `--cumulative` — Printing cumulative data instead of data per interval. -- `--database=DATABASE_NAME` — ClickHouse database name. Default value: `default`. -- `--json=FILEPATH` — JSON output. When the key is set, `clickhouse-benchmark` emite un informe al archivo JSON especificado. -- `--user=USERNAME` — ClickHouse user name. Default value: `default`. -- `--password=PSWD` — ClickHouse user password. Default value: empty string. -- `--stacktrace` — Stack traces output. When the key is set, `clickhouse-bencmark` las salidas acumulan rastros de excepciones. -- `--stage=WORD` — Query processing stage at server. ClickHouse stops query processing and returns answer to `clickhouse-benchmark` en la etapa especificada. Valores posibles: `complete`, `fetch_columns`, `with_mergeable_state`. Valor predeterminado: `complete`. -- `--help` — Shows the help message. - -Si desea aplicar alguna [configuración](../../operations/settings/index.md) para consultas, páselas como una clave `--= SETTING_VALUE`. Por ejemplo, `--max_memory_usage=1048576`. - -## Salida {#clickhouse-benchmark-output} - -Predeterminada, `clickhouse-benchmark` informes para cada `--delay` intervalo. - -Ejemplo del informe: - -``` text -Queries executed: 10. - -localhost:9000, queries 10, QPS: 6.772, RPS: 67904487.440, MiB/s: 518.070, result RPS: 67721584.984, result MiB/s: 516.675. - -0.000% 0.145 sec. -10.000% 0.146 sec. -20.000% 0.146 sec. -30.000% 0.146 sec. -40.000% 0.147 sec. -50.000% 0.148 sec. -60.000% 0.148 sec. -70.000% 0.148 sec. -80.000% 0.149 sec. -90.000% 0.150 sec. -95.000% 0.150 sec. -99.000% 0.150 sec. -99.900% 0.150 sec. -99.990% 0.150 sec. -``` - -En el informe puedes encontrar: - -- Número de consultas en el `Queries executed:` campo. - -- Cadena de estado que contiene (en orden): - - - Punto final del servidor ClickHouse. - - Número de consultas procesadas. - - QPS: QPS: ¿Cuántas consultas realizó el servidor por segundo durante un período `--delay` argumento. - - RPS: ¿Cuántas filas lee el servidor por segundo durante un período `--delay` argumento. - - MiB/s: ¿Cuántos mebibytes servidor leído por segundo durante un período especificado en el `--delay` argumento. - - resultado RPS: ¿Cuántas filas colocadas por el servidor al resultado de una consulta por segundo durante un período `--delay` argumento. - - resultado MiB/s. ¿Cuántos mebibytes colocados por el servidor al resultado de una consulta por segundo durante un período especificado en el `--delay` argumento. - -- Percentiles de tiempo de ejecución de consultas. - -## Modo de comparación {#clickhouse-benchmark-comparison-mode} - -`clickhouse-benchmark` puede comparar el rendimiento de dos servidores ClickHouse en ejecución. - -Para utilizar el modo de comparación, especifique los puntos finales de ambos servidores `--host`, `--port` claves. Las claves coinciden entre sí por posición en la lista de argumentos, la primera `--host` se empareja con la primera `--port` y así sucesivamente. `clickhouse-benchmark` establece conexiones a ambos servidores, luego envía consultas. Cada consulta dirigida a un servidor seleccionado al azar. Los resultados se muestran para cada servidor por separado. - -## Ejemplo {#clickhouse-benchmark-example} - -``` bash -$ echo "SELECT * FROM system.numbers LIMIT 10000000 OFFSET 10000000" | clickhouse-benchmark -i 10 -``` - -``` text -Loaded 1 queries. - -Queries executed: 6. - -localhost:9000, queries 6, QPS: 6.153, RPS: 123398340.957, MiB/s: 941.455, result RPS: 61532982.200, result MiB/s: 469.459. - -0.000% 0.159 sec. -10.000% 0.159 sec. -20.000% 0.159 sec. -30.000% 0.160 sec. -40.000% 0.160 sec. -50.000% 0.162 sec. -60.000% 0.164 sec. -70.000% 0.165 sec. -80.000% 0.166 sec. -90.000% 0.166 sec. -95.000% 0.167 sec. -99.000% 0.167 sec. -99.900% 0.167 sec. -99.990% 0.167 sec. - - - -Queries executed: 10. - -localhost:9000, queries 10, QPS: 6.082, RPS: 121959604.568, MiB/s: 930.478, result RPS: 60815551.642, result MiB/s: 463.986. - -0.000% 0.159 sec. -10.000% 0.159 sec. -20.000% 0.160 sec. -30.000% 0.163 sec. -40.000% 0.164 sec. -50.000% 0.165 sec. -60.000% 0.166 sec. -70.000% 0.166 sec. -80.000% 0.167 sec. -90.000% 0.167 sec. -95.000% 0.170 sec. -99.000% 0.172 sec. -99.900% 0.172 sec. -99.990% 0.172 sec. -``` diff --git a/docs/es/operations/utilities/clickhouse-copier.md b/docs/es/operations/utilities/clickhouse-copier.md deleted file mode 100644 index 5717ffaa737..00000000000 --- a/docs/es/operations/utilities/clickhouse-copier.md +++ /dev/null @@ -1,176 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 59 -toc_title: "M\xE9todo de codificaci\xF3n de datos:" ---- - -# Método de codificación de datos: {#clickhouse-copier} - -Copia datos de las tablas de un clúster en tablas de otro (o del mismo) clúster. - -Puede ejecutar varios `clickhouse-copier` instancias en diferentes servidores para realizar el mismo trabajo. ZooKeeper se utiliza para sincronizar los procesos. - -Después de comenzar, `clickhouse-copier`: - -- Se conecta a ZooKeeper y recibe: - - - Copia de trabajos. - - El estado de los trabajos de copia. - -- Realiza los trabajos. - - Cada proceso en ejecución elige el “closest” el fragmento del clúster de origen y copia los datos en el clúster de destino, reafirmando los datos si es necesario. - -`clickhouse-copier` realiza un seguimiento de los cambios en ZooKeeper y los aplica sobre la marcha. - -Para reducir el tráfico de red, recomendamos ejecutar `clickhouse-copier` en el mismo servidor donde se encuentran los datos de origen. - -## Ejecución de Clickhouse-copiadora {#running-clickhouse-copier} - -La utilidad debe ejecutarse manualmente: - -``` bash -$ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir -``` - -Parámetros: - -- `daemon` — Starts `clickhouse-copier` en modo daemon. -- `config` — The path to the `zookeeper.xml` con los parámetros para la conexión a ZooKeeper. -- `task-path` — The path to the ZooKeeper node. This node is used for syncing `clickhouse-copier` procesos y tareas de almacenamiento. Las tareas se almacenan en `$task-path/description`. -- `task-file` — Optional path to file with task configuration for initial upload to ZooKeeper. -- `task-upload-force` — Force upload `task-file` incluso si el nodo ya existe. -- `base-dir` — The path to logs and auxiliary files. When it starts, `clickhouse-copier` crear `clickhouse-copier_YYYYMMHHSS_` subdirectorios en `$base-dir`. Si se omite este parámetro, los directorios se crean en el directorio donde `clickhouse-copier` se puso en marcha. - -## Formato de Zookeeper.XML {#format-of-zookeeper-xml} - -``` xml - - - trace - 100M - 3 - - - - - 127.0.0.1 - 2181 - - - -``` - -## Configuración de tareas de copia {#configuration-of-copying-tasks} - -``` xml - - - - - - false - - 127.0.0.1 - 9000 - - - ... - - - - ... - - - - - 2 - - - - 1 - - - - - 0 - - - - - 3 - - 1 - - - - - - - - source_cluster - test - hits - - - destination_cluster - test - hits2 - - - - ENGINE=ReplicatedMergeTree('/clickhouse/tables/{cluster}/{shard}/hits2', '{replica}') - PARTITION BY toMonday(date) - ORDER BY (CounterID, EventDate) - - - - jumpConsistentHash(intHash64(UserID), 2) - - - CounterID != 0 - - - - '2018-02-26' - '2018-03-05' - ... - - - - - - ... - - ... - - -``` - -`clickhouse-copier` seguimiento de los cambios en `/task/path/description` y los aplica sobre la marcha. Por ejemplo, si cambia el valor de `max_workers`, el número de procesos que ejecutan tareas también cambiará. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/utils/clickhouse-copier/) diff --git a/docs/es/operations/utilities/clickhouse-local.md b/docs/es/operations/utilities/clickhouse-local.md deleted file mode 100644 index e122f668f53..00000000000 --- a/docs/es/operations/utilities/clickhouse-local.md +++ /dev/null @@ -1,81 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 60 -toc_title: clickhouse-local ---- - -# clickhouse-local {#clickhouse-local} - -El `clickhouse-local` El programa le permite realizar un procesamiento rápido en archivos locales, sin tener que implementar y configurar el servidor ClickHouse. - -Acepta datos que representan tablas y las consulta usando [Nombre de la red inalámbrica (SSID):](../../sql-reference/index.md). - -`clickhouse-local` utiliza el mismo núcleo que el servidor ClickHouse, por lo que es compatible con la mayoría de las características y el mismo conjunto de formatos y motores de tabla. - -Predeterminada `clickhouse-local` no tiene acceso a los datos en el mismo host, pero admite la carga de la configuración del servidor `--config-file` argumento. - -!!! warning "Advertencia" - No se recomienda cargar la configuración del servidor de producción en `clickhouse-local` Porque los datos pueden dañarse en caso de error humano. - -## Uso {#usage} - -Uso básico: - -``` bash -$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" -``` - -Argumento: - -- `-S`, `--structure` — table structure for input data. -- `-if`, `--input-format` — input format, `TSV` predeterminada. -- `-f`, `--file` — path to data, `stdin` predeterminada. -- `-q` `--query` — queries to execute with `;` como delimitador. -- `-N`, `--table` — table name where to put output data, `table` predeterminada. -- `-of`, `--format`, `--output-format` — output format, `TSV` predeterminada. -- `--stacktrace` — whether to dump debug output in case of exception. -- `--verbose` — more details on query execution. -- `-s` — disables `stderr` tala. -- `--config-file` — path to configuration file in same format as for ClickHouse server, by default the configuration empty. -- `--help` — arguments references for `clickhouse-local`. - -También hay argumentos para cada variable de configuración de ClickHouse que se usan más comúnmente en lugar de `--config-file`. - -## Ejemplos {#examples} - -``` bash -$ echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" -Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. -1 2 -3 4 -``` - -El ejemplo anterior es el mismo que: - -``` bash -$ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" -Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. -1 2 -3 4 -``` - -Ahora vamos a usuario de memoria de salida para cada usuario de Unix: - -``` bash -$ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" -``` - -``` text -Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. -┏━━━━━━━━━━┳━━━━━━━━━━┓ -┃ user ┃ memTotal ┃ -┡━━━━━━━━━━╇━━━━━━━━━━┩ -│ bayonet │ 113.5 │ -├──────────┼──────────┤ -│ root │ 8.8 │ -├──────────┼──────────┤ -... -``` - -[Artículo Original](https://clickhouse.tech/docs/en/operations/utils/clickhouse-local/) diff --git a/docs/es/operations/utilities/index.md b/docs/es/operations/utilities/index.md deleted file mode 100644 index a69397a326c..00000000000 --- a/docs/es/operations/utilities/index.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Utilidad -toc_priority: 56 -toc_title: "Descripci\xF3n" ---- - -# Utilidad ClickHouse {#clickhouse-utility} - -- [Sistema abierto.](clickhouse-local.md#clickhouse-local) — Allows running SQL queries on data without stopping the ClickHouse server, similar to how `awk` hace esto. -- [Método de codificación de datos:](clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. -- [Sistema abierto.](clickhouse-benchmark.md) — Loads server with the custom queries and settings. - -[Artículo Original](https://clickhouse.tech/docs/en/operations/utils/) diff --git a/docs/es/roadmap.md b/docs/es/roadmap.md deleted file mode 100644 index 60db1c608df..00000000000 --- a/docs/es/roadmap.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -machine_translated: true ---- - -# Hoja De Ruta {#roadmap} - -## Q1 2020 {#q1-2020} - -- Control de acceso basado en roles - -## Q2 2020 {#q2-2020} - -- Integración con servicios de autenticación externos -- Grupos de recursos para una distribución más precisa de la capacidad del clúster entre los usuarios - -{## [Artículo Original](https://clickhouse.tech/docs/es/roadmap/) ##} diff --git a/docs/es/sql-reference/aggregate-functions/combinators.md b/docs/es/sql-reference/aggregate-functions/combinators.md deleted file mode 100644 index c9fdcb9478f..00000000000 --- a/docs/es/sql-reference/aggregate-functions/combinators.md +++ /dev/null @@ -1,245 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 37 -toc_title: Combinadores ---- - -# Combinadores de funciones agregadas {#aggregate_functions_combinators} - -El nombre de una función agregada puede tener un sufijo anexado. Esto cambia la forma en que funciona la función de agregado. - -## -Si {#agg-functions-combinator-if} - -The suffix -If can be appended to the name of any aggregate function. In this case, the aggregate function accepts an extra argument – a condition (Uint8 type). The aggregate function processes only the rows that trigger the condition. If the condition was not triggered even once, it returns a default value (usually zeros or empty strings). - -Ejemplos: `sumIf(column, cond)`, `countIf(cond)`, `avgIf(x, cond)`, `quantilesTimingIf(level1, level2)(x, cond)`, `argMinIf(arg, val, cond)` y así sucesivamente. - -Con las funciones de agregado condicional, puede calcular agregados para varias condiciones a la vez, sin utilizar subconsultas y `JOIN`Por ejemplo, en Yandex.Metrica, las funciones de agregado condicional se utilizan para implementar la funcionalidad de comparación de segmentos. - -## -Matriz {#agg-functions-combinator-array} - -El sufijo -Array se puede agregar a cualquier función agregada. En este caso, la función de agregado toma argumentos del ‘Array(T)’ tipo (arrays) en lugar de ‘T’ argumentos de tipo. Si la función de agregado acepta varios argumentos, deben ser matrices de igual longitud. Al procesar matrices, la función de agregado funciona como la función de agregado original en todos los elementos de la matriz. - -Ejemplo 1: `sumArray(arr)` - Totales de todos los elementos de todos ‘arr’ matriz. En este ejemplo, podría haber sido escrito más simplemente: `sum(arraySum(arr))`. - -Ejemplo 2: `uniqArray(arr)` – Counts the number of unique elements in all ‘arr’ matriz. Esto podría hacerse de una manera más fácil: `uniq(arrayJoin(arr))`, pero no siempre es posible agregar ‘arrayJoin’ a una consulta. - --If y -Array se pueden combinar. Obstante, ‘Array’ debe venir primero, entonces ‘If’. Ejemplos: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Debido a este pedido, el ‘cond’ argumento no será una matriz. - -## -Estado {#agg-functions-combinator-state} - -Si aplica este combinador, la función de agregado no devuelve el valor resultante (como el número de valores únicos para el [uniq](reference.md#agg_function-uniq) función), pero un estado intermedio de la agregación (para `uniq`, esta es la tabla hash para calcular el número de valores únicos). Este es un `AggregateFunction(...)` que puede ser utilizado para su posterior procesamiento o almacenado en una tabla para terminar de agregar más tarde. - -Para trabajar con estos estados, use: - -- [AgregaciónMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) motor de mesa. -- [finalizeAggregation](../../sql-reference/functions/other-functions.md#function-finalizeaggregation) función. -- [runningAccumulate](../../sql-reference/functions/other-functions.md#function-runningaccumulate) función. -- [-Fusionar](#aggregate_functions_combinators-merge) combinador. -- [-MergeState](#aggregate_functions_combinators-mergestate) combinador. - -## -Fusionar {#aggregate_functions_combinators-merge} - -Si aplica este combinador, la función de agregado toma el estado de agregación intermedio como argumento, combina los estados para finalizar la agregación y devuelve el valor resultante. - -## -MergeState {#aggregate_functions_combinators-mergestate} - -Combina los estados de agregación intermedios de la misma manera que el combinador -Merge. Sin embargo, no devuelve el valor resultante, sino un estado de agregación intermedio, similar al combinador -State. - -## -ForEach {#agg-functions-combinator-foreach} - -Convierte una función de agregado para tablas en una función de agregado para matrices que agrega los elementos de matriz correspondientes y devuelve una matriz de resultados. Por ejemplo, `sumForEach` para las matrices `[1, 2]`, `[3, 4, 5]`y`[6, 7]`devuelve el resultado `[10, 13, 5]` después de agregar los elementos de la matriz correspondientes. - -## -OPor defecto {#agg-functions-combinator-ordefault} - -Cambia el comportamiento de una función agregada. - -Si una función agregada no tiene valores de entrada, con este combinador devuelve el valor predeterminado para su tipo de datos de retorno. Se aplica a las funciones agregadas que pueden tomar datos de entrada vacíos. - -`-OrDefault` se puede utilizar con otros combinadores. - -**Sintaxis** - -``` sql -OrDefault(x) -``` - -**Parámetros** - -- `x` — Aggregate function parameters. - -**Valores devueltos** - -Devuelve el valor predeterminado del tipo devuelto de una función de agregado si no hay nada que agregar. - -El tipo depende de la función de agregado utilizada. - -**Ejemplo** - -Consulta: - -``` sql -SELECT avg(number), avgOrDefault(number) FROM numbers(0) -``` - -Resultado: - -``` text -┌─avg(number)─┬─avgOrDefault(number)─┐ -│ nan │ 0 │ -└─────────────┴──────────────────────┘ -``` - -También `-OrDefault` se puede utilizar con otros combinadores. Es útil cuando la función de agregado no acepta la entrada vacía. - -Consulta: - -``` sql -SELECT avgOrDefaultIf(x, x > 10) -FROM -( - SELECT toDecimal32(1.23, 2) AS x -) -``` - -Resultado: - -``` text -┌─avgOrDefaultIf(x, greater(x, 10))─┐ -│ 0.00 │ -└───────────────────────────────────┘ -``` - -## -OrNull {#agg-functions-combinator-ornull} - -Cambia el comportamiento de una función agregada. - -Este combinador convierte un resultado de una función agregada en [NULL](../data-types/nullable.md) tipo de datos. Si la función de agregado no tiene valores para calcular devuelve [NULL](../syntax.md#null-literal). - -`-OrNull` se puede utilizar con otros combinadores. - -**Sintaxis** - -``` sql -OrNull(x) -``` - -**Parámetros** - -- `x` — Aggregate function parameters. - -**Valores devueltos** - -- El resultado de la función de agregado, convertida a la `Nullable` tipo de datos. -- `NULL`, si no hay nada que agregar. - -Tipo: `Nullable(aggregate function return type)`. - -**Ejemplo** - -Añadir `-orNull` hasta el final de la función agregada. - -Consulta: - -``` sql -SELECT sumOrNull(number), toTypeName(sumOrNull(number)) FROM numbers(10) WHERE number > 10 -``` - -Resultado: - -``` text -┌─sumOrNull(number)─┬─toTypeName(sumOrNull(number))─┐ -│ ᴺᵁᴸᴸ │ Nullable(UInt64) │ -└───────────────────┴───────────────────────────────┘ -``` - -También `-OrNull` se puede utilizar con otros combinadores. Es útil cuando la función de agregado no acepta la entrada vacía. - -Consulta: - -``` sql -SELECT avgOrNullIf(x, x > 10) -FROM -( - SELECT toDecimal32(1.23, 2) AS x -) -``` - -Resultado: - -``` text -┌─avgOrNullIf(x, greater(x, 10))─┐ -│ ᴺᵁᴸᴸ │ -└────────────────────────────────┘ -``` - -## -Remuestrear {#agg-functions-combinator-resample} - -Permite dividir los datos en grupos y, a continuación, agregar por separado los datos de esos grupos. Los grupos se crean dividiendo los valores de una columna en intervalos. - -``` sql -Resample(start, end, step)(, resampling_key) -``` - -**Parámetros** - -- `start` — Starting value of the whole required interval for `resampling_key` valor. -- `stop` — Ending value of the whole required interval for `resampling_key` valor. Todo el intervalo no incluye el `stop` valor `[start, stop)`. -- `step` — Step for separating the whole interval into subintervals. The `aggFunction` se ejecuta sobre cada uno de esos subintervalos de forma independiente. -- `resampling_key` — Column whose values are used for separating data into intervals. -- `aggFunction_params` — `aggFunction` parámetros. - -**Valores devueltos** - -- Matriz de `aggFunction` resultados para cada subintervalo. - -**Ejemplo** - -Considere el `people` con los siguientes datos: - -``` text -┌─name───┬─age─┬─wage─┐ -│ John │ 16 │ 10 │ -│ Alice │ 30 │ 15 │ -│ Mary │ 35 │ 8 │ -│ Evelyn │ 48 │ 11.5 │ -│ David │ 62 │ 9.9 │ -│ Brian │ 60 │ 16 │ -└────────┴─────┴──────┘ -``` - -Obtengamos los nombres de las personas cuya edad se encuentra en los intervalos de `[30,60)` y `[60,75)`. Como usamos la representación entera para la edad, obtenemos edades en el `[30, 59]` y `[60,74]` intervalo. - -Para agregar nombres en una matriz, usamos el [Método de codificación de datos:](reference.md#agg_function-grouparray) función de agregado. Se necesita un argumento. En nuestro caso, es el `name` columna. El `groupArrayResample` función debe utilizar el `age` columna para agregar nombres por edad. Para definir los intervalos requeridos, pasamos el `30, 75, 30` discusiones sobre el `groupArrayResample` función. - -``` sql -SELECT groupArrayResample(30, 75, 30)(name, age) FROM people -``` - -``` text -┌─groupArrayResample(30, 75, 30)(name, age)─────┐ -│ [['Alice','Mary','Evelyn'],['David','Brian']] │ -└───────────────────────────────────────────────┘ -``` - -Considera los resultados. - -`Jonh` est? fuera de la muestra porque es demasiado joven. Otras personas se distribuyen de acuerdo con los intervalos de edad especificados. - -Ahora vamos a contar el número total de personas y su salario promedio en los intervalos de edad especificados. - -``` sql -SELECT - countResample(30, 75, 30)(name, age) AS amount, - avgResample(30, 75, 30)(wage, age) AS avg_wage -FROM people -``` - -``` text -┌─amount─┬─avg_wage──────────────────┐ -│ [3,2] │ [11.5,12.949999809265137] │ -└────────┴───────────────────────────┘ -``` - -[Artículo Original](https://clickhouse.tech/docs/en/query_language/agg_functions/combinators/) diff --git a/docs/es/sql-reference/aggregate-functions/index.md b/docs/es/sql-reference/aggregate-functions/index.md deleted file mode 100644 index 7c7d58d5f94..00000000000 --- a/docs/es/sql-reference/aggregate-functions/index.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Funciones agregadas -toc_priority: 33 -toc_title: "Implantaci\xF3n" ---- - -# Funciones agregadas {#aggregate-functions} - -Las funciones agregadas funcionan en el [normal](http://www.sql-tutorial.com/sql-aggregate-functions-sql-tutorial) forma esperada por los expertos en bases de datos. - -ClickHouse también es compatible: - -- [Funciones agregadas paramétricas](parametric-functions.md#aggregate_functions_parametric) que aceptan otros parámetros además de las columnas. -- [Combinadores](combinators.md#aggregate_functions_combinators), que cambian el comportamiento de las funciones agregadas. - -## Procesamiento NULL {#null-processing} - -Durante la agregación, todos `NULL`s se omiten. - -**Ejemplos:** - -Considere esta tabla: - -``` text -┌─x─┬────y─┐ -│ 1 │ 2 │ -│ 2 │ ᴺᵁᴸᴸ │ -│ 3 │ 2 │ -│ 3 │ 3 │ -│ 3 │ ᴺᵁᴸᴸ │ -└───┴──────┘ -``` - -Digamos que necesita sumar los valores en el `y` columna: - -``` sql -SELECT sum(y) FROM t_null_big -``` - - ┌─sum(y)─┐ - │ 7 │ - └────────┘ - -El `sum` función interpreta `NULL` como `0`. En particular, esto significa que si la función recibe la entrada de una selección donde todos los valores son `NULL`, entonces el resultado será `0`, ni `NULL`. - -Ahora puedes usar el `groupArray` función para crear una matriz a partir de la `y` columna: - -``` sql -SELECT groupArray(y) FROM t_null_big -``` - -``` text -┌─groupArray(y)─┐ -│ [2,2,3] │ -└───────────────┘ -``` - -`groupArray` no incluye `NULL` en la matriz resultante. - -[Artículo Original](https://clickhouse.tech/docs/en/query_language/agg_functions/) diff --git a/docs/es/sql-reference/aggregate-functions/parametric-functions.md b/docs/es/sql-reference/aggregate-functions/parametric-functions.md deleted file mode 100644 index ea32920401b..00000000000 --- a/docs/es/sql-reference/aggregate-functions/parametric-functions.md +++ /dev/null @@ -1,499 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 38 -toc_title: "Param\xE9trico" ---- - -# Funciones agregadas paramétricas {#aggregate_functions_parametric} - -Some aggregate functions can accept not only argument columns (used for compression), but a set of parameters – constants for initialization. The syntax is two pairs of brackets instead of one. The first is for parameters, and the second is for arguments. - -## histograma {#histogram} - -Calcula un histograma adaptativo. No garantiza resultados precisos. - -``` sql -histogram(number_of_bins)(values) -``` - -Las funciones utiliza [Un algoritmo de árbol de decisión paralelo de transmisión](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). Los bordes de los contenedores de histograma se ajustan a medida que los nuevos datos entran en una función. En caso común, los anchos de los contenedores no son iguales. - -**Parámetros** - -`number_of_bins` — Upper limit for the number of bins in the histogram. The function automatically calculates the number of bins. It tries to reach the specified number of bins, but if it fails, it uses fewer bins. -`values` — [Expresion](../syntax.md#syntax-expressions) resultando en valores de entrada. - -**Valores devueltos** - -- [Matriz](../../sql-reference/data-types/array.md) de [Tuples](../../sql-reference/data-types/tuple.md) del siguiente formato: - - ``` - [(lower_1, upper_1, height_1), ... (lower_N, upper_N, height_N)] - ``` - - - `lower` — Lower bound of the bin. - - `upper` — Upper bound of the bin. - - `height` — Calculated height of the bin. - -**Ejemplo** - -``` sql -SELECT histogram(5)(number + 1) -FROM ( - SELECT * - FROM system.numbers - LIMIT 20 -) -``` - -``` text -┌─histogram(5)(plus(number, 1))───────────────────────────────────────────┐ -│ [(1,4.5,4),(4.5,8.5,4),(8.5,12.75,4.125),(12.75,17,4.625),(17,20,3.25)] │ -└─────────────────────────────────────────────────────────────────────────┘ -``` - -Puede visualizar un histograma con el [Bar](../../sql-reference/functions/other-functions.md#function-bar) función, por ejemplo: - -``` sql -WITH histogram(5)(rand() % 100) AS hist -SELECT - arrayJoin(hist).3 AS height, - bar(height, 0, 6, 5) AS bar -FROM -( - SELECT * - FROM system.numbers - LIMIT 20 -) -``` - -``` text -┌─height─┬─bar───┐ -│ 2.125 │ █▋ │ -│ 3.25 │ ██▌ │ -│ 5.625 │ ████▏ │ -│ 5.625 │ ████▏ │ -│ 3.375 │ ██▌ │ -└────────┴───────┘ -``` - -En este caso, debe recordar que no conoce los bordes del contenedor del histograma. - -## sequenceMatch(pattern)(timestamp, cond1, cond2, …) {#function-sequencematch} - -Comprueba si la secuencia contiene una cadena de eventos que coincida con el patrón. - -``` sql -sequenceMatch(pattern)(timestamp, cond1, cond2, ...) -``` - -!!! warning "Advertencia" - Los eventos que ocurren en el mismo segundo pueden estar en la secuencia en un orden indefinido que afecta el resultado. - -**Parámetros** - -- `pattern` — Pattern string. See [Sintaxis de patrón](#sequence-function-pattern-syntax). - -- `timestamp` — Column considered to contain time data. Typical data types are `Date` y `DateTime`. También puede utilizar cualquiera de los [UInt](../../sql-reference/data-types/int-uint.md) tipos de datos. - -- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. Puede pasar hasta 32 argumentos de condición. La función sólo tiene en cuenta los eventos descritos en estas condiciones. Si la secuencia contiene datos que no se describen en una condición, la función los omite. - -**Valores devueltos** - -- 1, si el patrón coincide. -- 0, si el patrón no coincide. - -Tipo: `UInt8`. - - -**Sintaxis de patrón** - -- `(?N)` — Matches the condition argument at position `N`. Las condiciones están numeradas en el `[1, 32]` gama. Por ejemplo, `(?1)` coincide con el argumento pasado al `cond1` parámetro. - -- `.*` — Matches any number of events. You don't need conditional arguments to match this element of the pattern. - -- `(?t operator value)` — Sets the time in seconds that should separate two events. For example, pattern `(?1)(?t>1800)(?2)` coincide con los eventos que ocurren a más de 1800 segundos el uno del otro. Un número arbitrario de cualquier evento puede estar entre estos eventos. Puede usar el `>=`, `>`, `<`, `<=` operador. - -**Ejemplos** - -Considere los datos en el `t` tabla: - -``` text -┌─time─┬─number─┐ -│ 1 │ 1 │ -│ 2 │ 3 │ -│ 3 │ 2 │ -└──────┴────────┘ -``` - -Realizar la consulta: - -``` sql -SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2) FROM t -``` - -``` text -┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2))─┐ -│ 1 │ -└───────────────────────────────────────────────────────────────────────┘ -``` - -La función encontró la cadena de eventos donde el número 2 sigue al número 1. Se saltó el número 3 entre ellos, porque el número no se describe como un evento. Si queremos tener en cuenta este número al buscar la cadena de eventos dada en el ejemplo, debemos establecer una condición para ello. - -``` sql -SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 3) FROM t -``` - -``` text -┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 3))─┐ -│ 0 │ -└──────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -En este caso, la función no pudo encontrar la cadena de eventos que coincida con el patrón, porque el evento para el número 3 ocurrió entre 1 y 2. Si en el mismo caso comprobamos la condición para el número 4, la secuencia coincidiría con el patrón. - -``` sql -SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM t -``` - -``` text -┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 4))─┐ -│ 1 │ -└──────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -**Ver también** - -- [sequenceCount](#function-sequencecount) - -## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount} - -Cuenta el número de cadenas de eventos que coinciden con el patrón. La función busca cadenas de eventos que no se superponen. Comienza a buscar la siguiente cadena después de que se haga coincidir la cadena actual. - -!!! warning "Advertencia" - Los eventos que ocurren en el mismo segundo pueden estar en la secuencia en un orden indefinido que afecta el resultado. - -``` sql -sequenceCount(pattern)(timestamp, cond1, cond2, ...) -``` - -**Parámetros** - -- `pattern` — Pattern string. See [Sintaxis de patrón](#sequence-function-pattern-syntax). - -- `timestamp` — Column considered to contain time data. Typical data types are `Date` y `DateTime`. También puede utilizar cualquiera de los [UInt](../../sql-reference/data-types/int-uint.md) tipos de datos. - -- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. Puede pasar hasta 32 argumentos de condición. La función sólo tiene en cuenta los eventos descritos en estas condiciones. Si la secuencia contiene datos que no se describen en una condición, la función los omite. - -**Valores devueltos** - -- Número de cadenas de eventos no superpuestas que coinciden. - -Tipo: `UInt64`. - -**Ejemplo** - -Considere los datos en el `t` tabla: - -``` text -┌─time─┬─number─┐ -│ 1 │ 1 │ -│ 2 │ 3 │ -│ 3 │ 2 │ -│ 4 │ 1 │ -│ 5 │ 3 │ -│ 6 │ 2 │ -└──────┴────────┘ -``` - -Cuente cuántas veces ocurre el número 2 después del número 1 con cualquier cantidad de otros números entre ellos: - -``` sql -SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t -``` - -``` text -┌─sequenceCount('(?1).*(?2)')(time, equals(number, 1), equals(number, 2))─┐ -│ 2 │ -└─────────────────────────────────────────────────────────────────────────┘ -``` - -**Ver también** - -- [sequenceMatch](#function-sequencematch) - -## ventanaEmbudo {#windowfunnel} - -Busca cadenas de eventos en una ventana de tiempo deslizante y calcula el número máximo de eventos que ocurrieron desde la cadena. - -La función funciona de acuerdo con el algoritmo: - -- La función busca datos que desencadenan la primera condición en la cadena y establece el contador de eventos en 1. Este es el momento en que comienza la ventana deslizante. - -- Si los eventos de la cadena ocurren secuencialmente dentro de la ventana, el contador se incrementa. Si se interrumpe la secuencia de eventos, el contador no se incrementa. - -- Si los datos tienen varias cadenas de eventos en diferentes puntos de finalización, la función solo generará el tamaño de la cadena más larga. - -**Sintaxis** - -``` sql -windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) -``` - -**Parámetros** - -- `window` — Length of the sliding window in seconds. -- `mode` - Es un argumento opcional. - - `'strict'` - Cuando el `'strict'` se establece, windowFunnel() aplica condiciones solo para los valores únicos. -- `timestamp` — Name of the column containing the timestamp. Data types supported: [Fecha](../../sql-reference/data-types/date.md), [FechaHora](../../sql-reference/data-types/datetime.md#data_type-datetime) y otros tipos de enteros sin signo (tenga en cuenta que aunque timestamp admite el `UInt64` tipo, su valor no puede exceder el máximo de Int64, que es 2 ^ 63 - 1). -- `cond` — Conditions or data describing the chain of events. [UInt8](../../sql-reference/data-types/int-uint.md). - -**Valor devuelto** - -El número máximo de condiciones desencadenadas consecutivas de la cadena dentro de la ventana de tiempo deslizante. -Se analizan todas las cadenas en la selección. - -Tipo: `Integer`. - -**Ejemplo** - -Determine si un período de tiempo establecido es suficiente para que el usuario seleccione un teléfono y lo compre dos veces en la tienda en línea. - -Establezca la siguiente cadena de eventos: - -1. El usuario inició sesión en su cuenta en la tienda (`eventID = 1003`). -2. El usuario busca un teléfono (`eventID = 1007, product = 'phone'`). -3. El usuario realizó un pedido (`eventID = 1009`). -4. El usuario volvió a realizar el pedido (`eventID = 1010`). - -Tabla de entrada: - -``` text -┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ -│ 2019-01-28 │ 1 │ 2019-01-29 10:00:00 │ 1003 │ phone │ -└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ -┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ -│ 2019-01-31 │ 1 │ 2019-01-31 09:00:00 │ 1007 │ phone │ -└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ -┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ -│ 2019-01-30 │ 1 │ 2019-01-30 08:00:00 │ 1009 │ phone │ -└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ -┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ -│ 2019-02-01 │ 1 │ 2019-02-01 08:00:00 │ 1010 │ phone │ -└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ -``` - -Averigüe hasta qué punto el usuario `user_id` podría atravesar la cadena en un período de enero a febrero de 2019. - -Consulta: - -``` sql -SELECT - level, - count() AS c -FROM -( - SELECT - user_id, - windowFunnel(6048000000000000)(timestamp, eventID = 1003, eventID = 1009, eventID = 1007, eventID = 1010) AS level - FROM trend - WHERE (event_date >= '2019-01-01') AND (event_date <= '2019-02-02') - GROUP BY user_id -) -GROUP BY level -ORDER BY level ASC -``` - -Resultado: - -``` text -┌─level─┬─c─┐ -│ 4 │ 1 │ -└───────┴───┘ -``` - -## retención {#retention} - -La función toma como argumentos un conjunto de condiciones de 1 a 32 argumentos de tipo `UInt8` que indican si se cumplió una determinada condición para el evento. -Cualquier condición se puede especificar como un argumento (como en [WHERE](../../sql-reference/statements/select/where.md#select-where)). - -Las condiciones, excepto la primera, se aplican en pares: el resultado del segundo será verdadero si el primero y el segundo son verdaderos, del tercero si el primero y el fird son verdaderos, etc. - -**Sintaxis** - -``` sql -retention(cond1, cond2, ..., cond32); -``` - -**Parámetros** - -- `cond` — an expression that returns a `UInt8` resultado (1 o 0). - -**Valor devuelto** - -La matriz de 1 o 0. - -- 1 — condition was met for the event. -- 0 — condition wasn't met for the event. - -Tipo: `UInt8`. - -**Ejemplo** - -Consideremos un ejemplo de cálculo del `retention` función para determinar el tráfico del sitio. - -**1.** Сreate a table to illustrate an example. - -``` sql -CREATE TABLE retention_test(date Date, uid Int32) ENGINE = Memory; - -INSERT INTO retention_test SELECT '2020-01-01', number FROM numbers(5); -INSERT INTO retention_test SELECT '2020-01-02', number FROM numbers(10); -INSERT INTO retention_test SELECT '2020-01-03', number FROM numbers(15); -``` - -Tabla de entrada: - -Consulta: - -``` sql -SELECT * FROM retention_test -``` - -Resultado: - -``` text -┌───────date─┬─uid─┐ -│ 2020-01-01 │ 0 │ -│ 2020-01-01 │ 1 │ -│ 2020-01-01 │ 2 │ -│ 2020-01-01 │ 3 │ -│ 2020-01-01 │ 4 │ -└────────────┴─────┘ -┌───────date─┬─uid─┐ -│ 2020-01-02 │ 0 │ -│ 2020-01-02 │ 1 │ -│ 2020-01-02 │ 2 │ -│ 2020-01-02 │ 3 │ -│ 2020-01-02 │ 4 │ -│ 2020-01-02 │ 5 │ -│ 2020-01-02 │ 6 │ -│ 2020-01-02 │ 7 │ -│ 2020-01-02 │ 8 │ -│ 2020-01-02 │ 9 │ -└────────────┴─────┘ -┌───────date─┬─uid─┐ -│ 2020-01-03 │ 0 │ -│ 2020-01-03 │ 1 │ -│ 2020-01-03 │ 2 │ -│ 2020-01-03 │ 3 │ -│ 2020-01-03 │ 4 │ -│ 2020-01-03 │ 5 │ -│ 2020-01-03 │ 6 │ -│ 2020-01-03 │ 7 │ -│ 2020-01-03 │ 8 │ -│ 2020-01-03 │ 9 │ -│ 2020-01-03 │ 10 │ -│ 2020-01-03 │ 11 │ -│ 2020-01-03 │ 12 │ -│ 2020-01-03 │ 13 │ -│ 2020-01-03 │ 14 │ -└────────────┴─────┘ -``` - -**2.** Agrupar usuarios por ID único `uid` utilizando el `retention` función. - -Consulta: - -``` sql -SELECT - uid, - retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') AS r -FROM retention_test -WHERE date IN ('2020-01-01', '2020-01-02', '2020-01-03') -GROUP BY uid -ORDER BY uid ASC -``` - -Resultado: - -``` text -┌─uid─┬─r───────┐ -│ 0 │ [1,1,1] │ -│ 1 │ [1,1,1] │ -│ 2 │ [1,1,1] │ -│ 3 │ [1,1,1] │ -│ 4 │ [1,1,1] │ -│ 5 │ [0,0,0] │ -│ 6 │ [0,0,0] │ -│ 7 │ [0,0,0] │ -│ 8 │ [0,0,0] │ -│ 9 │ [0,0,0] │ -│ 10 │ [0,0,0] │ -│ 11 │ [0,0,0] │ -│ 12 │ [0,0,0] │ -│ 13 │ [0,0,0] │ -│ 14 │ [0,0,0] │ -└─────┴─────────┘ -``` - -**3.** Calcule el número total de visitas al sitio por día. - -Consulta: - -``` sql -SELECT - sum(r[1]) AS r1, - sum(r[2]) AS r2, - sum(r[3]) AS r3 -FROM -( - SELECT - uid, - retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') AS r - FROM retention_test - WHERE date IN ('2020-01-01', '2020-01-02', '2020-01-03') - GROUP BY uid -) -``` - -Resultado: - -``` text -┌─r1─┬─r2─┬─r3─┐ -│ 5 │ 5 │ 5 │ -└────┴────┴────┘ -``` - -Donde: - -- `r1`- el número de visitantes únicos que visitaron el sitio durante 2020-01-01 (la `cond1` condición). -- `r2`- el número de visitantes únicos que visitaron el sitio durante un período de tiempo específico entre 2020-01-01 y 2020-01-02 (`cond1` y `cond2` condición). -- `r3`- el número de visitantes únicos que visitaron el sitio durante un período de tiempo específico entre 2020-01-01 y 2020-01-03 (`cond1` y `cond3` condición). - -## UniqUpTo(N)(x) {#uniquptonx} - -Calculates the number of different argument values ​​if it is less than or equal to N. If the number of different argument values is greater than N, it returns N + 1. - -Recomendado para usar con Ns pequeños, hasta 10. El valor máximo de N es 100. - -Para el estado de una función agregada, utiliza la cantidad de memoria igual a 1 + N \* el tamaño de un valor de bytes. -Para las cadenas, almacena un hash no criptográfico de 8 bytes. Es decir, el cálculo se aproxima a las cadenas. - -La función también funciona para varios argumentos. - -Funciona lo más rápido posible, excepto en los casos en que se usa un valor N grande y el número de valores únicos es ligeramente menor que N. - -Ejemplo de uso: - -``` text -Problem: Generate a report that shows only keywords that produced at least 5 unique users. -Solution: Write in the GROUP BY query SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 -``` - -[Artículo Original](https://clickhouse.tech/docs/en/query_language/agg_functions/parametric_functions/) - -## sumMapFiltered(keys_to_keep)(claves, valores) {#summapfilteredkeys-to-keepkeys-values} - -El mismo comportamiento que [sumMap](reference.md#agg_functions-summap) excepto que una matriz de claves se pasa como un parámetro. Esto puede ser especialmente útil cuando se trabaja con una alta cardinalidad de claves. diff --git a/docs/es/sql-reference/aggregate-functions/reference.md b/docs/es/sql-reference/aggregate-functions/reference.md deleted file mode 100644 index 572c4d01051..00000000000 --- a/docs/es/sql-reference/aggregate-functions/reference.md +++ /dev/null @@ -1,1914 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 36 -toc_title: Referencia ---- - -# Referencia de función agregada {#aggregate-functions-reference} - -## contar {#agg_function-count} - -Cuenta el número de filas o valores no NULL. - -ClickHouse admite las siguientes sintaxis para `count`: -- `count(expr)` o `COUNT(DISTINCT expr)`. -- `count()` o `COUNT(*)`. El `count()` la sintaxis es específica de ClickHouse. - -**Parámetros** - -La función puede tomar: - -- Cero parámetros. -- Una [expresion](../syntax.md#syntax-expressions). - -**Valor devuelto** - -- Si se llama a la función sin parámetros, cuenta el número de filas. -- Si el [expresion](../syntax.md#syntax-expressions) se pasa, entonces la función cuenta cuántas veces esta expresión devuelve no nula. Si la expresión devuelve un [NULL](../../sql-reference/data-types/nullable.md)-type valor, entonces el resultado de `count` no se queda `Nullable`. La función devuelve 0 si la expresión devuelta `NULL` para todas las filas. - -En ambos casos el tipo del valor devuelto es [UInt64](../../sql-reference/data-types/int-uint.md). - -**Detalles** - -ClickHouse soporta el `COUNT(DISTINCT ...)` sintaxis. El comportamiento de esta construcción depende del [count_distinct_implementation](../../operations/settings/settings.md#settings-count_distinct_implementation) configuración. Define cuál de las [uniq\*](#agg_function-uniq) se utiliza para realizar la operación. El valor predeterminado es el [uniqExact](#agg_function-uniqexact) función. - -El `SELECT count() FROM table` consulta no está optimizado, porque el número de entradas en la tabla no se almacena por separado. Elige una pequeña columna de la tabla y cuenta el número de valores en ella. - -**Ejemplos** - -Ejemplo 1: - -``` sql -SELECT count() FROM t -``` - -``` text -┌─count()─┐ -│ 5 │ -└─────────┘ -``` - -Ejemplo 2: - -``` sql -SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation' -``` - -``` text -┌─name──────────────────────────┬─value─────┐ -│ count_distinct_implementation │ uniqExact │ -└───────────────────────────────┴───────────┘ -``` - -``` sql -SELECT count(DISTINCT num) FROM t -``` - -``` text -┌─uniqExact(num)─┐ -│ 3 │ -└────────────────┘ -``` - -Este ejemplo muestra que `count(DISTINCT num)` se realiza por el `uniqExact` función según el `count_distinct_implementation` valor de ajuste. - -## cualquiera (x) {#agg_function-any} - -Selecciona el primer valor encontrado. -La consulta se puede ejecutar en cualquier orden e incluso en un orden diferente cada vez, por lo que el resultado de esta función es indeterminado. -Para obtener un resultado determinado, puede usar el ‘min’ o ‘max’ función en lugar de ‘any’. - -En algunos casos, puede confiar en el orden de ejecución. Esto se aplica a los casos en que SELECT proviene de una subconsulta que usa ORDER BY. - -Cuando un `SELECT` consulta tiene el `GROUP BY` cláusula o al menos una función agregada, ClickHouse (en contraste con MySQL) requiere que todas las expresiones `SELECT`, `HAVING`, y `ORDER BY` las cláusulas pueden calcularse a partir de claves o de funciones agregadas. En otras palabras, cada columna seleccionada de la tabla debe usarse en claves o dentro de funciones agregadas. Para obtener un comportamiento como en MySQL, puede colocar las otras columnas en el `any` función de agregado. - -## Cualquier pesado (x) {#anyheavyx} - -Selecciona un valor que ocurre con frecuencia [pesos pesados](http://www.cs.umd.edu/~samir/498/karp.pdf) algoritmo. Si hay un valor que se produce más de la mitad de los casos en cada uno de los subprocesos de ejecución de la consulta, se devuelve este valor. Normalmente, el resultado es no determinista. - -``` sql -anyHeavy(column) -``` - -**Argumento** - -- `column` – The column name. - -**Ejemplo** - -Tome el [A tiempo](../../getting-started/example-datasets/ontime.md) conjunto de datos y seleccione cualquier valor que ocurra con frecuencia `AirlineID` columna. - -``` sql -SELECT anyHeavy(AirlineID) AS res -FROM ontime -``` - -``` text -┌───res─┐ -│ 19690 │ -└───────┘ -``` - -## Cualquier último (x) {#anylastx} - -Selecciona el último valor encontrado. -El resultado es tan indeterminado como para el `any` función. - -## Método de codificación de datos: {#groupbitand} - -Se aplica bit a bit `AND` para la serie de números. - -``` sql -groupBitAnd(expr) -``` - -**Parámetros** - -`expr` – An expression that results in `UInt*` tipo. - -**Valor de retorno** - -Valor de la `UInt*` tipo. - -**Ejemplo** - -Datos de prueba: - -``` text -binary decimal -00101100 = 44 -00011100 = 28 -00001101 = 13 -01010101 = 85 -``` - -Consulta: - -``` sql -SELECT groupBitAnd(num) FROM t -``` - -Donde `num` es la columna con los datos de prueba. - -Resultado: - -``` text -binary decimal -00000100 = 4 -``` - -## GrupoBitO {#groupbitor} - -Se aplica bit a bit `OR` para la serie de números. - -``` sql -groupBitOr(expr) -``` - -**Parámetros** - -`expr` – An expression that results in `UInt*` tipo. - -**Valor de retorno** - -Valor de la `UInt*` tipo. - -**Ejemplo** - -Datos de prueba: - -``` text -binary decimal -00101100 = 44 -00011100 = 28 -00001101 = 13 -01010101 = 85 -``` - -Consulta: - -``` sql -SELECT groupBitOr(num) FROM t -``` - -Donde `num` es la columna con los datos de prueba. - -Resultado: - -``` text -binary decimal -01111101 = 125 -``` - -## GrupoBitXor {#groupbitxor} - -Se aplica bit a bit `XOR` para la serie de números. - -``` sql -groupBitXor(expr) -``` - -**Parámetros** - -`expr` – An expression that results in `UInt*` tipo. - -**Valor de retorno** - -Valor de la `UInt*` tipo. - -**Ejemplo** - -Datos de prueba: - -``` text -binary decimal -00101100 = 44 -00011100 = 28 -00001101 = 13 -01010101 = 85 -``` - -Consulta: - -``` sql -SELECT groupBitXor(num) FROM t -``` - -Donde `num` es la columna con los datos de prueba. - -Resultado: - -``` text -binary decimal -01101000 = 104 -``` - -## Método de codificación de datos: {#groupbitmap} - -Mapa de bits o cálculos agregados de una columna entera sin signo, devuelve cardinalidad de tipo UInt64, si agrega el sufijo -State, luego devuelve [objeto de mapa de bits](../../sql-reference/functions/bitmap-functions.md). - -``` sql -groupBitmap(expr) -``` - -**Parámetros** - -`expr` – An expression that results in `UInt*` tipo. - -**Valor de retorno** - -Valor de la `UInt64` tipo. - -**Ejemplo** - -Datos de prueba: - -``` text -UserID -1 -1 -2 -3 -``` - -Consulta: - -``` sql -SELECT groupBitmap(UserID) as num FROM t -``` - -Resultado: - -``` text -num -3 -``` - -## Mínimo (x) {#agg_function-min} - -Calcula el mínimo. - -## máximo (x) {#agg_function-max} - -Calcula el máximo. - -## ¿Cómo puedo hacerlo?) {#agg-function-argmin} - -Calcula el ‘arg’ para un valor mínimo ‘val’ valor. Si hay varios valores diferentes de ‘arg’ para valores mínimos de ‘val’, el primero de estos valores encontrados es la salida. - -**Ejemplo:** - -``` text -┌─user─────┬─salary─┐ -│ director │ 5000 │ -│ manager │ 3000 │ -│ worker │ 1000 │ -└──────────┴────────┘ -``` - -``` sql -SELECT argMin(user, salary) FROM salary -``` - -``` text -┌─argMin(user, salary)─┐ -│ worker │ -└──────────────────────┘ -``` - -## Descripción) {#agg-function-argmax} - -Calcula el ‘arg’ para un valor máximo ‘val’ valor. Si hay varios valores diferentes de ‘arg’ para valores máximos de ‘val’, el primero de estos valores encontrados es la salida. - -## suma (x) {#agg_function-sum} - -Calcula la suma. -Solo funciona para números. - -## ¿Cómo puedo obtener más información?) {#sumwithoverflowx} - -Calcula la suma de los números, utilizando el mismo tipo de datos para el resultado que para los parámetros de entrada. Si la suma supera el valor máximo para este tipo de datos, la función devuelve un error. - -Solo funciona para números. - -## Por ejemplo, el valor es el siguiente:)) {#agg_functions-summap} - -Totals el ‘value’ matriz de acuerdo con las claves especificadas en el ‘key’ matriz. -Pasar una tupla de matrices de claves y valores es sinónimo de pasar dos matrices de claves y valores. -El número de elementos en ‘key’ y ‘value’ debe ser el mismo para cada fila que se sume. -Returns a tuple of two arrays: keys in sorted order, and values ​​summed for the corresponding keys. - -Ejemplo: - -``` sql -CREATE TABLE sum_map( - date Date, - timeslot DateTime, - statusMap Nested( - status UInt16, - requests UInt64 - ), - statusMapTuple Tuple(Array(Int32), Array(Int32)) -) ENGINE = Log; -INSERT INTO sum_map VALUES - ('2000-01-01', '2000-01-01 00:00:00', [1, 2, 3], [10, 10, 10], ([1, 2, 3], [10, 10, 10])), - ('2000-01-01', '2000-01-01 00:00:00', [3, 4, 5], [10, 10, 10], ([3, 4, 5], [10, 10, 10])), - ('2000-01-01', '2000-01-01 00:01:00', [4, 5, 6], [10, 10, 10], ([4, 5, 6], [10, 10, 10])), - ('2000-01-01', '2000-01-01 00:01:00', [6, 7, 8], [10, 10, 10], ([6, 7, 8], [10, 10, 10])); - -SELECT - timeslot, - sumMap(statusMap.status, statusMap.requests), - sumMap(statusMapTuple) -FROM sum_map -GROUP BY timeslot -``` - -``` text -┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┬─sumMap(statusMapTuple)─────────┐ -│ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10]) │ ([1,2,3,4,5],[10,10,20,10,10]) │ -│ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ ([4,5,6,7,8],[10,10,20,10,10]) │ -└─────────────────────┴──────────────────────────────────────────────┴────────────────────────────────┘ -``` - -## SkewPop {#skewpop} - -Calcula el [la asimetría](https://en.wikipedia.org/wiki/Skewness) de una secuencia. - -``` sql -skewPop(expr) -``` - -**Parámetros** - -`expr` — [Expresion](../syntax.md#syntax-expressions) devolviendo un número. - -**Valor devuelto** - -The skewness of the given distribution. Type — [Float64](../../sql-reference/data-types/float.md) - -**Ejemplo** - -``` sql -SELECT skewPop(value) FROM series_with_value_column -``` - -## Sistema abierto {#skewsamp} - -Calcula el [asimetría de la muestra](https://en.wikipedia.org/wiki/Skewness) de una secuencia. - -Representa una estimación imparcial de la asimetría de una variable aleatoria si los valores pasados forman su muestra. - -``` sql -skewSamp(expr) -``` - -**Parámetros** - -`expr` — [Expresion](../syntax.md#syntax-expressions) devolviendo un número. - -**Valor devuelto** - -The skewness of the given distribution. Type — [Float64](../../sql-reference/data-types/float.md). Si `n <= 1` (`n` es el tamaño de la muestra), luego la función devuelve `nan`. - -**Ejemplo** - -``` sql -SELECT skewSamp(value) FROM series_with_value_column -``` - -## KurtPop {#kurtpop} - -Calcula el [curtosis](https://en.wikipedia.org/wiki/Kurtosis) de una secuencia. - -``` sql -kurtPop(expr) -``` - -**Parámetros** - -`expr` — [Expresion](../syntax.md#syntax-expressions) devolviendo un número. - -**Valor devuelto** - -The kurtosis of the given distribution. Type — [Float64](../../sql-reference/data-types/float.md) - -**Ejemplo** - -``` sql -SELECT kurtPop(value) FROM series_with_value_column -``` - -## KurtSamp {#kurtsamp} - -Calcula el [curtosis muestra](https://en.wikipedia.org/wiki/Kurtosis) de una secuencia. - -Representa una estimación imparcial de la curtosis de una variable aleatoria si los valores pasados forman su muestra. - -``` sql -kurtSamp(expr) -``` - -**Parámetros** - -`expr` — [Expresion](../syntax.md#syntax-expressions) devolviendo un número. - -**Valor devuelto** - -The kurtosis of the given distribution. Type — [Float64](../../sql-reference/data-types/float.md). Si `n <= 1` (`n` es un tamaño de la muestra), luego la función devuelve `nan`. - -**Ejemplo** - -``` sql -SELECT kurtSamp(value) FROM series_with_value_column -``` - -## Acerca de) {#agg_function-avg} - -Calcula el promedio. -Solo funciona para números. -El resultado es siempre Float64. - -## avgPonderado {#avgweighted} - -Calcula el [media aritmética ponderada](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean). - -**Sintaxis** - -``` sql -avgWeighted(x, weight) -``` - -**Parámetros** - -- `x` — Values. [Entero](../data-types/int-uint.md) o [punto flotante](../data-types/float.md). -- `weight` — Weights of the values. [Entero](../data-types/int-uint.md) o [punto flotante](../data-types/float.md). - -Tipo de `x` y `weight` debe ser el mismo. - -**Valor devuelto** - -- Media ponderada. -- `NaN`. Si todos los pesos son iguales a 0. - -Tipo: [Float64](../data-types/float.md). - -**Ejemplo** - -Consulta: - -``` sql -SELECT avgWeighted(x, w) -FROM values('x Int8, w Int8', (4, 1), (1, 0), (10, 2)) -``` - -Resultado: - -``` text -┌─avgWeighted(x, weight)─┐ -│ 8 │ -└────────────────────────┘ -``` - -## uniq {#agg_function-uniq} - -Calcula el número aproximado de diferentes valores del argumento. - -``` sql -uniq(x[, ...]) -``` - -**Parámetros** - -La función toma un número variable de parámetros. Los parámetros pueden ser `Tuple`, `Array`, `Date`, `DateTime`, `String`, o tipos numéricos. - -**Valor devuelto** - -- A [UInt64](../../sql-reference/data-types/int-uint.md)-tipo número. - -**Detalles de implementación** - -Función: - -- Calcula un hash para todos los parámetros en el agregado, luego lo usa en los cálculos. - -- Utiliza un algoritmo de muestreo adaptativo. Para el estado de cálculo, la función utiliza una muestra de valores hash de elemento de hasta 65536. - - This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions. - -- Proporciona el resultado de forma determinista (no depende del orden de procesamiento de la consulta). - -Recomendamos usar esta función en casi todos los escenarios. - -**Ver también** - -- [uniqCombined](#agg_function-uniqcombined) -- [UniqCombined64](#agg_function-uniqcombined64) -- [uniqHLL12](#agg_function-uniqhll12) -- [uniqExact](#agg_function-uniqexact) - -## uniqCombined {#agg_function-uniqcombined} - -Calcula el número aproximado de diferentes valores de argumento. - -``` sql -uniqCombined(HLL_precision)(x[, ...]) -``` - -El `uniqCombined` es una buena opción para calcular el número de valores diferentes. - -**Parámetros** - -La función toma un número variable de parámetros. Los parámetros pueden ser `Tuple`, `Array`, `Date`, `DateTime`, `String`, o tipos numéricos. - -`HLL_precision` es el logaritmo base-2 del número de células en [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). Opcional, puede utilizar la función como `uniqCombined(x[, ...])`. El valor predeterminado para `HLL_precision` es 17, que es efectivamente 96 KiB de espacio (2 ^ 17 celdas, 6 bits cada una). - -**Valor devuelto** - -- Numero [UInt64](../../sql-reference/data-types/int-uint.md)-tipo número. - -**Detalles de implementación** - -Función: - -- Calcula un hash (hash de 64 bits para `String` y 32 bits de lo contrario) para todos los parámetros en el agregado, luego lo usa en los cálculos. - -- Utiliza una combinación de tres algoritmos: matriz, tabla hash e HyperLogLog con una tabla de corrección de errores. - - For a small number of distinct elements, an array is used. When the set size is larger, a hash table is used. For a larger number of elements, HyperLogLog is used, which will occupy a fixed amount of memory. - -- Proporciona el resultado de forma determinista (no depende del orden de procesamiento de la consulta). - -!!! note "Nota" - Dado que usa hash de 32 bits para no-`String` tipo, el resultado tendrá un error muy alto para cardinalidades significativamente mayores que `UINT_MAX` (el error aumentará rápidamente después de unas pocas decenas de miles de millones de valores distintos), por lo tanto, en este caso debe usar [UniqCombined64](#agg_function-uniqcombined64) - -En comparación con el [uniq](#agg_function-uniq) función, el `uniqCombined`: - -- Consume varias veces menos memoria. -- Calcula con una precisión varias veces mayor. -- Por lo general, tiene un rendimiento ligeramente menor. En algunos escenarios, `uniqCombined` puede funcionar mejor que `uniq`, por ejemplo, con consultas distribuidas que transmiten un gran número de estados de agregación a través de la red. - -**Ver también** - -- [uniq](#agg_function-uniq) -- [UniqCombined64](#agg_function-uniqcombined64) -- [uniqHLL12](#agg_function-uniqhll12) -- [uniqExact](#agg_function-uniqexact) - -## UniqCombined64 {#agg_function-uniqcombined64} - -Lo mismo que [uniqCombined](#agg_function-uniqcombined), pero utiliza hash de 64 bits para todos los tipos de datos. - -## uniqHLL12 {#agg_function-uniqhll12} - -Calcula el número aproximado de diferentes valores de argumento [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) algoritmo. - -``` sql -uniqHLL12(x[, ...]) -``` - -**Parámetros** - -La función toma un número variable de parámetros. Los parámetros pueden ser `Tuple`, `Array`, `Date`, `DateTime`, `String`, o tipos numéricos. - -**Valor devuelto** - -- A [UInt64](../../sql-reference/data-types/int-uint.md)-tipo número. - -**Detalles de implementación** - -Función: - -- Calcula un hash para todos los parámetros en el agregado, luego lo usa en los cálculos. - -- Utiliza el algoritmo HyperLogLog para aproximar el número de valores de argumento diferentes. - - 212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). - -- Proporciona el resultado determinado (no depende del orden de procesamiento de la consulta). - -No recomendamos usar esta función. En la mayoría de los casos, use el [uniq](#agg_function-uniq) o [uniqCombined](#agg_function-uniqcombined) función. - -**Ver también** - -- [uniq](#agg_function-uniq) -- [uniqCombined](#agg_function-uniqcombined) -- [uniqExact](#agg_function-uniqexact) - -## uniqExact {#agg_function-uniqexact} - -Calcula el número exacto de diferentes valores de argumento. - -``` sql -uniqExact(x[, ...]) -``` - -Utilice el `uniqExact` función si necesita absolutamente un resultado exacto. De lo contrario, use el [uniq](#agg_function-uniq) función. - -El `uniqExact` función utiliza más memoria que `uniq`, porque el tamaño del estado tiene un crecimiento ilimitado a medida que aumenta el número de valores diferentes. - -**Parámetros** - -La función toma un número variable de parámetros. Los parámetros pueden ser `Tuple`, `Array`, `Date`, `DateTime`, `String`, o tipos numéricos. - -**Ver también** - -- [uniq](#agg_function-uniq) -- [uniqCombined](#agg_function-uniqcombined) -- [uniqHLL12](#agg_function-uniqhll12) - -## ¿Cómo puedo hacerlo?) {#agg_function-grouparray} - -Crea una matriz de valores de argumento. -Los valores se pueden agregar a la matriz en cualquier orden (indeterminado). - -La segunda versión (con el `max_size` parámetro) limita el tamaño de la matriz resultante a `max_size` elemento. -Por ejemplo, `groupArray (1) (x)` es equivalente a `[any (x)]`. - -En algunos casos, aún puede confiar en el orden de ejecución. Esto se aplica a los casos en que `SELECT` procede de una subconsulta que utiliza `ORDER BY`. - -## GrupoArrayInsertAt {#grouparrayinsertat} - -Inserta un valor en la matriz en la posición especificada. - -**Sintaxis** - -``` sql -groupArrayInsertAt(default_x, size)(x, pos); -``` - -Si en una consulta se insertan varios valores en la misma posición, la función se comporta de las siguientes maneras: - -- Si se ejecuta una consulta en un solo subproceso, se utiliza el primero de los valores insertados. -- Si una consulta se ejecuta en varios subprocesos, el valor resultante es uno indeterminado de los valores insertados. - -**Parámetros** - -- `x` — Value to be inserted. [Expresion](../syntax.md#syntax-expressions) lo que resulta en uno de los [tipos de datos compatibles](../../sql-reference/data-types/index.md). -- `pos` — Position at which the specified element `x` se va a insertar. La numeración de índices en la matriz comienza desde cero. [UInt32](../../sql-reference/data-types/int-uint.md#uint-ranges). -- `default_x`— Default value for substituting in empty positions. Optional parameter. [Expresion](../syntax.md#syntax-expressions) dando como resultado el tipo de datos configurado para `x` parámetro. Si `default_x` no está definido, el [valores predeterminados](../../sql-reference/statements/create.md#create-default-values) se utilizan. -- `size`— Length of the resulting array. Optional parameter. When using this parameter, the default value `default_x` debe ser especificado. [UInt32](../../sql-reference/data-types/int-uint.md#uint-ranges). - -**Valor devuelto** - -- Matriz con valores insertados. - -Tipo: [Matriz](../../sql-reference/data-types/array.md#data-type-array). - -**Ejemplo** - -Consulta: - -``` sql -SELECT groupArrayInsertAt(toString(number), number * 2) FROM numbers(5); -``` - -Resultado: - -``` text -┌─groupArrayInsertAt(toString(number), multiply(number, 2))─┐ -│ ['0','','1','','2','','3','','4'] │ -└───────────────────────────────────────────────────────────┘ -``` - -Consulta: - -``` sql -SELECT groupArrayInsertAt('-')(toString(number), number * 2) FROM numbers(5); -``` - -Resultado: - -``` text -┌─groupArrayInsertAt('-')(toString(number), multiply(number, 2))─┐ -│ ['0','-','1','-','2','-','3','-','4'] │ -└────────────────────────────────────────────────────────────────┘ -``` - -Consulta: - -``` sql -SELECT groupArrayInsertAt('-', 5)(toString(number), number * 2) FROM numbers(5); -``` - -Resultado: - -``` text -┌─groupArrayInsertAt('-', 5)(toString(number), multiply(number, 2))─┐ -│ ['0','-','1','-','2'] │ -└───────────────────────────────────────────────────────────────────┘ -``` - -Inserción multihilo de elementos en una posición. - -Consulta: - -``` sql -SELECT groupArrayInsertAt(number, 0) FROM numbers_mt(10) SETTINGS max_block_size = 1; -``` - -Como resultado de esta consulta, obtiene un entero aleatorio en el `[0,9]` gama. Por ejemplo: - -``` text -┌─groupArrayInsertAt(number, 0)─┐ -│ [7] │ -└───────────────────────────────┘ -``` - -## groupArrayMovingSum {#agg_function-grouparraymovingsum} - -Calcula la suma móvil de los valores de entrada. - -``` sql -groupArrayMovingSum(numbers_for_summing) -groupArrayMovingSum(window_size)(numbers_for_summing) -``` - -La función puede tomar el tamaño de la ventana como un parámetro. Si no se especifica, la función toma el tamaño de ventana igual al número de filas de la columna. - -**Parámetros** - -- `numbers_for_summing` — [Expresion](../syntax.md#syntax-expressions) dando como resultado un valor de tipo de datos numérico. -- `window_size` — Size of the calculation window. - -**Valores devueltos** - -- Matriz del mismo tamaño y tipo que los datos de entrada. - -**Ejemplo** - -La tabla de ejemplo: - -``` sql -CREATE TABLE t -( - `int` UInt8, - `float` Float32, - `dec` Decimal32(2) -) -ENGINE = TinyLog -``` - -``` text -┌─int─┬─float─┬──dec─┐ -│ 1 │ 1.1 │ 1.10 │ -│ 2 │ 2.2 │ 2.20 │ -│ 4 │ 4.4 │ 4.40 │ -│ 7 │ 7.77 │ 7.77 │ -└─────┴───────┴──────┘ -``` - -Consulta: - -``` sql -SELECT - groupArrayMovingSum(int) AS I, - groupArrayMovingSum(float) AS F, - groupArrayMovingSum(dec) AS D -FROM t -``` - -``` text -┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ -│ [1,3,7,14] │ [1.1,3.3000002,7.7000003,15.47] │ [1.10,3.30,7.70,15.47] │ -└────────────┴─────────────────────────────────┴────────────────────────┘ -``` - -``` sql -SELECT - groupArrayMovingSum(2)(int) AS I, - groupArrayMovingSum(2)(float) AS F, - groupArrayMovingSum(2)(dec) AS D -FROM t -``` - -``` text -┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ -│ [1,3,6,11] │ [1.1,3.3000002,6.6000004,12.17] │ [1.10,3.30,6.60,12.17] │ -└────────────┴─────────────────────────────────┴────────────────────────┘ -``` - -## Método de codificación de datos: {#agg_function-grouparraymovingavg} - -Calcula la media móvil de los valores de entrada. - -``` sql -groupArrayMovingAvg(numbers_for_summing) -groupArrayMovingAvg(window_size)(numbers_for_summing) -``` - -La función puede tomar el tamaño de la ventana como un parámetro. Si no se especifica, la función toma el tamaño de ventana igual al número de filas de la columna. - -**Parámetros** - -- `numbers_for_summing` — [Expresion](../syntax.md#syntax-expressions) dando como resultado un valor de tipo de datos numérico. -- `window_size` — Size of the calculation window. - -**Valores devueltos** - -- Matriz del mismo tamaño y tipo que los datos de entrada. - -La función utiliza [redondeando hacia cero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). Trunca los decimales insignificantes para el tipo de datos resultante. - -**Ejemplo** - -La tabla de ejemplo `b`: - -``` sql -CREATE TABLE t -( - `int` UInt8, - `float` Float32, - `dec` Decimal32(2) -) -ENGINE = TinyLog -``` - -``` text -┌─int─┬─float─┬──dec─┐ -│ 1 │ 1.1 │ 1.10 │ -│ 2 │ 2.2 │ 2.20 │ -│ 4 │ 4.4 │ 4.40 │ -│ 7 │ 7.77 │ 7.77 │ -└─────┴───────┴──────┘ -``` - -Consulta: - -``` sql -SELECT - groupArrayMovingAvg(int) AS I, - groupArrayMovingAvg(float) AS F, - groupArrayMovingAvg(dec) AS D -FROM t -``` - -``` text -┌─I─────────┬─F───────────────────────────────────┬─D─────────────────────┐ -│ [0,0,1,3] │ [0.275,0.82500005,1.9250001,3.8675] │ [0.27,0.82,1.92,3.86] │ -└───────────┴─────────────────────────────────────┴───────────────────────┘ -``` - -``` sql -SELECT - groupArrayMovingAvg(2)(int) AS I, - groupArrayMovingAvg(2)(float) AS F, - groupArrayMovingAvg(2)(dec) AS D -FROM t -``` - -``` text -┌─I─────────┬─F────────────────────────────────┬─D─────────────────────┐ -│ [0,1,3,5] │ [0.55,1.6500001,3.3000002,6.085] │ [0.55,1.65,3.30,6.08] │ -└───────────┴──────────────────────────────────┴───────────────────────┘ -``` - -## ¿Cómo puedo obtener más información?) {#groupuniqarrayx-groupuniqarraymax-sizex} - -Crea una matriz a partir de diferentes valores de argumento. El consumo de memoria es el mismo que para el `uniqExact` función. - -La segunda versión (con el `max_size` parámetro) limita el tamaño de la matriz resultante a `max_size` elemento. -Por ejemplo, `groupUniqArray(1)(x)` es equivalente a `[any(x)]`. - -## cuantil {#quantile} - -Calcula un aproximado [cuantil](https://en.wikipedia.org/wiki/Quantile) de una secuencia de datos numéricos. - -Esta función se aplica [muestreo de embalses](https://en.wikipedia.org/wiki/Reservoir_sampling) con un tamaño de depósito de hasta 8192 y un generador de números aleatorios para el muestreo. El resultado es no determinista. Para obtener un cuantil exacto, use el [quantileExact](#quantileexact) función. - -Cuando se utilizan múltiples `quantile*` funciones con diferentes niveles en una consulta, los estados internos no se combinan (es decir, la consulta funciona de manera menos eficiente de lo que podría). En este caso, use el [cantiles](#quantiles) función. - -**Sintaxis** - -``` sql -quantile(level)(expr) -``` - -Apodo: `median`. - -**Parámetros** - -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` valor en el rango de `[0.01, 0.99]`. Valor predeterminado: 0.5. En `level=0.5` la función calcula [mediana](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [tipos de datos](../../sql-reference/data-types/index.md#data_types), [Fecha](../../sql-reference/data-types/date.md) o [FechaHora](../../sql-reference/data-types/datetime.md). - -**Valor devuelto** - -- Cuantil aproximado del nivel especificado. - -Tipo: - -- [Float64](../../sql-reference/data-types/float.md) para la entrada de tipo de datos numéricos. -- [Fecha](../../sql-reference/data-types/date.md) si los valores de entrada tienen `Date` tipo. -- [FechaHora](../../sql-reference/data-types/datetime.md) si los valores de entrada tienen `DateTime` tipo. - -**Ejemplo** - -Tabla de entrada: - -``` text -┌─val─┐ -│ 1 │ -│ 1 │ -│ 2 │ -│ 3 │ -└─────┘ -``` - -Consulta: - -``` sql -SELECT quantile(val) FROM t -``` - -Resultado: - -``` text -┌─quantile(val)─┐ -│ 1.5 │ -└───────────────┘ -``` - -**Ver también** - -- [mediana](#median) -- [cantiles](#quantiles) - -## quantileDeterminista {#quantiledeterministic} - -Calcula un aproximado [cuantil](https://en.wikipedia.org/wiki/Quantile) de una secuencia de datos numéricos. - -Esta función se aplica [muestreo de embalses](https://en.wikipedia.org/wiki/Reservoir_sampling) con un tamaño de depósito de hasta 8192 y algoritmo determinista de muestreo. El resultado es determinista. Para obtener un cuantil exacto, use el [quantileExact](#quantileexact) función. - -Cuando se utilizan múltiples `quantile*` funciones con diferentes niveles en una consulta, los estados internos no se combinan (es decir, la consulta funciona de manera menos eficiente de lo que podría). En este caso, use el [cantiles](#quantiles) función. - -**Sintaxis** - -``` sql -quantileDeterministic(level)(expr, determinator) -``` - -Apodo: `medianDeterministic`. - -**Parámetros** - -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` valor en el rango de `[0.01, 0.99]`. Valor predeterminado: 0.5. En `level=0.5` la función calcula [mediana](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [tipos de datos](../../sql-reference/data-types/index.md#data_types), [Fecha](../../sql-reference/data-types/date.md) o [FechaHora](../../sql-reference/data-types/datetime.md). -- `determinator` — Number whose hash is used instead of a random number generator in the reservoir sampling algorithm to make the result of sampling deterministic. As a determinator you can use any deterministic positive number, for example, a user id or an event id. If the same determinator value occures too often, the function works incorrectly. - -**Valor devuelto** - -- Cuantil aproximado del nivel especificado. - -Tipo: - -- [Float64](../../sql-reference/data-types/float.md) para la entrada de tipo de datos numéricos. -- [Fecha](../../sql-reference/data-types/date.md) si los valores de entrada tienen `Date` tipo. -- [FechaHora](../../sql-reference/data-types/datetime.md) si los valores de entrada tienen `DateTime` tipo. - -**Ejemplo** - -Tabla de entrada: - -``` text -┌─val─┐ -│ 1 │ -│ 1 │ -│ 2 │ -│ 3 │ -└─────┘ -``` - -Consulta: - -``` sql -SELECT quantileDeterministic(val, 1) FROM t -``` - -Resultado: - -``` text -┌─quantileDeterministic(val, 1)─┐ -│ 1.5 │ -└───────────────────────────────┘ -``` - -**Ver también** - -- [mediana](#median) -- [cantiles](#quantiles) - -## quantileExact {#quantileexact} - -Calcula exactamente el [cuantil](https://en.wikipedia.org/wiki/Quantile) de una secuencia de datos numéricos. - -To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Therefore, the function consumes `O(n)` memoria, donde `n` es un número de valores que se pasaron. Sin embargo, para un pequeño número de valores, la función es muy efectiva. - -Cuando se utilizan múltiples `quantile*` funciones con diferentes niveles en una consulta, los estados internos no se combinan (es decir, la consulta funciona de manera menos eficiente de lo que podría). En este caso, use el [cantiles](#quantiles) función. - -**Sintaxis** - -``` sql -quantileExact(level)(expr) -``` - -Apodo: `medianExact`. - -**Parámetros** - -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` valor en el rango de `[0.01, 0.99]`. Valor predeterminado: 0.5. En `level=0.5` la función calcula [mediana](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [tipos de datos](../../sql-reference/data-types/index.md#data_types), [Fecha](../../sql-reference/data-types/date.md) o [FechaHora](../../sql-reference/data-types/datetime.md). - -**Valor devuelto** - -- Cuantil del nivel especificado. - -Tipo: - -- [Float64](../../sql-reference/data-types/float.md) para la entrada de tipo de datos numéricos. -- [Fecha](../../sql-reference/data-types/date.md) si los valores de entrada tienen `Date` tipo. -- [FechaHora](../../sql-reference/data-types/datetime.md) si los valores de entrada tienen `DateTime` tipo. - -**Ejemplo** - -Consulta: - -``` sql -SELECT quantileExact(number) FROM numbers(10) -``` - -Resultado: - -``` text -┌─quantileExact(number)─┐ -│ 5 │ -└───────────────────────┘ -``` - -**Ver también** - -- [mediana](#median) -- [cantiles](#quantiles) - -## quantileExactWeighted {#quantileexactweighted} - -Calcula exactamente el [cuantil](https://en.wikipedia.org/wiki/Quantile) de una secuencia de datos numéricos, teniendo en cuenta el peso de cada elemento. - -To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Each value is counted with its weight, as if it is present `weight` times. A hash table is used in the algorithm. Because of this, if the passed values ​​are frequently repeated, the function consumes less RAM than [quantileExact](#quantileexact). Puede usar esta función en lugar de `quantileExact` y especifique el peso 1. - -Cuando se utilizan múltiples `quantile*` funciones con diferentes niveles en una consulta, los estados internos no se combinan (es decir, la consulta funciona de manera menos eficiente de lo que podría). En este caso, use el [cantiles](#quantiles) función. - -**Sintaxis** - -``` sql -quantileExactWeighted(level)(expr, weight) -``` - -Apodo: `medianExactWeighted`. - -**Parámetros** - -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` valor en el rango de `[0.01, 0.99]`. Valor predeterminado: 0.5. En `level=0.5` la función calcula [mediana](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [tipos de datos](../../sql-reference/data-types/index.md#data_types), [Fecha](../../sql-reference/data-types/date.md) o [FechaHora](../../sql-reference/data-types/datetime.md). -- `weight` — Column with weights of sequence members. Weight is a number of value occurrences. - -**Valor devuelto** - -- Cuantil del nivel especificado. - -Tipo: - -- [Float64](../../sql-reference/data-types/float.md) para la entrada de tipo de datos numéricos. -- [Fecha](../../sql-reference/data-types/date.md) si los valores de entrada tienen `Date` tipo. -- [FechaHora](../../sql-reference/data-types/datetime.md) si los valores de entrada tienen `DateTime` tipo. - -**Ejemplo** - -Tabla de entrada: - -``` text -┌─n─┬─val─┐ -│ 0 │ 3 │ -│ 1 │ 2 │ -│ 2 │ 1 │ -│ 5 │ 4 │ -└───┴─────┘ -``` - -Consulta: - -``` sql -SELECT quantileExactWeighted(n, val) FROM t -``` - -Resultado: - -``` text -┌─quantileExactWeighted(n, val)─┐ -│ 1 │ -└───────────────────────────────┘ -``` - -**Ver también** - -- [mediana](#median) -- [cantiles](#quantiles) - -## quantileTiming {#quantiletiming} - -Con la precisión determinada calcula el [cuantil](https://en.wikipedia.org/wiki/Quantile) de una secuencia de datos numéricos. - -El resultado es determinista (no depende del orden de procesamiento de la consulta). La función está optimizada para trabajar con secuencias que describen distribuciones como tiempos de carga de páginas web o tiempos de respuesta de back-end. - -Cuando se utilizan múltiples `quantile*` funciones con diferentes niveles en una consulta, los estados internos no se combinan (es decir, la consulta funciona de manera menos eficiente de lo que podría). En este caso, use el [cantiles](#quantiles) función. - -**Sintaxis** - -``` sql -quantileTiming(level)(expr) -``` - -Apodo: `medianTiming`. - -**Parámetros** - -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` valor en el rango de `[0.01, 0.99]`. Valor predeterminado: 0.5. En `level=0.5` la función calcula [mediana](https://en.wikipedia.org/wiki/Median). - -- `expr` — [Expresion](../syntax.md#syntax-expressions) sobre una columna valores que devuelven un [Flotante\*](../../sql-reference/data-types/float.md)-tipo número. - - - If negative values are passed to the function, the behavior is undefined. - - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. - -**Exactitud** - -El cálculo es preciso si: - -- El número total de valores no supera los 5670. -- El número total de valores supera los 5670, pero el tiempo de carga de la página es inferior a 1024 ms. - -De lo contrario, el resultado del cálculo se redondea al múltiplo más cercano de 16 ms. - -!!! note "Nota" - Para calcular los cuantiles de tiempo de carga de la página, esta función es más efectiva y precisa que [cuantil](#quantile). - -**Valor devuelto** - -- Cuantil del nivel especificado. - -Tipo: `Float32`. - -!!! note "Nota" - Si no se pasan valores a la función (cuando se `quantileTimingIf`), [NaN](../../sql-reference/data-types/float.md#data_type-float-nan-inf) se devuelve. El propósito de esto es diferenciar estos casos de los casos que resultan en cero. Ver [ORDER BY cláusula](../statements/select/order-by.md#select-order-by) para notas sobre la clasificación `NaN` valor. - -**Ejemplo** - -Tabla de entrada: - -``` text -┌─response_time─┐ -│ 72 │ -│ 112 │ -│ 126 │ -│ 145 │ -│ 104 │ -│ 242 │ -│ 313 │ -│ 168 │ -│ 108 │ -└───────────────┘ -``` - -Consulta: - -``` sql -SELECT quantileTiming(response_time) FROM t -``` - -Resultado: - -``` text -┌─quantileTiming(response_time)─┐ -│ 126 │ -└───────────────────────────────┘ -``` - -**Ver también** - -- [mediana](#median) -- [cantiles](#quantiles) - -## quantileTimingWeighted {#quantiletimingweighted} - -Con la precisión determinada calcula el [cuantil](https://en.wikipedia.org/wiki/Quantile) de una secuencia de datos numéricos según el peso de cada miembro de secuencia. - -El resultado es determinista (no depende del orden de procesamiento de la consulta). La función está optimizada para trabajar con secuencias que describen distribuciones como tiempos de carga de páginas web o tiempos de respuesta de back-end. - -Cuando se utilizan múltiples `quantile*` funciones con diferentes niveles en una consulta, los estados internos no se combinan (es decir, la consulta funciona de manera menos eficiente de lo que podría). En este caso, use el [cantiles](#quantiles) función. - -**Sintaxis** - -``` sql -quantileTimingWeighted(level)(expr, weight) -``` - -Apodo: `medianTimingWeighted`. - -**Parámetros** - -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` valor en el rango de `[0.01, 0.99]`. Valor predeterminado: 0.5. En `level=0.5` la función calcula [mediana](https://en.wikipedia.org/wiki/Median). - -- `expr` — [Expresion](../syntax.md#syntax-expressions) sobre una columna valores que devuelven un [Flotante\*](../../sql-reference/data-types/float.md)-tipo número. - - - If negative values are passed to the function, the behavior is undefined. - - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. - -- `weight` — Column with weights of sequence elements. Weight is a number of value occurrences. - -**Exactitud** - -El cálculo es preciso si: - -- El número total de valores no supera los 5670. -- El número total de valores supera los 5670, pero el tiempo de carga de la página es inferior a 1024 ms. - -De lo contrario, el resultado del cálculo se redondea al múltiplo más cercano de 16 ms. - -!!! note "Nota" - Para calcular los cuantiles de tiempo de carga de la página, esta función es más efectiva y precisa que [cuantil](#quantile). - -**Valor devuelto** - -- Cuantil del nivel especificado. - -Tipo: `Float32`. - -!!! note "Nota" - Si no se pasan valores a la función (cuando se `quantileTimingIf`), [NaN](../../sql-reference/data-types/float.md#data_type-float-nan-inf) se devuelve. El propósito de esto es diferenciar estos casos de los casos que resultan en cero. Ver [ORDER BY cláusula](../statements/select/order-by.md#select-order-by) para notas sobre la clasificación `NaN` valor. - -**Ejemplo** - -Tabla de entrada: - -``` text -┌─response_time─┬─weight─┐ -│ 68 │ 1 │ -│ 104 │ 2 │ -│ 112 │ 3 │ -│ 126 │ 2 │ -│ 138 │ 1 │ -│ 162 │ 1 │ -└───────────────┴────────┘ -``` - -Consulta: - -``` sql -SELECT quantileTimingWeighted(response_time, weight) FROM t -``` - -Resultado: - -``` text -┌─quantileTimingWeighted(response_time, weight)─┐ -│ 112 │ -└───────────────────────────────────────────────┘ -``` - -**Ver también** - -- [mediana](#median) -- [cantiles](#quantiles) - -## quantileTDigest {#quantiletdigest} - -Calcula un aproximado [cuantil](https://en.wikipedia.org/wiki/Quantile) de una secuencia de datos numéricos usando el [T-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algoritmo. - -El error máximo es 1%. El consumo de memoria es `log(n)`, donde `n` es un número de valores. El resultado depende del orden de ejecución de la consulta y no es determinista. - -El rendimiento de la función es menor que el rendimiento de [cuantil](#quantile) o [quantileTiming](#quantiletiming). En términos de la relación entre el tamaño del estado y la precisión, esta función es mucho mejor que `quantile`. - -Cuando se utilizan múltiples `quantile*` funciones con diferentes niveles en una consulta, los estados internos no se combinan (es decir, la consulta funciona de manera menos eficiente de lo que podría). En este caso, use el [cantiles](#quantiles) función. - -**Sintaxis** - -``` sql -quantileTDigest(level)(expr) -``` - -Apodo: `medianTDigest`. - -**Parámetros** - -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` valor en el rango de `[0.01, 0.99]`. Valor predeterminado: 0.5. En `level=0.5` la función calcula [mediana](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [tipos de datos](../../sql-reference/data-types/index.md#data_types), [Fecha](../../sql-reference/data-types/date.md) o [FechaHora](../../sql-reference/data-types/datetime.md). - -**Valor devuelto** - -- Cuantil aproximado del nivel especificado. - -Tipo: - -- [Float64](../../sql-reference/data-types/float.md) para la entrada de tipo de datos numéricos. -- [Fecha](../../sql-reference/data-types/date.md) si los valores de entrada tienen `Date` tipo. -- [FechaHora](../../sql-reference/data-types/datetime.md) si los valores de entrada tienen `DateTime` tipo. - -**Ejemplo** - -Consulta: - -``` sql -SELECT quantileTDigest(number) FROM numbers(10) -``` - -Resultado: - -``` text -┌─quantileTDigest(number)─┐ -│ 4.5 │ -└─────────────────────────┘ -``` - -**Ver también** - -- [mediana](#median) -- [cantiles](#quantiles) - -## quantileTDigestWeighted {#quantiletdigestweighted} - -Calcula un aproximado [cuantil](https://en.wikipedia.org/wiki/Quantile) de una secuencia de datos numéricos usando el [T-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algoritmo. La función tiene en cuenta el peso de cada miembro de secuencia. El error máximo es 1%. El consumo de memoria es `log(n)`, donde `n` es un número de valores. - -El rendimiento de la función es menor que el rendimiento de [cuantil](#quantile) o [quantileTiming](#quantiletiming). En términos de la relación entre el tamaño del estado y la precisión, esta función es mucho mejor que `quantile`. - -El resultado depende del orden de ejecución de la consulta y no es determinista. - -Cuando se utilizan múltiples `quantile*` funciones con diferentes niveles en una consulta, los estados internos no se combinan (es decir, la consulta funciona de manera menos eficiente de lo que podría). En este caso, use el [cantiles](#quantiles) función. - -**Sintaxis** - -``` sql -quantileTDigest(level)(expr) -``` - -Apodo: `medianTDigest`. - -**Parámetros** - -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` valor en el rango de `[0.01, 0.99]`. Valor predeterminado: 0.5. En `level=0.5` la función calcula [mediana](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [tipos de datos](../../sql-reference/data-types/index.md#data_types), [Fecha](../../sql-reference/data-types/date.md) o [FechaHora](../../sql-reference/data-types/datetime.md). -- `weight` — Column with weights of sequence elements. Weight is a number of value occurrences. - -**Valor devuelto** - -- Cuantil aproximado del nivel especificado. - -Tipo: - -- [Float64](../../sql-reference/data-types/float.md) para la entrada de tipo de datos numéricos. -- [Fecha](../../sql-reference/data-types/date.md) si los valores de entrada tienen `Date` tipo. -- [FechaHora](../../sql-reference/data-types/datetime.md) si los valores de entrada tienen `DateTime` tipo. - -**Ejemplo** - -Consulta: - -``` sql -SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) -``` - -Resultado: - -``` text -┌─quantileTDigestWeighted(number, 1)─┐ -│ 4.5 │ -└────────────────────────────────────┘ -``` - -**Ver también** - -- [mediana](#median) -- [cantiles](#quantiles) - -## mediana {#median} - -El `median*` funciones son los alias para el `quantile*` función. Calculan la mediana de una muestra de datos numéricos. - -Función: - -- `median` — Alias for [cuantil](#quantile). -- `medianDeterministic` — Alias for [quantileDeterminista](#quantiledeterministic). -- `medianExact` — Alias for [quantileExact](#quantileexact). -- `medianExactWeighted` — Alias for [quantileExactWeighted](#quantileexactweighted). -- `medianTiming` — Alias for [quantileTiming](#quantiletiming). -- `medianTimingWeighted` — Alias for [quantileTimingWeighted](#quantiletimingweighted). -- `medianTDigest` — Alias for [quantileTDigest](#quantiletdigest). -- `medianTDigestWeighted` — Alias for [quantileTDigestWeighted](#quantiletdigestweighted). - -**Ejemplo** - -Tabla de entrada: - -``` text -┌─val─┐ -│ 1 │ -│ 1 │ -│ 2 │ -│ 3 │ -└─────┘ -``` - -Consulta: - -``` sql -SELECT medianDeterministic(val, 1) FROM t -``` - -Resultado: - -``` text -┌─medianDeterministic(val, 1)─┐ -│ 1.5 │ -└─────────────────────────────┘ -``` - -## quantiles(level1, level2, …)(x) {#quantiles} - -Todas las funciones de cuantiles también tienen funciones de cuantiles correspondientes: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`. Estas funciones calculan todos los cuantiles de los niveles enumerados en una sola pasada y devuelven una matriz de los valores resultantes. - -## Acerca de Nosotros) {#varsampx} - -Calcula la cantidad `Σ((x - x̅)^2) / (n - 1)`, donde `n` es el tamaño de la muestra y `x̅`es el valor promedio de `x`. - -Representa una estimación imparcial de la varianza de una variable aleatoria si los valores pasados forman su muestra. - -Devoluciones `Float64`. Cuando `n <= 1`, devoluciones `+∞`. - -!!! note "Nota" - Esta función utiliza un algoritmo numéricamente inestable. Si necesita [estabilidad numérica](https://en.wikipedia.org/wiki/Numerical_stability) en los cálculos, utilice el `varSampStable` función. Funciona más lento, pero proporciona un menor error computacional. - -## Nombre de la red inalámbrica (SSID):) {#varpopx} - -Calcula la cantidad `Σ((x - x̅)^2) / n`, donde `n` es el tamaño de la muestra y `x̅`es el valor promedio de `x`. - -En otras palabras, dispersión para un conjunto de valores. Devoluciones `Float64`. - -!!! note "Nota" - Esta función utiliza un algoritmo numéricamente inestable. Si necesita [estabilidad numérica](https://en.wikipedia.org/wiki/Numerical_stability) en los cálculos, utilice el `varPopStable` función. Funciona más lento, pero proporciona un menor error computacional. - -## Soporte técnico) {#stddevsampx} - -El resultado es igual a la raíz cuadrada de `varSamp(x)`. - -!!! note "Nota" - Esta función utiliza un algoritmo numéricamente inestable. Si necesita [estabilidad numérica](https://en.wikipedia.org/wiki/Numerical_stability) en los cálculos, utilice el `stddevSampStable` función. Funciona más lento, pero proporciona un menor error computacional. - -## stddevPop(x) {#stddevpopx} - -El resultado es igual a la raíz cuadrada de `varPop(x)`. - -!!! note "Nota" - Esta función utiliza un algoritmo numéricamente inestable. Si necesita [estabilidad numérica](https://en.wikipedia.org/wiki/Numerical_stability) en los cálculos, utilice el `stddevPopStable` función. Funciona más lento, pero proporciona un menor error computacional. - -## topK(N)(x) {#topknx} - -Devuelve una matriz de los valores aproximadamente más frecuentes de la columna especificada. La matriz resultante se ordena en orden descendente de frecuencia aproximada de valores (no por los valores mismos). - -Implementa el [Ahorro de espacio filtrado](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) algoritmo para analizar TopK, basado en el algoritmo de reducción y combinación de [Ahorro de espacio paralelo](https://arxiv.org/pdf/1401.0702.pdf). - -``` sql -topK(N)(column) -``` - -Esta función no proporciona un resultado garantizado. En ciertas situaciones, pueden producirse errores y pueden devolver valores frecuentes que no son los valores más frecuentes. - -Recomendamos usar el `N < 10` valor; el rendimiento se reduce con grandes `N` valor. Valor máximo de `N = 65536`. - -**Parámetros** - -- ‘N’ es el número de elementos a devolver. - -Si se omite el parámetro, se utiliza el valor predeterminado 10. - -**Argumento** - -- ' x ' – The value to calculate frequency. - -**Ejemplo** - -Tome el [A tiempo](../../getting-started/example-datasets/ontime.md) conjunto de datos y seleccione los tres valores más frecuentes `AirlineID` columna. - -``` sql -SELECT topK(3)(AirlineID) AS res -FROM ontime -``` - -``` text -┌─res─────────────────┐ -│ [19393,19790,19805] │ -└─────────────────────┘ -``` - -## topKPeso {#topkweighted} - -Similar a `topK` pero toma un argumento adicional de tipo entero - `weight`. Cada valor se contabiliza `weight` veces para el cálculo de la frecuencia. - -**Sintaxis** - -``` sql -topKWeighted(N)(x, weight) -``` - -**Parámetros** - -- `N` — The number of elements to return. - -**Argumento** - -- `x` – The value. -- `weight` — The weight. [UInt8](../../sql-reference/data-types/int-uint.md). - -**Valor devuelto** - -Devuelve una matriz de los valores con la suma aproximada máxima de pesos. - -**Ejemplo** - -Consulta: - -``` sql -SELECT topKWeighted(10)(number, number) FROM numbers(1000) -``` - -Resultado: - -``` text -┌─topKWeighted(10)(number, number)──────────┐ -│ [999,998,997,996,995,994,993,992,991,990] │ -└───────────────────────────────────────────┘ -``` - -## covarSamp(x, y) {#covarsampx-y} - -Calcula el valor de `Σ((x - x̅)(y - y̅)) / (n - 1)`. - -Devuelve Float64. Cuando `n <= 1`, returns +∞. - -!!! note "Nota" - Esta función utiliza un algoritmo numéricamente inestable. Si necesita [estabilidad numérica](https://en.wikipedia.org/wiki/Numerical_stability) en los cálculos, utilice el `covarSampStable` función. Funciona más lento, pero proporciona un menor error computacional. - -## covarPop(x, y) {#covarpopx-y} - -Calcula el valor de `Σ((x - x̅)(y - y̅)) / n`. - -!!! note "Nota" - Esta función utiliza un algoritmo numéricamente inestable. Si necesita [estabilidad numérica](https://en.wikipedia.org/wiki/Numerical_stability) en los cálculos, utilice el `covarPopStable` función. Funciona más lento pero proporciona un menor error computacional. - -## corr(x, y) {#corrx-y} - -Calcula el coeficiente de correlación de Pearson: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`. - -!!! note "Nota" - Esta función utiliza un algoritmo numéricamente inestable. Si necesita [estabilidad numérica](https://en.wikipedia.org/wiki/Numerical_stability) en los cálculos, utilice el `corrStable` función. Funciona más lento, pero proporciona un menor error computacional. - -## categoricalInformationValue {#categoricalinformationvalue} - -Calcula el valor de `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` para cada categoría. - -``` sql -categoricalInformationValue(category1, category2, ..., tag) -``` - -El resultado indica cómo una característica discreta (categórica `[category1, category2, ...]` contribuir a un modelo de aprendizaje que predice el valor de `tag`. - -## SimpleLinearRegression {#simplelinearregression} - -Realiza una regresión lineal simple (unidimensional). - -``` sql -simpleLinearRegression(x, y) -``` - -Parámetros: - -- `x` — Column with dependent variable values. -- `y` — Column with explanatory variable values. - -Valores devueltos: - -Constante `(a, b)` de la línea resultante `y = a*x + b`. - -**Ejemplos** - -``` sql -SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3]) -``` - -``` text -┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])─┐ -│ (1,0) │ -└───────────────────────────────────────────────────────────────────┘ -``` - -``` sql -SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6]) -``` - -``` text -┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])─┐ -│ (1,3) │ -└───────────────────────────────────────────────────────────────────┘ -``` - -## stochasticLinearRegression {#agg_functions-stochasticlinearregression} - -Esta función implementa la regresión lineal estocástica. Admite parámetros personalizados para la tasa de aprendizaje, el coeficiente de regularización L2, el tamaño de mini lote y tiene pocos métodos para actualizar los pesos ([Adán](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (utilizado por defecto), [SGD simple](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Impulso](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)). - -### Parámetros {#agg_functions-stochasticlinearregression-parameters} - -Hay 4 parámetros personalizables. Se pasan a la función secuencialmente, pero no es necesario pasar los cuatro; se usarán valores predeterminados, sin embargo, un buen modelo requirió algún ajuste de parámetros. - -``` text -stochasticLinearRegression(1.0, 1.0, 10, 'SGD') -``` - -1. `learning rate` es el coeficiente en la longitud del paso, cuando se realiza el paso de descenso de gradiente. Una tasa de aprendizaje demasiado grande puede causar pesos infinitos del modelo. El valor predeterminado es `0.00001`. -2. `l2 regularization coefficient` que puede ayudar a prevenir el sobreajuste. El valor predeterminado es `0.1`. -3. `mini-batch size` establece el número de elementos, cuyos gradientes se calcularán y sumarán para realizar un paso de descenso de gradiente. El descenso estocástico puro usa un elemento, sin embargo, tener lotes pequeños (aproximadamente 10 elementos) hace que los pasos de gradiente sean más estables. El valor predeterminado es `15`. -4. `method for updating weights`, son: `Adam` (predeterminada), `SGD`, `Momentum`, `Nesterov`. `Momentum` y `Nesterov` requieren un poco más de cálculos y memoria, sin embargo, resultan útiles en términos de velocidad de convergencia y estabilidad de los métodos de gradiente estocásticos. - -### Uso {#agg_functions-stochasticlinearregression-usage} - -`stochasticLinearRegression` se utiliza en dos pasos: ajustar el modelo y predecir nuevos datos. Para ajustar el modelo y guardar su estado para su uso posterior, utilizamos `-State` combinador, que básicamente guarda el estado (pesos del modelo, etc.). -Para predecir usamos la función [evalMLMethod](../functions/machine-learning-functions.md#machine_learning_methods-evalmlmethod), que toma un estado como argumento, así como características para predecir. - - - -**1.** Accesorio - -Dicha consulta puede ser utilizada. - -``` sql -CREATE TABLE IF NOT EXISTS train_data -( - param1 Float64, - param2 Float64, - target Float64 -) ENGINE = Memory; - -CREATE TABLE your_model ENGINE = Memory AS SELECT -stochasticLinearRegressionState(0.1, 0.0, 5, 'SGD')(target, param1, param2) -AS state FROM train_data; -``` - -Aquí también tenemos que insertar datos en `train_data` tabla. El número de parámetros no es fijo, depende solo del número de argumentos, pasados a `linearRegressionState`. Todos deben ser valores numéricos. -Tenga en cuenta que la columna con valor objetivo (que nos gustaría aprender a predecir) se inserta como primer argumento. - -**2.** Predecir - -Después de guardar un estado en la tabla, podemos usarlo varias veces para la predicción, o incluso fusionarlo con otros estados y crear nuevos modelos aún mejores. - -``` sql -WITH (SELECT state FROM your_model) AS model SELECT -evalMLMethod(model, param1, param2) FROM test_data -``` - -La consulta devolverá una columna de valores predichos. Tenga en cuenta que el primer argumento de `evalMLMethod` ser `AggregateFunctionState` objeto, siguiente son columnas de características. - -`test_data` es una mesa como `train_data` pero puede no contener el valor objetivo. - -### Nota {#agg_functions-stochasticlinearregression-notes} - -1. Para fusionar dos modelos, el usuario puede crear dicha consulta: - `sql SELECT state1 + state2 FROM your_models` - donde `your_models` la tabla contiene ambos modelos. Esta consulta devolverá un nuevo `AggregateFunctionState` objeto. - -2. El usuario puede obtener pesos del modelo creado para sus propios fines sin guardar el modelo si no `-State` combinador se utiliza. - `sql SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data` - Dicha consulta se ajustará al modelo y devolverá sus pesos: primero son los pesos, que corresponden a los parámetros del modelo, el último es el sesgo. Entonces, en el ejemplo anterior, la consulta devolverá una columna con 3 valores. - -**Ver también** - -- [stochasticLogisticRegression](#agg_functions-stochasticlogisticregression) -- [Diferencia entre regresiones lineales y logísticas](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) - -## stochasticLogisticRegression {#agg_functions-stochasticlogisticregression} - -Esta función implementa la regresión logística estocástica. Se puede usar para problemas de clasificación binaria, admite los mismos parámetros personalizados que stochasticLinearRegression y funciona de la misma manera. - -### Parámetros {#agg_functions-stochasticlogisticregression-parameters} - -Los parámetros son exactamente los mismos que en stochasticLinearRegression: -`learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. -Para obtener más información, consulte [parámetros](#agg_functions-stochasticlinearregression-parameters). - -``` text -stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') -``` - -1. Accesorio - - - - See the `Fitting` section in the [stochasticLinearRegression](#stochasticlinearregression-usage-fitting) description. - - Predicted labels have to be in \[-1, 1\]. - -1. Predecir - - - - Using saved state we can predict probability of object having label `1`. - - ``` sql - WITH (SELECT state FROM your_model) AS model SELECT - evalMLMethod(model, param1, param2) FROM test_data - ``` - - The query will return a column of probabilities. Note that first argument of `evalMLMethod` is `AggregateFunctionState` object, next are columns of features. - - We can also set a bound of probability, which assigns elements to different labels. - - ``` sql - SELECT ans < 1.1 AND ans > 0.5 FROM - (WITH (SELECT state FROM your_model) AS model SELECT - evalMLMethod(model, param1, param2) AS ans FROM test_data) - ``` - - Then the result will be labels. - - `test_data` is a table like `train_data` but may not contain target value. - -**Ver también** - -- [stochasticLinearRegression](#agg_functions-stochasticlinearregression) -- [Diferencia entre regresiones lineales y logísticas.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) - -## Método de codificación de datos: {#groupbitmapand} - -Calcula el AND de una columna de mapa de bits, devuelve la cardinalidad del tipo UInt64, si agrega el sufijo -State, luego devuelve [objeto de mapa de bits](../../sql-reference/functions/bitmap-functions.md). - -``` sql -groupBitmapAnd(expr) -``` - -**Parámetros** - -`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` tipo. - -**Valor de retorno** - -Valor de la `UInt64` tipo. - -**Ejemplo** - -``` sql -DROP TABLE IF EXISTS bitmap_column_expr_test2; -CREATE TABLE bitmap_column_expr_test2 -( - tag_id String, - z AggregateFunction(groupBitmap, UInt32) -) -ENGINE = MergeTree -ORDER BY tag_id; - -INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); - -SELECT groupBitmapAnd(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─groupBitmapAnd(z)─┐ -│ 3 │ -└───────────────────┘ - -SELECT arraySort(bitmapToArray(groupBitmapAndState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─arraySort(bitmapToArray(groupBitmapAndState(z)))─┐ -│ [6,8,10] │ -└──────────────────────────────────────────────────┘ -``` - -## Método de codificación de datos: {#groupbitmapor} - -Calcula el OR de una columna de mapa de bits, devuelve la cardinalidad del tipo UInt64, si agrega el sufijo -State, luego devuelve [objeto de mapa de bits](../../sql-reference/functions/bitmap-functions.md). Esto es equivalente a `groupBitmapMerge`. - -``` sql -groupBitmapOr(expr) -``` - -**Parámetros** - -`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` tipo. - -**Valor de retorno** - -Valor de la `UInt64` tipo. - -**Ejemplo** - -``` sql -DROP TABLE IF EXISTS bitmap_column_expr_test2; -CREATE TABLE bitmap_column_expr_test2 -( - tag_id String, - z AggregateFunction(groupBitmap, UInt32) -) -ENGINE = MergeTree -ORDER BY tag_id; - -INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); - -SELECT groupBitmapOr(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─groupBitmapOr(z)─┐ -│ 15 │ -└──────────────────┘ - -SELECT arraySort(bitmapToArray(groupBitmapOrState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─arraySort(bitmapToArray(groupBitmapOrState(z)))─┐ -│ [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] │ -└─────────────────────────────────────────────────┘ -``` - -## Método de codificación de datos: {#groupbitmapxor} - -Calcula el XOR de una columna de mapa de bits, devuelve la cardinalidad del tipo UInt64, si agrega el sufijo -State, luego devuelve [objeto de mapa de bits](../../sql-reference/functions/bitmap-functions.md). - -``` sql -groupBitmapOr(expr) -``` - -**Parámetros** - -`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` tipo. - -**Valor de retorno** - -Valor de la `UInt64` tipo. - -**Ejemplo** - -``` sql -DROP TABLE IF EXISTS bitmap_column_expr_test2; -CREATE TABLE bitmap_column_expr_test2 -( - tag_id String, - z AggregateFunction(groupBitmap, UInt32) -) -ENGINE = MergeTree -ORDER BY tag_id; - -INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); - -SELECT groupBitmapXor(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─groupBitmapXor(z)─┐ -│ 10 │ -└───────────────────┘ - -SELECT arraySort(bitmapToArray(groupBitmapXorState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─arraySort(bitmapToArray(groupBitmapXorState(z)))─┐ -│ [1,3,5,6,8,10,11,13,14,15] │ -└──────────────────────────────────────────────────┘ -``` - -[Artículo Original](https://clickhouse.tech/docs/en/query_language/agg_functions/reference/) diff --git a/docs/es/sql-reference/ansi.md b/docs/es/sql-reference/ansi.md deleted file mode 100644 index 29e2c5b12e9..00000000000 --- a/docs/es/sql-reference/ansi.md +++ /dev/null @@ -1,180 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: ad252bbb4f7e2899c448eb42ecc39ff195c8faa1 -toc_priority: 40 -toc_title: Compatibilidad con ANSI ---- - -# Compatibilidad de SQL ANSI de ClickHouse SQL Dialect {#ansi-sql-compatibility-of-clickhouse-sql-dialect} - -!!! note "Nota" - Este artículo se basa en la Tabla 38, “Feature taxonomy and definition for mandatory features”, Annex F of ISO/IEC CD 9075-2:2013. - -## Diferencias en el comportamiento {#differences-in-behaviour} - -En la tabla siguiente se enumeran los casos en que la característica de consulta funciona en ClickHouse, pero no se comporta como se especifica en ANSI SQL. - -| Feature ID | Nombre de la función | Diferencia | -|------------|----------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------| -| E011 | Tipos de datos numéricos | El literal numérico con punto se interpreta como aproximado (`Float64`) en lugar de exacta (`Decimal`) | -| E051-05 | Los elementos seleccionados pueden ser renombrados | Los cambios de nombre de los elementos tienen un alcance de visibilidad más amplio que solo el resultado SELECT | -| E141-01 | Restricciones NOT NULL | `NOT NULL` está implícito para las columnas de tabla de forma predeterminada | -| E011-04 | Operadores aritméticos | ClickHouse se desborda en lugar de la aritmética comprobada y cambia el tipo de datos de resultado en función de las reglas personalizadas | - -## Estado de la función {#feature-status} - -| Feature ID | Nombre de la función | Estatus | Comentario | -|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **E011** | **Tipos de datos numéricos** | **Parcial**{.text-warning} | | -| E011-01 | Tipos de datos INTEGER y SMALLINT | Sí {.text-success} | | -| E011-02 | REAL, DOUBLE PRECISION y FLOAT tipos de datos tipos de datos | Parcial {.text-warning} | `FLOAT()`, `REAL` y `DOUBLE PRECISION` no son compatibles | -| E011-03 | Tipos de datos DECIMAL y NUMERIC | Parcial {.text-warning} | Solo `DECIMAL(p,s)` es compatible, no `NUMERIC` | -| E011-04 | Operadores aritméticos | Sí {.text-success} | | -| E011-05 | Comparación numérica | Sí {.text-success} | | -| E011-06 | Conversión implícita entre los tipos de datos numéricos | No {.text-danger} | ANSI SQL permite la conversión implícita arbitraria entre tipos numéricos, mientras que ClickHouse se basa en funciones que tienen múltiples sobrecargas en lugar de conversión implícita | -| **E021** | **Tipos de cadena de caracteres** | **Parcial**{.text-warning} | | -| E021-01 | Tipo de datos CHARACTER | No {.text-danger} | | -| E021-02 | Tipo de datos CHARACTER VARYING | No {.text-danger} | `String` se comporta de manera similar, pero sin límite de longitud entre paréntesis | -| E021-03 | Literales de caracteres | Parcial {.text-warning} | Sin concatenación automática de literales consecutivos y compatibilidad con el conjunto de caracteres | -| E021-04 | Función CHARACTER_LENGTH | Parcial {.text-warning} | No `USING` clausula | -| E021-05 | Función OCTET_LENGTH | No {.text-danger} | `LENGTH` se comporta de manera similar | -| E021-06 | SUBSTRING | Parcial {.text-warning} | No hay soporte para `SIMILAR` y `ESCAPE` cláusulas, no `SUBSTRING_REGEX` variante | -| E021-07 | Concatenación de caracteres | Parcial {.text-warning} | No `COLLATE` clausula | -| E021-08 | Funciones SUPERIOR e INFERIOR | Sí {.text-success} | | -| E021-09 | Función TRIM | Sí {.text-success} | | -| E021-10 | Conversión implícita entre los tipos de cadena de caracteres de longitud fija y longitud variable | No {.text-danger} | ANSI SQL permite la conversión implícita arbitraria entre tipos de cadena, mientras que ClickHouse se basa en funciones que tienen múltiples sobrecargas en lugar de conversión implícita | -| E021-11 | Función POSITION | Parcial {.text-warning} | No hay soporte para `IN` y `USING` cláusulas, no `POSITION_REGEX` variante | -| E021-12 | Comparación de caracteres | Sí {.text-success} | | -| **E031** | **Identificador** | **Parcial**{.text-warning} | | -| E031-01 | Identificadores delimitados | Parcial {.text-warning} | El soporte literal Unicode es limitado | -| E031-02 | Identificadores de minúsculas | Sí {.text-success} | | -| E031-03 | Trailing subrayado | Sí {.text-success} | | -| **E051** | **Especificación básica de la consulta** | **Parcial**{.text-warning} | | -| E051-01 | SELECT DISTINCT | Sí {.text-success} | | -| E051-02 | Cláusula GROUP BY | Sí {.text-success} | | -| E051-04 | GROUP BY puede contener columnas que no estén en `` | Oui {.text-success} | | -| E051-05 | Les éléments sélectionnés peuvent être renommés | Oui {.text-success} | | -| E051-06 | Clause HAVING | Oui {.text-success} | | -| E051-07 | Qualifié \* dans la liste select | Oui {.text-success} | | -| E051-08 | Nom de corrélation dans la clause FROM | Oui {.text-success} | | -| E051-09 | Renommer les colonnes de la clause FROM | Aucun {.text-danger} | | -| **E061** | **Prédicats de base et conditions de recherche** | **Partiel**{.text-warning} | | -| E061-01 | Prédicat de comparaison | Oui {.text-success} | | -| E061-02 | Entre prédicat | Partiel {.text-warning} | Aucun `SYMMETRIC` et `ASYMMETRIC` clause | -| E061-03 | Dans le prédicat avec la liste des valeurs | Oui {.text-success} | | -| E061-04 | Comme prédicat | Oui {.text-success} | | -| E061-05 | Comme prédicat: clause D'échappement | Aucun {.text-danger} | | -| E061-06 | Prédicat NULL | Oui {.text-success} | | -| E061-07 | Prédicat de comparaison quantifié | Aucun {.text-danger} | | -| E061-08 | Existe prédicat | Aucun {.text-danger} | | -| E061-09 | Sous-requêtes dans le prédicat de comparaison | Oui {.text-success} | | -| E061-11 | Sous-requêtes dans dans le prédicat | Oui {.text-success} | | -| E061-12 | Sous-requêtes dans le prédicat de comparaison quantifiée | Aucun {.text-danger} | | -| E061-13 | Sous-requêtes corrélées | Aucun {.text-danger} | | -| E061-14 | Condition de recherche | Oui {.text-success} | | -| **E071** | **Expressions de requête de base** | **Partiel**{.text-warning} | | -| E071-01 | Opérateur de table distinct UNION | Aucun {.text-danger} | | -| E071-02 | Opérateur de table UNION ALL | Oui {.text-success} | | -| E071-03 | Sauf opérateur de table DISTINCT | Aucun {.text-danger} | | -| E071-05 | Les colonnes combinées via les opérateurs de table n'ont pas besoin d'avoir exactement le même type de données | Oui {.text-success} | | -| E071-06 | Tableau des opérateurs dans les sous-requêtes | Oui {.text-success} | | -| **E081** | **Les privilèges de base** | **Partiel**{.text-warning} | Les travaux en cours | -| **E091** | **Les fonctions de jeu** | **Oui**{.text-success} | | -| E091-01 | AVG | Oui {.text-success} | | -| E091-02 | COUNT | Oui {.text-success} | | -| E091-03 | MAX | Oui {.text-success} | | -| E091-04 | MIN | Oui {.text-success} | | -| E091-05 | SUM | Oui {.text-success} | | -| E091-06 | TOUS les quantificateurs | Aucun {.text-danger} | | -| E091-07 | Quantificateur DISTINCT | Partiel {.text-warning} | Toutes les fonctions d'agrégation ne sont pas prises en charge | -| **E101** | **Manipulation des données de base** | **Partiel**{.text-warning} | | -| E101-01 | Insérer une déclaration | Oui {.text-success} | Remarque: la clé primaire dans ClickHouse n'implique pas `UNIQUE` contrainte | -| E101-03 | Déclaration de mise à jour recherchée | Aucun {.text-danger} | Il y a un `ALTER UPDATE` déclaration pour la modification des données de lot | -| E101-04 | Requête de suppression recherchée | Aucun {.text-danger} | Il y a un `ALTER DELETE` déclaration pour la suppression de données par lots | -| **E111** | **Instruction SELECT à une ligne** | **Aucun**{.text-danger} | | -| **E121** | **Prise en charge du curseur de base** | **Aucun**{.text-danger} | | -| E121-01 | DECLARE CURSOR | Aucun {.text-danger} | | -| E121-02 | Les colonnes ORDER BY n'ont pas besoin d'être dans la liste select | Aucun {.text-danger} | | -| E121-03 | Expressions de valeur dans la clause ORDER BY | Aucun {.text-danger} | | -| E121-04 | Instruction OPEN | Aucun {.text-danger} | | -| E121-06 | Déclaration de mise à jour positionnée | Aucun {.text-danger} | | -| E121-07 | Instruction de suppression positionnée | Aucun {.text-danger} | | -| E121-08 | Déclaration de fermeture | Aucun {.text-danger} | | -| E121-10 | Instruction FETCH: implicite suivant | Aucun {.text-danger} | | -| E121-17 | Avec curseurs HOLD | Aucun {.text-danger} | | -| **E131** | **Support de valeur Null (nulls au lieu de valeurs)** | **Partiel**{.text-warning} | Certaines restrictions s'appliquent | -| **E141** | **Contraintes d'intégrité de base** | **Partiel**{.text-warning} | | -| E141-01 | Contraintes non nulles | Oui {.text-success} | Note: `NOT NULL` est implicite pour les colonnes de table par défaut | -| E141-02 | Contrainte UNIQUE de colonnes non nulles | Aucun {.text-danger} | | -| E141-03 | Contraintes de clé primaire | Aucun {.text-danger} | | -| E141-04 | Contrainte de clé étrangère de base avec la valeur par défaut NO ACTION Pour l'action de suppression référentielle et l'action de mise à jour référentielle | Aucun {.text-danger} | | -| E141-06 | Vérifier la contrainte | Oui {.text-success} | | -| E141-07 | Colonne par défaut | Oui {.text-success} | | -| E141-08 | Non NULL déduit sur la clé primaire | Oui {.text-success} | | -| E141-10 | Les noms dans une clé étrangère peut être spécifié dans n'importe quel ordre | Aucun {.text-danger} | | -| **E151** | **Support de Transaction** | **Aucun**{.text-danger} | | -| E151-01 | COMMIT déclaration | Aucun {.text-danger} | | -| E151-02 | Déclaration de restauration | Aucun {.text-danger} | | -| **E152** | **Instruction de transaction set de base** | **Aucun**{.text-danger} | | -| E152-01 | SET TRANSACTION statement: clause sérialisable de niveau D'isolement | Aucun {.text-danger} | | -| E152-02 | SET TRANSACTION statement: clauses en lecture seule et en lecture écriture | Aucun {.text-danger} | | -| **E153** | **Requêtes pouvant être mises à jour avec des sous requêtes** | **Aucun**{.text-danger} | | -| **E161** | **Commentaires SQL en utilisant le premier Double moins** | **Oui**{.text-success} | | -| **E171** | **Support SQLSTATE** | **Aucun**{.text-danger} | | -| **E182** | **Liaison du langage hôte** | **Aucun**{.text-danger} | | -| **F031** | **Manipulation de schéma de base** | **Partiel**{.text-warning} | | -| F031-01 | Instruction CREATE TABLE pour créer des tables de base persistantes | Partiel {.text-warning} | Aucun `SYSTEM VERSIONING`, `ON COMMIT`, `GLOBAL`, `LOCAL`, `PRESERVE`, `DELETE`, `REF IS`, `WITH OPTIONS`, `UNDER`, `LIKE`, `PERIOD FOR` clauses et aucun support pour les types de données résolus par l'utilisateur | -| F031-02 | Instruction créer une vue | Partiel {.text-warning} | Aucun `RECURSIVE`, `CHECK`, `UNDER`, `WITH OPTIONS` clauses et aucun support pour les types de données résolus par l'utilisateur | -| F031-03 | Déclaration de subvention | Oui {.text-success} | | -| F031-04 | ALTER TABLE statement: ajouter une clause de colonne | Partiel {.text-warning} | Pas de support pour `GENERATED` clause et période de temps du système | -| F031-13 | Instruction DROP TABLE: clause RESTRICT | Aucun {.text-danger} | | -| F031-16 | Instruction DROP VIEW: clause RESTRICT | Aucun {.text-danger} | | -| F031-19 | REVOKE statement: clause RESTRICT | Aucun {.text-danger} | | -| **F041** | **Table jointe de base** | **Partiel**{.text-warning} | | -| F041-01 | INNER join (mais pas nécessairement le mot-clé INNER) | Oui {.text-success} | | -| F041-02 | INTÉRIEURE mot-clé | Oui {.text-success} | | -| F041-03 | LEFT OUTER JOIN | Oui {.text-success} | | -| F041-04 | RIGHT OUTER JOIN | Oui {.text-success} | | -| F041-05 | Les jointures externes peuvent être imbriqués | Oui {.text-success} | | -| F041-07 | La table intérieure dans une jointure extérieure gauche ou droite peut également être utilisée dans une jointure intérieure | Oui {.text-success} | | -| F041-08 | Tous les opérateurs de comparaison sont pris en charge (plutôt que juste =) | Aucun {.text-danger} | | -| **F051** | **Date et heure de base** | **Partiel**{.text-warning} | | -| F051-01 | Type de données de DATE (y compris la prise en charge du littéral de DATE) | Partiel {.text-warning} | Aucun littéral | -| F051-02 | TYPE DE DONNÉES DE TEMPS (y compris la prise en charge du littéral de temps) avec une précision de secondes fractionnaires d'au moins 0 | Aucun {.text-danger} | | -| F051-03 | Type de données D'horodatage (y compris la prise en charge du littéral D'horodatage) avec une précision de secondes fractionnaires d'au moins 0 et 6 | Aucun {.text-danger} | `DateTime64` temps fournit des fonctionnalités similaires | -| F051-04 | Prédicat de comparaison sur les types de données DATE, heure et horodatage | Partiel {.text-warning} | Un seul type de données disponible | -| F051-05 | Distribution explicite entre les types datetime et les types de chaînes de caractères | Oui {.text-success} | | -| F051-06 | CURRENT_DATE | Aucun {.text-danger} | `today()` est similaire | -| F051-07 | LOCALTIME | Aucun {.text-danger} | `now()` est similaire | -| F051-08 | LOCALTIMESTAMP | Aucun {.text-danger} | | -| **F081** | **UNION et sauf dans les vues** | **Partiel**{.text-warning} | | -| **F131** | **Groupées des opérations** | **Partiel**{.text-warning} | | -| F131-01 | WHERE, GROUP BY et ayant des clauses prises en charge dans les requêtes avec des vues groupées | Oui {.text-success} | | -| F131-02 | Plusieurs tables prises en charge dans les requêtes avec des vues groupées | Oui {.text-success} | | -| F131-03 | Définir les fonctions prises en charge dans les requêtes groupées vues | Oui {.text-success} | | -| F131-04 | Sous requêtes avec des clauses GROUP BY et HAVING et des vues groupées | Oui {.text-success} | | -| F131-05 | Sélectionnez une seule ligne avec des clauses GROUP BY et HAVING et des vues groupées | Aucun {.text-danger} | | -| **F181** | **Support de module Multiple** | **Aucun**{.text-danger} | | -| **F201** | **Fonction de distribution** | **Oui**{.text-success} | | -| **F221** | **Valeurs par défaut explicites** | **Aucun**{.text-danger} | | -| **F261** | **Expression de cas** | **Oui**{.text-success} | | -| F261-01 | Cas Simple | Oui {.text-success} | | -| F261-02 | Cas recherché | Oui {.text-success} | | -| F261-03 | NULLIF | Oui {.text-success} | | -| F261-04 | COALESCE | Oui {.text-success} | | -| **F311** | **Déclaration de définition de schéma** | **Partiel**{.text-warning} | | -| F311-01 | CREATE SCHEMA | Aucun {.text-danger} | | -| F311-02 | Créer une TABLE pour les tables de base persistantes | Oui {.text-success} | | -| F311-03 | CREATE VIEW | Oui {.text-success} | | -| F311-04 | CREATE VIEW: WITH CHECK OPTION | Aucun {.text-danger} | | -| F311-05 | Déclaration de subvention | Oui {.text-success} | | -| **F471** | **Valeurs de sous-requête scalaire** | **Oui**{.text-success} | | -| **F481** | **Prédicat null étendu** | **Oui**{.text-success} | | -| **F812** | **Base de repérage** | **Aucun**{.text-danger} | | -| **T321** | **Routines SQL-invoked de base** | **Aucun**{.text-danger} | | -| T321-01 | Fonctions définies par l'utilisateur sans surcharge | Aucun {.text-danger} | | -| T321-02 | Procédures stockées définies par l'utilisateur sans surcharge | Aucun {.text-danger} | | -| T321-03 | L'invocation de la fonction | Aucun {.text-danger} | | -| T321-04 | L'instruction d'APPEL de | Aucun {.text-danger} | | -| T321-05 | Déclaration de retour | Aucun {.text-danger} | | -| **T631** | **Dans le prédicat avec un élément de liste** | **Oui**{.text-success} | | diff --git a/docs/fr/sql-reference/data-types/aggregatefunction.md b/docs/fr/sql-reference/data-types/aggregatefunction.md deleted file mode 100644 index 18874cd3cb7..00000000000 --- a/docs/fr/sql-reference/data-types/aggregatefunction.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 52 -toc_title: AggregateFunction (nom, types_of_arguments...) ---- - -# AggregateFunction(name, types_of_arguments…) {#data-type-aggregatefunction} - -Aggregate functions can have an implementation-defined intermediate state that can be serialized to an AggregateFunction(…) data type and stored in a table, usually, by means of [une vue matérialisée](../../sql-reference/statements/create.md#create-view). La manière courante de produire un État de fonction d'agrégat est d'appeler la fonction d'agrégat avec le `-State` suffixe. Pour obtenir le résultat final de l'agrégation dans l'avenir, vous devez utiliser la même fonction d'agrégation avec la `-Merge`suffixe. - -`AggregateFunction` — parametric data type. - -**Paramètre** - -- Nom de la fonction d'agrégation. - - If the function is parametric, specify its parameters too. - -- Types des arguments de la fonction d'agrégation. - -**Exemple** - -``` sql -CREATE TABLE t -( - column1 AggregateFunction(uniq, UInt64), - column2 AggregateFunction(anyIf, String, UInt8), - column3 AggregateFunction(quantiles(0.5, 0.9), UInt64) -) ENGINE = ... -``` - -[uniq](../../sql-reference/aggregate-functions/reference.md#agg_function-uniq), anyIf ([tout](../../sql-reference/aggregate-functions/reference.md#agg_function-any)+[Si](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-if)) et [les quantiles](../../sql-reference/aggregate-functions/reference.md) les fonctions d'agrégation sont-elles prises en charge dans ClickHouse. - -## Utilisation {#usage} - -### Insertion De Données {#data-insertion} - -Pour insérer des données, utilisez `INSERT SELECT` avec le regroupement d' `-State`- fonction. - -**Exemples de fonction** - -``` sql -uniqState(UserID) -quantilesState(0.5, 0.9)(SendTiming) -``` - -Contrairement aux fonctions correspondantes `uniq` et `quantiles`, `-State`- les fonctions renvoient l'état, au lieu de la valeur finale. En d'autres termes, ils renvoient une valeur de `AggregateFunction` type. - -Dans les résultats de `SELECT` requête, les valeurs de `AggregateFunction` type ont une représentation binaire spécifique à l'implémentation pour tous les formats de sortie ClickHouse. Si les données de vidage dans, par exemple, `TabSeparated` format avec `SELECT` requête, puis ce vidage peut être chargé en utilisant `INSERT` requête. - -### Sélection De Données {#data-selection} - -Lors de la sélection des données `AggregatingMergeTree` table, utilisez `GROUP BY` et les mêmes fonctions d'agrégat que lors de l'insertion de données, mais en utilisant `-Merge`suffixe. - -Une fonction d'agrégation avec `-Merge` suffixe prend un ensemble d'états, les combine, et renvoie le résultat complet de l'agrégation de données. - -Par exemple, les deux requêtes suivantes retournent le même résultat: - -``` sql -SELECT uniq(UserID) FROM table - -SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP BY RegionID) -``` - -## Exemple D'Utilisation {#usage-example} - -Voir [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) Description du moteur. - -[Article Original](https://clickhouse.tech/docs/en/data_types/nested_data_structures/aggregatefunction/) diff --git a/docs/fr/sql-reference/data-types/array.md b/docs/fr/sql-reference/data-types/array.md deleted file mode 100644 index 41772cab177..00000000000 --- a/docs/fr/sql-reference/data-types/array.md +++ /dev/null @@ -1,77 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 51 -toc_title: Array(T) ---- - -# Array(t) {#data-type-array} - -Un tableau de `T`les éléments de type. `T` peut être n'importe quel type de données, y compris un tableau. - -## La création d'un Tableau {#creating-an-array} - -Vous pouvez utiliser une fonction pour créer un tableau: - -``` sql -array(T) -``` - -Vous pouvez également utiliser des crochets. - -``` sql -[] -``` - -Exemple de création d'un tableau: - -``` sql -SELECT array(1, 2) AS x, toTypeName(x) -``` - -``` text -┌─x─────┬─toTypeName(array(1, 2))─┐ -│ [1,2] │ Array(UInt8) │ -└───────┴─────────────────────────┘ -``` - -``` sql -SELECT [1, 2] AS x, toTypeName(x) -``` - -``` text -┌─x─────┬─toTypeName([1, 2])─┐ -│ [1,2] │ Array(UInt8) │ -└───────┴────────────────────┘ -``` - -## Utilisation de Types de données {#working-with-data-types} - -Lors de la création d'un tableau à la volée, ClickHouse définit automatiquement le type d'argument comme le type de données le plus étroit pouvant stocker tous les arguments listés. S'il y a des [Nullable](nullable.md#data_type-nullable) ou littéral [NULL](../../sql-reference/syntax.md#null-literal) les valeurs, le type d'un élément de tableau devient également [Nullable](nullable.md). - -Si ClickHouse n'a pas pu déterminer le type de données, il génère une exception. Par exemple, cela se produit lorsque vous essayez de créer un tableau avec des chaînes et des nombres simultanément (`SELECT array(1, 'a')`). - -Exemples de détection automatique de type de données: - -``` sql -SELECT array(1, 2, NULL) AS x, toTypeName(x) -``` - -``` text -┌─x──────────┬─toTypeName(array(1, 2, NULL))─┐ -│ [1,2,NULL] │ Array(Nullable(UInt8)) │ -└────────────┴───────────────────────────────┘ -``` - -Si vous essayez de créer un tableau de types de données incompatibles, ClickHouse lève une exception: - -``` sql -SELECT array(1, 'a') -``` - -``` text -Received exception from server (version 1.1.54388): -Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. -``` - -[Article Original](https://clickhouse.tech/docs/en/data_types/array/) diff --git a/docs/fr/sql-reference/data-types/boolean.md b/docs/fr/sql-reference/data-types/boolean.md deleted file mode 100644 index aeb84cf1cc1..00000000000 --- a/docs/fr/sql-reference/data-types/boolean.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 43 -toc_title: "Bool\xE9en" ---- - -# Les Valeurs Booléennes {#boolean-values} - -Il n'y a pas de type distinct pour les valeurs booléennes. Utilisez le type UInt8, limité aux valeurs 0 ou 1. - -[Article Original](https://clickhouse.tech/docs/en/data_types/boolean/) diff --git a/docs/fr/sql-reference/data-types/date.md b/docs/fr/sql-reference/data-types/date.md deleted file mode 100644 index 698639f1d2f..00000000000 --- a/docs/fr/sql-reference/data-types/date.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 47 -toc_title: Date ---- - -# Date {#date} - -Date. Stocké en deux octets comme le nombre de jours depuis 1970-01-01 (non signé). Permet de stocker des valeurs juste après le début de L'époque Unix jusqu'au seuil supérieur défini par une constante au stade de la compilation (actuellement, c'est jusqu'à l'année 2106, mais l'année finale entièrement prise en charge est 2105). - -La valeur de date est stockée sans le fuseau horaire. - -[Article Original](https://clickhouse.tech/docs/en/data_types/date/) diff --git a/docs/fr/sql-reference/data-types/datetime.md b/docs/fr/sql-reference/data-types/datetime.md deleted file mode 100644 index 915270e4d2b..00000000000 --- a/docs/fr/sql-reference/data-types/datetime.md +++ /dev/null @@ -1,129 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 48 -toc_title: DateTime ---- - -# Datetime {#data_type-datetime} - -Permet de stocker un instant dans le temps, qui peut être exprimé comme une date de calendrier et une heure d'une journée. - -Syntaxe: - -``` sql -DateTime([timezone]) -``` - -Plage de valeurs prise en charge: \[1970-01-01 00:00:00, 2105-12-31 23:59:59\]. - -Résolution: 1 seconde. - -## Utilisation Remarques {#usage-remarks} - -Le point dans le temps est enregistré en tant que [Le timestamp Unix](https://en.wikipedia.org/wiki/Unix_time), quel que soit le fuseau horaire ou l'heure d'été. En outre, l' `DateTime` type peut stocker le fuseau horaire qui est le même pour la colonne entière, qui affecte la façon dont les valeurs de la `DateTime` les valeurs de type sont affichées au format texte et comment les valeurs spécifiées en tant que chaînes sont analysées (‘2020-01-01 05:00:01’). Le fuseau horaire n'est pas stocké dans les lignes de la table (ou dans resultset), mais est stocké dans les métadonnées de la colonne. -Une liste des fuseaux horaires pris en charge peut être trouvée dans le [Base de données de fuseau horaire IANA](https://www.iana.org/time-zones). -Le `tzdata` paquet, contenant [Base de données de fuseau horaire IANA](https://www.iana.org/time-zones), doit être installé dans le système. L'utilisation de la `timedatectl list-timezones` commande pour lister les fuseaux horaires connus par un système local. - -Vous pouvez définir explicitement un fuseau horaire `DateTime`- tapez des colonnes lors de la création d'une table. Si le fuseau horaire n'est pas défini, ClickHouse utilise la valeur [fuseau](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) paramètre dans les paramètres du serveur ou les paramètres du système d'exploitation au moment du démarrage du serveur ClickHouse. - -Le [clickhouse-client](../../interfaces/cli.md) applique le fuseau horaire du serveur par défaut si un fuseau horaire n'est pas explicitement défini lors de l'initialisation du type de données. Pour utiliser le fuseau horaire du client, exécutez `clickhouse-client` avec l' `--use_client_time_zone` paramètre. - -Clickhouse affiche les valeurs dans `YYYY-MM-DD hh:mm:ss` format de texte par défaut. Vous pouvez modifier la sortie avec le [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) fonction. - -Lorsque vous insérez des données dans ClickHouse, vous pouvez utiliser différents formats de chaînes de date et d'heure, en fonction de la valeur du [date_time_input_format](../../operations/settings/settings.md#settings-date_time_input_format) paramètre. - -## Exemple {#examples} - -**1.** Création d'une table avec un `DateTime`- tapez la colonne et insérez des données dedans: - -``` sql -CREATE TABLE dt -( - `timestamp` DateTime('Europe/Moscow'), - `event_id` UInt8 -) -ENGINE = TinyLog; -``` - -``` sql -INSERT INTO dt Values (1546300800, 1), ('2019-01-01 00:00:00', 2); -``` - -``` sql -SELECT * FROM dt; -``` - -``` text -┌───────────timestamp─┬─event_id─┐ -│ 2019-01-01 03:00:00 │ 1 │ -│ 2019-01-01 00:00:00 │ 2 │ -└─────────────────────┴──────────┘ -``` - -- Lors de l'insertion de datetime en tant qu'entier, il est traité comme un horodatage Unix (UTC). `1546300800` représenter `'2019-01-01 00:00:00'` L'UTC. Cependant, comme `timestamp` la colonne a `Europe/Moscow` (UTC+3) fuseau horaire spécifié, lors de la sortie en tant que chaîne, la valeur sera affichée comme `'2019-01-01 03:00:00'` -- Lors de l'insertion d'une valeur de chaîne en tant que datetime, elle est traitée comme étant dans le fuseau horaire de la colonne. `'2019-01-01 00:00:00'` sera considérée comme étant en `Europe/Moscow` fuseau horaire et enregistré sous `1546290000`. - -**2.** Le filtrage sur `DateTime` valeur - -``` sql -SELECT * FROM dt WHERE timestamp = toDateTime('2019-01-01 00:00:00', 'Europe/Moscow') -``` - -``` text -┌───────────timestamp─┬─event_id─┐ -│ 2019-01-01 00:00:00 │ 2 │ -└─────────────────────┴──────────┘ -``` - -`DateTime` les valeurs de colonne peuvent être filtrées à l'aide d'une `WHERE` prédicat. Elle sera convertie `DateTime` automatiquement: - -``` sql -SELECT * FROM dt WHERE timestamp = '2019-01-01 00:00:00' -``` - -``` text -┌───────────timestamp─┬─event_id─┐ -│ 2019-01-01 03:00:00 │ 1 │ -└─────────────────────┴──────────┘ -``` - -**3.** Obtenir un fuseau horaire pour un `DateTime`colonne de type: - -``` sql -SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x -``` - -``` text -┌──────────────column─┬─x─────────────────────────┐ -│ 2019-10-16 04:12:04 │ DateTime('Europe/Moscow') │ -└─────────────────────┴───────────────────────────┘ -``` - -**4.** Conversion de fuseau horaire - -``` sql -SELECT -toDateTime(timestamp, 'Europe/London') as lon_time, -toDateTime(timestamp, 'Europe/Moscow') as mos_time -FROM dt -``` - -``` text -┌───────────lon_time──┬────────────mos_time─┐ -│ 2019-01-01 00:00:00 │ 2019-01-01 03:00:00 │ -│ 2018-12-31 21:00:00 │ 2019-01-01 00:00:00 │ -└─────────────────────┴─────────────────────┘ -``` - -## Voir Aussi {#see-also} - -- [Fonctions de conversion de Type](../../sql-reference/functions/type-conversion-functions.md) -- [Fonctions pour travailler avec des dates et des heures](../../sql-reference/functions/date-time-functions.md) -- [Fonctions pour travailler avec des tableaux](../../sql-reference/functions/array-functions.md) -- [Le `date_time_input_format` paramètre](../../operations/settings/settings.md#settings-date_time_input_format) -- [Le `timezone` paramètre de configuration du serveur](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) -- [Opérateurs pour travailler avec des dates et des heures](../../sql-reference/operators/index.md#operators-datetime) -- [Le `Date` type de données](date.md) - -[Article Original](https://clickhouse.tech/docs/en/data_types/datetime/) diff --git a/docs/fr/sql-reference/data-types/datetime64.md b/docs/fr/sql-reference/data-types/datetime64.md deleted file mode 100644 index 027891c595d..00000000000 --- a/docs/fr/sql-reference/data-types/datetime64.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 49 -toc_title: DateTime64 ---- - -# Datetime64 {#data_type-datetime64} - -Permet de stocker un instant dans le temps, qui peut être exprimé comme une date de calendrier et une heure d'un jour, avec une précision de sous-seconde définie - -Tick taille (précision): 10-précision deuxième - -Syntaxe: - -``` sql -DateTime64(precision, [timezone]) -``` - -En interne, stocke les données comme un certain nombre de ‘ticks’ depuis le début de l'époque (1970-01-01 00: 00: 00 UTC) comme Int64. La résolution des tiques est déterminée par le paramètre de précision. En outre, l' `DateTime64` type peut stocker le fuseau horaire qui est le même pour la colonne entière, qui affecte la façon dont les valeurs de la `DateTime64` les valeurs de type sont affichées au format texte et comment les valeurs spécifiées en tant que chaînes sont analysées (‘2020-01-01 05:00:01.000’). Le fuseau horaire n'est pas stocké dans les lignes de la table (ou dans resultset), mais est stocké dans les métadonnées de la colonne. Voir les détails dans [DateTime](datetime.md). - -## Exemple {#examples} - -**1.** Création d'une table avec `DateTime64`- tapez la colonne et insérez des données dedans: - -``` sql -CREATE TABLE dt -( - `timestamp` DateTime64(3, 'Europe/Moscow'), - `event_id` UInt8 -) -ENGINE = TinyLog -``` - -``` sql -INSERT INTO dt Values (1546300800000, 1), ('2019-01-01 00:00:00', 2) -``` - -``` sql -SELECT * FROM dt -``` - -``` text -┌───────────────timestamp─┬─event_id─┐ -│ 2019-01-01 03:00:00.000 │ 1 │ -│ 2019-01-01 00:00:00.000 │ 2 │ -└─────────────────────────┴──────────┘ -``` - -- Lors de l'insertion de datetime en tant qu'entier, il est traité comme un horodatage Unix (UTC) mis à l'échelle de manière appropriée. `1546300800000` (avec précision 3) représente `'2019-01-01 00:00:00'` L'UTC. Cependant, comme `timestamp` la colonne a `Europe/Moscow` (UTC+3) fuseau horaire spécifié, lors de la sortie sous forme de chaîne, la valeur sera affichée comme `'2019-01-01 03:00:00'` -- Lors de l'insertion d'une valeur de chaîne en tant que datetime, elle est traitée comme étant dans le fuseau horaire de la colonne. `'2019-01-01 00:00:00'` sera considérée comme étant en `Europe/Moscow` fuseau horaire et stocké comme `1546290000000`. - -**2.** Le filtrage sur `DateTime64` valeur - -``` sql -SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow') -``` - -``` text -┌───────────────timestamp─┬─event_id─┐ -│ 2019-01-01 00:00:00.000 │ 2 │ -└─────────────────────────┴──────────┘ -``` - -Contrairement `DateTime`, `DateTime64` les valeurs ne sont pas converties depuis `String` automatiquement - -**3.** Obtenir un fuseau horaire pour un `DateTime64`-le type de la valeur: - -``` sql -SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x -``` - -``` text -┌──────────────────column─┬─x──────────────────────────────┐ -│ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Europe/Moscow') │ -└─────────────────────────┴────────────────────────────────┘ -``` - -**4.** Conversion de fuseau horaire - -``` sql -SELECT -toDateTime64(timestamp, 3, 'Europe/London') as lon_time, -toDateTime64(timestamp, 3, 'Europe/Moscow') as mos_time -FROM dt -``` - -``` text -┌───────────────lon_time──┬────────────────mos_time─┐ -│ 2019-01-01 00:00:00.000 │ 2019-01-01 03:00:00.000 │ -│ 2018-12-31 21:00:00.000 │ 2019-01-01 00:00:00.000 │ -└─────────────────────────┴─────────────────────────┘ -``` - -## Voir Aussi {#see-also} - -- [Fonctions de conversion de Type](../../sql-reference/functions/type-conversion-functions.md) -- [Fonctions pour travailler avec des dates et des heures](../../sql-reference/functions/date-time-functions.md) -- [Fonctions pour travailler avec des tableaux](../../sql-reference/functions/array-functions.md) -- [Le `date_time_input_format` paramètre](../../operations/settings/settings.md#settings-date_time_input_format) -- [Le `timezone` paramètre de configuration du serveur](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) -- [Opérateurs pour travailler avec des dates et des heures](../../sql-reference/operators/index.md#operators-datetime) -- [`Date` type de données](date.md) -- [`DateTime` type de données](datetime.md) diff --git a/docs/fr/sql-reference/data-types/decimal.md b/docs/fr/sql-reference/data-types/decimal.md deleted file mode 100644 index 171bc1cf6dd..00000000000 --- a/docs/fr/sql-reference/data-types/decimal.md +++ /dev/null @@ -1,109 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 42 -toc_title: "D\xE9cimal" ---- - -# Décimal (P, S), Décimal32 (S), Décimal64 (S), Décimal128 (S) {#decimalp-s-decimal32s-decimal64s-decimal128s} - -Nombres à points fixes signés qui conservent la précision pendant les opérations d'addition, de soustraction et de multiplication. Pour la division, les chiffres les moins significatifs sont ignorés (non arrondis). - -## Paramètre {#parameters} - -- P-précision. Plage valide: \[1: 38 \]. Détermine le nombre de chiffres décimaux nombre peut avoir (fraction y compris). -- S - échelle. Plage valide: \[0: P \]. Détermine le nombre de chiffres décimaux fraction peut avoir. - -En fonction de P Paramètre Valeur décimal (P, S) est un synonyme de: -- P à partir de \[ 1: 9\] - Pour Décimal32 (S) -- P à partir de \[10: 18\] - pour Décimal64 (S) -- P à partir de \[19: 38\] - pour Décimal128 (S) - -## Plages De Valeurs Décimales {#decimal-value-ranges} - -- Décimal32 (S) - ( -1 \* 10^(9 - S), 1 \* 10^(9-S) ) -- Décimal64 (S) - ( -1 \* 10^(18 - S), 1 \* 10^(18-S) ) -- Décimal128 (S) - ( -1 \* 10^(38 - S), 1 \* 10^(38-S) ) - -Par exemple, Decimal32(4) peut contenir des nombres de -99999.9999 à 99999.9999 avec 0,0001 étape. - -## Représentation Interne {#internal-representation} - -En interne, les données sont représentées comme des entiers signés normaux avec une largeur de bit respective. Les plages de valeurs réelles qui peuvent être stockées en mémoire sont un peu plus grandes que celles spécifiées ci-dessus, qui sont vérifiées uniquement lors de la conversion à partir d'une chaîne. - -Parce que les processeurs modernes ne prennent pas en charge les entiers 128 bits nativement, les opérations sur Decimal128 sont émulées. Pour cette raison, Decimal128 fonctionne significativement plus lentement que Decimal32 / Decimal64. - -## Opérations et type de résultat {#operations-and-result-type} - -Les opérations binaires sur le résultat décimal dans le type de résultat plus large (avec n'importe quel ordre d'arguments). - -- `Decimal64(S1) Decimal32(S2) -> Decimal64(S)` -- `Decimal128(S1) Decimal32(S2) -> Decimal128(S)` -- `Decimal128(S1) Decimal64(S2) -> Decimal128(S)` - -Règles pour l'échelle: - -- ajouter, soustraire: S = max (S1, S2). -- multuply: S = S1 + S2. -- diviser: S = S1. - -Pour des opérations similaires entre décimal et entier, le résultat est Décimal de la même taille qu'un argument. - -Les opérations entre Decimal et Float32 / Float64 ne sont pas définies. Si vous en avez besoin, vous pouvez explicitement lancer l'un des arguments en utilisant les builtins toDecimal32, toDecimal64, toDecimal128 ou toFloat32, toFloat64. Gardez à l'esprit que le résultat perdra de la précision et que la conversion de type est une opération coûteuse en calcul. - -Certaines fonctions sur le résultat de retour décimal comme Float64 (par exemple, var ou stddev). Les calculs intermédiaires peuvent toujours être effectués en décimal, ce qui peut conduire à des résultats différents entre les entrées Float64 et Decimal avec les mêmes valeurs. - -## Contrôles De Débordement {#overflow-checks} - -Pendant les calculs sur Décimal, des débordements entiers peuvent se produire. Les chiffres excessifs dans une fraction sont éliminés (non arrondis). Les chiffres excessifs dans la partie entière conduiront à une exception. - -``` sql -SELECT toDecimal32(2, 4) AS x, x / 3 -``` - -``` text -┌──────x─┬─divide(toDecimal32(2, 4), 3)─┐ -│ 2.0000 │ 0.6666 │ -└────────┴──────────────────────────────┘ -``` - -``` sql -SELECT toDecimal32(4.2, 8) AS x, x * x -``` - -``` text -DB::Exception: Scale is out of bounds. -``` - -``` sql -SELECT toDecimal32(4.2, 8) AS x, 6 * x -``` - -``` text -DB::Exception: Decimal math overflow. -``` - -Les contrôles de débordement entraînent un ralentissement des opérations. S'il est connu que les débordements ne sont pas possibles, il est logique de désactiver les contrôles en utilisant `decimal_check_overflow` paramètre. Lorsque des contrôles sont désactivés et le débordement se produit, le résultat sera faux: - -``` sql -SET decimal_check_overflow = 0; -SELECT toDecimal32(4.2, 8) AS x, 6 * x -``` - -``` text -┌──────────x─┬─multiply(6, toDecimal32(4.2, 8))─┐ -│ 4.20000000 │ -17.74967296 │ -└────────────┴──────────────────────────────────┘ -``` - -Les contrôles de débordement se produisent non seulement sur les opérations arithmétiques mais aussi sur la comparaison de valeurs: - -``` sql -SELECT toDecimal32(1, 8) < 100 -``` - -``` text -DB::Exception: Can't compare. -``` - -[Article Original](https://clickhouse.tech/docs/en/data_types/decimal/) diff --git a/docs/fr/sql-reference/data-types/domains/index.md b/docs/fr/sql-reference/data-types/domains/index.md deleted file mode 100644 index 7e11f9a8a68..00000000000 --- a/docs/fr/sql-reference/data-types/domains/index.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Domaine -toc_priority: 56 -toc_title: "Aper\xE7u" ---- - -# Domaine {#domains} - -Les domaines sont des types spéciaux qui ajoutent des fonctionnalités supplémentaires au sommet du type de base existant, mais en laissant le format on-wire et on-disc du type de données sous-jacent intact. À l'heure actuelle, ClickHouse ne prend pas en charge les domaines définis par l'utilisateur. - -Vous pouvez utiliser des domaines partout type de base correspondant peut être utilisé, par exemple: - -- Créer une colonne d'un type de domaine -- Valeurs de lecture / écriture depuis / vers la colonne de domaine -- L'utiliser comme un indice si un type de base peut être utilisée comme un indice -- Fonctions d'appel avec des valeurs de colonne de domaine - -### Fonctionnalités supplémentaires des domaines {#extra-features-of-domains} - -- Nom de type de colonne explicite dans `SHOW CREATE TABLE` ou `DESCRIBE TABLE` -- Entrée du format convivial avec `INSERT INTO domain_table(domain_column) VALUES(...)` -- Sortie au format convivial pour `SELECT domain_column FROM domain_table` -- Chargement de données à partir d'une source externe dans un format convivial: `INSERT INTO domain_table FORMAT CSV ...` - -### Limitation {#limitations} - -- Impossible de convertir la colonne d'index du type de base en type de domaine via `ALTER TABLE`. -- Impossible de convertir implicitement des valeurs de chaîne en valeurs de domaine lors de l'insertion de données d'une autre colonne ou table. -- Le domaine n'ajoute aucune contrainte sur les valeurs stockées. - -[Article Original](https://clickhouse.tech/docs/en/data_types/domains/overview) diff --git a/docs/fr/sql-reference/data-types/domains/ipv4.md b/docs/fr/sql-reference/data-types/domains/ipv4.md deleted file mode 100644 index 12895992e77..00000000000 --- a/docs/fr/sql-reference/data-types/domains/ipv4.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 59 -toc_title: IPv4 ---- - -## IPv4 {#ipv4} - -`IPv4` est un domaine basé sur `UInt32` tapez et sert de remplacement typé pour stocker des valeurs IPv4. Il fournit un stockage compact avec le format d'entrée-sortie convivial et les informations de type de colonne sur l'inspection. - -### Utilisation De Base {#basic-usage} - -``` sql -CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url; - -DESCRIBE TABLE hits; -``` - -``` text -┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ -│ url │ String │ │ │ │ │ -│ from │ IPv4 │ │ │ │ │ -└──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ -``` - -Ou vous pouvez utiliser le domaine IPv4 comme clé: - -``` sql -CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY from; -``` - -`IPv4` le domaine prend en charge le format d'entrée personnalisé en tant que chaînes IPv4: - -``` sql -INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '116.253.40.133')('https://clickhouse.tech', '183.247.232.58')('https://clickhouse.tech/docs/en/', '116.106.34.242'); - -SELECT * FROM hits; -``` - -``` text -┌─url────────────────────────────────┬───────────from─┐ -│ https://clickhouse.tech/docs/en/ │ 116.106.34.242 │ -│ https://wikipedia.org │ 116.253.40.133 │ -│ https://clickhouse.tech │ 183.247.232.58 │ -└────────────────────────────────────┴────────────────┘ -``` - -Les valeurs sont stockées sous forme binaire compacte: - -``` sql -SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(from)─┬─hex(from)─┐ -│ IPv4 │ B7F7E83A │ -└──────────────────┴───────────┘ -``` - -Les valeurs de domaine ne sont pas implicitement convertibles en types autres que `UInt32`. -Si vous voulez convertir `IPv4` valeur à une chaîne, vous devez le faire explicitement avec `IPv4NumToString()` fonction: - -``` sql -SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1; -``` - - ┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ - │ String │ 183.247.232.58 │ - └───────────────────────────────────┴────────────────┘ - -Ou coulé à un `UInt32` valeur: - -``` sql -SELECT toTypeName(i), CAST(from as UInt32) as i FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ -│ UInt32 │ 3086477370 │ -└──────────────────────────────────┴────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/data_types/domains/ipv4) diff --git a/docs/fr/sql-reference/data-types/domains/ipv6.md b/docs/fr/sql-reference/data-types/domains/ipv6.md deleted file mode 100644 index 77510a950cb..00000000000 --- a/docs/fr/sql-reference/data-types/domains/ipv6.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 60 -toc_title: IPv6 ---- - -## IPv6 {#ipv6} - -`IPv6` est un domaine basé sur `FixedString(16)` tapez et sert de remplacement typé pour stocker des valeurs IPv6. Il fournit un stockage compact avec le format d'entrée-sortie convivial et les informations de type de colonne sur l'inspection. - -### Utilisation De Base {#basic-usage} - -``` sql -CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; - -DESCRIBE TABLE hits; -``` - -``` text -┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ -│ url │ String │ │ │ │ │ -│ from │ IPv6 │ │ │ │ │ -└──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ -``` - -Ou vous pouvez utiliser `IPv6` domaine comme l'un des principaux: - -``` sql -CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from; -``` - -`IPv6` le domaine prend en charge l'entrée personnalisée en tant que chaînes IPv6: - -``` sql -INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '2a02:aa08:e000:3100::2')('https://clickhouse.tech', '2001:44c8:129:2632:33:0:252:2')('https://clickhouse.tech/docs/en/', '2a02:e980:1e::1'); - -SELECT * FROM hits; -``` - -``` text -┌─url────────────────────────────────┬─from──────────────────────────┐ -│ https://clickhouse.tech │ 2001:44c8:129:2632:33:0:252:2 │ -│ https://clickhouse.tech/docs/en/ │ 2a02:e980:1e::1 │ -│ https://wikipedia.org │ 2a02:aa08:e000:3100::2 │ -└────────────────────────────────────┴───────────────────────────────┘ -``` - -Les valeurs sont stockées sous forme binaire compacte: - -``` sql -SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(from)─┬─hex(from)────────────────────────┐ -│ IPv6 │ 200144C8012926320033000002520002 │ -└──────────────────┴──────────────────────────────────┘ -``` - -Les valeurs de domaine ne sont pas implicitement convertibles en types autres que `FixedString(16)`. -Si vous voulez convertir `IPv6` valeur à une chaîne, vous devez le faire explicitement avec `IPv6NumToString()` fonction: - -``` sql -SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ -│ String │ 2001:44c8:129:2632:33:0:252:2 │ -└───────────────────────────────────┴───────────────────────────────┘ -``` - -Ou coulé à un `FixedString(16)` valeur: - -``` sql -SELECT toTypeName(i), CAST(from as FixedString(16)) as i FROM hits LIMIT 1; -``` - -``` text -┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ -│ FixedString(16) │ ��� │ -└───────────────────────────────────────────┴─────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/data_types/domains/ipv6) diff --git a/docs/fr/sql-reference/data-types/enum.md b/docs/fr/sql-reference/data-types/enum.md deleted file mode 100644 index b9751c1c804..00000000000 --- a/docs/fr/sql-reference/data-types/enum.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 50 -toc_title: Enum ---- - -# Enum {#enum} - -Type énuméré composé de valeurs nommées. - -Les valeurs nommées doivent être déclarées comme `'string' = integer` pair. ClickHouse ne stocke que des nombres, mais prend en charge les opérations avec les valeurs à travers leurs noms. - -Supports ClickHouse: - -- 8-bit `Enum`. Il peut contenir jusqu'à 256 valeurs énumérées dans le `[-128, 127]` gamme. -- 16 bits `Enum`. Il peut contenir jusqu'à 65 536 valeurs énumérées dans le `[-32768, 32767]` gamme. - -Clickhouse choisit automatiquement le type de `Enum` lorsque les données sont insérées. Vous pouvez également utiliser `Enum8` ou `Enum16` types pour être sûr de la taille de stockage. - -## Exemples D'Utilisation {#usage-examples} - -Ici, nous créons une table avec une `Enum8('hello' = 1, 'world' = 2)` type de colonne: - -``` sql -CREATE TABLE t_enum -( - x Enum('hello' = 1, 'world' = 2) -) -ENGINE = TinyLog -``` - -Colonne `x` ne peut stocker que les valeurs répertoriées dans la définition de type: `'hello'` ou `'world'`. Si vous essayez d'enregistrer une autre valeur, ClickHouse déclenchera une exception. Taille 8 bits pour cela `Enum` est choisi automatiquement. - -``` sql -INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') -``` - -``` text -Ok. -``` - -``` sql -INSERT INTO t_enum values('a') -``` - -``` text -Exception on client: -Code: 49. DB::Exception: Unknown element 'a' for type Enum('hello' = 1, 'world' = 2) -``` - -Lorsque vous interrogez des données de la table, ClickHouse affiche les valeurs de chaîne de `Enum`. - -``` sql -SELECT * FROM t_enum -``` - -``` text -┌─x─────┐ -│ hello │ -│ world │ -│ hello │ -└───────┘ -``` - -Si vous avez besoin de voir les équivalents numériques des lignes, vous devez `Enum` valeur en type entier. - -``` sql -SELECT CAST(x, 'Int8') FROM t_enum -``` - -``` text -┌─CAST(x, 'Int8')─┐ -│ 1 │ -│ 2 │ -│ 1 │ -└─────────────────┘ -``` - -Pour créer une valeur d'Enum dans une requête, vous devez également utiliser `CAST`. - -``` sql -SELECT toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)')) -``` - -``` text -┌─toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)'))─┐ -│ Enum8('a' = 1, 'b' = 2) │ -└─────────────────────────────────────────────────────┘ -``` - -## Règles générales et utilisation {#general-rules-and-usage} - -Chacune des valeurs se voit attribuer un nombre dans la plage `-128 ... 127` pour `Enum8` ou dans la gamme `-32768 ... 32767` pour `Enum16`. Toutes les chaînes et les nombres doivent être différents. Une chaîne vide est autorisé. Si ce type est spécifié (dans une définition de table), les nombres peuvent être dans un ordre arbitraire. Toutefois, l'ordre n'a pas d'importance. - -Ni la chaîne ni la valeur numérique dans un `Enum` peut être [NULL](../../sql-reference/syntax.md). - -Un `Enum` peut être contenue dans [Nullable](nullable.md) type. Donc, si vous créez une table en utilisant la requête - -``` sql -CREATE TABLE t_enum_nullable -( - x Nullable( Enum8('hello' = 1, 'world' = 2) ) -) -ENGINE = TinyLog -``` - -il peut stocker non seulement des `'hello'` et `'world'`, mais `NULL`, ainsi. - -``` sql -INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) -``` - -Dans la mémoire RAM, un `Enum` la colonne est stockée dans la même manière que `Int8` ou `Int16` des valeurs numériques correspondantes. - -Lors de la lecture sous forme de texte, ClickHouse analyse la valeur sous forme de chaîne et recherche la chaîne correspondante à partir de l'ensemble des valeurs Enum. Si elle n'est pas trouvée, une exception est levée. Lors de la lecture au format texte, la chaîne est lue et la valeur numérique correspondante est recherchée. Une exception sera levée si il n'est pas trouvé. -Lors de l'écriture sous forme de texte, il écrit la valeur correspondante de la chaîne. Si les données de colonne contiennent des déchets (nombres qui ne proviennent pas de l'ensemble valide), une exception est levée. Lors de la lecture et de l'écriture sous forme binaire, cela fonctionne de la même manière que pour les types de données Int8 et Int16. -La valeur implicite par défaut est la valeur avec le numéro le plus bas. - -Lors `ORDER BY`, `GROUP BY`, `IN`, `DISTINCT` et ainsi de suite, les Énumérations se comportent de la même façon que les nombres correspondants. Par exemple, ORDER BY les trie numériquement. Les opérateurs d'égalité et de comparaison fonctionnent de la même manière sur les énumérations que sur les valeurs numériques sous-jacentes. - -Les valeurs Enum ne peuvent pas être comparées aux nombres. Les Enums peuvent être comparés à une chaîne constante. Si la chaîne comparée à n'est pas une valeur valide pour L'énumération, une exception sera levée. L'opérateur est pris en charge avec l'Enum sur le côté gauche, et un ensemble de chaînes sur le côté droit. Les chaînes sont les valeurs de L'énumération correspondante. - -Most numeric and string operations are not defined for Enum values, e.g. adding a number to an Enum or concatenating a string to an Enum. -Cependant, L'énumération a un naturel `toString` fonction qui renvoie sa valeur de chaîne. - -Les valeurs Enum sont également convertibles en types numériques en utilisant `toT` fonction, où T est un type numérique. Lorsque T correspond au type numérique sous-jacent de l'énumération, cette conversion est à coût nul. -Le type Enum peut être modifié sans coût en utilisant ALTER, si seulement l'ensemble des valeurs est modifié. Il est possible d'ajouter et de supprimer des membres de L'énumération en utilisant ALTER (la suppression n'est sûre que si la valeur supprimée n'a jamais été utilisée dans la table). À titre de sauvegarde, la modification de la valeur numérique d'un membre Enum précédemment défini lancera une exception. - -En utilisant ALTER, il est possible de changer un Enum8 en Enum16 ou vice versa, tout comme changer un Int8 en Int16. - -[Article Original](https://clickhouse.tech/docs/en/data_types/enum/) diff --git a/docs/fr/sql-reference/data-types/fixedstring.md b/docs/fr/sql-reference/data-types/fixedstring.md deleted file mode 100644 index 5ba09187581..00000000000 --- a/docs/fr/sql-reference/data-types/fixedstring.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 45 -toc_title: FixedString (N) ---- - -# Fixedstring {#fixedstring} - -Une chaîne de longueur fixe de `N` octets (ni caractères ni points de code). - -Pour déclarer une colonne de `FixedString` tapez, utilisez la syntaxe suivante: - -``` sql - FixedString(N) -``` - -Où `N` est un nombre naturel. - -Le `FixedString` type est efficace lorsque les données ont la longueur de précisément `N` octet. Dans tous les autres cas, il est susceptible de réduire l'efficacité. - -Exemples de valeurs qui peuvent être stockées efficacement dans `FixedString`-tapé colonnes: - -- La représentation binaire des adresses IP (`FixedString(16)` pour IPv6). -- Language codes (ru_RU, en_US … ). -- Currency codes (USD, RUB … ). -- Représentation binaire des hachages (`FixedString(16)` pour MD5, `FixedString(32)` pour SHA256). - -Pour stocker les valeurs UUID, utilisez [UUID](uuid.md) type de données. - -Lors de l'insertion des données, ClickHouse: - -- Complète une chaîne avec des octets null si la chaîne contient moins de `N` octet. -- Jette le `Too large value for FixedString(N)` exception si la chaîne contient plus de `N` octet. - -Lors de la sélection des données, ClickHouse ne supprime pas les octets nuls à la fin de la chaîne. Si vous utilisez le `WHERE` clause, vous devez ajouter des octets null manuellement pour `FixedString` valeur. L'exemple suivant illustre l'utilisation de l' `WHERE` la clause de `FixedString`. - -Considérons le tableau suivant avec le seul `FixedString(2)` colonne: - -``` text -┌─name──┐ -│ b │ -└───────┘ -``` - -Requête `SELECT * FROM FixedStringTable WHERE a = 'b'` ne renvoie aucune donnée en conséquence. Nous devrions compléter le modèle de filtre avec des octets nuls. - -``` sql -SELECT * FROM FixedStringTable -WHERE a = 'b\0' -``` - -``` text -┌─a─┐ -│ b │ -└───┘ -``` - -Ce comportement diffère de MySQL pour le `CHAR` type (où les chaînes sont remplies d'espaces et les espaces sont supprimés pour la sortie). - -À noter que la longueur de la `FixedString(N)` la valeur est constante. Le [longueur](../../sql-reference/functions/array-functions.md#array_functions-length) la fonction renvoie `N` même si l' `FixedString(N)` la valeur est remplie uniquement avec des octets [vide](../../sql-reference/functions/string-functions.md#empty) la fonction renvoie `1` dans ce cas. - -[Article Original](https://clickhouse.tech/docs/en/data_types/fixedstring/) diff --git a/docs/fr/sql-reference/data-types/float.md b/docs/fr/sql-reference/data-types/float.md deleted file mode 100644 index b269b930110..00000000000 --- a/docs/fr/sql-reference/data-types/float.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 41 -toc_title: Float32, Float64 ---- - -# Float32, Float64 {#float32-float64} - -[Les nombres à virgule flottante](https://en.wikipedia.org/wiki/IEEE_754). - -Les Types sont équivalents aux types de C: - -- `Float32` - `float` -- `Float64` - `double` - -Nous vous recommandons de stocker les données sous forme entière chaque fois que possible. Par exemple, convertissez des nombres de précision fixes en valeurs entières, telles que des montants monétaires ou des temps de chargement de page en millisecondes. - -## Utilisation de nombres à virgule flottante {#using-floating-point-numbers} - -- Calculs avec des nombres à virgule flottante peut produire une erreur d'arrondi. - - - -``` sql -SELECT 1 - 0.9 -``` - -``` text -┌───────minus(1, 0.9)─┐ -│ 0.09999999999999998 │ -└─────────────────────┘ -``` - -- Le résultat du calcul dépend de la méthode de calcul (le type de processeur et de l'architecture du système informatique). -- Les calculs à virgule flottante peuvent entraîner des nombres tels que l'infini (`Inf`) et “not-a-number” (`NaN`). Cela doit être pris en compte lors du traitement des résultats de calculs. -- Lors de l'analyse de nombres à virgule flottante à partir de texte, le résultat peut ne pas être le nombre représentable par machine le plus proche. - -## NaN et Inf {#data_type-float-nan-inf} - -Contrairement à SQL standard, ClickHouse prend en charge les catégories suivantes de nombres à virgule flottante: - -- `Inf` – Infinity. - - - -``` sql -SELECT 0.5 / 0 -``` - -``` text -┌─divide(0.5, 0)─┐ -│ inf │ -└────────────────┘ -``` - -- `-Inf` – Negative infinity. - - - -``` sql -SELECT -0.5 / 0 -``` - -``` text -┌─divide(-0.5, 0)─┐ -│ -inf │ -└─────────────────┘ -``` - -- `NaN` – Not a number. - - - -``` sql -SELECT 0 / 0 -``` - -``` text -┌─divide(0, 0)─┐ -│ nan │ -└──────────────┘ -``` - - See the rules for `NaN` sorting in the section [ORDER BY clause](../sql_reference/statements/select/order-by.md). - -[Article Original](https://clickhouse.tech/docs/en/data_types/float/) diff --git a/docs/fr/sql-reference/data-types/index.md b/docs/fr/sql-reference/data-types/index.md deleted file mode 100644 index 887e2efd69f..00000000000 --- a/docs/fr/sql-reference/data-types/index.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Types De Donn\xE9es" -toc_priority: 37 -toc_title: Introduction ---- - -# Types De Données {#data_types} - -ClickHouse peut stocker différents types de données dans des cellules de table. - -Cette section décrit les types de données pris en charge et les considérations spéciales pour les utiliser et/ou les implémenter le cas échéant. - -[Article Original](https://clickhouse.tech/docs/en/data_types/) diff --git a/docs/fr/sql-reference/data-types/int-uint.md b/docs/fr/sql-reference/data-types/int-uint.md deleted file mode 100644 index 9b196c164a4..00000000000 --- a/docs/fr/sql-reference/data-types/int-uint.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 40 -toc_title: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 ---- - -# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 {#uint8-uint16-uint32-uint64-int8-int16-int32-int64} - -Entiers de longueur fixe, avec ou sans signe. - -## Plages Int {#int-ranges} - -- Int8 - \[-128: 127\] -- Int16 - \[-32768: 32767\] -- Int32 - \[-2147483648: 2147483647\] -- Int64 - \[-9223372036854775808: 9223372036854775807\] - -## Plages Uint {#uint-ranges} - -- UInt8 - \[0: 255\] -- UInt16 - \[0: 65535\] -- UInt32- \[0: 4294967295\] -- UInt64- \[0: 18446744073709551615\] - -[Article Original](https://clickhouse.tech/docs/en/data_types/int_uint/) diff --git a/docs/fr/sql-reference/data-types/nested-data-structures/index.md b/docs/fr/sql-reference/data-types/nested-data-structures/index.md deleted file mode 100644 index 528e0bad0cd..00000000000 --- a/docs/fr/sql-reference/data-types/nested-data-structures/index.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Structures De Donn\xE9es Imbriqu\xE9es" -toc_hidden: true -toc_priority: 54 -toc_title: "cach\xE9s" ---- - -# Structures De Données Imbriquées {#nested-data-structures} - -[Article Original](https://clickhouse.tech/docs/en/data_types/nested_data_structures/) diff --git a/docs/fr/sql-reference/data-types/nested-data-structures/nested.md b/docs/fr/sql-reference/data-types/nested-data-structures/nested.md deleted file mode 100644 index 2805780de24..00000000000 --- a/docs/fr/sql-reference/data-types/nested-data-structures/nested.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 57 -toc_title: "Imbriqu\xE9e(Type1 Nom1, Nom2 Type2, ...)" ---- - -# Nested(name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} - -A nested data structure is like a table inside a cell. The parameters of a nested data structure – the column names and types – are specified the same way as in a [CREATE TABLE](../../../sql-reference/statements/create.md) requête. Chaque ligne de table peut correspondre à n'importe quel nombre de lignes dans une structure de données imbriquée. - -Exemple: - -``` sql -CREATE TABLE test.visits -( - CounterID UInt32, - StartDate Date, - Sign Int8, - IsNew UInt8, - VisitID UInt64, - UserID UInt64, - ... - Goals Nested - ( - ID UInt32, - Serial UInt32, - EventTime DateTime, - Price Int64, - OrderID String, - CurrencyID UInt32 - ), - ... -) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) -``` - -Cet exemple déclare le `Goals` structure de données imbriquée, qui contient des données sur les conversions (objectifs atteints). Chaque ligne de la ‘visits’ table peut correspondre à zéro ou n'importe quel nombre de conversions. - -Un seul niveau d'imbrication est pris en charge. Les colonnes de structures imbriquées contenant des tableaux sont équivalentes à des tableaux multidimensionnels, elles ont donc un support limité (il n'y a pas de support pour stocker ces colonnes dans des tables avec le moteur MergeTree). - -Dans la plupart des cas, lorsque vous travaillez avec une structure de données imbriquée, ses colonnes sont spécifiées avec des noms de colonnes séparés par un point. Ces colonnes constituent un tableau de types correspondants. Tous les tableaux de colonnes d'une structure de données imbriquée unique ont la même longueur. - -Exemple: - -``` sql -SELECT - Goals.ID, - Goals.EventTime -FROM test.visits -WHERE CounterID = 101500 AND length(Goals.ID) < 5 -LIMIT 10 -``` - -``` text -┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ -│ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ -│ [1073752] │ ['2014-03-17 00:28:25'] │ -│ [1073752] │ ['2014-03-17 10:46:20'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ -│ [] │ [] │ -│ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ -│ [] │ [] │ -│ [] │ [] │ -│ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ -└────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -Il est plus facile de penser à une structure de données imbriquée comme un ensemble de plusieurs tableaux de colonnes de la même longueur. - -Le seul endroit où une requête SELECT peut spécifier le nom d'une structure de données imbriquée entière au lieu de colonnes individuelles est la clause de jointure de tableau. Pour plus d'informations, voir “ARRAY JOIN clause”. Exemple: - -``` sql -SELECT - Goal.ID, - Goal.EventTime -FROM test.visits -ARRAY JOIN Goals AS Goal -WHERE CounterID = 101500 AND length(Goals.ID) < 5 -LIMIT 10 -``` - -``` text -┌─Goal.ID─┬──────Goal.EventTime─┐ -│ 1073752 │ 2014-03-17 16:38:10 │ -│ 591325 │ 2014-03-17 16:38:48 │ -│ 591325 │ 2014-03-17 16:42:27 │ -│ 1073752 │ 2014-03-17 00:28:25 │ -│ 1073752 │ 2014-03-17 10:46:20 │ -│ 1073752 │ 2014-03-17 13:59:20 │ -│ 591325 │ 2014-03-17 22:17:55 │ -│ 591325 │ 2014-03-17 22:18:07 │ -│ 591325 │ 2014-03-17 22:18:51 │ -│ 1073752 │ 2014-03-17 11:37:06 │ -└─────────┴─────────────────────┘ -``` - -Vous ne pouvez pas effectuer SELECT pour une structure de données imbriquée entière. Vous ne pouvez lister explicitement que les colonnes individuelles qui en font partie. - -Pour une requête INSERT, vous devez passer tous les tableaux de colonnes composant d'une structure de données imbriquée séparément (comme s'il s'agissait de tableaux de colonnes individuels). Au cours de l'insertion, le système vérifie qu'ils ont la même longueur. - -Pour une requête DESCRIBE, les colonnes d'une structure de données imbriquée sont répertoriées séparément de la même manière. - -La requête ALTER pour les éléments d'une structure de données imbriquée a des limites. - -[Article Original](https://clickhouse.tech/docs/en/data_types/nested_data_structures/nested/) diff --git a/docs/fr/sql-reference/data-types/nullable.md b/docs/fr/sql-reference/data-types/nullable.md deleted file mode 100644 index 6b37b571a96..00000000000 --- a/docs/fr/sql-reference/data-types/nullable.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 54 -toc_title: Nullable ---- - -# Nullable(typename) {#data_type-nullable} - -Permet de stocker marqueur spécial ([NULL](../../sql-reference/syntax.md)) qui dénote “missing value” aux valeurs normales autorisées par `TypeName`. Par exemple, un `Nullable(Int8)` type colonne peut stocker `Int8` type de valeurs, et les lignes qui n'ont pas de valeur magasin `NULL`. - -Pour un `TypeName` vous ne pouvez pas utiliser les types de données composites [Tableau](array.md) et [Tuple](tuple.md). Les types de données composites peuvent contenir `Nullable` valeurs de type, telles que `Array(Nullable(Int8))`. - -A `Nullable` le champ type ne peut pas être inclus dans les index de table. - -`NULL` est la valeur par défaut pour tout `Nullable` type, sauf indication contraire dans la configuration du serveur ClickHouse. - -## Caractéristiques De Stockage {#storage-features} - -Stocker `Nullable` valeurs de type dans une colonne de table, ClickHouse utilise un fichier séparé avec `NULL` masques en plus du fichier normal avec des valeurs. Les entrées du fichier masks permettent à ClickHouse de faire la distinction entre `NULL` et une valeur par défaut du type de données correspondant pour chaque ligne de table. En raison d'un fichier supplémentaire, `Nullable` colonne consomme de l'espace de stockage supplémentaire par rapport à une normale similaire. - -!!! info "Note" - Utiliser `Nullable` affecte presque toujours négativement les performances, gardez cela à l'esprit lors de la conception de vos bases de données. - -## Exemple D'Utilisation {#usage-example} - -``` sql -CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog -``` - -``` sql -INSERT INTO t_null VALUES (1, NULL), (2, 3) -``` - -``` sql -SELECT x + y FROM t_null -``` - -``` text -┌─plus(x, y)─┐ -│ ᴺᵁᴸᴸ │ -│ 5 │ -└────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/data_types/nullable/) diff --git a/docs/fr/sql-reference/data-types/simpleaggregatefunction.md b/docs/fr/sql-reference/data-types/simpleaggregatefunction.md deleted file mode 100644 index 81fcd67cfae..00000000000 --- a/docs/fr/sql-reference/data-types/simpleaggregatefunction.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# SimpleAggregateFunction {#data-type-simpleaggregatefunction} - -`SimpleAggregateFunction(name, types_of_arguments…)` le type de données stocke la valeur actuelle de la fonction d'agrégat et ne stocke pas son état complet comme [`AggregateFunction`](aggregatefunction.md) faire. Cette optimisation peut être appliquée aux fonctions pour lesquelles la propriété suivante est conservée: le résultat de l'application d'une fonction `f` pour un ensemble de lignes `S1 UNION ALL S2` peut être obtenu en appliquant `f` pour les parties de la ligne définie séparément, puis à nouveau l'application `f` pour les résultats: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Cette propriété garantit que les résultats d'agrégation partielle sont suffisants pour calculer le combiné, de sorte que nous n'avons pas à stocker et traiter de données supplémentaires. - -Les fonctions d'agrégation suivantes sont prises en charge: - -- [`any`](../../sql-reference/aggregate-functions/reference.md#agg_function-any) -- [`anyLast`](../../sql-reference/aggregate-functions/reference.md#anylastx) -- [`min`](../../sql-reference/aggregate-functions/reference.md#agg_function-min) -- [`max`](../../sql-reference/aggregate-functions/reference.md#agg_function-max) -- [`sum`](../../sql-reference/aggregate-functions/reference.md#agg_function-sum) -- [`groupBitAnd`](../../sql-reference/aggregate-functions/reference.md#groupbitand) -- [`groupBitOr`](../../sql-reference/aggregate-functions/reference.md#groupbitor) -- [`groupBitXor`](../../sql-reference/aggregate-functions/reference.md#groupbitxor) - -Les valeurs de la `SimpleAggregateFunction(func, Type)` regarder et stockées de la même manière que `Type`, de sorte que vous n'avez pas besoin d'appliquer des fonctions avec `-Merge`/`-State` suffixe. `SimpleAggregateFunction` a de meilleures performances que `AggregateFunction` avec la même fonction d'agrégation. - -**Paramètre** - -- Nom de la fonction d'agrégation. -- Types des arguments de la fonction d'agrégation. - -**Exemple** - -``` sql -CREATE TABLE t -( - column1 SimpleAggregateFunction(sum, UInt64), - column2 SimpleAggregateFunction(any, String) -) ENGINE = ... -``` - -[Article Original](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) diff --git a/docs/fr/sql-reference/data-types/special-data-types/expression.md b/docs/fr/sql-reference/data-types/special-data-types/expression.md deleted file mode 100644 index c3ba5e42ba1..00000000000 --- a/docs/fr/sql-reference/data-types/special-data-types/expression.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 58 -toc_title: Expression ---- - -# Expression {#expression} - -Les Expressions sont utilisées pour représenter des lambdas dans des fonctions d'ordre Élevé. - -[Article Original](https://clickhouse.tech/docs/en/data_types/special_data_types/expression/) diff --git a/docs/fr/sql-reference/data-types/special-data-types/index.md b/docs/fr/sql-reference/data-types/special-data-types/index.md deleted file mode 100644 index 6d292dc522e..00000000000 --- a/docs/fr/sql-reference/data-types/special-data-types/index.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "Types De Donn\xE9es Sp\xE9ciaux" -toc_hidden: true -toc_priority: 55 -toc_title: "cach\xE9s" ---- - -# Types De Données Spéciaux {#special-data-types} - -Les valeurs de type de données spéciales ne peuvent pas être sérialisées pour l'enregistrement dans une table ou la sortie dans les résultats de la requête, mais peuvent être utilisées comme résultat intermédiaire lors de l'exécution de la requête. - -[Article Original](https://clickhouse.tech/docs/en/data_types/special_data_types/) diff --git a/docs/fr/sql-reference/data-types/special-data-types/interval.md b/docs/fr/sql-reference/data-types/special-data-types/interval.md deleted file mode 100644 index 464de8a10ab..00000000000 --- a/docs/fr/sql-reference/data-types/special-data-types/interval.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 61 -toc_title: Intervalle ---- - -# Intervalle {#data-type-interval} - -Famille de types de données représentant des intervalles d'heure et de date. Les types de la [INTERVAL](../../../sql-reference/operators/index.md#operator-interval) opérateur. - -!!! warning "Avertissement" - `Interval` les valeurs de type de données ne peuvent pas être stockées dans les tables. - -Structure: - -- Intervalle de temps en tant que valeur entière non signée. -- Type de l'intervalle. - -Types d'intervalles pris en charge: - -- `SECOND` -- `MINUTE` -- `HOUR` -- `DAY` -- `WEEK` -- `MONTH` -- `QUARTER` -- `YEAR` - -Pour chaque type d'intervalle, il existe un type de données distinct. Par exemple, l' `DAY` l'intervalle correspond au `IntervalDay` type de données: - -``` sql -SELECT toTypeName(INTERVAL 4 DAY) -``` - -``` text -┌─toTypeName(toIntervalDay(4))─┐ -│ IntervalDay │ -└──────────────────────────────┘ -``` - -## Utilisation Remarques {#data-type-interval-usage-remarks} - -Vous pouvez utiliser `Interval`-tapez des valeurs dans des opérations arithmétiques avec [Date](../../../sql-reference/data-types/date.md) et [DateTime](../../../sql-reference/data-types/datetime.md)-type de valeurs. Par exemple, vous pouvez ajouter 4 jours à l'heure actuelle: - -``` sql -SELECT now() as current_date_time, current_date_time + INTERVAL 4 DAY -``` - -``` text -┌───current_date_time─┬─plus(now(), toIntervalDay(4))─┐ -│ 2019-10-23 10:58:45 │ 2019-10-27 10:58:45 │ -└─────────────────────┴───────────────────────────────┘ -``` - -Les intervalles avec différents types ne peuvent pas être combinés. Vous ne pouvez pas utiliser des intervalles comme `4 DAY 1 HOUR`. Spécifiez des intervalles en unités inférieures ou égales à la plus petite unité de l'intervalle, par exemple, l'intervalle `1 day and an hour` l'intervalle peut être exprimée comme `25 HOUR` ou `90000 SECOND`. - -Vous ne pouvez pas effectuer d'opérations arithmétiques avec `Interval`- tapez des valeurs, mais vous pouvez ajouter des intervalles de différents types par conséquent aux valeurs dans `Date` ou `DateTime` types de données. Exemple: - -``` sql -SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR -``` - -``` text -┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ -│ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ -└─────────────────────┴────────────────────────────────────────────────────────┘ -``` - -La requête suivante provoque une exception: - -``` sql -select now() AS current_date_time, current_date_time + (INTERVAL 4 DAY + INTERVAL 3 HOUR) -``` - -``` text -Received exception from server (version 19.14.1): -Code: 43. DB::Exception: Received from localhost:9000. DB::Exception: Wrong argument types for function plus: if one argument is Interval, then another must be Date or DateTime.. -``` - -## Voir Aussi {#see-also} - -- [INTERVAL](../../../sql-reference/operators/index.md#operator-interval) opérateur -- [toInterval](../../../sql-reference/functions/type-conversion-functions.md#function-tointerval) type fonctions de conversion diff --git a/docs/fr/sql-reference/data-types/special-data-types/nothing.md b/docs/fr/sql-reference/data-types/special-data-types/nothing.md deleted file mode 100644 index 2e3d76b7207..00000000000 --- a/docs/fr/sql-reference/data-types/special-data-types/nothing.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 60 -toc_title: Rien ---- - -# Rien {#nothing} - -Le seul but de ce type de données est de représenter les cas où une valeur n'est pas prévu. Donc vous ne pouvez pas créer un `Nothing` type de valeur. - -Par exemple, littéral [NULL](../../../sql-reference/syntax.md#null-literal) a type de `Nullable(Nothing)`. Voir plus sur [Nullable](../../../sql-reference/data-types/nullable.md). - -Le `Nothing` type peut également être utilisé pour désigner des tableaux vides: - -``` sql -SELECT toTypeName(array()) -``` - -``` text -┌─toTypeName(array())─┐ -│ Array(Nothing) │ -└─────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/data_types/special_data_types/nothing/) diff --git a/docs/fr/sql-reference/data-types/special-data-types/set.md b/docs/fr/sql-reference/data-types/special-data-types/set.md deleted file mode 100644 index 8f50175bb6b..00000000000 --- a/docs/fr/sql-reference/data-types/special-data-types/set.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 59 -toc_title: "D\xE9finir" ---- - -# Définir {#set} - -Utilisé pour la moitié droite d'un [IN](../../operators/in.md#select-in-operators) expression. - -[Article Original](https://clickhouse.tech/docs/en/data_types/special_data_types/set/) diff --git a/docs/fr/sql-reference/data-types/string.md b/docs/fr/sql-reference/data-types/string.md deleted file mode 100644 index b82e1fe6c69..00000000000 --- a/docs/fr/sql-reference/data-types/string.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 44 -toc_title: "Cha\xEEne" ---- - -# Chaîne {#string} - -Les chaînes d'une longueur arbitraire. La longueur n'est pas limitée. La valeur peut contenir un ensemble arbitraire d'octets, y compris des octets nuls. -Le type de chaîne remplace les types VARCHAR, BLOB, CLOB et autres provenant d'autres SGBD. - -## Encodage {#encodings} - -ClickHouse n'a pas le concept d'encodages. Les chaînes peuvent contenir un ensemble arbitraire d'octets, qui sont stockés et sortis tels quels. -Si vous avez besoin de stocker des textes, nous vous recommandons d'utiliser L'encodage UTF-8. À tout le moins, si votre terminal utilise UTF-8 (comme recommandé), vous pouvez lire et écrire vos valeurs sans effectuer de conversions. -De même, certaines fonctions pour travailler avec des chaînes ont des variations distinctes qui fonctionnent sous l'hypothèse que la chaîne contient un ensemble d'octets représentant un texte codé en UTF-8. -Par exemple, l' ‘length’ fonction calcule la longueur de la chaîne en octets, tandis que le ‘lengthUTF8’ la fonction calcule la longueur de la chaîne en points de code Unicode, en supposant que la valeur est encodée en UTF-8. - -[Article Original](https://clickhouse.tech/docs/en/data_types/string/) diff --git a/docs/fr/sql-reference/data-types/tuple.md b/docs/fr/sql-reference/data-types/tuple.md deleted file mode 100644 index ab9db735181..00000000000 --- a/docs/fr/sql-reference/data-types/tuple.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 53 -toc_title: Tuple (T1, T2,...) ---- - -# Tuple(t1, T2, …) {#tuplet1-t2} - -Un n-uplet d'éléments, chacun ayant une personne [type](index.md#data_types). - -Les Tuples sont utilisés pour le regroupement temporaire de colonnes. Les colonnes peuvent être regroupées lorsqu'une expression IN est utilisée dans une requête et pour spécifier certains paramètres formels des fonctions lambda. Pour plus d'informations, voir les sections [Dans les opérateurs](../../sql-reference/operators/in.md) et [Des fonctions d'ordre supérieur](../../sql-reference/functions/higher-order-functions.md). - -Les Tuples peuvent être le résultat d'une requête. Dans ce cas, pour les formats de texte autres que JSON, les valeurs sont séparées par des virgules entre parenthèses. Dans les formats JSON, les tuples sont sortis sous forme de tableaux (entre crochets). - -## La création d'un Tuple {#creating-a-tuple} - -Vous pouvez utiliser une fonction pour créer un tuple: - -``` sql -tuple(T1, T2, ...) -``` - -Exemple de création d'un tuple: - -``` sql -SELECT tuple(1,'a') AS x, toTypeName(x) -``` - -``` text -┌─x───────┬─toTypeName(tuple(1, 'a'))─┐ -│ (1,'a') │ Tuple(UInt8, String) │ -└─────────┴───────────────────────────┘ -``` - -## Utilisation de Types de données {#working-with-data-types} - -Lors de la création d'un tuple à la volée, ClickHouse détecte automatiquement le type de chaque argument comme le minimum des types qui peuvent stocker la valeur de l'argument. Si l'argument est [NULL](../../sql-reference/syntax.md#null-literal) le type de l'élément tuple est [Nullable](nullable.md). - -Exemple de détection automatique de type de données: - -``` sql -SELECT tuple(1, NULL) AS x, toTypeName(x) -``` - -``` text -┌─x────────┬─toTypeName(tuple(1, NULL))──────┐ -│ (1,NULL) │ Tuple(UInt8, Nullable(Nothing)) │ -└──────────┴─────────────────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/data_types/tuple/) diff --git a/docs/fr/sql-reference/data-types/uuid.md b/docs/fr/sql-reference/data-types/uuid.md deleted file mode 100644 index 60973a3f855..00000000000 --- a/docs/fr/sql-reference/data-types/uuid.md +++ /dev/null @@ -1,77 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 46 -toc_title: UUID ---- - -# UUID {#uuid-data-type} - -Un identifiant unique universel (UUID) est un numéro de 16 octets utilisé pour identifier les enregistrements. Pour plus d'informations sur L'UUID, voir [Wikipedia](https://en.wikipedia.org/wiki/Universally_unique_identifier). - -L'exemple de valeur de type UUID est représenté ci-dessous: - -``` text -61f0c404-5cb3-11e7-907b-a6006ad3dba0 -``` - -Si vous ne spécifiez pas la valeur de la colonne UUID lors de l'insertion d'un nouvel enregistrement, la valeur UUID est remplie avec zéro: - -``` text -00000000-0000-0000-0000-000000000000 -``` - -## Comment générer {#how-to-generate} - -Pour générer la valeur UUID, ClickHouse fournit [generateUUIDv4](../../sql-reference/functions/uuid-functions.md) fonction. - -## Exemple D'Utilisation {#usage-example} - -**Exemple 1** - -Cet exemple montre la création d'une table avec la colonne de type UUID et l'insertion d'une valeur dans la table. - -``` sql -CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog -``` - -``` sql -INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1' -``` - -``` sql -SELECT * FROM t_uuid -``` - -``` text -┌────────────────────────────────────x─┬─y─────────┐ -│ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ -└──────────────────────────────────────┴───────────┘ -``` - -**Exemple 2** - -Dans cet exemple, la valeur de la colonne UUID n'est pas spécifiée lors de l'insertion d'un nouvel enregistrement. - -``` sql -INSERT INTO t_uuid (y) VALUES ('Example 2') -``` - -``` sql -SELECT * FROM t_uuid -``` - -``` text -┌────────────────────────────────────x─┬─y─────────┐ -│ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ -│ 00000000-0000-0000-0000-000000000000 │ Example 2 │ -└──────────────────────────────────────┴───────────┘ -``` - -## Restriction {#restrictions} - -Le type de données UUID ne prend en charge que les fonctions qui [Chaîne](string.md) type de données prend également en charge (par exemple, [min](../../sql-reference/aggregate-functions/reference.md#agg_function-min), [Max](../../sql-reference/aggregate-functions/reference.md#agg_function-max), et [compter](../../sql-reference/aggregate-functions/reference.md#agg_function-count)). - -Le type de données UUID n'est pas pris en charge par les opérations arithmétiques (par exemple, [ABS](../../sql-reference/functions/arithmetic-functions.md#arithm_func-abs)) ou des fonctions d'agrégation, comme [somme](../../sql-reference/aggregate-functions/reference.md#agg_function-sum) et [avg](../../sql-reference/aggregate-functions/reference.md#agg_function-avg). - -[Article Original](https://clickhouse.tech/docs/en/data_types/uuid/) diff --git a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md b/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md deleted file mode 100644 index cc238f02f3a..00000000000 --- a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 45 -toc_title: "Dictionnaires hi\xE9rarchiques" ---- - -# Dictionnaires Hiérarchiques {#hierarchical-dictionaries} - -Clickhouse prend en charge les dictionnaires hiérarchiques avec un [touche numérique](external-dicts-dict-structure.md#ext_dict-numeric-key). - -Voici une structure hiérarchique: - -``` text -0 (Common parent) -│ -├── 1 (Russia) -│ │ -│ └── 2 (Moscow) -│ │ -│ └── 3 (Center) -│ -└── 4 (Great Britain) - │ - └── 5 (London) -``` - -Cette hiérarchie peut être exprimée comme la table de dictionnaire suivante. - -| id_région | région_parent | nom_région | -|------------|----------------|--------------------| -| 1 | 0 | Russie | -| 2 | 1 | Moscou | -| 3 | 2 | Center | -| 4 | 0 | La Grande-Bretagne | -| 5 | 4 | Londres | - -Ce tableau contient une colonne `parent_region` qui contient la clé du parent le plus proche de l'élément. - -Clickhouse soutient le [hiérarchique](external-dicts-dict-structure.md#hierarchical-dict-attr) propriété pour [externe dictionnaire](index.md) attribut. Cette propriété vous permet de configurer le dictionnaire hiérarchique comme décrit ci-dessus. - -Le [dictGetHierarchy](../../../sql-reference/functions/ext-dict-functions.md#dictgethierarchy) la fonction vous permet d'obtenir la chaîne parent d'un élément. - -Pour notre exemple, la structure du dictionnaire peut être la suivante: - -``` xml - - - - region_id - - - - parent_region - UInt64 - 0 - true - - - - region_name - String - - - - - -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_hierarchical/) diff --git a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md deleted file mode 100644 index 2569329fefd..00000000000 --- a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ /dev/null @@ -1,407 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 41 -toc_title: "Stockage des dictionnaires en m\xE9moire" ---- - -# Stockage des dictionnaires en mémoire {#dicts-external-dicts-dict-layout} - -Il existe une variété de façons de stocker les dictionnaires en mémoire. - -Nous vous recommandons [plat](#flat), [haché](#dicts-external_dicts_dict_layout-hashed) et [complex_key_hashed](#complex-key-hashed). qui fournissent la vitesse de traitement optimale. - -La mise en cache n'est pas recommandée en raison de performances potentiellement médiocres et de difficultés à sélectionner les paramètres optimaux. En savoir plus dans la section “[cache](#cache)”. - -Il existe plusieurs façons d'améliorer les performances du dictionnaire: - -- Appelez la fonction pour travailler avec le dictionnaire après `GROUP BY`. -- Marquer les attributs à extraire comme injectifs. Un attribut est appelé injectif si différentes valeurs d'attribut correspondent à différentes clés. Alors, quand `GROUP BY` utilise une fonction qui récupère une valeur d'attribut par la clé, cette fonction est automatiquement retirée de `GROUP BY`. - -ClickHouse génère une exception pour les erreurs avec les dictionnaires. Des exemples d'erreurs: - -- Le dictionnaire accessible n'a pas pu être chargé. -- Erreur de la requête d'une `cached` dictionnaire. - -Vous pouvez afficher la liste des dictionnaires externes et leurs statuts dans le `system.dictionaries` table. - -La configuration ressemble à ceci: - -``` xml - - - ... - - - - - - ... - - -``` - -Correspondant [DDL-requête](../../statements/create.md#create-dictionary-query): - -``` sql -CREATE DICTIONARY (...) -... -LAYOUT(LAYOUT_TYPE(param value)) -- layout settings -... -``` - -## Façons de stocker des dictionnaires en mémoire {#ways-to-store-dictionaries-in-memory} - -- [plat](#flat) -- [haché](#dicts-external_dicts_dict_layout-hashed) -- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed) -- [cache](#cache) -- [direct](#direct) -- [range_hashed](#range-hashed) -- [complex_key_hashed](#complex-key-hashed) -- [complex_key_cache](#complex-key-cache) -- [complex_key_direct](#complex-key-direct) -- [ip_trie](#ip-trie) - -### plat {#flat} - -Le dictionnaire est complètement stocké en mémoire sous la forme de tableaux plats. Combien de mémoire le dictionnaire utilise-t-il? Le montant est proportionnel à la taille de la plus grande clé (dans l'espace). - -La clé du dictionnaire a le `UInt64` type et la valeur est limitée à 500 000. Si une clé plus grande est découverte lors de la création du dictionnaire, ClickHouse lève une exception et ne crée pas le dictionnaire. - -Tous les types de sources sont pris en charge. Lors de la mise à jour, les données (à partir d'un fichier ou d'une table) sont lues dans leur intégralité. - -Cette méthode fournit les meilleures performances parmi toutes les méthodes disponibles de stockage du dictionnaire. - -Exemple de Configuration: - -``` xml - - - -``` - -ou - -``` sql -LAYOUT(FLAT()) -``` - -### haché {#dicts-external_dicts_dict_layout-hashed} - -Le dictionnaire est entièrement stockée en mémoire sous la forme d'une table de hachage. Le dictionnaire peut contenir n'importe quel nombre d'éléments avec tous les identificateurs Dans la pratique, le nombre de clés peut atteindre des dizaines de millions d'articles. - -Tous les types de sources sont pris en charge. Lors de la mise à jour, les données (à partir d'un fichier ou d'une table) sont lues dans leur intégralité. - -Exemple de Configuration: - -``` xml - - - -``` - -ou - -``` sql -LAYOUT(HASHED()) -``` - -### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed} - -Semblable à `hashed`, mais utilise moins de mémoire en faveur de plus D'utilisation du processeur. - -Exemple de Configuration: - -``` xml - - - -``` - -``` sql -LAYOUT(SPARSE_HASHED()) -``` - -### complex_key_hashed {#complex-key-hashed} - -Ce type de stockage est pour une utilisation avec composite [touches](external-dicts-dict-structure.md). Semblable à `hashed`. - -Exemple de Configuration: - -``` xml - - - -``` - -``` sql -LAYOUT(COMPLEX_KEY_HASHED()) -``` - -### range_hashed {#range-hashed} - -Le dictionnaire est stocké en mémoire sous la forme d'une table de hachage avec un tableau ordonné de gammes et leurs valeurs correspondantes. - -Cette méthode de stockage fonctionne de la même manière que hachée et permet d'utiliser des plages de date / heure (Type numérique arbitraire) en plus de la clé. - -Exemple: Le tableau contient des réductions pour chaque annonceur dans le format: - -``` text -+---------|-------------|-------------|------+ -| advertiser id | discount start date | discount end date | amount | -+===============+=====================+===================+========+ -| 123 | 2015-01-01 | 2015-01-15 | 0.15 | -+---------|-------------|-------------|------+ -| 123 | 2015-01-16 | 2015-01-31 | 0.25 | -+---------|-------------|-------------|------+ -| 456 | 2015-01-01 | 2015-01-15 | 0.05 | -+---------|-------------|-------------|------+ -``` - -Pour utiliser un échantillon pour les plages de dates, définissez `range_min` et `range_max` éléments dans le [structure](external-dicts-dict-structure.md). Ces éléments doivent contenir des éléments `name` et`type` (si `type` n'est pas spécifié, le type par défaut sera utilisé-Date). `type` peut être n'importe quel type numérique (Date / DateTime / UInt64 / Int32 / autres). - -Exemple: - -``` xml - - - Id - - - first - Date - - - last - Date - - ... -``` - -ou - -``` sql -CREATE DICTIONARY somedict ( - id UInt64, - first Date, - last Date -) -PRIMARY KEY id -LAYOUT(RANGE_HASHED()) -RANGE(MIN first MAX last) -``` - -Pour travailler avec ces dictionnaires, vous devez passer un argument supplémentaire à l' `dictGetT` fonction, pour laquelle une plage est sélectionnée: - -``` sql -dictGetT('dict_name', 'attr_name', id, date) -``` - -Cette fonction retourne la valeur pour l' `id`s et la plage de dates qui inclut la date passée. - -Détails de l'algorithme: - -- Si l' `id` est introuvable ou une plage n'est pas trouvé pour l' `id` il retourne la valeur par défaut pour le dictionnaire. -- S'il y a des plages qui se chevauchent, vous pouvez en utiliser. -- Si le délimiteur est `NULL` ou une date non valide (telle que 1900-01-01 ou 2039-01-01), la plage est laissée ouverte. La gamme peut être ouverte des deux côtés. - -Exemple de Configuration: - -``` xml - - - - ... - - - - - - - - Abcdef - - - StartTimeStamp - UInt64 - - - EndTimeStamp - UInt64 - - - XXXType - String - - - - - - -``` - -ou - -``` sql -CREATE DICTIONARY somedict( - Abcdef UInt64, - StartTimeStamp UInt64, - EndTimeStamp UInt64, - XXXType String DEFAULT '' -) -PRIMARY KEY Abcdef -RANGE(MIN StartTimeStamp MAX EndTimeStamp) -``` - -### cache {#cache} - -Le dictionnaire est stocké dans un cache qui a un nombre fixe de cellules. Ces cellules contiennent des éléments fréquemment utilisés. - -Lors de la recherche d'un dictionnaire, le cache est recherché en premier. Pour chaque bloc de données, toutes les clés qui ne sont pas trouvées dans le cache ou qui sont obsolètes sont demandées à la source en utilisant `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. Les données reçues sont ensuite écrites dans le cache. - -Pour les dictionnaires de cache, l'expiration [vie](external-dicts-dict-lifetime.md) des données dans le cache peuvent être définies. Si plus de temps que `lifetime` passé depuis le chargement des données dans une cellule, la valeur de la cellule n'est pas utilisée et elle est demandée à nouveau la prochaine fois qu'elle doit être utilisée. -C'est la moins efficace de toutes les façons de stocker les dictionnaires. La vitesse du cache dépend fortement des paramètres corrects et que le scénario d'utilisation. Un dictionnaire de type de cache fonctionne bien uniquement lorsque les taux de réussite sont suffisamment élevés (recommandé 99% et plus). Vous pouvez afficher le taux de réussite moyen dans le `system.dictionaries` table. - -Pour améliorer les performances du cache, utilisez une sous-requête avec `LIMIT`, et appelez la fonction avec le dictionnaire en externe. - -Soutenu [source](external-dicts-dict-sources.md): MySQL, ClickHouse, exécutable, HTTP. - -Exemple de paramètres: - -``` xml - - - - 1000000000 - - -``` - -ou - -``` sql -LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) -``` - -Définissez une taille de cache suffisamment grande. Vous devez expérimenter pour sélectionner le nombre de cellules: - -1. Définissez une valeur. -2. Exécutez les requêtes jusqu'à ce que le cache soit complètement plein. -3. Évaluer la consommation de mémoire en utilisant le `system.dictionaries` table. -4. Augmentez ou diminuez le nombre de cellules jusqu'à ce que la consommation de mémoire requise soit atteinte. - -!!! warning "Avertissement" - N'utilisez pas ClickHouse comme source, car le traitement des requêtes avec des lectures aléatoires est lent. - -### complex_key_cache {#complex-key-cache} - -Ce type de stockage est pour une utilisation avec composite [touches](external-dicts-dict-structure.md). Semblable à `cache`. - -### direct {#direct} - -Le dictionnaire n'est pas stocké dans la mémoire et va directement à la source, pendant le traitement d'une demande. - -La clé du dictionnaire a le `UInt64` type. - -Tous les types de [source](external-dicts-dict-sources.md), sauf les fichiers locaux, sont pris en charge. - -Exemple de Configuration: - -``` xml - - - -``` - -ou - -``` sql -LAYOUT(DIRECT()) -``` - -### complex_key_direct {#complex-key-direct} - -Ce type de stockage est destiné à être utilisé avec des [clés](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md) composites. Similaire à `direct` - -### ip_trie {#ip-trie} - -Ce type de stockage permet de mapper des préfixes de réseau (adresses IP) à des métadonnées telles que ASN. - -Exemple: la table contient les préfixes de réseau et leur correspondant en tant que numéro et Code de pays: - -``` text - +-----------|-----|------+ - | prefix | asn | cca2 | - +=================+=======+========+ - | 202.79.32.0/20 | 17501 | NP | - +-----------|-----|------+ - | 2620:0:870::/48 | 3856 | US | - +-----------|-----|------+ - | 2a02:6b8:1::/48 | 13238 | RU | - +-----------|-----|------+ - | 2001:db8::/32 | 65536 | ZZ | - +-----------|-----|------+ -``` - -Lorsque vous utilisez ce type de mise en page, la structure doit avoir une clé composite. - -Exemple: - -``` xml - - - - prefix - String - - - - asn - UInt32 - - - - cca2 - String - ?? - - ... - - - - true - - -``` - -ou - -``` sql -CREATE DICTIONARY somedict ( - prefix String, - asn UInt32, - cca2 String DEFAULT '??' -) -PRIMARY KEY prefix -``` - -La clé ne doit avoir qu'un seul attribut de type chaîne contenant un préfixe IP autorisé. Les autres types ne sont pas encore pris en charge. - -Pour les requêtes, vous devez utiliser les mêmes fonctions (`dictGetT` avec un n-uplet) comme pour les dictionnaires avec des clés composites: - -``` sql -dictGetT('dict_name', 'attr_name', tuple(ip)) -``` - -La fonction prend soit `UInt32` pour IPv4, ou `FixedString(16)` pour IPv6: - -``` sql -dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) -``` - -Les autres types ne sont pas encore pris en charge. La fonction renvoie l'attribut du préfixe correspondant à cette adresse IP. S'il y a chevauchement des préfixes, le plus spécifique est retourné. - -Les données doit complètement s'intégrer dans la RAM. - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_layout/) diff --git a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md deleted file mode 100644 index 8ce78919ff1..00000000000 --- a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md +++ /dev/null @@ -1,91 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 42 -toc_title: "Mises \xC0 Jour Du Dictionnaire" ---- - -# Mises À Jour Du Dictionnaire {#dictionary-updates} - -ClickHouse met périodiquement à jour les dictionnaires. L'intervalle de mise à jour pour les dictionnaires entièrement téléchargés et l'intervalle d'invalidation pour les dictionnaires `` tag en quelques secondes. - -Les mises à jour du dictionnaire (autres que le chargement pour la première utilisation) ne bloquent pas les requêtes. Lors des mises à jour, l'ancienne version d'un dictionnaire est utilisée. Si une erreur se produit pendant une mise à jour, l'erreur est écrite dans le journal du serveur et les requêtes continuent d'utiliser l'ancienne version des dictionnaires. - -Exemple de paramètres: - -``` xml - - ... - 300 - ... - -``` - -``` sql -CREATE DICTIONARY (...) -... -LIFETIME(300) -... -``` - -Paramètre `0` (`LIFETIME(0)`) empêche la mise à jour des dictionnaires. - -Vous pouvez définir un intervalle de temps pour les mises à niveau, et ClickHouse choisira un temps uniformément aléatoire dans cette plage. Ceci est nécessaire pour répartir la charge sur la source du dictionnaire lors de la mise à niveau sur un grand nombre de serveurs. - -Exemple de paramètres: - -``` xml - - ... - - 300 - 360 - - ... - -``` - -ou - -``` sql -LIFETIME(MIN 300 MAX 360) -``` - -Si `0` et `0`, ClickHouse ne recharge pas le dictionnaire par timeout. -Dans ce cas, ClickHouse peut recharger le dictionnaire plus tôt si le fichier de configuration du dictionnaire a été `SYSTEM RELOAD DICTIONARY` la commande a été exécutée. - -Lors de la mise à niveau des dictionnaires, le serveur ClickHouse applique une logique différente selon le type de [source](external-dicts-dict-sources.md): - -Lors de la mise à niveau des dictionnaires, le serveur ClickHouse applique une logique différente selon le type de [source](external-dicts-dict-sources.md): - -- Pour un fichier texte, il vérifie l'heure de la modification. Si l'heure diffère de l'heure enregistrée précédemment, le dictionnaire est mis à jour. -- Pour les tables MyISAM, l'Heure de modification est vérifiée à l'aide d'un `SHOW TABLE STATUS` requête. -- Les dictionnaires d'autres sources sont mis à jour à chaque fois par défaut. - -Pour les sources MySQL (InnoDB), ODBC et ClickHouse, vous pouvez configurer une requête qui mettra à jour les dictionnaires uniquement s'ils ont vraiment changé, plutôt que chaque fois. Pour ce faire, suivez ces étapes: - -- La table de dictionnaire doit avoir un champ qui change toujours lorsque les données source sont mises à jour. -- Les paramètres de la source doivent spécifier une requête qui récupère le champ de modification. Le serveur ClickHouse interprète le résultat de la requête comme une ligne, et si cette ligne a changé par rapport à son état précédent, le dictionnaire est mis à jour. Spécifier la requête dans le `` champ dans les paramètres pour le [source](external-dicts-dict-sources.md). - -Exemple de paramètres: - -``` xml - - ... - - ... - SELECT update_time FROM dictionary_source where id = 1 - - ... - -``` - -ou - -``` sql -... -SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source where id = 1')) -... -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_lifetime/) diff --git a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md deleted file mode 100644 index 4c608fa7188..00000000000 --- a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ /dev/null @@ -1,630 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 43 -toc_title: Sources de dictionnaires externes ---- - -# Sources de dictionnaires externes {#dicts-external-dicts-dict-sources} - -Externe dictionnaire peut être connecté à partir de nombreuses sources différentes. - -Si dictionary est configuré à l'aide de xml-file, la configuration ressemble à ceci: - -``` xml - - - ... - - - - - - ... - - ... - -``` - -En cas de [DDL-requête](../../statements/create.md#create-dictionary-query), configuration égale ressemblera à: - -``` sql -CREATE DICTIONARY dict_name (...) -... -SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration -... -``` - -La source est configurée dans le `source` section. - -Pour les types de source [Fichier Local](#dicts-external_dicts_dict_sources-local_file), [Fichier exécutable](#dicts-external_dicts_dict_sources-executable), [HTTP(S)](#dicts-external_dicts_dict_sources-http), [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) -les paramètres optionnels sont disponibles: - -``` xml - - - /opt/dictionaries/os.tsv - TabSeparated - - - 0 - - -``` - -ou - -``` sql -SOURCE(FILE(path '/opt/dictionaries/os.tsv' format 'TabSeparated')) -SETTINGS(format_csv_allow_single_quotes = 0) -``` - -Les Types de sources (`source_type`): - -- [Fichier Local](#dicts-external_dicts_dict_sources-local_file) -- [Fichier exécutable](#dicts-external_dicts_dict_sources-executable) -- [HTTP(S)](#dicts-external_dicts_dict_sources-http) -- DBMS - - [ODBC](#dicts-external_dicts_dict_sources-odbc) - - [MySQL](#dicts-external_dicts_dict_sources-mysql) - - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - - [Redis](#dicts-external_dicts_dict_sources-redis) - -## Fichier Local {#dicts-external_dicts_dict_sources-local_file} - -Exemple de paramètres: - -``` xml - - - /opt/dictionaries/os.tsv - TabSeparated - - -``` - -ou - -``` sql -SOURCE(FILE(path '/opt/dictionaries/os.tsv' format 'TabSeparated')) -``` - -Définition des champs: - -- `path` – The absolute path to the file. -- `format` – The file format. All the formats described in “[Format](../../../interfaces/formats.md#formats)” sont pris en charge. - -## Fichier Exécutable {#dicts-external_dicts_dict_sources-executable} - -Travailler avec des fichiers exécutables en dépend [comment le dictionnaire est stocké dans la mémoire](external-dicts-dict-layout.md). Si le dictionnaire est stocké en utilisant `cache` et `complex_key_cache`, Clickhouse demande les clés nécessaires en envoyant une requête au STDIN du fichier exécutable. Sinon, ClickHouse démarre le fichier exécutable et traite sa sortie comme des données de dictionnaire. - -Exemple de paramètres: - -``` xml - - - cat /opt/dictionaries/os.tsv - TabSeparated - - -``` - -ou - -``` sql -SOURCE(EXECUTABLE(command 'cat /opt/dictionaries/os.tsv' format 'TabSeparated')) -``` - -Définition des champs: - -- `command` – The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). -- `format` – The file format. All the formats described in “[Format](../../../interfaces/formats.md#formats)” sont pris en charge. - -## Http(s) {#dicts-external_dicts_dict_sources-http} - -Travailler avec un serveur HTTP (S) dépend de [comment le dictionnaire est stocké dans la mémoire](external-dicts-dict-layout.md). Si le dictionnaire est stocké en utilisant `cache` et `complex_key_cache`, Clickhouse demande les clés nécessaires en envoyant une demande via le `POST` méthode. - -Exemple de paramètres: - -``` xml - - - http://[::1]/os.tsv - TabSeparated - - user - password - - -
- API-KEY - key -
-
-
- -``` - -ou - -``` sql -SOURCE(HTTP( - url 'http://[::1]/os.tsv' - format 'TabSeparated' - credentials(user 'user' password 'password') - headers(header(name 'API-KEY' value 'key')) -)) -``` - -Pour que ClickHouse accède à une ressource HTTPS, vous devez [configurer openSSL](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-openssl) dans la configuration du serveur. - -Définition des champs: - -- `url` – The source URL. -- `format` – The file format. All the formats described in “[Format](../../../interfaces/formats.md#formats)” sont pris en charge. -- `credentials` – Basic HTTP authentication. Optional parameter. - - `user` – Username required for the authentication. - - `password` – Password required for the authentication. -- `headers` – All custom HTTP headers entries used for the HTTP request. Optional parameter. - - `header` – Single HTTP header entry. - - `name` – Identifiant name used for the header send on the request. - - `value` – Value set for a specific identifiant name. - -## ODBC {#dicts-external_dicts_dict_sources-odbc} - -Vous pouvez utiliser cette méthode pour connecter n'importe quelle base de données dotée d'un pilote ODBC. - -Exemple de paramètres: - -``` xml - - - DatabaseName - ShemaName.TableName
- DSN=some_parameters - SQL_QUERY -
- -``` - -ou - -``` sql -SOURCE(ODBC( - db 'DatabaseName' - table 'SchemaName.TableName' - connection_string 'DSN=some_parameters' - invalidate_query 'SQL_QUERY' -)) -``` - -Définition des champs: - -- `db` – Name of the database. Omit it if the database name is set in the `` paramètre. -- `table` – Name of the table and schema if exists. -- `connection_string` – Connection string. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Mise à jour des dictionnaires](external-dicts-dict-lifetime.md). - -ClickHouse reçoit des symboles de citation D'ODBC-driver et cite tous les paramètres des requêtes au pilote, il est donc nécessaire de définir le nom de la table en conséquence sur le cas du nom de la table dans la base de données. - -Si vous avez des problèmes avec des encodages lors de l'utilisation d'Oracle, consultez le [FAQ](../../../faq/general.md#oracle-odbc-encodings) article. - -### Vulnérabilité connue de la fonctionnalité du dictionnaire ODBC {#known-vulnerability-of-the-odbc-dictionary-functionality} - -!!! attention "Attention" - Lors de la connexion à la base de données via le paramètre de connexion du pilote ODBC `Servername` peut être substitué. Dans ce cas, les valeurs de `USERNAME` et `PASSWORD` de `odbc.ini` sont envoyés au serveur distant et peuvent être compromis. - -**Exemple d'utilisation non sécurisée** - -Configurons unixODBC pour PostgreSQL. Le contenu de `/etc/odbc.ini`: - -``` text -[gregtest] -Driver = /usr/lib/psqlodbca.so -Servername = localhost -PORT = 5432 -DATABASE = test_db -#OPTION = 3 -USERNAME = test -PASSWORD = test -``` - -Si vous faites alors une requête telle que - -``` sql -SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); -``` - -Le pilote ODBC enverra des valeurs de `USERNAME` et `PASSWORD` de `odbc.ini` de `some-server.com`. - -### Exemple de connexion Postgresql {#example-of-connecting-postgresql} - -Ubuntu OS. - -Installation d'unixODBC et du pilote ODBC pour PostgreSQL: - -``` bash -$ sudo apt-get install -y unixodbc odbcinst odbc-postgresql -``` - -Configuration `/etc/odbc.ini` (ou `~/.odbc.ini`): - -``` text - [DEFAULT] - Driver = myconnection - - [myconnection] - Description = PostgreSQL connection to my_db - Driver = PostgreSQL Unicode - Database = my_db - Servername = 127.0.0.1 - UserName = username - Password = password - Port = 5432 - Protocol = 9.3 - ReadOnly = No - RowVersioning = No - ShowSystemTables = No - ConnSettings = -``` - -La configuration du dictionnaire dans ClickHouse: - -``` xml - - - table_name - - - - - DSN=myconnection - postgresql_table
-
- - - 300 - 360 - - - - - - - id - - - some_column - UInt64 - 0 - - -
-
-``` - -ou - -``` sql -CREATE DICTIONARY table_name ( - id UInt64, - some_column UInt64 DEFAULT 0 -) -PRIMARY KEY id -SOURCE(ODBC(connection_string 'DSN=myconnection' table 'postgresql_table')) -LAYOUT(HASHED()) -LIFETIME(MIN 300 MAX 360) -``` - -Vous devrez peut-être modifier `odbc.ini` pour spécifier le chemin d'accès complet à la bibliothèque avec le conducteur `DRIVER=/usr/local/lib/psqlodbcw.so`. - -### Exemple de connexion à MS SQL Server {#example-of-connecting-ms-sql-server} - -Ubuntu OS. - -Installation du pilote: : - -``` bash -$ sudo apt-get install tdsodbc freetds-bin sqsh -``` - -Configuration du pilote: - -``` bash - $ cat /etc/freetds/freetds.conf - ... - - [MSSQL] - host = 192.168.56.101 - port = 1433 - tds version = 7.0 - client charset = UTF-8 - - $ cat /etc/odbcinst.ini - ... - - [FreeTDS] - Description = FreeTDS - Driver = /usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so - Setup = /usr/lib/x86_64-linux-gnu/odbc/libtdsS.so - FileUsage = 1 - UsageCount = 5 - - $ cat ~/.odbc.ini - ... - - [MSSQL] - Description = FreeTDS - Driver = FreeTDS - Servername = MSSQL - Database = test - UID = test - PWD = test - Port = 1433 -``` - -Configuration du dictionnaire dans ClickHouse: - -``` xml - - - test - - - dict
- DSN=MSSQL;UID=test;PWD=test -
- - - - 300 - 360 - - - - - - - - - k - - - s - String - - - -
-
-``` - -ou - -``` sql -CREATE DICTIONARY test ( - k UInt64, - s String DEFAULT '' -) -PRIMARY KEY k -SOURCE(ODBC(table 'dict' connection_string 'DSN=MSSQL;UID=test;PWD=test')) -LAYOUT(FLAT()) -LIFETIME(MIN 300 MAX 360) -``` - -## DBMS {#dbms} - -### Mysql {#dicts-external_dicts_dict_sources-mysql} - -Exemple de paramètres: - -``` xml - - - 3306 - clickhouse - qwerty - - example01-1 - 1 - - - example01-2 - 1 - - db_name - table_name
- id=10 - SQL_QUERY -
- -``` - -ou - -``` sql -SOURCE(MYSQL( - port 3306 - user 'clickhouse' - password 'qwerty' - replica(host 'example01-1' priority 1) - replica(host 'example01-2' priority 1) - db 'db_name' - table 'table_name' - where 'id=10' - invalidate_query 'SQL_QUERY' -)) -``` - -Définition des champs: - -- `port` – The port on the MySQL server. You can specify it for all replicas, or for each one individually (inside ``). - -- `user` – Name of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). - -- `password` – Password of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). - -- `replica` – Section of replica configurations. There can be multiple sections. - - - `replica/host` – The MySQL host. - - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. - -- `db` – Name of the database. - -- `table` – Name of the table. - -- `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause dans MySQL, par exemple, `id > 10 AND id < 20`. Paramètre facultatif. - -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Mise à jour des dictionnaires](external-dicts-dict-lifetime.md). - -MySQL peut être connecté sur un hôte local via des sockets. Pour ce faire, définissez `host` et `socket`. - -Exemple de paramètres: - -``` xml - - - localhost - /path/to/socket/file.sock - clickhouse - qwerty - db_name - table_name
- id=10 - SQL_QUERY -
- -``` - -ou - -``` sql -SOURCE(MYSQL( - host 'localhost' - socket '/path/to/socket/file.sock' - user 'clickhouse' - password 'qwerty' - db 'db_name' - table 'table_name' - where 'id=10' - invalidate_query 'SQL_QUERY' -)) -``` - -### ClickHouse {#dicts-external_dicts_dict_sources-clickhouse} - -Exemple de paramètres: - -``` xml - - - example01-01-1 - 9000 - default - - default - ids
- id=10 -
- -``` - -ou - -``` sql -SOURCE(CLICKHOUSE( - host 'example01-01-1' - port 9000 - user 'default' - password '' - db 'default' - table 'ids' - where 'id=10' -)) -``` - -Définition des champs: - -- `host` – The ClickHouse host. If it is a local host, the query is processed without any network activity. To improve fault tolerance, you can create a [Distribué](../../../engines/table-engines/special/distributed.md) table et entrez-le dans les configurations suivantes. -- `port` – The port on the ClickHouse server. -- `user` – Name of the ClickHouse user. -- `password` – Password of the ClickHouse user. -- `db` – Name of the database. -- `table` – Name of the table. -- `where` – The selection criteria. May be omitted. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Mise à jour des dictionnaires](external-dicts-dict-lifetime.md). - -### Mongodb {#dicts-external_dicts_dict_sources-mongodb} - -Exemple de paramètres: - -``` xml - - - localhost - 27017 - - - test - dictionary_source - - -``` - -ou - -``` sql -SOURCE(MONGO( - host 'localhost' - port 27017 - user '' - password '' - db 'test' - collection 'dictionary_source' -)) -``` - -Définition des champs: - -- `host` – The MongoDB host. -- `port` – The port on the MongoDB server. -- `user` – Name of the MongoDB user. -- `password` – Password of the MongoDB user. -- `db` – Name of the database. -- `collection` – Name of the collection. - -### Redis {#dicts-external_dicts_dict_sources-redis} - -Exemple de paramètres: - -``` xml - - - localhost - 6379 - simple - 0 - - -``` - -ou - -``` sql -SOURCE(REDIS( - host 'localhost' - port 6379 - storage_type 'simple' - db_index 0 -)) -``` - -Définition des champs: - -- `host` – The Redis host. -- `port` – The port on the Redis server. -- `storage_type` – The structure of internal Redis storage using for work with keys. `simple` est pour les sources simples et pour les sources à clé unique hachées, `hash_map` est pour les sources hachées avec deux clés. Les sources À Distance et les sources de cache à clé complexe ne sont pas prises en charge. Peut être omis, la valeur par défaut est `simple`. -- `db_index` – The specific numeric index of Redis logical database. May be omitted, default value is 0. - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_sources/) diff --git a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md deleted file mode 100644 index 1b9215baf06..00000000000 --- a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ /dev/null @@ -1,175 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 44 -toc_title: "Cl\xE9 et champs du dictionnaire" ---- - -# Clé et champs du dictionnaire {#dictionary-key-and-fields} - -Le `` la clause décrit la clé du dictionnaire et les champs disponibles pour les requêtes. - -Description XML: - -``` xml - - - - Id - - - - - - - ... - - - -``` - -Les attributs sont décrits dans les éléments: - -- `` — [La colonne de la clé](external-dicts-dict-structure.md#ext_dict_structure-key). -- `` — [Colonne de données](external-dicts-dict-structure.md#ext_dict_structure-attributes). Il peut y avoir un certain nombre d'attributs. - -Requête DDL: - -``` sql -CREATE DICTIONARY dict_name ( - Id UInt64, - -- attributes -) -PRIMARY KEY Id -... -``` - -Les attributs sont décrits dans le corps de la requête: - -- `PRIMARY KEY` — [La colonne de la clé](external-dicts-dict-structure.md#ext_dict_structure-key) -- `AttrName AttrType` — [Colonne de données](external-dicts-dict-structure.md#ext_dict_structure-attributes). Il peut y avoir un certain nombre d'attributs. - -## Clé {#ext_dict_structure-key} - -ClickHouse prend en charge les types de clés suivants: - -- Touche numérique. `UInt64`. Défini dans le `` tag ou en utilisant `PRIMARY KEY` mot. -- Clé Composite. Ensemble de valeurs de types différents. Défini dans la balise `` ou `PRIMARY KEY` mot. - -Une structure xml peut contenir `` ou ``. DDL-requête doit contenir unique `PRIMARY KEY`. - -!!! warning "Avertissement" - Vous ne devez pas décrire clé comme un attribut. - -### Touche Numérique {#ext_dict-numeric-key} - -Type: `UInt64`. - -Exemple de Configuration: - -``` xml - - Id - -``` - -Champs de Configuration: - -- `name` – The name of the column with keys. - -Pour DDL-requête: - -``` sql -CREATE DICTIONARY ( - Id UInt64, - ... -) -PRIMARY KEY Id -... -``` - -- `PRIMARY KEY` – The name of the column with keys. - -### Clé Composite {#composite-key} - -La clé peut être un `tuple` de tous les types de champs. Le [disposition](external-dicts-dict-layout.md) dans ce cas, doit être `complex_key_hashed` ou `complex_key_cache`. - -!!! tip "Conseil" - Une clé composite peut être constitué d'un seul élément. Cela permet d'utiliser une chaîne comme clé, par exemple. - -La structure de clé est définie dans l'élément ``. Les principaux champs sont spécifiés dans le même format que le dictionnaire [attribut](external-dicts-dict-structure.md). Exemple: - -``` xml - - - - field1 - String - - - field2 - UInt32 - - ... - -... -``` - -ou - -``` sql -CREATE DICTIONARY ( - field1 String, - field2 String - ... -) -PRIMARY KEY field1, field2 -... -``` - -Pour une requête à l' `dictGet*` fonction, un tuple est passé comme clé. Exemple: `dictGetString('dict_name', 'attr_name', tuple('string for field1', num_for_field2))`. - -## Attribut {#ext_dict_structure-attributes} - -Exemple de Configuration: - -``` xml - - ... - - Name - ClickHouseDataType - - rand64() - true - true - true - - -``` - -ou - -``` sql -CREATE DICTIONARY somename ( - Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID -) -``` - -Champs de Configuration: - -| Balise | Description | Requis | -|------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------| -| `name` | Nom de la colonne. | Oui | -| `type` | Type de données ClickHouse.
ClickHouse tente de convertir la valeur du dictionnaire vers le type de données spécifié. Par exemple, pour MySQL, le champ peut être `TEXT`, `VARCHAR`, ou `BLOB` dans la table source MySQL, mais il peut être téléchargé comme `String` à ClickHouse.
[Nullable](../../../sql-reference/data-types/nullable.md) n'est pas pris en charge. | Oui | -| `null_value` | Valeur par défaut pour un élément inexistant.
Dans l'exemple, c'est une chaîne vide. Vous ne pouvez pas utiliser `NULL` dans ce domaine. | Oui | -| `expression` | [Expression](../../syntax.md#syntax-expressions) que ClickHouse s'exécute sur la valeur.
L'expression peut être un nom de colonne dans la base de données SQL distante. Ainsi, vous pouvez l'utiliser pour créer un alias pour la colonne à distance.

Valeur par défaut: aucune expression. | Aucun | -| `hierarchical` | Si `true`, l'attribut contient la valeur d'un parent clé de la clé actuelle. Voir [Dictionnaires Hiérarchiques](external-dicts-dict-hierarchical.md).

Valeur par défaut: `false`. | Aucun | -| `injective` | Indicateur qui indique si le `id -> attribute` l'image est [injective](https://en.wikipedia.org/wiki/Injective_function).
Si `true`, ClickHouse peut automatiquement placer après le `GROUP BY` clause les requêtes aux dictionnaires avec injection. Habituellement, il réduit considérablement le montant de ces demandes.

Valeur par défaut: `false`. | Aucun | -| `is_object_id` | Indicateur qui indique si la requête est exécutée pour un document MongoDB par `ObjectID`.

Valeur par défaut: `false`. | Aucun | - -## Voir Aussi {#see-also} - -- [Fonctions pour travailler avec des dictionnaires externes](../../../sql-reference/functions/ext-dict-functions.md). - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_structure/) diff --git a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md b/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md deleted file mode 100644 index 3bb8884df2f..00000000000 --- a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 40 -toc_title: Configuration D'un dictionnaire externe ---- - -# Configuration D'un dictionnaire externe {#dicts-external-dicts-dict} - -Si dictionary est configuré à l'aide d'un fichier xml, than dictionary configuration a la structure suivante: - -``` xml - - dict_name - - - - - - - - - - - - - - - - - -``` - -Correspondant [DDL-requête](../../statements/create.md#create-dictionary-query) a la structure suivante: - -``` sql -CREATE DICTIONARY dict_name -( - ... -- attributes -) -PRIMARY KEY ... -- complex or single key configuration -SOURCE(...) -- Source configuration -LAYOUT(...) -- Memory layout configuration -LIFETIME(...) -- Lifetime of dictionary in memory -``` - -- `name` – The identifier that can be used to access the dictionary. Use the characters `[a-zA-Z0-9_\-]`. -- [source](external-dicts-dict-sources.md) — Source of the dictionary. -- [disposition](external-dicts-dict-layout.md) — Dictionary layout in memory. -- [structure](external-dicts-dict-structure.md) — Structure of the dictionary . A key and attributes that can be retrieved by this key. -- [vie](external-dicts-dict-lifetime.md) — Frequency of dictionary updates. - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict/) diff --git a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts.md b/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts.md deleted file mode 100644 index d68b7a7f112..00000000000 --- a/docs/fr/sql-reference/dictionaries/external-dictionaries/external-dicts.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 39 -toc_title: "Description G\xE9n\xE9rale" ---- - -# Dictionnaires Externes {#dicts-external-dicts} - -Vous pouvez ajouter vos propres dictionnaires à partir de diverses sources de données. La source de données d'un dictionnaire peut être un texte local ou un fichier exécutable, une ressource HTTP(S) ou un autre SGBD. Pour plus d'informations, voir “[Sources pour les dictionnaires externes](external-dicts-dict-sources.md)”. - -ClickHouse: - -- Stocke entièrement ou partiellement les dictionnaires en RAM. -- Met à jour périodiquement les dictionnaires et charge dynamiquement les valeurs manquantes. En d'autres mots, les dictionnaires peuvent être chargés dynamiquement. -- Permet de créer des dictionnaires externes avec des fichiers xml ou [Les requêtes DDL](../../statements/create.md#create-dictionary-query). - -La configuration des dictionnaires externes peut être située dans un ou plusieurs fichiers xml. Le chemin d'accès à la configuration spécifiée dans le [dictionaries_config](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_config) paramètre. - -Les dictionnaires peuvent être chargés au démarrage du serveur ou à la première utilisation, en fonction [dictionaries_lazy_load](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_lazy_load) paramètre. - -Le [dictionnaire](../../../operations/system-tables.md#system_tables-dictionaries) la table système contient des informations sur les dictionnaires configurés sur le serveur. Pour chaque dictionnaire, vous pouvez y trouver: - -- Statut du dictionnaire. -- Paramètres de Configuration. -- Des métriques telles que la quantité de RAM allouée pour le dictionnaire ou un certain nombre de requêtes depuis que le dictionnaire a été chargé avec succès. - -Le fichier de configuration du dictionnaire a le format suivant: - -``` xml - - An optional element with any content. Ignored by the ClickHouse server. - - - /etc/metrika.xml - - - - - - - - -``` - -Vous pouvez [configurer](external-dicts-dict.md) le nombre de dictionnaires dans le même fichier. - -[Requêtes DDL pour les dictionnaires](../../statements/create.md#create-dictionary-query) ne nécessite aucun enregistrement supplémentaire dans la configuration du serveur. Ils permettent de travailler avec des dictionnaires en tant qu'entités de première classe, comme des tables ou des vues. - -!!! attention "Attention" - Vous pouvez convertir les valeurs pour un petit dictionnaire en le décrivant dans un `SELECT` requête (voir la [transformer](../../../sql-reference/functions/other-functions.md) fonction). Cette fonctionnalité n'est pas liée aux dictionnaires externes. - -## Voir Aussi {#ext-dicts-see-also} - -- [Configuration D'un dictionnaire externe](external-dicts-dict.md) -- [Stockage des dictionnaires en mémoire](external-dicts-dict-layout.md) -- [Mises À Jour Du Dictionnaire](external-dicts-dict-lifetime.md) -- [Sources de dictionnaires externes](external-dicts-dict-sources.md) -- [Clé et champs du dictionnaire](external-dicts-dict-structure.md) -- [Fonctions pour travailler avec des dictionnaires externes](../../../sql-reference/functions/ext-dict-functions.md) - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts/) diff --git a/docs/fr/sql-reference/dictionaries/external-dictionaries/index.md b/docs/fr/sql-reference/dictionaries/external-dictionaries/index.md deleted file mode 100644 index 109220205dd..00000000000 --- a/docs/fr/sql-reference/dictionaries/external-dictionaries/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Dictionnaires Externes -toc_priority: 37 ---- - - diff --git a/docs/fr/sql-reference/dictionaries/index.md b/docs/fr/sql-reference/dictionaries/index.md deleted file mode 100644 index 3ec31085cc5..00000000000 --- a/docs/fr/sql-reference/dictionaries/index.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Dictionnaire -toc_priority: 35 -toc_title: Introduction ---- - -# Dictionnaire {#dictionaries} - -Un dictionnaire est une cartographie (`key -> attributes`) qui est pratique pour différents types de listes de référence. - -ClickHouse prend en charge des fonctions spéciales pour travailler avec des dictionnaires qui peuvent être utilisés dans les requêtes. Il est plus facile et plus efficace d'utiliser des dictionnaires avec des fonctions que par une `JOIN` avec des tableaux de référence. - -[NULL](../../sql-reference/syntax.md#null-literal) les valeurs ne peuvent pas être stockées dans un dictionnaire. - -Supports ClickHouse: - -- [Construit-dans les dictionnaires](internal-dicts.md#internal_dicts) avec un [ensemble de fonctions](../../sql-reference/functions/ym-dict-functions.md). -- [Plug-in (externe) dictionnaires](external-dictionaries/external-dicts.md#dicts-external-dicts) avec un [ensemble de fonctions](../../sql-reference/functions/ext-dict-functions.md). - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/) diff --git a/docs/fr/sql-reference/dictionaries/internal-dicts.md b/docs/fr/sql-reference/dictionaries/internal-dicts.md deleted file mode 100644 index 607936031a1..00000000000 --- a/docs/fr/sql-reference/dictionaries/internal-dicts.md +++ /dev/null @@ -1,55 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 39 -toc_title: Dictionnaires Internes ---- - -# Dictionnaires Internes {#internal_dicts} - -ClickHouse contient une fonction intégrée pour travailler avec une géobase. - -Cela vous permet de: - -- Utilisez L'ID d'une région pour obtenir son nom dans la langue souhaitée. -- Utilisez L'ID d'une région pour obtenir L'ID d'une ville, d'une région, d'un district fédéral, d'un pays ou d'un continent. -- Vérifiez si une région fait partie d'une autre région. -- Obtenez une chaîne de régions parentes. - -Toutes les fonctions prennent en charge “translocality,” la capacité d'utiliser simultanément différentes perspectives sur la propriété de la région. Pour plus d'informations, consultez la section “Functions for working with Yandex.Metrica dictionaries”. - -Les dictionnaires internes sont désactivés dans le package par défaut. -Pour les activer, décommentez les paramètres `path_to_regions_hierarchy_file` et `path_to_regions_names_files` dans le fichier de configuration du serveur. - -La géobase est chargée à partir de fichiers texte. - -Place de la `regions_hierarchy*.txt` les fichiers dans le `path_to_regions_hierarchy_file` répertoire. Ce paramètre de configuration doit contenir le chemin `regions_hierarchy.txt` fichier (la hiérarchie régionale par défaut), et les autres fichiers (`regions_hierarchy_ua.txt`) doit être situé dans le même répertoire. - -Mettre le `regions_names_*.txt` les fichiers dans le `path_to_regions_names_files` répertoire. - -Vous pouvez également créer ces fichiers vous-même. Le format de fichier est le suivant: - -`regions_hierarchy*.txt`: TabSeparated (pas d'en-tête), colonnes: - -- région de l'ID (`UInt32`) -- ID de région parent (`UInt32`) -- type de région (`UInt8`): 1-continent, 3-pays, 4-district fédéral, 5-région, 6-ville; les autres types n'ont pas de valeurs -- population (`UInt32`) — optional column - -`regions_names_*.txt`: TabSeparated (pas d'en-tête), colonnes: - -- région de l'ID (`UInt32`) -- nom de la région (`String`) — Can't contain tabs or line feeds, even escaped ones. - -Un tableau plat est utilisé pour stocker dans la RAM. Pour cette raison, les ID ne devraient pas dépasser un million. - -Les dictionnaires peuvent être mis à jour sans redémarrer le serveur. Cependant, l'ensemble des dictionnaires n'est pas mis à jour. -Pour les mises à jour, les temps de modification du fichier sont vérifiés. Si un fichier a été modifié, le dictionnaire est mis à jour. -L'intervalle de vérification des modifications est configuré dans le `builtin_dictionaries_reload_interval` paramètre. -Les mises à jour du dictionnaire (autres que le chargement lors de la première utilisation) ne bloquent pas les requêtes. Lors des mises à jour, les requêtes utilisent les anciennes versions des dictionnaires. Si une erreur se produit pendant une mise à jour, l'erreur est écrite dans le journal du serveur et les requêtes continuent d'utiliser l'ancienne version des dictionnaires. - -Nous vous recommandons de mettre à jour périodiquement les dictionnaires avec la géobase. Lors d'une mise à jour, générez de nouveaux fichiers et écrivez-les dans un emplacement séparé. Lorsque tout est prêt, renommez - les en fichiers utilisés par le serveur. - -Il existe également des fonctions pour travailler avec les identifiants du système d'exploitation et Yandex.Moteurs de recherche Metrica, mais ils ne devraient pas être utilisés. - -[Article Original](https://clickhouse.tech/docs/en/query_language/dicts/internal_dicts/) diff --git a/docs/fr/sql-reference/functions/arithmetic-functions.md b/docs/fr/sql-reference/functions/arithmetic-functions.md deleted file mode 100644 index c35fb104236..00000000000 --- a/docs/fr/sql-reference/functions/arithmetic-functions.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 35 -toc_title: "Arithm\xE9tique" ---- - -# Fonctions Arithmétiques {#arithmetic-functions} - -Pour toutes les fonctions arithmétiques, le type de résultat est calculé comme le plus petit type de nombre dans lequel le résultat correspond, s'il existe un tel type. Le minimum est pris simultanément sur la base du nombre de bits, s'il est signé, et s'il flotte. S'il n'y a pas assez de bits, le type de bits le plus élevé est pris. - -Exemple: - -``` sql -SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0) -``` - -``` text -┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ -│ UInt8 │ UInt16 │ UInt32 │ UInt64 │ -└───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ -``` - -Les fonctions arithmétiques fonctionnent pour n'importe quelle paire de types de UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32 ou Float64. - -Le débordement est produit de la même manière qu'en C++. - -## plus (A, B), opérateur a + b {#plusa-b-a-b-operator} - -Calcule la somme des nombres. -Vous pouvez également ajouter des nombres entiers avec une date ou la date et l'heure. Dans le cas d'une date, Ajouter un entier signifie ajouter le nombre de jours correspondant. Pour une date avec l'heure, cela signifie ajouter le nombre de secondes correspondant. - -## moins (A, B), opérateur a - b {#minusa-b-a-b-operator} - -Calcule la différence. Le résultat est toujours signé. - -You can also calculate integer numbers from a date or date with time. The idea is the same – see above for ‘plus’. - -## la multiplication(a, b), a \* et b \* de l'opérateur {#multiplya-b-a-b-operator} - -Calcule le produit des nombres. - -## diviser (A, B), opérateur a / b {#dividea-b-a-b-operator} - -Calcule le quotient des nombres. Le type de résultat est toujours un type à virgule flottante. -Il n'est pas de division entière. Pour la division entière, utilisez le ‘intDiv’ fonction. -En divisant par zéro vous obtenez ‘inf’, ‘-inf’, ou ‘nan’. - -## intDiv (a, b) {#intdiva-b} - -Calcule le quotient des nombres. Divise en entiers, arrondi vers le bas (par la valeur absolue). -Une exception est levée en divisant par zéro ou en divisant un nombre négatif minimal par moins un. - -## intDivOrZero(a, b) {#intdivorzeroa-b} - -Diffère de ‘intDiv’ en ce sens qu'il renvoie zéro en divisant par zéro ou en divisant un nombre négatif minimal par moins un. - -## opérateur modulo(A, B), A % B {#moduloa-b-a-b-operator} - -Calcule le reste après la division. -Si les arguments sont des nombres à virgule flottante, ils sont pré-convertis en entiers en supprimant la partie décimale. -Le reste est pris dans le même sens qu'en C++. La division tronquée est utilisée pour les nombres négatifs. -Une exception est levée en divisant par zéro ou en divisant un nombre négatif minimal par moins un. - -## moduloOrZero (a, b) {#moduloorzeroa-b} - -Diffère de ‘modulo’ en ce sens qu'il renvoie zéro lorsque le diviseur est nul. - -## annuler (a), - un opérateur {#negatea-a-operator} - -Calcule un nombre avec le signe inverse. Le résultat est toujours signé. - -## abs(un) {#arithm_func-abs} - -Calcule la valeur absolue d'un nombre (un). Autrement dit, si un \< 0, Il renvoie-A. pour les types non signés, il ne fait rien. Pour les types entiers signés, il renvoie un nombre non signé. - -## pgcd(a, b) {#gcda-b} - -Renvoie le plus grand diviseur commun des nombres. -Une exception est levée en divisant par zéro ou en divisant un nombre négatif minimal par moins un. - -## ppcm(a, b) {#lcma-b} - -Renvoie le multiple le moins commun des nombres. -Une exception est levée en divisant par zéro ou en divisant un nombre négatif minimal par moins un. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/arithmetic_functions/) diff --git a/docs/fr/sql-reference/functions/array-functions.md b/docs/fr/sql-reference/functions/array-functions.md deleted file mode 100644 index 40568841372..00000000000 --- a/docs/fr/sql-reference/functions/array-functions.md +++ /dev/null @@ -1,1061 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 46 -toc_title: Travailler avec des tableaux ---- - -# Fonctions pour travailler avec des tableaux {#functions-for-working-with-arrays} - -## vide {#function-empty} - -Retourne 1 pour un tableau vide, ou 0 pour un non-vide. -Le type de résultat est UInt8. -La fonction fonctionne également pour les chaînes. - -## notEmpty {#function-notempty} - -Retourne 0 pour un tableau vide, ou 1 pour un non-vide. -Le type de résultat est UInt8. -La fonction fonctionne également pour les chaînes. - -## longueur {#array_functions-length} - -Retourne le nombre d'éléments dans le tableau. -Le type de résultat est UInt64. -La fonction fonctionne également pour les chaînes. - -## emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64 {#emptyarrayuint8-emptyarrayuint16-emptyarrayuint32-emptyarrayuint64} - -## emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64 {#emptyarrayint8-emptyarrayint16-emptyarrayint32-emptyarrayint64} - -## emptyArrayFloat32, emptyArrayFloat64 {#emptyarrayfloat32-emptyarrayfloat64} - -## emptyArrayDate, emptyArrayDateTime {#emptyarraydate-emptyarraydatetime} - -## emptyArrayString {#emptyarraystring} - -Accepte zéro argument et renvoie un tableau vide du type approprié. - -## emptyArrayToSingle {#emptyarraytosingle} - -Accepte un tableau vide et renvoie un élément de tableau qui est égal à la valeur par défaut. - -## plage (fin), Plage(début, fin \[, étape\]) {#rangeend-rangestart-end-step} - -Retourne un tableau de nombres du début à la fin-1 par étape. -Si l'argument `start` n'est pas spécifié, la valeur par défaut est 0. -Si l'argument `step` n'est pas spécifié, la valeur par défaut est 1. -Il se comporte presque comme pythonic `range`. Mais la différence est que tous les types d'arguments doivent être `UInt` nombre. -Juste au cas où, une exception est levée si des tableaux d'une longueur totale de plus de 100 000 000 d'éléments sont créés dans un bloc de données. - -## array(x1, …), operator \[x1, …\] {#arrayx1-operator-x1} - -Crée un tableau à partir des arguments de la fonction. -Les arguments doivent être des constantes et avoir des types qui ont le plus petit type commun. Au moins un argument doit être passé, sinon il n'est pas clair quel type de tableau créer. Qui est, vous ne pouvez pas utiliser cette fonction pour créer un tableau vide (pour ce faire, utilisez la ‘emptyArray\*’ la fonction décrite ci-dessus). -Retourne un ‘Array(T)’ type de résultat, où ‘T’ est le plus petit type commun parmi les arguments passés. - -## arrayConcat {#arrayconcat} - -Combine des tableaux passés comme arguments. - -``` sql -arrayConcat(arrays) -``` - -**Paramètre** - -- `arrays` – Arbitrary number of arguments of [Tableau](../../sql-reference/data-types/array.md) type. - **Exemple** - - - -``` sql -SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res -``` - -``` text -┌─res───────────┐ -│ [1,2,3,4,5,6] │ -└───────────────┘ -``` - -## arrayElement(arr, n), opérateur arr\[n\] {#arrayelementarr-n-operator-arrn} - -Récupérer l'élément avec l'index `n` à partir du tableau `arr`. `n` doit être n'importe quel type entier. -Les index dans un tableau commencent à partir d'un. -Les index négatifs sont pris en charge. Dans ce cas, il sélectionne l'élément correspondant numérotées à partir de la fin. Exemple, `arr[-1]` est le dernier élément du tableau. - -Si l'index est en dehors des limites d'un tableau, il renvoie une valeur (0 pour les nombres, une chaîne vide pour les cordes, etc.), sauf pour le cas avec un tableau non constant et un index constant 0 (dans ce cas, il y aura une erreur `Array indices are 1-based`). - -## a (arr, elem) {#hasarr-elem} - -Vérifie si le ‘arr’ tableau a la ‘elem’ élément. -Retourne 0 si l'élément n'est pas dans le tableau, ou 1 si elle l'est. - -`NULL` est traitée comme une valeur. - -``` sql -SELECT has([1, 2, NULL], NULL) -``` - -``` text -┌─has([1, 2, NULL], NULL)─┐ -│ 1 │ -└─────────────────────────┘ -``` - -## hasAll {#hasall} - -Vérifie si un tableau est un sous-ensemble de l'autre. - -``` sql -hasAll(set, subset) -``` - -**Paramètre** - -- `set` – Array of any type with a set of elements. -- `subset` – Array of any type with elements that should be tested to be a subset of `set`. - -**Les valeurs de retour** - -- `1`, si `set` contient tous les éléments de `subset`. -- `0`, autrement. - -**Propriétés particulières** - -- Un tableau vide est un sous-ensemble d'un tableau quelconque. -- `Null` traitée comme une valeur. -- Ordre des valeurs dans les deux tableaux n'a pas d'importance. - -**Exemple** - -`SELECT hasAll([], [])` retours 1. - -`SELECT hasAll([1, Null], [Null])` retours 1. - -`SELECT hasAll([1.0, 2, 3, 4], [1, 3])` retours 1. - -`SELECT hasAll(['a', 'b'], ['a'])` retours 1. - -`SELECT hasAll([1], ['a'])` renvoie 0. - -`SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [3, 5]])` renvoie 0. - -## hasAny {#hasany} - -Vérifie si deux tableaux ont une intersection par certains éléments. - -``` sql -hasAny(array1, array2) -``` - -**Paramètre** - -- `array1` – Array of any type with a set of elements. -- `array2` – Array of any type with a set of elements. - -**Les valeurs de retour** - -- `1`, si `array1` et `array2` avoir un élément similaire au moins. -- `0`, autrement. - -**Propriétés particulières** - -- `Null` traitée comme une valeur. -- Ordre des valeurs dans les deux tableaux n'a pas d'importance. - -**Exemple** - -`SELECT hasAny([1], [])` retourner `0`. - -`SELECT hasAny([Null], [Null, 1])` retourner `1`. - -`SELECT hasAny([-128, 1., 512], [1])` retourner `1`. - -`SELECT hasAny([[1, 2], [3, 4]], ['a', 'c'])` retourner `0`. - -`SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [1, 2]])` retourner `1`. - -## indexOf (arr, x) {#indexofarr-x} - -Renvoie l'index de la première ‘x’ élément (à partir de 1) s'il est dans le tableau, ou 0 s'il ne l'est pas. - -Exemple: - -``` sql -SELECT indexOf([1, 3, NULL, NULL], NULL) -``` - -``` text -┌─indexOf([1, 3, NULL, NULL], NULL)─┐ -│ 3 │ -└───────────────────────────────────┘ -``` - -Ensemble d'éléments de `NULL` sont traités comme des valeurs normales. - -## countEqual (arr, x) {#countequalarr-x} - -Renvoie le nombre d'éléments dans le tableau égal à X. équivalent à arrayCount (elem - \> elem = x, arr). - -`NULL` les éléments sont traités comme des valeurs distinctes. - -Exemple: - -``` sql -SELECT countEqual([1, 2, NULL, NULL], NULL) -``` - -``` text -┌─countEqual([1, 2, NULL, NULL], NULL)─┐ -│ 2 │ -└──────────────────────────────────────┘ -``` - -## arrayEnumerate (arr) {#array_functions-arrayenumerate} - -Returns the array \[1, 2, 3, …, length (arr) \] - -Cette fonction est normalement utilisée avec ARRAY JOIN. Il permet de compter quelque chose une seule fois pour chaque tableau après l'application de la jointure de tableau. Exemple: - -``` sql -SELECT - count() AS Reaches, - countIf(num = 1) AS Hits -FROM test.hits -ARRAY JOIN - GoalsReached, - arrayEnumerate(GoalsReached) AS num -WHERE CounterID = 160656 -LIMIT 10 -``` - -``` text -┌─Reaches─┬──Hits─┐ -│ 95606 │ 31406 │ -└─────────┴───────┘ -``` - -Dans cet exemple, Reaches est le nombre de conversions (les chaînes reçues après l'application de la jointure de tableau), et Hits est le nombre de pages vues (chaînes avant la jointure de tableau). Dans ce cas particulier, vous pouvez obtenir le même résultat dans une voie plus facile: - -``` sql -SELECT - sum(length(GoalsReached)) AS Reaches, - count() AS Hits -FROM test.hits -WHERE (CounterID = 160656) AND notEmpty(GoalsReached) -``` - -``` text -┌─Reaches─┬──Hits─┐ -│ 95606 │ 31406 │ -└─────────┴───────┘ -``` - -Cette fonction peut également être utilisée dans les fonctions d'ordre supérieur. Par exemple, vous pouvez l'utiliser pour obtenir les indices de tableau pour les éléments qui correspondent à une condition. - -## arrayEnumerateUniq(arr, …) {#arrayenumerateuniqarr} - -Renvoie un tableau de la même taille que le tableau source, indiquant pour chaque élément Quelle est sa position parmi les éléments de même valeur. -Par exemple: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. - -Cette fonction est utile lors de L'utilisation de la jointure de tableau et de l'agrégation d'éléments de tableau. -Exemple: - -``` sql -SELECT - Goals.ID AS GoalID, - sum(Sign) AS Reaches, - sumIf(Sign, num = 1) AS Visits -FROM test.visits -ARRAY JOIN - Goals, - arrayEnumerateUniq(Goals.ID) AS num -WHERE CounterID = 160656 -GROUP BY GoalID -ORDER BY Reaches DESC -LIMIT 10 -``` - -``` text -┌──GoalID─┬─Reaches─┬─Visits─┐ -│ 53225 │ 3214 │ 1097 │ -│ 2825062 │ 3188 │ 1097 │ -│ 56600 │ 2803 │ 488 │ -│ 1989037 │ 2401 │ 365 │ -│ 2830064 │ 2396 │ 910 │ -│ 1113562 │ 2372 │ 373 │ -│ 3270895 │ 2262 │ 812 │ -│ 1084657 │ 2262 │ 345 │ -│ 56599 │ 2260 │ 799 │ -│ 3271094 │ 2256 │ 812 │ -└─────────┴─────────┴────────┘ -``` - -Dans cet exemple, chaque ID d'objectif a un calcul du nombre de conversions (chaque élément de la structure de données imbriquées objectifs est un objectif atteint, que nous appelons une conversion) et le nombre de sessions. Sans array JOIN, nous aurions compté le nombre de sessions comme sum(signe). Mais dans ce cas particulier, les lignes ont été multipliées par la structure des objectifs imbriqués, donc pour compter chaque session une fois après cela, nous appliquons une condition à la valeur de arrayEnumerateUniq(Goals.ID) fonction. - -La fonction arrayEnumerateUniq peut prendre plusieurs tableaux de la même taille que les arguments. Dans ce cas, l'unicité est considérée pour les tuples d'éléments dans les mêmes positions dans tous les tableaux. - -``` sql -SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res -``` - -``` text -┌─res───────────┐ -│ [1,2,1,1,2,1] │ -└───────────────┘ -``` - -Ceci est nécessaire lors de L'utilisation de Array JOIN avec une structure de données imbriquée et une agrégation supplémentaire entre plusieurs éléments de cette structure. - -## arrayPopBack {#arraypopback} - -Supprime le dernier élément du tableau. - -``` sql -arrayPopBack(array) -``` - -**Paramètre** - -- `array` – Array. - -**Exemple** - -``` sql -SELECT arrayPopBack([1, 2, 3]) AS res -``` - -``` text -┌─res───┐ -│ [1,2] │ -└───────┘ -``` - -## arrayPopFront {#arraypopfront} - -Supprime le premier élément de la matrice. - -``` sql -arrayPopFront(array) -``` - -**Paramètre** - -- `array` – Array. - -**Exemple** - -``` sql -SELECT arrayPopFront([1, 2, 3]) AS res -``` - -``` text -┌─res───┐ -│ [2,3] │ -└───────┘ -``` - -## arrayPushBack {#arraypushback} - -Ajoute un élément à la fin du tableau. - -``` sql -arrayPushBack(array, single_value) -``` - -**Paramètre** - -- `array` – Array. -- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type pour le type de données du tableau. Pour plus d'informations sur les types de données dans ClickHouse, voir “[Types de données](../../sql-reference/data-types/index.md#data_types)”. Peut être `NULL`. La fonction ajoute un `NULL` tableau, et le type d'éléments de tableau convertit en `Nullable`. - -**Exemple** - -``` sql -SELECT arrayPushBack(['a'], 'b') AS res -``` - -``` text -┌─res───────┐ -│ ['a','b'] │ -└───────────┘ -``` - -## arrayPushFront {#arraypushfront} - -Ajoute un élément au début du tableau. - -``` sql -arrayPushFront(array, single_value) -``` - -**Paramètre** - -- `array` – Array. -- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type pour le type de données du tableau. Pour plus d'informations sur les types de données dans ClickHouse, voir “[Types de données](../../sql-reference/data-types/index.md#data_types)”. Peut être `NULL`. La fonction ajoute un `NULL` tableau, et le type d'éléments de tableau convertit en `Nullable`. - -**Exemple** - -``` sql -SELECT arrayPushFront(['b'], 'a') AS res -``` - -``` text -┌─res───────┐ -│ ['a','b'] │ -└───────────┘ -``` - -## arrayResize {#arrayresize} - -Les changements de la longueur du tableau. - -``` sql -arrayResize(array, size[, extender]) -``` - -**Paramètre:** - -- `array` — Array. -- `size` — Required length of the array. - - Si `size` est inférieure à la taille d'origine du tableau, le tableau est tronqué à partir de la droite. -- Si `size` est plus grande que la taille initiale du tableau, le tableau est étendu vers la droite avec `extender` valeurs ou valeurs par défaut pour le type de données des éléments du tableau. -- `extender` — Value for extending an array. Can be `NULL`. - -**Valeur renvoyée:** - -Un tableau de longueur `size`. - -**Exemples d'appels** - -``` sql -SELECT arrayResize([1], 3) -``` - -``` text -┌─arrayResize([1], 3)─┐ -│ [1,0,0] │ -└─────────────────────┘ -``` - -``` sql -SELECT arrayResize([1], 3, NULL) -``` - -``` text -┌─arrayResize([1], 3, NULL)─┐ -│ [1,NULL,NULL] │ -└───────────────────────────┘ -``` - -## arraySlice {#arrayslice} - -Retourne une tranche du tableau. - -``` sql -arraySlice(array, offset[, length]) -``` - -**Paramètre** - -- `array` – Array of data. -- `offset` – Indent from the edge of the array. A positive value indicates an offset on the left, and a negative value is an indent on the right. Numbering of the array items begins with 1. -- `length` - La longueur de la nécessaire tranche. Si vous spécifiez une valeur négative, la fonction renvoie un ouvert tranche `[offset, array_length - length)`. Si vous omettez la valeur, la fonction renvoie la tranche `[offset, the_end_of_array]`. - -**Exemple** - -``` sql -SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res -``` - -``` text -┌─res────────┐ -│ [2,NULL,4] │ -└────────────┘ -``` - -Éléments de tableau définis sur `NULL` sont traités comme des valeurs normales. - -## arraySort(\[func,\] arr, …) {#array_functions-sort} - -Trie les éléments de la `arr` tableau dans l'ordre croissant. Si l' `func` fonction est spécifiée, l'ordre de tri est déterminé par le résultat de la `func` fonction appliquée aux éléments du tableau. Si `func` accepte plusieurs arguments, le `arraySort` la fonction est passé plusieurs tableaux que les arguments de `func` correspond à. Des exemples détaillés sont présentés à la fin de `arraySort` Description. - -Exemple de tri de valeurs entières: - -``` sql -SELECT arraySort([1, 3, 3, 0]); -``` - -``` text -┌─arraySort([1, 3, 3, 0])─┐ -│ [0,1,3,3] │ -└─────────────────────────┘ -``` - -Exemple de tri des valeurs de chaîne: - -``` sql -SELECT arraySort(['hello', 'world', '!']); -``` - -``` text -┌─arraySort(['hello', 'world', '!'])─┐ -│ ['!','hello','world'] │ -└────────────────────────────────────┘ -``` - -Considérez l'ordre de tri suivant pour le `NULL`, `NaN` et `Inf` valeur: - -``` sql -SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]); -``` - -``` text -┌─arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf])─┐ -│ [-inf,-4,1,2,3,inf,nan,nan,NULL,NULL] │ -└───────────────────────────────────────────────────────────┘ -``` - -- `-Inf` les valeurs sont d'abord dans le tableau. -- `NULL` les valeurs sont les derniers dans le tableau. -- `NaN` les valeurs sont juste avant `NULL`. -- `Inf` les valeurs sont juste avant `NaN`. - -Notez que `arraySort` est un [fonction d'ordre supérieur](higher-order-functions.md). Vous pouvez passer d'une fonction lambda comme premier argument. Dans ce cas, l'ordre de classement est déterminé par le résultat de la fonction lambda appliquée aux éléments de la matrice. - -Considérons l'exemple suivant: - -``` sql -SELECT arraySort((x) -> -x, [1, 2, 3]) as res; -``` - -``` text -┌─res─────┐ -│ [3,2,1] │ -└─────────┘ -``` - -For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` fonction trie les touches dans l'ordre croissant, le résultat est \[3, 2, 1\]. Ainsi, l' `(x) –> -x` fonction lambda définit le [l'ordre décroissant](#array_functions-reverse-sort) dans un tri. - -La fonction lambda peut accepter plusieurs arguments. Dans ce cas, vous avez besoin de passer l' `arraySort` fonction plusieurs tableaux de longueur identique à laquelle correspondront les arguments de la fonction lambda. Le tableau résultant sera composé d'éléments du premier tableau d'entrée; les éléments du(des) Tableau (s) d'entrée suivant (s) spécifient les clés de tri. Exemple: - -``` sql -SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; -``` - -``` text -┌─res────────────────┐ -│ ['world', 'hello'] │ -└────────────────────┘ -``` - -Ici, les éléments qui sont passés dans le deuxième tableau (\[2, 1\]) définissent une clé de tri pour l'élément correspondant à partir du tableau source (\[‘hello’, ‘world’\]), qui est, \[‘hello’ –\> 2, ‘world’ –\> 1\]. Since the lambda function doesn't use `x`, les valeurs réelles du tableau source n'affectent pas l'ordre dans le résultat. Si, ‘hello’ sera le deuxième élément du résultat, et ‘world’ sera le premier. - -D'autres exemples sont présentés ci-dessous. - -``` sql -SELECT arraySort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; -``` - -``` text -┌─res─────┐ -│ [2,1,0] │ -└─────────┘ -``` - -``` sql -SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; -``` - -``` text -┌─res─────┐ -│ [2,1,0] │ -└─────────┘ -``` - -!!! note "Note" - Pour améliorer l'efficacité du tri, de la [Transformation schwartzienne](https://en.wikipedia.org/wiki/Schwartzian_transform) est utilisée. - -## arrayReverseSort(\[func,\] arr, …) {#array_functions-reverse-sort} - -Trie les éléments de la `arr` tableau dans l'ordre décroissant. Si l' `func` la fonction est spécifiée, `arr` est trié en fonction du résultat de la `func` fonction appliquée aux éléments du tableau, puis le tableau trié est inversé. Si `func` accepte plusieurs arguments, le `arrayReverseSort` la fonction est passé plusieurs tableaux que les arguments de `func` correspond à. Des exemples détaillés sont présentés à la fin de `arrayReverseSort` Description. - -Exemple de tri de valeurs entières: - -``` sql -SELECT arrayReverseSort([1, 3, 3, 0]); -``` - -``` text -┌─arrayReverseSort([1, 3, 3, 0])─┐ -│ [3,3,1,0] │ -└────────────────────────────────┘ -``` - -Exemple de tri des valeurs de chaîne: - -``` sql -SELECT arrayReverseSort(['hello', 'world', '!']); -``` - -``` text -┌─arrayReverseSort(['hello', 'world', '!'])─┐ -│ ['world','hello','!'] │ -└───────────────────────────────────────────┘ -``` - -Considérez l'ordre de tri suivant pour le `NULL`, `NaN` et `Inf` valeur: - -``` sql -SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res; -``` - -``` text -┌─res───────────────────────────────────┐ -│ [inf,3,2,1,-4,-inf,nan,nan,NULL,NULL] │ -└───────────────────────────────────────┘ -``` - -- `Inf` les valeurs sont d'abord dans le tableau. -- `NULL` les valeurs sont les derniers dans le tableau. -- `NaN` les valeurs sont juste avant `NULL`. -- `-Inf` les valeurs sont juste avant `NaN`. - -Notez que l' `arrayReverseSort` est un [fonction d'ordre supérieur](higher-order-functions.md). Vous pouvez passer d'une fonction lambda comme premier argument. Exemple est montré ci-dessous. - -``` sql -SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; -``` - -``` text -┌─res─────┐ -│ [1,2,3] │ -└─────────┘ -``` - -Le tableau est trié de la façon suivante: - -1. Dans un premier temps, le tableau source (\[1, 2, 3\]) est trié en fonction du résultat de la fonction lambda appliquée aux éléments du tableau. Le résultat est un tableau \[3, 2, 1\]. -2. Tableau qui est obtenu à l'étape précédente, est renversé. Donc, le résultat final est \[1, 2, 3\]. - -La fonction lambda peut accepter plusieurs arguments. Dans ce cas, vous avez besoin de passer l' `arrayReverseSort` fonction plusieurs tableaux de longueur identique à laquelle correspondront les arguments de la fonction lambda. Le tableau résultant sera composé d'éléments du premier tableau d'entrée; les éléments du(des) Tableau (s) d'entrée suivant (s) spécifient les clés de tri. Exemple: - -``` sql -SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; -``` - -``` text -┌─res───────────────┐ -│ ['hello','world'] │ -└───────────────────┘ -``` - -Dans cet exemple, le tableau est trié de la façon suivante: - -1. Au début, le tableau source (\[‘hello’, ‘world’\]) est triée selon le résultat de la fonction lambda appliquée aux éléments de tableaux. Les éléments qui sont passés dans le deuxième tableau (\[2, 1\]), définissent les clés de tri pour les éléments correspondants du tableau source. Le résultat est un tableau \[‘world’, ‘hello’\]. -2. Tableau trié lors de l'étape précédente, est renversé. Donc, le résultat final est \[‘hello’, ‘world’\]. - -D'autres exemples sont présentés ci-dessous. - -``` sql -SELECT arrayReverseSort((x, y) -> y, [4, 3, 5], ['a', 'b', 'c']) AS res; -``` - -``` text -┌─res─────┐ -│ [5,3,4] │ -└─────────┘ -``` - -``` sql -SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; -``` - -``` text -┌─res─────┐ -│ [4,3,5] │ -└─────────┘ -``` - -## arrayUniq(arr, …) {#arrayuniqarr} - -Si un argument est passé, il compte le nombre de différents éléments dans le tableau. -Si plusieurs arguments sont passés, il compte le nombre de tuples différents d'éléments aux positions correspondantes dans plusieurs tableaux. - -Si vous souhaitez obtenir une liste des éléments dans un tableau, vous pouvez utiliser arrayReduce(‘groupUniqArray’, arr). - -## arrayJoin (arr) {#array-functions-join} - -Une fonction spéciale. Voir la section [“ArrayJoin function”](array-join.md#functions_arrayjoin). - -## tableaudifférence {#arraydifference} - -Calcule la différence entre les éléments de tableau adjacents. Renvoie un tableau où le premier élément sera 0, le second est la différence entre `a[1] - a[0]`, etc. The type of elements in the resulting array is determined by the type inference rules for subtraction (e.g. `UInt8` - `UInt8` = `Int16`). - -**Syntaxe** - -``` sql -arrayDifference(array) -``` - -**Paramètre** - -- `array` – [Tableau](https://clickhouse.tech/docs/en/data_types/array/). - -**Valeurs renvoyées** - -Renvoie un tableau de différences entre les éléments adjacents. - -Type: [UInt\*](https://clickhouse.tech/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.tech/docs/en/data_types/int_uint/#int-ranges), [Flottant\*](https://clickhouse.tech/docs/en/data_types/float/). - -**Exemple** - -Requête: - -``` sql -SELECT arrayDifference([1, 2, 3, 4]) -``` - -Résultat: - -``` text -┌─arrayDifference([1, 2, 3, 4])─┐ -│ [0,1,1,1] │ -└───────────────────────────────┘ -``` - -Exemple de débordement dû au type de résultat Int64: - -Requête: - -``` sql -SELECT arrayDifference([0, 10000000000000000000]) -``` - -Résultat: - -``` text -┌─arrayDifference([0, 10000000000000000000])─┐ -│ [0,-8446744073709551616] │ -└────────────────────────────────────────────┘ -``` - -## arrayDistinct {#arraydistinct} - -Prend un tableau, retourne un tableau contenant les différents éléments seulement. - -**Syntaxe** - -``` sql -arrayDistinct(array) -``` - -**Paramètre** - -- `array` – [Tableau](https://clickhouse.tech/docs/en/data_types/array/). - -**Valeurs renvoyées** - -Retourne un tableau contenant les éléments distincts. - -**Exemple** - -Requête: - -``` sql -SELECT arrayDistinct([1, 2, 2, 3, 1]) -``` - -Résultat: - -``` text -┌─arrayDistinct([1, 2, 2, 3, 1])─┐ -│ [1,2,3] │ -└────────────────────────────────┘ -``` - -## arrayEnumerateDense(arr) {#array_functions-arrayenumeratedense} - -Renvoie un tableau de la même taille que le tableau source, indiquant où chaque élément apparaît en premier dans le tableau source. - -Exemple: - -``` sql -SELECT arrayEnumerateDense([10, 20, 10, 30]) -``` - -``` text -┌─arrayEnumerateDense([10, 20, 10, 30])─┐ -│ [1,2,1,3] │ -└───────────────────────────────────────┘ -``` - -## arrayIntersect (arr) {#array-functions-arrayintersect} - -Prend plusieurs tableaux, retourne un tableau avec des éléments présents dans tous les tableaux source. L'ordre des éléments dans le tableau résultant est le même que dans le premier tableau. - -Exemple: - -``` sql -SELECT - arrayIntersect([1, 2], [1, 3], [2, 3]) AS no_intersect, - arrayIntersect([1, 2], [1, 3], [1, 4]) AS intersect -``` - -``` text -┌─no_intersect─┬─intersect─┐ -│ [] │ [1] │ -└──────────────┴───────────┘ -``` - -## arrayReduce {#arrayreduce} - -Applique une fonction d'agrégation aux éléments du tableau et renvoie son résultat. Le nom de la fonction d'agrégation est passé sous forme de chaîne entre guillemets simples `'max'`, `'sum'`. Lorsque vous utilisez des fonctions d'agrégat paramétriques, le paramètre est indiqué après le nom de la fonction entre parenthèses `'uniqUpTo(6)'`. - -**Syntaxe** - -``` sql -arrayReduce(agg_func, arr1, arr2, ..., arrN) -``` - -**Paramètre** - -- `agg_func` — The name of an aggregate function which should be a constant [chaîne](../../sql-reference/data-types/string.md). -- `arr` — Any number of [tableau](../../sql-reference/data-types/array.md) tapez les colonnes comme paramètres de la fonction d'agrégation. - -**Valeur renvoyée** - -**Exemple** - -``` sql -SELECT arrayReduce('max', [1, 2, 3]) -``` - -``` text -┌─arrayReduce('max', [1, 2, 3])─┐ -│ 3 │ -└───────────────────────────────┘ -``` - -Si une fonction d'agrégation prend plusieurs arguments, cette fonction doit être appliqué à plusieurs ensembles de même taille. - -``` sql -SELECT arrayReduce('maxIf', [3, 5], [1, 0]) -``` - -``` text -┌─arrayReduce('maxIf', [3, 5], [1, 0])─┐ -│ 3 │ -└──────────────────────────────────────┘ -``` - -Exemple avec une fonction d'agrégat paramétrique: - -``` sql -SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) -``` - -``` text -┌─arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])─┐ -│ 4 │ -└─────────────────────────────────────────────────────────────┘ -``` - -## arrayReduceInRanges {#arrayreduceinranges} - -Applique une fonction d'agrégation d'éléments de tableau dans des plages et retourne un tableau contenant le résultat correspondant à chaque gamme. La fonction retourne le même résultat que plusieurs `arrayReduce(agg_func, arraySlice(arr1, index, length), ...)`. - -**Syntaxe** - -``` sql -arrayReduceInRanges(agg_func, ranges, arr1, arr2, ..., arrN) -``` - -**Paramètre** - -- `agg_func` — The name of an aggregate function which should be a constant [chaîne](../../sql-reference/data-types/string.md). -- `ranges` — The ranges to aggretate which should be an [tableau](../../sql-reference/data-types/array.md) de [tuple](../../sql-reference/data-types/tuple.md) qui contient l'indice et la longueur de chaque plage. -- `arr` — Any number of [tableau](../../sql-reference/data-types/array.md) tapez les colonnes comme paramètres de la fonction d'agrégation. - -**Valeur renvoyée** - -**Exemple** - -``` sql -SELECT arrayReduceInRanges( - 'sum', - [(1, 5), (2, 3), (3, 4), (4, 4)], - [1000000, 200000, 30000, 4000, 500, 60, 7] -) AS res -``` - -``` text -┌─res─────────────────────────┐ -│ [1234500,234000,34560,4567] │ -└─────────────────────────────┘ -``` - -## arrayReverse(arr) {#arrayreverse} - -Retourne un tableau de la même taille que l'original tableau contenant les éléments dans l'ordre inverse. - -Exemple: - -``` sql -SELECT arrayReverse([1, 2, 3]) -``` - -``` text -┌─arrayReverse([1, 2, 3])─┐ -│ [3,2,1] │ -└─────────────────────────┘ -``` - -## inverse (arr) {#array-functions-reverse} - -Synonyme de [“arrayReverse”](#arrayreverse) - -## arrayFlatten {#arrayflatten} - -Convertit un tableau de tableaux dans un tableau associatif. - -Fonction: - -- S'applique à toute profondeur de tableaux imbriqués. -- Ne change pas les tableaux qui sont déjà plats. - -Le tableau aplati contient tous les éléments de tous les tableaux source. - -**Syntaxe** - -``` sql -flatten(array_of_arrays) -``` - -Alias: `flatten`. - -**Paramètre** - -- `array_of_arrays` — [Tableau](../../sql-reference/data-types/array.md) de tableaux. Exemple, `[[1,2,3], [4,5]]`. - -**Exemple** - -``` sql -SELECT flatten([[[1]], [[2], [3]]]) -``` - -``` text -┌─flatten(array(array([1]), array([2], [3])))─┐ -│ [1,2,3] │ -└─────────────────────────────────────────────┘ -``` - -## arrayCompact {#arraycompact} - -Supprime les éléments en double consécutifs d'un tableau. L'ordre des valeurs de résultat est déterminée par l'ordre dans le tableau source. - -**Syntaxe** - -``` sql -arrayCompact(arr) -``` - -**Paramètre** - -`arr` — The [tableau](../../sql-reference/data-types/array.md) inspecter. - -**Valeur renvoyée** - -Le tableau sans doublon. - -Type: `Array`. - -**Exemple** - -Requête: - -``` sql -SELECT arrayCompact([1, 1, nan, nan, 2, 3, 3, 3]) -``` - -Résultat: - -``` text -┌─arrayCompact([1, 1, nan, nan, 2, 3, 3, 3])─┐ -│ [1,nan,nan,2,3] │ -└────────────────────────────────────────────┘ -``` - -## arrayZip {#arrayzip} - -Combine plusieurs tableaux en un seul tableau. Le tableau résultant contient les éléments correspondants des tableaux source regroupés en tuples dans l'ordre des arguments listés. - -**Syntaxe** - -``` sql -arrayZip(arr1, arr2, ..., arrN) -``` - -**Paramètre** - -- `arrN` — [Tableau](../data-types/array.md). - -La fonction peut prendre n'importe quel nombre de tableaux de différents types. Tous les tableaux doivent être de taille égale. - -**Valeur renvoyée** - -- Tableau avec des éléments des tableaux source regroupés en [tuple](../data-types/tuple.md). Types de données dans le tuple sont les mêmes que les types de l'entrée des tableaux et dans le même ordre que les tableaux sont passés. - -Type: [Tableau](../data-types/array.md). - -**Exemple** - -Requête: - -``` sql -SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]) -``` - -Résultat: - -``` text -┌─arrayZip(['a', 'b', 'c'], [5, 2, 1])─┐ -│ [('a',5),('b',2),('c',1)] │ -└──────────────────────────────────────┘ -``` - -## arrayAUC {#arrayauc} - -Calculer AUC (zone sous la courbe, qui est un concept dans l'apprentissage automatique, voir plus de détails: https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve). - -**Syntaxe** - -``` sql -arrayAUC(arr_scores, arr_labels) -``` - -**Paramètre** -- `arr_scores` — scores prediction model gives. -- `arr_labels` — labels of samples, usually 1 for positive sample and 0 for negtive sample. - -**Valeur renvoyée** -Renvoie la valeur AUC avec le type Float64. - -**Exemple** -Requête: - -``` sql -select arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]) -``` - -Résultat: - -``` text -┌─arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1])─┐ -│ 0.75 │ -└────────────────────────────────────────---──┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/array_functions/) diff --git a/docs/fr/sql-reference/functions/array-join.md b/docs/fr/sql-reference/functions/array-join.md deleted file mode 100644 index 859e801994d..00000000000 --- a/docs/fr/sql-reference/functions/array-join.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 61 -toc_title: arrayJoin ---- - -# fonction arrayJoin {#functions_arrayjoin} - -C'est un très inhabituelle de la fonction. - -Les fonctions normales ne modifient pas un ensemble de lignes, mais modifient simplement les valeurs de chaque ligne (map). -Les fonctions d'agrégation compriment un ensemble de lignes (plier ou réduire). -Le ‘arrayJoin’ la fonction prend chaque ligne et génère un ensemble de lignes (dépliante). - -Cette fonction prend un tableau comme argument et propage la ligne source à plusieurs lignes pour le nombre d'éléments dans le tableau. -Toutes les valeurs des colonnes sont simplement copiés, sauf les valeurs dans la colonne où cette fonction est appliquée; elle est remplacée par la valeur correspondante de tableau. - -Une requête peut utiliser plusieurs `arrayJoin` fonction. Dans ce cas, la transformation est effectuée plusieurs fois. - -Notez la syntaxe de jointure de tableau dans la requête SELECT, qui offre des possibilités plus larges. - -Exemple: - -``` sql -SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src -``` - -``` text -┌─dst─┬─\'Hello\'─┬─src─────┐ -│ 1 │ Hello │ [1,2,3] │ -│ 2 │ Hello │ [1,2,3] │ -│ 3 │ Hello │ [1,2,3] │ -└─────┴───────────┴─────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/array_join/) diff --git a/docs/fr/sql-reference/functions/bit-functions.md b/docs/fr/sql-reference/functions/bit-functions.md deleted file mode 100644 index 7b8795815f2..00000000000 --- a/docs/fr/sql-reference/functions/bit-functions.md +++ /dev/null @@ -1,255 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 48 -toc_title: Bit ---- - -# Peu De Fonctions {#bit-functions} - -Les fonctions Bit fonctionnent pour n'importe quelle paire de types de UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32 ou Float64. - -Le type de résultat est un entier avec des bits égaux aux bits maximum de ses arguments. Si au moins l'un des arguments est signé, le résultat est un signé nombre. Si un argument est un nombre à virgule flottante, Il est converti en Int64. - -## bitAnd (a, b) {#bitanda-b} - -## bitOr (a, b) {#bitora-b} - -## bitXor (a, b) {#bitxora-b} - -## bitNot (a) {#bitnota} - -## bitShiftLeft (A, b) {#bitshiftlefta-b} - -## bitShiftRight (A, b) {#bitshiftrighta-b} - -## bitRotateLeft (a, b) {#bitrotatelefta-b} - -## bitRotateRight (a, b) {#bitrotaterighta-b} - -## bitTest {#bittest} - -Prend tout entier et le convertit en [forme binaire](https://en.wikipedia.org/wiki/Binary_number) renvoie la valeur d'un bit à la position spécifiée. Le compte à rebours commence à partir de 0 de la droite vers la gauche. - -**Syntaxe** - -``` sql -SELECT bitTest(number, index) -``` - -**Paramètre** - -- `number` – integer number. -- `index` – position of bit. - -**Valeurs renvoyées** - -Renvoie une valeur de bit à la position spécifiée. - -Type: `UInt8`. - -**Exemple** - -Par exemple, le nombre 43 dans le système numérique de base-2 (binaire) est 101011. - -Requête: - -``` sql -SELECT bitTest(43, 1) -``` - -Résultat: - -``` text -┌─bitTest(43, 1)─┐ -│ 1 │ -└────────────────┘ -``` - -Un autre exemple: - -Requête: - -``` sql -SELECT bitTest(43, 2) -``` - -Résultat: - -``` text -┌─bitTest(43, 2)─┐ -│ 0 │ -└────────────────┘ -``` - -## bitTestAll {#bittestall} - -Renvoie le résultat de [logique de conjonction](https://en.wikipedia.org/wiki/Logical_conjunction) (Et opérateur) de tous les bits à des positions données. Le compte à rebours commence à partir de 0 de la droite vers la gauche. - -La conjonction pour les opérations bit à bit: - -0 AND 0 = 0 - -0 AND 1 = 0 - -1 AND 0 = 0 - -1 AND 1 = 1 - -**Syntaxe** - -``` sql -SELECT bitTestAll(number, index1, index2, index3, index4, ...) -``` - -**Paramètre** - -- `number` – integer number. -- `index1`, `index2`, `index3`, `index4` – positions of bit. For example, for set of positions (`index1`, `index2`, `index3`, `index4`) est vrai si et seulement si toutes ses positions sont remplies (`index1` ⋀ `index2`, ⋀ `index3` ⋀ `index4`). - -**Valeurs renvoyées** - -Retourne le résultat de la conjonction logique. - -Type: `UInt8`. - -**Exemple** - -Par exemple, le nombre 43 dans le système numérique de base-2 (binaire) est 101011. - -Requête: - -``` sql -SELECT bitTestAll(43, 0, 1, 3, 5) -``` - -Résultat: - -``` text -┌─bitTestAll(43, 0, 1, 3, 5)─┐ -│ 1 │ -└────────────────────────────┘ -``` - -Un autre exemple: - -Requête: - -``` sql -SELECT bitTestAll(43, 0, 1, 3, 5, 2) -``` - -Résultat: - -``` text -┌─bitTestAll(43, 0, 1, 3, 5, 2)─┐ -│ 0 │ -└───────────────────────────────┘ -``` - -## bitTestAny {#bittestany} - -Renvoie le résultat de [disjonction logique](https://en.wikipedia.org/wiki/Logical_disjunction) (Ou opérateur) de tous les bits à des positions données. Le compte à rebours commence à partir de 0 de la droite vers la gauche. - -La disjonction pour les opérations binaires: - -0 OR 0 = 0 - -0 OR 1 = 1 - -1 OR 0 = 1 - -1 OR 1 = 1 - -**Syntaxe** - -``` sql -SELECT bitTestAny(number, index1, index2, index3, index4, ...) -``` - -**Paramètre** - -- `number` – integer number. -- `index1`, `index2`, `index3`, `index4` – positions of bit. - -**Valeurs renvoyées** - -Renvoie le résultat de la disjuction logique. - -Type: `UInt8`. - -**Exemple** - -Par exemple, le nombre 43 dans le système numérique de base-2 (binaire) est 101011. - -Requête: - -``` sql -SELECT bitTestAny(43, 0, 2) -``` - -Résultat: - -``` text -┌─bitTestAny(43, 0, 2)─┐ -│ 1 │ -└──────────────────────┘ -``` - -Un autre exemple: - -Requête: - -``` sql -SELECT bitTestAny(43, 4, 2) -``` - -Résultat: - -``` text -┌─bitTestAny(43, 4, 2)─┐ -│ 0 │ -└──────────────────────┘ -``` - -## bitCount {#bitcount} - -Calcule le nombre de bits mis à un dans la représentation binaire d'un nombre. - -**Syntaxe** - -``` sql -bitCount(x) -``` - -**Paramètre** - -- `x` — [Entier](../../sql-reference/data-types/int-uint.md) ou [virgule flottante](../../sql-reference/data-types/float.md) nombre. La fonction utilise la représentation de la valeur en mémoire. Il permet de financer les nombres à virgule flottante. - -**Valeur renvoyée** - -- Nombre de bits défini sur un dans le numéro d'entrée. - -La fonction ne convertit pas la valeur d'entrée en un type plus grand ([l'extension du signe](https://en.wikipedia.org/wiki/Sign_extension)). Ainsi, par exemple, `bitCount(toUInt8(-1)) = 8`. - -Type: `UInt8`. - -**Exemple** - -Prenez par exemple le numéro 333. Sa représentation binaire: 0000000101001101. - -Requête: - -``` sql -SELECT bitCount(333) -``` - -Résultat: - -``` text -┌─bitCount(333)─┐ -│ 5 │ -└───────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/bit_functions/) diff --git a/docs/fr/sql-reference/functions/bitmap-functions.md b/docs/fr/sql-reference/functions/bitmap-functions.md deleted file mode 100644 index 15cb68ffc52..00000000000 --- a/docs/fr/sql-reference/functions/bitmap-functions.md +++ /dev/null @@ -1,496 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 49 -toc_title: Bitmap ---- - -# Fonctions De Bitmap {#bitmap-functions} - -Les fonctions Bitmap fonctionnent pour le calcul de la valeur de L'objet de deux bitmaps, il s'agit de renvoyer un nouveau bitmap ou une cardinalité tout en utilisant le calcul de la formule, tel que and, or, xor, and not, etc. - -Il existe 2 types de méthodes de construction pour L'objet Bitmap. L'un doit être construit par la fonction d'agrégation groupBitmap avec-State, l'autre doit être construit par L'objet Array. Il est également de convertir L'objet Bitmap en objet tableau. - -RoaringBitmap est enveloppé dans une structure de données pendant le stockage réel des objets Bitmap. Lorsque la cardinalité est inférieure ou égale à 32, elle utilise Set objet. Lorsque la cardinalité est supérieure à 32, elle utilise l'objet RoaringBitmap. C'est pourquoi le stockage de faible cardinalité jeu est plus rapide. - -Pour plus d'informations sur RoaringBitmap, voir: [CRoaring](https://github.com/RoaringBitmap/CRoaring). - -## bitmapBuild {#bitmap_functions-bitmapbuild} - -Construire un bitmap à partir d'un tableau entier non signé. - -``` sql -bitmapBuild(array) -``` - -**Paramètre** - -- `array` – unsigned integer array. - -**Exemple** - -``` sql -SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) -``` - -``` text -┌─res─┬─toTypeName(bitmapBuild([1, 2, 3, 4, 5]))─────┐ -│  │ AggregateFunction(groupBitmap, UInt8) │ -└─────┴──────────────────────────────────────────────┘ -``` - -## bitmapToArray {#bitmaptoarray} - -Convertir bitmap en tableau entier. - -``` sql -bitmapToArray(bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res -``` - -``` text -┌─res─────────┐ -│ [1,2,3,4,5] │ -└─────────────┘ -``` - -## bitmapSubsetInRange {#bitmap-functions-bitmapsubsetinrange} - -Retourne le sous-ensemble dans la plage spécifiée (n'inclut pas le range_end). - -``` sql -bitmapSubsetInRange(bitmap, range_start, range_end) -``` - -**Paramètre** - -- `bitmap` – [Objet Bitmap](#bitmap_functions-bitmapbuild). -- `range_start` – range start point. Type: [UInt32](../../sql-reference/data-types/int-uint.md). -- `range_end` – range end point(excluded). Type: [UInt32](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -``` sql -SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res -``` - -``` text -┌─res───────────────┐ -│ [30,31,32,33,100] │ -└───────────────────┘ -``` - -## bitmapSubsetLimit {#bitmapsubsetlimit} - -Crée un sous-ensemble de bitmap avec n éléments pris entre `range_start` et `cardinality_limit`. - -**Syntaxe** - -``` sql -bitmapSubsetLimit(bitmap, range_start, cardinality_limit) -``` - -**Paramètre** - -- `bitmap` – [Objet Bitmap](#bitmap_functions-bitmapbuild). -- `range_start` – The subset starting point. Type: [UInt32](../../sql-reference/data-types/int-uint.md). -- `cardinality_limit` – The subset cardinality upper limit. Type: [UInt32](../../sql-reference/data-types/int-uint.md). - -**Valeur renvoyée** - -Ensemble. - -Type: `Bitmap object`. - -**Exemple** - -Requête: - -``` sql -SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res -``` - -Résultat: - -``` text -┌─res───────────────────────┐ -│ [30,31,32,33,100,200,500] │ -└───────────────────────────┘ -``` - -## bitmapContains {#bitmap_functions-bitmapcontains} - -Vérifie si le bitmap contient un élément. - -``` sql -bitmapContains(haystack, needle) -``` - -**Paramètre** - -- `haystack` – [Objet Bitmap](#bitmap_functions-bitmapbuild) où la fonction recherche. -- `needle` – Value that the function searches. Type: [UInt32](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- 0 — If `haystack` ne contient pas de `needle`. -- 1 — If `haystack` contenir `needle`. - -Type: `UInt8`. - -**Exemple** - -``` sql -SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res -``` - -``` text -┌─res─┐ -│ 1 │ -└─────┘ -``` - -## bitmapHasAny {#bitmaphasany} - -Vérifie si deux bitmaps ont une intersection par certains éléments. - -``` sql -bitmapHasAny(bitmap1, bitmap2) -``` - -Si vous êtes sûr que `bitmap2` contient strictement un élément, envisagez d'utiliser le [bitmapContains](#bitmap_functions-bitmapcontains) fonction. Cela fonctionne plus efficacement. - -**Paramètre** - -- `bitmap*` – bitmap object. - -**Les valeurs de retour** - -- `1`, si `bitmap1` et `bitmap2` avoir un élément similaire au moins. -- `0`, autrement. - -**Exemple** - -``` sql -SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res -``` - -``` text -┌─res─┐ -│ 1 │ -└─────┘ -``` - -## bitmapHasAll {#bitmaphasall} - -Analogue à `hasAll(array, array)` renvoie 1 si le premier bitmap contient tous les éléments du second, 0 sinon. -Si le deuxième argument est un bitmap vide, alors renvoie 1. - -``` sql -bitmapHasAll(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res -``` - -``` text -┌─res─┐ -│ 0 │ -└─────┘ -``` - -## bitmapCardinality {#bitmapcardinality} - -Retrun bitmap cardinalité de type UInt64. - -``` sql -bitmapCardinality(bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res -``` - -``` text -┌─res─┐ -│ 5 │ -└─────┘ -``` - -## bitmapMin {#bitmapmin} - -Retrun la plus petite valeur de type UInt64 dans l'ensemble, UINT32_MAX si l'ensemble est vide. - - bitmapMin(bitmap) - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res -``` - - ┌─res─┐ - │ 1 │ - └─────┘ - -## bitmapMax {#bitmapmax} - -Retrun la plus grande valeur de type UInt64 dans l'ensemble, 0 si l'ensemble est vide. - - bitmapMax(bitmap) - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res -``` - - ┌─res─┐ - │ 5 │ - └─────┘ - -## bitmapTransform {#bitmaptransform} - -Transformer un tableau de valeurs d'une image à l'autre tableau de valeurs, le résultat est une nouvelle image. - - bitmapTransform(bitmap, from_array, to_array) - -**Paramètre** - -- `bitmap` – bitmap object. -- `from_array` – UInt32 array. For idx in range \[0, from_array.size()), if bitmap contains from_array\[idx\], then replace it with to_array\[idx\]. Note that the result depends on array ordering if there are common elements between from_array and to_array. -- `to_array` – UInt32 array, its size shall be the same to from_array. - -**Exemple** - -``` sql -SELECT bitmapToArray(bitmapTransform(bitmapBuild([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), cast([5,999,2] as Array(UInt32)), cast([2,888,20] as Array(UInt32)))) AS res -``` - - ┌─res───────────────────┐ - │ [1,3,4,6,7,8,9,10,20] │ - └───────────────────────┘ - -## bitmapAnd {#bitmapand} - -Deux bitmap et calcul, le résultat est un nouveau bitmap. - -``` sql -bitmapAnd(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res -``` - -``` text -┌─res─┐ -│ [3] │ -└─────┘ -``` - -## bitmapOr {#bitmapor} - -Deux bitmap ou calcul, le résultat est un nouveau bitmap. - -``` sql -bitmapOr(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res -``` - -``` text -┌─res─────────┐ -│ [1,2,3,4,5] │ -└─────────────┘ -``` - -## bitmapXor {#bitmapxor} - -Deux bitmap xor calcul, le résultat est une nouvelle image. - -``` sql -bitmapXor(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res -``` - -``` text -┌─res───────┐ -│ [1,2,4,5] │ -└───────────┘ -``` - -## bitmapetnot {#bitmapandnot} - -Deux Bitmap andnot calcul, le résultat est un nouveau bitmap. - -``` sql -bitmapAndnot(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res -``` - -``` text -┌─res───┐ -│ [1,2] │ -└───────┘ -``` - -## bitmapetcardinalité {#bitmapandcardinality} - -Deux bitmap et calcul, retour cardinalité de type UInt64. - -``` sql -bitmapAndCardinality(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; -``` - -``` text -┌─res─┐ -│ 1 │ -└─────┘ -``` - -## bitmapOrCardinality {#bitmaporcardinality} - -Deux bitmap ou calcul, retour cardinalité de type UInt64. - -``` sql -bitmapOrCardinality(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; -``` - -``` text -┌─res─┐ -│ 5 │ -└─────┘ -``` - -## bitmapXorCardinality {#bitmapxorcardinality} - -Deux bitmap XOR calcul, retour cardinalité de type UInt64. - -``` sql -bitmapXorCardinality(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; -``` - -``` text -┌─res─┐ -│ 4 │ -└─────┘ -``` - -## bitmapetnotcardinality {#bitmapandnotcardinality} - -Deux bitmap andnot calcul, retour cardinalité de type UInt64. - -``` sql -bitmapAndnotCardinality(bitmap,bitmap) -``` - -**Paramètre** - -- `bitmap` – bitmap object. - -**Exemple** - -``` sql -SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; -``` - -``` text -┌─res─┐ -│ 2 │ -└─────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/bitmap_functions/) diff --git a/docs/fr/sql-reference/functions/comparison-functions.md b/docs/fr/sql-reference/functions/comparison-functions.md deleted file mode 100644 index a5008c676fa..00000000000 --- a/docs/fr/sql-reference/functions/comparison-functions.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 36 -toc_title: Comparaison ---- - -# Fonctions De Comparaison {#comparison-functions} - -Les fonctions de comparaison renvoient toujours 0 ou 1 (Uint8). - -Les types suivants peuvent être comparés: - -- nombre -- cordes et cordes fixes -- date -- dates avec heures - -au sein de chaque groupe, mais pas entre différents groupes. - -Par exemple, vous ne pouvez pas comparer une date avec une chaîne. Vous devez utiliser une fonction pour convertir la chaîne en une date, ou vice versa. - -Les chaînes sont comparées par octets. Une courte chaîne est plus petite que toutes les chaînes qui commencent par elle et qui contiennent au moins un caractère de plus. - -## égal, A = B et a = = b opérateur {#function-equals} - -## notEquals, a ! opérateur= b et a \<\> b {#function-notequals} - -## moins, opérateur \< {#function-less} - -## de plus, \> opérateur {#function-greater} - -## lessOrEquals, \< = opérateur {#function-lessorequals} - -## greaterOrEquals, \> = opérateur {#function-greaterorequals} - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/comparison_functions/) diff --git a/docs/fr/sql-reference/functions/conditional-functions.md b/docs/fr/sql-reference/functions/conditional-functions.md deleted file mode 100644 index 3912b49aa6a..00000000000 --- a/docs/fr/sql-reference/functions/conditional-functions.md +++ /dev/null @@ -1,207 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 43 -toc_title: 'Conditionnel ' ---- - -# Fonctions Conditionnelles {#conditional-functions} - -## si {#if} - -Contrôle la ramification conditionnelle. Contrairement à la plupart des systèmes, ClickHouse évalue toujours les deux expressions `then` et `else`. - -**Syntaxe** - -``` sql -SELECT if(cond, then, else) -``` - -Si la condition `cond` renvoie une valeur non nulle, retourne le résultat de l'expression `then` et le résultat de l'expression `else`, si présent, est ignoré. Si l' `cond` est égal à zéro ou `NULL` alors le résultat de la `then` l'expression est ignorée et le résultat de `else` expression, si elle est présente, est renvoyée. - -**Paramètre** - -- `cond` – The condition for evaluation that can be zero or not. The type is UInt8, Nullable(UInt8) or NULL. -- `then` - L'expression à renvoyer si la condition est remplie. -- `else` - L'expression à renvoyer si la condition n'est pas remplie. - -**Valeurs renvoyées** - -La fonction s'exécute `then` et `else` expressions et retourne son résultat, selon que la condition `cond` fini par être zéro ou pas. - -**Exemple** - -Requête: - -``` sql -SELECT if(1, plus(2, 2), plus(2, 6)) -``` - -Résultat: - -``` text -┌─plus(2, 2)─┐ -│ 4 │ -└────────────┘ -``` - -Requête: - -``` sql -SELECT if(0, plus(2, 2), plus(2, 6)) -``` - -Résultat: - -``` text -┌─plus(2, 6)─┐ -│ 8 │ -└────────────┘ -``` - -- `then` et `else` doit avoir le type commun le plus bas. - -**Exemple:** - -Prendre cette `LEFT_RIGHT` table: - -``` sql -SELECT * -FROM LEFT_RIGHT - -┌─left─┬─right─┐ -│ ᴺᵁᴸᴸ │ 4 │ -│ 1 │ 3 │ -│ 2 │ 2 │ -│ 3 │ 1 │ -│ 4 │ ᴺᵁᴸᴸ │ -└──────┴───────┘ -``` - -La requête suivante compare `left` et `right` valeur: - -``` sql -SELECT - left, - right, - if(left < right, 'left is smaller than right', 'right is greater or equal than left') AS is_smaller -FROM LEFT_RIGHT -WHERE isNotNull(left) AND isNotNull(right) - -┌─left─┬─right─┬─is_smaller──────────────────────────┐ -│ 1 │ 3 │ left is smaller than right │ -│ 2 │ 2 │ right is greater or equal than left │ -│ 3 │ 1 │ right is greater or equal than left │ -└──────┴───────┴─────────────────────────────────────┘ -``` - -Note: `NULL` les valeurs ne sont pas utilisés dans cet exemple, vérifier [Valeurs nulles dans les conditions](#null-values-in-conditionals) section. - -## Opérateur Ternaire {#ternary-operator} - -Il fonctionne même comme `if` fonction. - -Syntaxe: `cond ? then : else` - -Retourner `then` si l' `cond` renvoie la valeur vrai (supérieur à zéro), sinon renvoie `else`. - -- `cond` doit être de type de `UInt8`, et `then` et `else` doit avoir le type commun le plus bas. - -- `then` et `else` peut être `NULL` - -**Voir aussi** - -- [ifNotFinite](other-functions.md#ifnotfinite). - -## multiIf {#multiif} - -Permet d'écrire le [CASE](../operators/index.md#operator_case) opérateur plus compacte dans la requête. - -Syntaxe: `multiIf(cond_1, then_1, cond_2, then_2, ..., else)` - -**Paramètre:** - -- `cond_N` — The condition for the function to return `then_N`. -- `then_N` — The result of the function when executed. -- `else` — The result of the function if none of the conditions is met. - -La fonction accepte `2N+1` paramètre. - -**Valeurs renvoyées** - -La fonction renvoie l'une des valeurs `then_N` ou `else` selon les conditions `cond_N`. - -**Exemple** - -En utilisant à nouveau `LEFT_RIGHT` table. - -``` sql -SELECT - left, - right, - multiIf(left < right, 'left is smaller', left > right, 'left is greater', left = right, 'Both equal', 'Null value') AS result -FROM LEFT_RIGHT - -┌─left─┬─right─┬─result──────────┐ -│ ᴺᵁᴸᴸ │ 4 │ Null value │ -│ 1 │ 3 │ left is smaller │ -│ 2 │ 2 │ Both equal │ -│ 3 │ 1 │ left is greater │ -│ 4 │ ᴺᵁᴸᴸ │ Null value │ -└──────┴───────┴─────────────────┘ -``` - -## Utilisation Directe Des Résultats Conditionnels {#using-conditional-results-directly} - -Les conditions entraînent toujours `0`, `1` ou `NULL`. Vous pouvez donc utiliser des résultats conditionnels directement comme ceci: - -``` sql -SELECT left < right AS is_small -FROM LEFT_RIGHT - -┌─is_small─┐ -│ ᴺᵁᴸᴸ │ -│ 1 │ -│ 0 │ -│ 0 │ -│ ᴺᵁᴸᴸ │ -└──────────┘ -``` - -## Valeurs nulles dans les conditions {#null-values-in-conditionals} - -Lorsque `NULL` les valeurs sont impliqués dans des conditions, le résultat sera également `NULL`. - -``` sql -SELECT - NULL < 1, - 2 < NULL, - NULL < NULL, - NULL = NULL - -┌─less(NULL, 1)─┬─less(2, NULL)─┬─less(NULL, NULL)─┬─equals(NULL, NULL)─┐ -│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -└───────────────┴───────────────┴──────────────────┴────────────────────┘ -``` - -Donc, vous devriez construire vos requêtes avec soin si les types sont `Nullable`. - -L'exemple suivant le démontre en omettant d'ajouter la condition égale à `multiIf`. - -``` sql -SELECT - left, - right, - multiIf(left < right, 'left is smaller', left > right, 'right is smaller', 'Both equal') AS faulty_result -FROM LEFT_RIGHT - -┌─left─┬─right─┬─faulty_result────┐ -│ ᴺᵁᴸᴸ │ 4 │ Both equal │ -│ 1 │ 3 │ left is smaller │ -│ 2 │ 2 │ Both equal │ -│ 3 │ 1 │ right is smaller │ -│ 4 │ ᴺᵁᴸᴸ │ Both equal │ -└──────┴───────┴──────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/conditional_functions/) diff --git a/docs/fr/sql-reference/functions/date-time-functions.md b/docs/fr/sql-reference/functions/date-time-functions.md deleted file mode 100644 index d1c16b42d07..00000000000 --- a/docs/fr/sql-reference/functions/date-time-functions.md +++ /dev/null @@ -1,450 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 39 -toc_title: Travailler avec les Dates et les heures ---- - -# Fonctions pour travailler avec des Dates et des heures {#functions-for-working-with-dates-and-times} - -Support des fuseaux horaires - -Toutes les fonctions pour travailler avec la date et l'heure qui ont une logique d'utilisation pour le fuseau horaire peut accepter un second fuseau horaire argument. Exemple: Asie / Ekaterinbourg. Dans ce cas, ils utilisent le fuseau horaire spécifié au lieu du fuseau horaire local (par défaut). - -``` sql -SELECT - toDateTime('2016-06-15 23:00:00') AS time, - toDate(time) AS date_local, - toDate(time, 'Asia/Yekaterinburg') AS date_yekat, - toString(time, 'US/Samoa') AS time_samoa -``` - -``` text -┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ -│ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ -└─────────────────────┴────────────┴────────────┴─────────────────────┘ -``` - -Seuls les fuseaux horaires qui diffèrent de L'UTC par un nombre entier d'heures sont pris en charge. - -## toTimeZone {#totimezone} - -Convertir l'heure ou la date et de l'heure au fuseau horaire spécifié. - -## toYear {#toyear} - -Convertit une date ou une date avec l'heure en un numéro UInt16 contenant le numéro d'année (AD). - -## toQuarter {#toquarter} - -Convertit une date ou une date avec l'heure en un numéro UInt8 contenant le numéro de trimestre. - -## toMonth {#tomonth} - -Convertit une date ou une date avec l'heure en un numéro UInt8 contenant le numéro de mois (1-12). - -## toDayOfYear {#todayofyear} - -Convertit une date ou une date avec l'heure en un numéro UInt16 contenant le numéro du jour de l'année (1-366). - -## toDayOfMonth {#todayofmonth} - -Convertit une date ou une date avec le temps à un UInt8 contenant le numéro du jour du mois (1-31). - -## toDayOfWeek {#todayofweek} - -Convertit une date ou une date avec l'heure en un numéro UInt8 contenant le numéro du jour de la semaine (lundi est 1, et dimanche est 7). - -## toHour {#tohour} - -Convertit une date avec l'heure en un nombre UInt8 contenant le numéro de l'heure dans l'Heure de 24 heures (0-23). -This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true – even in Moscow the clocks were twice changed at a different time). - -## toMinute {#tominute} - -Convertit une date avec l'heure en un numéro UInt8 contenant le numéro de la minute de l'heure (0-59). - -## toseconde {#tosecond} - -Convertit une date avec l'heure en un nombre UInt8 contenant le numéro de la seconde dans la minute (0-59). -Les secondes intercalaires ne sont pas comptabilisés. - -## toUnixTimestamp {#to-unix-timestamp} - -Pour L'argument DateTime: convertit la valeur en sa représentation numérique interne (horodatage Unix). -For String argument: analyse datetime from string en fonction du fuseau horaire (second argument optionnel, le fuseau horaire du serveur est utilisé par défaut) et renvoie l'horodatage unix correspondant. -Pour L'argument Date: le comportement n'est pas spécifié. - -**Syntaxe** - -``` sql -toUnixTimestamp(datetime) -toUnixTimestamp(str, [timezone]) -``` - -**Valeur renvoyée** - -- Renvoie l'horodatage unix. - -Type: `UInt32`. - -**Exemple** - -Requête: - -``` sql -SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp -``` - -Résultat: - -``` text -┌─unix_timestamp─┐ -│ 1509836867 │ -└────────────────┘ -``` - -## toStartOfYear {#tostartofyear} - -Arrondit une date ou une date avec l'heure jusqu'au premier jour de l'année. -Renvoie la date. - -## toStartOfISOYear {#tostartofisoyear} - -Arrondit une date ou une date avec l'heure jusqu'au premier jour de L'année ISO. -Renvoie la date. - -## toStartOfQuarter {#tostartofquarter} - -Arrondit une date ou une date avec l'heure jusqu'au premier jour du trimestre. -Le premier jour du trimestre, soit le 1er janvier, 1er avril, 1er juillet ou 1er octobre. -Renvoie la date. - -## toStartOfMonth {#tostartofmonth} - -Arrondit une date ou une date avec l'heure jusqu'au premier jour du mois. -Renvoie la date. - -!!! attention "Attention" - Le comportement de l'analyse des dates incorrectes est spécifique à l'implémentation. ClickHouse peut renvoyer la date zéro, lancer une exception ou faire “natural” débordement. - -## toMonday {#tomonday} - -Arrondit une date ou une date avec l'heure au lundi le plus proche. -Renvoie la date. - -## toStartOfWeek (t \[, mode\]) {#tostartofweektmode} - -Arrondit une date ou une date avec l'heure au dimanche ou au lundi le plus proche par mode. -Renvoie la date. -L'argument mode fonctionne exactement comme l'argument mode de toWeek(). Pour la syntaxe à argument unique, une valeur de mode de 0 est utilisée. - -## toStartOfDay {#tostartofday} - -Arrondit une date avec le temps au début de la journée. - -## toStartOfHour {#tostartofhour} - -Arrondit une date avec le temps au début de l " heure. - -## toStartOfMinute {#tostartofminute} - -Arrondit une date avec le temps au début de la minute. - -## toStartOfFiveMinute {#tostartoffiveminute} - -Arrondit à une date avec l'heure de début de l'intervalle de cinq minutes. - -## toStartOfTenMinutes {#tostartoftenminutes} - -Arrondit une date avec le temps au début de l " intervalle de dix minutes. - -## toStartOfFifteenMinutes {#tostartoffifteenminutes} - -Arrondit la date avec le temps jusqu'au début de l'intervalle de quinze minutes. - -## toStartOfInterval(time_or_data, intervalle x Unité \[, time_zone\]) {#tostartofintervaltime-or-data-interval-x-unit-time-zone} - -Ceci est une généralisation d'autres fonctions nommées `toStartOf*`. Exemple, -`toStartOfInterval(t, INTERVAL 1 year)` renvoie la même chose que `toStartOfYear(t)`, -`toStartOfInterval(t, INTERVAL 1 month)` renvoie la même chose que `toStartOfMonth(t)`, -`toStartOfInterval(t, INTERVAL 1 day)` renvoie la même chose que `toStartOfDay(t)`, -`toStartOfInterval(t, INTERVAL 15 minute)` renvoie la même chose que `toStartOfFifteenMinutes(t)` etc. - -## toTime {#totime} - -Convertit une date avec l'heure en une certaine date fixe, tout en préservant l'heure. - -## toRelativeYearNum {#torelativeyearnum} - -Convertit une date avec l'heure ou la date, le numéro de l'année, à partir d'un certain point fixe dans le passé. - -## toRelativeQuarterNum {#torelativequarternum} - -Convertit une date avec l'heure ou la date au numéro du trimestre, à partir d'un certain point fixe dans le passé. - -## toRelativeMonthNum {#torelativemonthnum} - -Convertit une date avec l'heure ou la date au numéro du mois, à partir d'un certain point fixe dans le passé. - -## toRelativeWeekNum {#torelativeweeknum} - -Convertit une date avec l'heure ou la date, le numéro de la semaine, à partir d'un certain point fixe dans le passé. - -## toRelativeDayNum {#torelativedaynum} - -Convertit une date avec l'heure ou la date au numéro du jour, à partir d'un certain point fixe dans le passé. - -## toRelativeHourNum {#torelativehournum} - -Convertit une date avec l'heure ou la date au nombre de l'heure, à partir d'un certain point fixe dans le passé. - -## toRelativeMinuteNum {#torelativeminutenum} - -Convertit une date avec l'heure ou la date au numéro de la minute, à partir d'un certain point fixe dans le passé. - -## toRelativeSecondNum {#torelativesecondnum} - -Convertit une date avec l'heure ou la date au numéro de la seconde, à partir d'un certain point fixe dans le passé. - -## toISOYear {#toisoyear} - -Convertit une date ou une date avec l'heure en un numéro UInt16 contenant le numéro D'année ISO. - -## toISOWeek {#toisoweek} - -Convertit une date ou une date avec l'heure en un numéro UInt8 contenant le numéro de semaine ISO. - -## toWeek (date \[, mode\]) {#toweekdatemode} - -Cette fonction renvoie le numéro de semaine pour date ou datetime. La forme à deux arguments de toWeek() vous permet de spécifier si la semaine commence le dimanche ou le lundi et si la valeur de retour doit être comprise entre 0 et 53 ou entre 1 et 53. Si l'argument mode est omis, le mode par défaut est 0. -`toISOWeek()`est une fonction de compatibilité équivalente à `toWeek(date,3)`. -Le tableau suivant décrit le fonctionnement de l'argument mode. - -| Mode | Premier jour de la semaine | Gamme | Week 1 is the first week … | -|------|----------------------------|-------|----------------------------------| -| 0 | Dimanche | 0-53 | avec un dimanche cette année | -| 1 | Lundi | 0-53 | avec 4 jours ou plus cette année | -| 2 | Dimanche | 1-53 | avec un dimanche cette année | -| 3 | Lundi | 1-53 | avec 4 jours ou plus cette année | -| 4 | Dimanche | 0-53 | avec 4 jours ou plus cette année | -| 5 | Lundi | 0-53 | avec un lundi cette année | -| 6 | Dimanche | 1-53 | avec 4 jours ou plus cette année | -| 7 | Lundi | 1-53 | avec un lundi cette année | -| 8 | Dimanche | 1-53 | contient Janvier 1 | -| 9 | Lundi | 1-53 | contient Janvier 1 | - -Pour les valeurs de mode avec une signification de “with 4 or more days this year,” les semaines sont numérotées selon ISO 8601: 1988: - -- Si la semaine contenant Janvier 1 A 4 jours ou plus dans la nouvelle année, il est Semaine 1. - -- Sinon, c'est la dernière semaine de l'année précédente, et la semaine prochaine est la semaine 1. - -Pour les valeurs de mode avec une signification de “contains January 1”, la semaine contient Janvier 1 est Semaine 1. Peu importe combien de jours dans la nouvelle année la semaine contenait, même si elle contenait seulement un jour. - -``` sql -toWeek(date, [, mode][, Timezone]) -``` - -**Paramètre** - -- `date` – Date or DateTime. -- `mode` – Optional parameter, Range of values is \[0,9\], default is 0. -- `Timezone` – Optional parameter, it behaves like any other conversion function. - -**Exemple** - -``` sql -SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS week1, toWeek(date,9) AS week9; -``` - -``` text -┌───────date─┬─week0─┬─week1─┬─week9─┐ -│ 2016-12-27 │ 52 │ 52 │ 1 │ -└────────────┴───────┴───────┴───────┘ -``` - -## toYearWeek (date \[, mode\]) {#toyearweekdatemode} - -Retourne l'année et la semaine pour une date. L'année dans le résultat peut être différente de l'année dans l'argument date pour la première et la dernière semaine de l'année. - -L'argument mode fonctionne exactement comme l'argument mode de toWeek(). Pour la syntaxe à argument unique, une valeur de mode de 0 est utilisée. - -`toISOYear()`est une fonction de compatibilité équivalente à `intDiv(toYearWeek(date,3),100)`. - -**Exemple** - -``` sql -SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(date,1) AS yearWeek1, toYearWeek(date,9) AS yearWeek9; -``` - -``` text -┌───────date─┬─yearWeek0─┬─yearWeek1─┬─yearWeek9─┐ -│ 2016-12-27 │ 201652 │ 201652 │ 201701 │ -└────────────┴───────────┴───────────┴───────────┘ -``` - -## maintenant {#now} - -Accepte zéro argument et renvoie l'heure actuelle à l'un des moments de l'exécution de la requête. -Cette fonction renvoie une constante, même si la requête a pris beaucoup de temps à compléter. - -## aujourd' {#today} - -Accepte zéro argument et renvoie la date actuelle à l'un des moments de l'exécution de la requête. -Le même que ‘toDate(now())’. - -## hier {#yesterday} - -Accepte zéro argument et renvoie la date d'hier à l'un des moments de l'exécution de la requête. -Le même que ‘today() - 1’. - -## l'horaire de diffusion {#timeslot} - -Arrondit le temps à la demi-heure. -Cette fonction est spécifique à Yandex.Metrica, car une demi-heure est le temps minimum pour diviser une session en deux sessions si une balise de suivi affiche les pages vues consécutives d'un seul utilisateur qui diffèrent dans le temps de strictement plus que ce montant. Cela signifie que les tuples (l'ID de balise, l'ID utilisateur et l'intervalle de temps) peuvent être utilisés pour rechercher les pages vues incluses dans la session correspondante. - -## toYYYYMM {#toyyyymm} - -Convertit une date ou une date avec l'heure en un numéro UInt32 contenant le numéro d'année et de mois (AAAA \* 100 + MM). - -## toYYYYMMDD {#toyyyymmdd} - -Convertit une date ou une date avec l'heure en un numéro UInt32 contenant le numéro d'année et de mois (AAAA \* 10000 + MM \* 100 + JJ). - -## toYYYYMMDDhhmmss {#toyyyymmddhhmmss} - -Convertit une date ou une date avec l'heure en un numéro UInt64 contenant le numéro d'année et de mois (AAAA \* 10000000000 + MM \* 100000000 + DD \* 1000000 + hh \* 10000 + mm \* 100 + ss). - -## addYears, addMonths, addWeeks, addDays, addHours, addMinutes, addSeconds, addQuarters {#addyears-addmonths-addweeks-adddays-addhours-addminutes-addseconds-addquarters} - -Fonction ajoute une date / DateTime intervalle à une Date / DateTime, puis retourner la Date / DateTime. Exemple: - -``` sql -WITH - toDate('2018-01-01') AS date, - toDateTime('2018-01-01 00:00:00') AS date_time -SELECT - addYears(date, 1) AS add_years_with_date, - addYears(date_time, 1) AS add_years_with_date_time -``` - -``` text -┌─add_years_with_date─┬─add_years_with_date_time─┐ -│ 2019-01-01 │ 2019-01-01 00:00:00 │ -└─────────────────────┴──────────────────────────┘ -``` - -## subtractYears, subtractMonths, subtractWeeks, subtractDays, subtractHours, subtractMinutes, subtractSeconds, subtractQuarters {#subtractyears-subtractmonths-subtractweeks-subtractdays-subtracthours-subtractminutes-subtractseconds-subtractquarters} - -Fonction soustrayez un intervalle de Date / DateTime à une Date / DateTime, puis renvoyez la Date / DateTime. Exemple: - -``` sql -WITH - toDate('2019-01-01') AS date, - toDateTime('2019-01-01 00:00:00') AS date_time -SELECT - subtractYears(date, 1) AS subtract_years_with_date, - subtractYears(date_time, 1) AS subtract_years_with_date_time -``` - -``` text -┌─subtract_years_with_date─┬─subtract_years_with_date_time─┐ -│ 2018-01-01 │ 2018-01-01 00:00:00 │ -└──────────────────────────┴───────────────────────────────┘ -``` - -## dateDiff {#datediff} - -Renvoie la différence entre deux valeurs Date ou DateTime. - -**Syntaxe** - -``` sql -dateDiff('unit', startdate, enddate, [timezone]) -``` - -**Paramètre** - -- `unit` — Time unit, in which the returned value is expressed. [Chaîne](../syntax.md#syntax-string-literal). - - Supported values: - - | unit | - | ---- | - |second | - |minute | - |hour | - |day | - |week | - |month | - |quarter | - |year | - -- `startdate` — The first time value to compare. [Date](../../sql-reference/data-types/date.md) ou [DateTime](../../sql-reference/data-types/datetime.md). - -- `enddate` — The second time value to compare. [Date](../../sql-reference/data-types/date.md) ou [DateTime](../../sql-reference/data-types/datetime.md). - -- `timezone` — Optional parameter. If specified, it is applied to both `startdate` et `enddate`. Si non spécifié, fuseaux horaires de l' `startdate` et `enddate` sont utilisés. Si elles ne sont pas identiques, le résultat n'est pas spécifié. - -**Valeur renvoyée** - -Différence entre `startdate` et `enddate` exprimé en `unit`. - -Type: `int`. - -**Exemple** - -Requête: - -``` sql -SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); -``` - -Résultat: - -``` text -┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ -│ 25 │ -└────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -## intervalle de temps (StartTime, Duration, \[, Size\]) {#timeslotsstarttime-duration-size} - -Pour un intervalle de temps commençant à ‘StartTime’ et de poursuivre pour ‘Duration’ secondes, il renvoie un tableau de moments dans le temps, composé de points de cet intervalle arrondis vers le bas à la ‘Size’ en quelques secondes. ‘Size’ est un paramètre optionnel: une constante UInt32, définie sur 1800 par défaut. -Exemple, `timeSlots(toDateTime('2012-01-01 12:20:00'), 600) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]`. -Ceci est nécessaire pour rechercher les pages vues dans la session correspondante. - -## formatDateTime(Heure, Format \[, fuseau horaire\]) {#formatdatetime} - -Function formats a Time according given Format string. N.B.: Format is a constant expression, e.g. you can not have multiple formats for single result column. - -Modificateurs pris en charge pour le Format: -(“Example” colonne affiche le résultat de formatage pour le temps `2018-01-02 22:33:44`) - -| Modificateur | Description | Exemple | -|--------------|------------------------------------------------------------------------|------------| -| %C | année divisée par 100 et tronquée en entier (00-99) | 20 | -| %d | jour du mois, zero-rembourré (01-31) | 02 | -| %D | Date courte MM / JJ / AA, équivalente à %m / % d / % y | 01/02/18 | -| % e | jour du mois, rembourré dans l'espace ( 1-31) | 2 | -| %F | date courte AAAA-MM-JJ, équivalente à % Y - % m - % d | 2018-01-02 | -| %H | heure en format 24h (00-23) | 22 | -| %I | heure en format 12h (01-12) | 10 | -| %j | les jours de l'année (001-366) | 002 | -| %m | mois en nombre décimal (01-12) | 01 | -| %M | minute (00-59) | 33 | -| %et | caractère de nouvelle ligne (") | | -| %p | Désignation AM ou PM | PM | -| %R | 24 heures HH:MM temps, équivalent à %H: % M | 22:33 | -| %S | deuxième (00-59) | 44 | -| % t | horizontal-caractère de tabulation (') | | -| %T | Format d'heure ISO 8601 (HH:MM:SS), équivalent à %H: % M:%S | 22:33:44 | -| % u | ISO 8601 jour de la semaine comme numéro avec Lundi comme 1 (1-7) | 2 | -| %V | Numéro de semaine ISO 8601 (01-53) | 01 | -| %W | jour de la semaine comme un nombre décimal avec dimanche comme 0 (0-6) | 2 | -| % y | Année, deux derniers chiffres (00-99) | 18 | -| %Y | An | 2018 | -| %% | signe | % | - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/date_time_functions/) diff --git a/docs/fr/sql-reference/functions/encoding-functions.md b/docs/fr/sql-reference/functions/encoding-functions.md deleted file mode 100644 index 6c99ed4f32e..00000000000 --- a/docs/fr/sql-reference/functions/encoding-functions.md +++ /dev/null @@ -1,175 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 52 -toc_title: Encodage ---- - -# L'Encodage Des Fonctions {#encoding-functions} - -## char {#char} - -Retourne la chaîne avec la longueur que le nombre d'arguments passés et chaque octet a la valeur de l'argument correspondant. Accepte plusieurs arguments de types numériques. Si la valeur de l'argument est hors de portée du type de données UInt8, elle est convertie en UInt8 avec arrondi et débordement possibles. - -**Syntaxe** - -``` sql -char(number_1, [number_2, ..., number_n]); -``` - -**Paramètre** - -- `number_1, number_2, ..., number_n` — Numerical arguments interpreted as integers. Types: [Int](../../sql-reference/data-types/int-uint.md), [Flottant](../../sql-reference/data-types/float.md). - -**Valeur renvoyée** - -- une chaîne d'octets. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello -``` - -Résultat: - -``` text -┌─hello─┐ -│ hello │ -└───────┘ -``` - -Vous pouvez construire une chaîne de codage arbitraire en passant les octets correspondants. Voici un exemple pour UTF-8: - -Requête: - -``` sql -SELECT char(0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, 0xD1, 0x82) AS hello; -``` - -Résultat: - -``` text -┌─hello──┐ -│ привет │ -└────────┘ -``` - -Requête: - -``` sql -SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; -``` - -Résultat: - -``` text -┌─hello─┐ -│ 你好 │ -└───────┘ -``` - -## Hex {#hex} - -Renvoie une chaîne contenant la représentation hexadécimale de l'argument. - -**Syntaxe** - -``` sql -hex(arg) -``` - -La fonction utilise des lettres majuscules `A-F` et ne pas utiliser de préfixes (comme `0x`) ou suffixes (comme `h`). - -Pour les arguments entiers, il imprime des chiffres hexadécimaux (“nibbles”) du plus significatif au moins significatif (big endian ou “human readable” ordre). Il commence par l'octet non nul le plus significatif (les octets de début zéro sont omis) mais imprime toujours les deux chiffres de chaque octet même si le chiffre de début est nul. - -Exemple: - -**Exemple** - -Requête: - -``` sql -SELECT hex(1); -``` - -Résultat: - -``` text -01 -``` - -Les valeurs de type `Date` et `DateTime` sont formatés comme des entiers correspondants (le nombre de jours depuis Epoch pour Date et la valeur de L'horodatage Unix pour DateTime). - -Pour `String` et `FixedString`, tous les octets sont simplement codés en deux nombres hexadécimaux. Zéro octets ne sont pas omis. - -Les valeurs des types virgule flottante et décimale sont codées comme leur représentation en mémoire. Comme nous soutenons l'architecture little endian, ils sont codés dans little endian. Zéro octets de début / fin ne sont pas omis. - -**Paramètre** - -- `arg` — A value to convert to hexadecimal. Types: [Chaîne](../../sql-reference/data-types/string.md), [UInt](../../sql-reference/data-types/int-uint.md), [Flottant](../../sql-reference/data-types/float.md), [Décimal](../../sql-reference/data-types/decimal.md), [Date](../../sql-reference/data-types/date.md) ou [DateTime](../../sql-reference/data-types/datetime.md). - -**Valeur renvoyée** - -- Une chaîne avec la représentation hexadécimale de l'argument. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT hex(toFloat32(number)) as hex_presentation FROM numbers(15, 2); -``` - -Résultat: - -``` text -┌─hex_presentation─┐ -│ 00007041 │ -│ 00008041 │ -└──────────────────┘ -``` - -Requête: - -``` sql -SELECT hex(toFloat64(number)) as hex_presentation FROM numbers(15, 2); -``` - -Résultat: - -``` text -┌─hex_presentation─┐ -│ 0000000000002E40 │ -│ 0000000000003040 │ -└──────────────────┘ -``` - -## unhex (str) {#unhexstr} - -Accepte une chaîne contenant un nombre quelconque de chiffres hexadécimaux, et renvoie une chaîne contenant le correspondant octets. Prend en charge les lettres majuscules et minuscules A-F. Le nombre de chiffres hexadécimaux ne doit pas être pair. S'il est impair, le dernier chiffre est interprété comme la moitié la moins significative de l'octet 00-0F. Si la chaîne d'argument contient autre chose que des chiffres hexadécimaux, un résultat défini par l'implémentation est renvoyé (une exception n'est pas levée). -Si vous voulez convertir le résultat en un nombre, vous pouvez utiliser le ‘reverse’ et ‘reinterpretAsType’ fonction. - -## UUIDStringToNum (str) {#uuidstringtonumstr} - -Accepte une chaîne contenant 36 caractères dans le format `123e4567-e89b-12d3-a456-426655440000`, et le renvoie comme un ensemble d'octets dans un FixedString (16). - -## UUIDNumToString (str) {#uuidnumtostringstr} - -Accepte une valeur FixedString (16). Renvoie une chaîne contenant 36 caractères au format texte. - -## bitmaskToList(num) {#bitmasktolistnum} - -Accepte un entier. Renvoie une chaîne contenant la liste des puissances de deux qui totalisent le nombre source lorsqu'il est additionné. Ils sont séparés par des virgules sans espaces au format texte, dans l'ordre croissant. - -## bitmaskToArray(num) {#bitmasktoarraynum} - -Accepte un entier. Renvoie un tableau de nombres UInt64 contenant la liste des puissances de deux qui totalisent le nombre source lorsqu'il est additionné. Les numéros dans le tableau sont dans l'ordre croissant. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/encoding_functions/) diff --git a/docs/fr/sql-reference/functions/ext-dict-functions.md b/docs/fr/sql-reference/functions/ext-dict-functions.md deleted file mode 100644 index 1cec307747d..00000000000 --- a/docs/fr/sql-reference/functions/ext-dict-functions.md +++ /dev/null @@ -1,205 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 58 -toc_title: Travailler avec des dictionnaires externes ---- - -# Fonctions pour travailler avec des dictionnaires externes {#ext_dict_functions} - -Pour plus d'informations sur la connexion et la configuration de dictionnaires externes, voir [Dictionnaires externes](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). - -## dictGet {#dictget} - -Récupère une valeur d'un dictionnaire externe. - -``` sql -dictGet('dict_name', 'attr_name', id_expr) -dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) -``` - -**Paramètre** - -- `dict_name` — Name of the dictionary. [Chaîne littérale](../syntax.md#syntax-string-literal). -- `attr_name` — Name of the column of the dictionary. [Chaîne littérale](../syntax.md#syntax-string-literal). -- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) de retour d'un [UInt64](../../sql-reference/data-types/int-uint.md) ou [Tuple](../../sql-reference/data-types/tuple.md)- tapez la valeur en fonction de la configuration du dictionnaire. -- `default_value_expr` — Value returned if the dictionary doesn't contain a row with the `id_expr` clé. [Expression](../syntax.md#syntax-expressions) renvoyer la valeur dans le type de données configuré pour `attr_name` attribut. - -**Valeur renvoyée** - -- Si ClickHouse analyse l'attribut avec succès dans le [l'attribut type de données](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes), les fonctions renvoient la valeur du dictionnaire de l'attribut qui correspond à `id_expr`. - -- Si il n'y a pas la clé, correspondant à `id_expr` dans le dictionnaire, puis: - - - `dictGet` returns the content of the `` element specified for the attribute in the dictionary configuration. - - `dictGetOrDefault` returns the value passed as the `default_value_expr` parameter. - -ClickHouse lève une exception si elle ne peut pas analyser la valeur de l'attribut ou si la valeur ne correspond pas au type de données d'attribut. - -**Exemple** - -Créer un fichier texte `ext-dict-text.csv` contenant les éléments suivants: - -``` text -1,1 -2,2 -``` - -La première colonne est `id` la deuxième colonne est `c1`. - -Configurer le dictionnaire externe: - -``` xml - - - ext-dict-test - - - /path-to/ext-dict-test.csv - CSV - - - - - - - - id - - - c1 - UInt32 - - - - 0 - - -``` - -Effectuer la requête: - -``` sql -SELECT - dictGetOrDefault('ext-dict-test', 'c1', number + 1, toUInt32(number * 10)) AS val, - toTypeName(val) AS type -FROM system.numbers -LIMIT 3 -``` - -``` text -┌─val─┬─type───┐ -│ 1 │ UInt32 │ -│ 2 │ UInt32 │ -│ 20 │ UInt32 │ -└─────┴────────┘ -``` - -**Voir Aussi** - -- [Dictionnaires Externes](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) - -## dictHas {#dicthas} - -Vérifie si une clé est présente dans un dictionnaire. - -``` sql -dictHas('dict_name', id_expr) -``` - -**Paramètre** - -- `dict_name` — Name of the dictionary. [Chaîne littérale](../syntax.md#syntax-string-literal). -- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) de retour d'un [UInt64](../../sql-reference/data-types/int-uint.md)-le type de la valeur. - -**Valeur renvoyée** - -- 0, si il n'y a pas de clé. -- 1, si il y a une clé. - -Type: `UInt8`. - -## dictGetHierarchy {#dictgethierarchy} - -Crée un tableau contenant tous les parents d'une clé dans le [hiérarchique dictionnaire](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md). - -**Syntaxe** - -``` sql -dictGetHierarchy('dict_name', key) -``` - -**Paramètre** - -- `dict_name` — Name of the dictionary. [Chaîne littérale](../syntax.md#syntax-string-literal). -- `key` — Key value. [Expression](../syntax.md#syntax-expressions) de retour d'un [UInt64](../../sql-reference/data-types/int-uint.md)-le type de la valeur. - -**Valeur renvoyée** - -- Les Parents pour la clé. - -Type: [Tableau (UInt64)](../../sql-reference/data-types/array.md). - -## dictisine {#dictisin} - -Vérifie l'ancêtre d'une clé à travers toute la chaîne hiérarchique dans le dictionnaire. - -``` sql -dictIsIn('dict_name', child_id_expr, ancestor_id_expr) -``` - -**Paramètre** - -- `dict_name` — Name of the dictionary. [Chaîne littérale](../syntax.md#syntax-string-literal). -- `child_id_expr` — Key to be checked. [Expression](../syntax.md#syntax-expressions) de retour d'un [UInt64](../../sql-reference/data-types/int-uint.md)-le type de la valeur. -- `ancestor_id_expr` — Alleged ancestor of the `child_id_expr` clé. [Expression](../syntax.md#syntax-expressions) de retour d'un [UInt64](../../sql-reference/data-types/int-uint.md)-le type de la valeur. - -**Valeur renvoyée** - -- 0, si `child_id_expr` n'est pas un enfant de `ancestor_id_expr`. -- 1, si `child_id_expr` est un enfant de `ancestor_id_expr` ou si `child_id_expr` est un `ancestor_id_expr`. - -Type: `UInt8`. - -## D'Autres Fonctions {#ext_dict_functions-other} - -ClickHouse prend en charge des fonctions spécialisées qui convertissent les valeurs d'attribut de dictionnaire en un type de données spécifique, quelle que soit la configuration du dictionnaire. - -Fonction: - -- `dictGetInt8`, `dictGetInt16`, `dictGetInt32`, `dictGetInt64` -- `dictGetUInt8`, `dictGetUInt16`, `dictGetUInt32`, `dictGetUInt64` -- `dictGetFloat32`, `dictGetFloat64` -- `dictGetDate` -- `dictGetDateTime` -- `dictGetUUID` -- `dictGetString` - -Toutes ces fonctions ont le `OrDefault` modification. Exemple, `dictGetDateOrDefault`. - -Syntaxe: - -``` sql -dictGet[Type]('dict_name', 'attr_name', id_expr) -dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) -``` - -**Paramètre** - -- `dict_name` — Name of the dictionary. [Chaîne littérale](../syntax.md#syntax-string-literal). -- `attr_name` — Name of the column of the dictionary. [Chaîne littérale](../syntax.md#syntax-string-literal). -- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) de retour d'un [UInt64](../../sql-reference/data-types/int-uint.md)-le type de la valeur. -- `default_value_expr` — Value which is returned if the dictionary doesn't contain a row with the `id_expr` clé. [Expression](../syntax.md#syntax-expressions) renvoyer une valeur dans le type de données configuré pour `attr_name` attribut. - -**Valeur renvoyée** - -- Si ClickHouse analyse l'attribut avec succès dans le [l'attribut type de données](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes), les fonctions renvoient la valeur du dictionnaire de l'attribut qui correspond à `id_expr`. - -- Si il n'est pas demandé `id_expr` dans le dictionnaire,: - - - `dictGet[Type]` returns the content of the `` element specified for the attribute in the dictionary configuration. - - `dictGet[Type]OrDefault` returns the value passed as the `default_value_expr` parameter. - -ClickHouse lève une exception si elle ne peut pas analyser la valeur de l'attribut ou si la valeur ne correspond pas au type de données d'attribut. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/ext_dict_functions/) diff --git a/docs/fr/sql-reference/functions/functions-for-nulls.md b/docs/fr/sql-reference/functions/functions-for-nulls.md deleted file mode 100644 index ef7be728ce7..00000000000 --- a/docs/fr/sql-reference/functions/functions-for-nulls.md +++ /dev/null @@ -1,312 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 63 -toc_title: Travailler avec des arguments nullables ---- - -# Fonctions pour travailler avec des agrégats nullables {#functions-for-working-with-nullable-aggregates} - -## isNull {#isnull} - -Vérifie si l'argument est [NULL](../../sql-reference/syntax.md#null-literal). - -``` sql -isNull(x) -``` - -**Paramètre** - -- `x` — A value with a non-compound data type. - -**Valeur renvoyée** - -- `1` si `x` être `NULL`. -- `0` si `x` n'est pas `NULL`. - -**Exemple** - -Table d'entrée - -``` text -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` - -Requête - -``` sql -SELECT x FROM t_null WHERE isNull(y) -``` - -``` text -┌─x─┐ -│ 1 │ -└───┘ -``` - -## isNotNull {#isnotnull} - -Vérifie si l'argument est [NULL](../../sql-reference/syntax.md#null-literal). - -``` sql -isNotNull(x) -``` - -**Paramètre:** - -- `x` — A value with a non-compound data type. - -**Valeur renvoyée** - -- `0` si `x` être `NULL`. -- `1` si `x` n'est pas `NULL`. - -**Exemple** - -Table d'entrée - -``` text -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` - -Requête - -``` sql -SELECT x FROM t_null WHERE isNotNull(y) -``` - -``` text -┌─x─┐ -│ 2 │ -└───┘ -``` - -## fusionner {#coalesce} - -Vérifie de gauche à droite si `NULL` les arguments ont été passés et renvoie le premier non-`NULL` argument. - -``` sql -coalesce(x,...) -``` - -**Paramètre:** - -- N'importe quel nombre de paramètres d'un type non composé. Tous les paramètres doivent être compatibles par type de données. - -**Valeurs renvoyées** - -- Le premier non-`NULL` argument. -- `NULL` si tous les arguments sont `NULL`. - -**Exemple** - -Considérez une liste de contacts qui peuvent spécifier plusieurs façons de contacter un client. - -``` text -┌─name─────┬─mail─┬─phone─────┬──icq─┐ -│ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ -│ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -└──────────┴──────┴───────────┴──────┘ -``` - -Le `mail` et `phone` les champs sont de type Chaîne de caractères, mais la `icq` le terrain est `UInt32`, de sorte qu'il doit être converti en `String`. - -Obtenir la première méthode de contact pour le client à partir de la liste de contacts: - -``` sql -SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook -``` - -``` text -┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ -│ client 1 │ 123-45-67 │ -│ client 2 │ ᴺᵁᴸᴸ │ -└──────────┴──────────────────────────────────────────────────────┘ -``` - -## ifNull {#ifnull} - -Renvoie une valeur alternative si l'argument principal est `NULL`. - -``` sql -ifNull(x,alt) -``` - -**Paramètre:** - -- `x` — The value to check for `NULL`. -- `alt` — The value that the function returns if `x` être `NULL`. - -**Valeurs renvoyées** - -- Valeur `x`, si `x` n'est pas `NULL`. -- Valeur `alt`, si `x` être `NULL`. - -**Exemple** - -``` sql -SELECT ifNull('a', 'b') -``` - -``` text -┌─ifNull('a', 'b')─┐ -│ a │ -└──────────────────┘ -``` - -``` sql -SELECT ifNull(NULL, 'b') -``` - -``` text -┌─ifNull(NULL, 'b')─┐ -│ b │ -└───────────────────┘ -``` - -## nullIf {#nullif} - -Retourner `NULL` si les arguments sont égaux. - -``` sql -nullIf(x, y) -``` - -**Paramètre:** - -`x`, `y` — Values for comparison. They must be compatible types, or ClickHouse will generate an exception. - -**Valeurs renvoyées** - -- `NULL` si les arguments sont égaux. -- Le `x` valeur, si les arguments ne sont pas égaux. - -**Exemple** - -``` sql -SELECT nullIf(1, 1) -``` - -``` text -┌─nullIf(1, 1)─┐ -│ ᴺᵁᴸᴸ │ -└──────────────┘ -``` - -``` sql -SELECT nullIf(1, 2) -``` - -``` text -┌─nullIf(1, 2)─┐ -│ 1 │ -└──────────────┘ -``` - -## assumeNotNull {#assumenotnull} - -Résultats dans une valeur de type [Nullable](../../sql-reference/data-types/nullable.md) pour un non- `Nullable` si la valeur n'est pas `NULL`. - -``` sql -assumeNotNull(x) -``` - -**Paramètre:** - -- `x` — The original value. - -**Valeurs renvoyées** - -- La valeur d'origine du non-`Nullable` type, si elle n'est pas `NULL`. -- La valeur par défaut pour le non-`Nullable` Tapez si la valeur d'origine était `NULL`. - -**Exemple** - -Envisager l' `t_null` table. - -``` sql -SHOW CREATE TABLE t_null -``` - -``` text -┌─statement─────────────────────────────────────────────────────────────────┐ -│ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ -└───────────────────────────────────────────────────────────────────────────┘ -``` - -``` text -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` - -Appliquer le `assumeNotNull` la fonction de la `y` colonne. - -``` sql -SELECT assumeNotNull(y) FROM t_null -``` - -``` text -┌─assumeNotNull(y)─┐ -│ 0 │ -│ 3 │ -└──────────────────┘ -``` - -``` sql -SELECT toTypeName(assumeNotNull(y)) FROM t_null -``` - -``` text -┌─toTypeName(assumeNotNull(y))─┐ -│ Int8 │ -│ Int8 │ -└──────────────────────────────┘ -``` - -## toNullable {#tonullable} - -Convertit le type d'argument en `Nullable`. - -``` sql -toNullable(x) -``` - -**Paramètre:** - -- `x` — The value of any non-compound type. - -**Valeur renvoyée** - -- La valeur d'entrée avec un `Nullable` type. - -**Exemple** - -``` sql -SELECT toTypeName(10) -``` - -``` text -┌─toTypeName(10)─┐ -│ UInt8 │ -└────────────────┘ -``` - -``` sql -SELECT toTypeName(toNullable(10)) -``` - -``` text -┌─toTypeName(toNullable(10))─┐ -│ Nullable(UInt8) │ -└────────────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/functions_for_nulls/) diff --git a/docs/fr/sql-reference/functions/geo.md b/docs/fr/sql-reference/functions/geo.md deleted file mode 100644 index a89f03c7216..00000000000 --- a/docs/fr/sql-reference/functions/geo.md +++ /dev/null @@ -1,510 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 62 -toc_title: "Travailler avec des coordonn\xE9es g\xE9ographiques" ---- - -# Fonctions pour travailler avec des coordonnées géographiques {#functions-for-working-with-geographical-coordinates} - -## greatCircleDistance {#greatcircledistance} - -Calculer la distance entre deux points sur la surface de la Terre en utilisant [la formule du grand cercle](https://en.wikipedia.org/wiki/Great-circle_distance). - -``` sql -greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) -``` - -**Les paramètres d'entrée** - -- `lon1Deg` — Longitude of the first point in degrees. Range: `[-180°, 180°]`. -- `lat1Deg` — Latitude of the first point in degrees. Range: `[-90°, 90°]`. -- `lon2Deg` — Longitude of the second point in degrees. Range: `[-180°, 180°]`. -- `lat2Deg` — Latitude of the second point in degrees. Range: `[-90°, 90°]`. - -Les valeurs positives correspondent à la latitude nord et à la longitude Est, et les valeurs négatives à la latitude Sud et à la longitude ouest. - -**Valeur renvoyée** - -La distance entre deux points sur la surface de la Terre, en mètres. - -Génère une exception lorsque les valeurs des paramètres d'entrée se situent en dehors de la plage. - -**Exemple** - -``` sql -SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) -``` - -``` text -┌─greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)─┐ -│ 14132374.194975413 │ -└───────────────────────────────────────────────────────────────────┘ -``` - -## pointInEllipses {#pointinellipses} - -Vérifie si le point appartient à au moins une des ellipses. -Coordonnées géométriques sont dans le système de coordonnées Cartésiennes. - -``` sql -pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) -``` - -**Les paramètres d'entrée** - -- `x, y` — Coordinates of a point on the plane. -- `xᵢ, yᵢ` — Coordinates of the center of the `i`-ème points de suspension. -- `aᵢ, bᵢ` — Axes of the `i`- e ellipse en unités de coordonnées x, Y. - -Les paramètres d'entrée doivent être `2+4⋅n`, où `n` est le nombre de points de suspension. - -**Valeurs renvoyées** - -`1` si le point est à l'intérieur d'au moins l'un des ellipses; `0`si elle ne l'est pas. - -**Exemple** - -``` sql -SELECT pointInEllipses(10., 10., 10., 9.1, 1., 0.9999) -``` - -``` text -┌─pointInEllipses(10., 10., 10., 9.1, 1., 0.9999)─┐ -│ 1 │ -└─────────────────────────────────────────────────┘ -``` - -## pointtinpolygon {#pointinpolygon} - -Vérifie si le point appartient au polygone sur l'avion. - -``` sql -pointInPolygon((x, y), [(a, b), (c, d) ...], ...) -``` - -**Les valeurs d'entrée** - -- `(x, y)` — Coordinates of a point on the plane. Data type — [Tuple](../../sql-reference/data-types/tuple.md) — A tuple of two numbers. -- `[(a, b), (c, d) ...]` — Polygon vertices. Data type — [Tableau](../../sql-reference/data-types/array.md). Chaque sommet est représenté par une paire de coordonnées `(a, b)`. Les sommets doivent être spécifiés dans le sens horaire ou antihoraire. Le nombre minimum de sommets est 3. Le polygone doit être constante. -- La fonction prend également en charge les polygones avec des trous (découper des sections). Dans ce cas, ajoutez des polygones qui définissent les sections découpées en utilisant des arguments supplémentaires de la fonction. La fonction ne prend pas en charge les polygones non simplement connectés. - -**Valeurs renvoyées** - -`1` si le point est à l'intérieur du polygone, `0` si elle ne l'est pas. -Si le point est sur la limite du polygone, la fonction peut renvoyer 0 ou 1. - -**Exemple** - -``` sql -SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res -``` - -``` text -┌─res─┐ -│ 1 │ -└─────┘ -``` - -## geohashEncode {#geohashencode} - -Encode la latitude et la longitude en tant que chaîne geohash, voir (http://geohash.org/, https://en.wikipedia.org/wiki/Geohash). - -``` sql -geohashEncode(longitude, latitude, [precision]) -``` - -**Les valeurs d'entrée** - -- longitude longitude partie de la coordonnée que vous souhaitez encoder. Flottant dans la gamme`[-180°, 180°]` -- latitude latitude partie de la coordonnée que vous souhaitez encoder. Flottant dans la gamme `[-90°, 90°]` -- precision-facultatif, longueur de la chaîne codée résultante, par défaut `12`. Entier dans la gamme `[1, 12]`. Toute valeur inférieure à `1` ou supérieure à `12` silencieusement converti à `12`. - -**Valeurs renvoyées** - -- alphanumérique `String` de coordonnées codées (la version modifiée de l'alphabet de codage base32 est utilisée). - -**Exemple** - -``` sql -SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res -``` - -``` text -┌─res──────────┐ -│ ezs42d000000 │ -└──────────────┘ -``` - -## geohashDecode {#geohashdecode} - -Décode toute chaîne codée geohash en longitude et latitude. - -**Les valeurs d'entrée** - -- chaîne codée-chaîne codée geohash. - -**Valeurs renvoyées** - -- (longitude, latitude) - 2-n-uplet de `Float64` les valeurs de longitude et de latitude. - -**Exemple** - -``` sql -SELECT geohashDecode('ezs42') AS res -``` - -``` text -┌─res─────────────────────────────┐ -│ (-5.60302734375,42.60498046875) │ -└─────────────────────────────────┘ -``` - -## geoToH3 {#geotoh3} - -Retourner [H3](https://uber.github.io/h3/#/documentation/overview/introduction) point d'indice `(lon, lat)` avec une résolution spécifiée. - -[H3](https://uber.github.io/h3/#/documentation/overview/introduction) est un système d'indexation géographique où la surface de la Terre divisée en carreaux hexagonaux même. Ce système est hiérarchique, c'est-à-dire que chaque hexagone au niveau supérieur peut être divisé en sept, même mais plus petits, etc. - -Cet indice est principalement utilisé pour les emplacements de bucketing et d'autres manipulations géospatiales. - -**Syntaxe** - -``` sql -geoToH3(lon, lat, resolution) -``` - -**Paramètre** - -- `lon` — Longitude. Type: [Float64](../../sql-reference/data-types/float.md). -- `lat` — Latitude. Type: [Float64](../../sql-reference/data-types/float.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- Numéro d'indice hexagonal. -- 0 en cas d'erreur. - -Type: `UInt64`. - -**Exemple** - -Requête: - -``` sql -SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index -``` - -Résultat: - -``` text -┌────────────h3Index─┐ -│ 644325524701193974 │ -└────────────────────┘ -``` - -## geohashesInBox {#geohashesinbox} - -Renvoie un tableau de chaînes codées geohash de précision donnée qui tombent à l'intérieur et croisent les limites d'une boîte donnée, essentiellement une grille 2D aplatie en tableau. - -**Les valeurs d'entrée** - -- longitude_min-longitude min, valeur flottante dans la plage `[-180°, 180°]` -- latitude_min-latitude min, valeur flottante dans la plage `[-90°, 90°]` -- longitude_max-longitude maximale, valeur flottante dans la plage `[-180°, 180°]` -- latitude_max-latitude maximale, valeur flottante dans la plage `[-90°, 90°]` -- précision - geohash précision, `UInt8` dans la gamme `[1, 12]` - -Veuillez noter que tous les paramètres de coordonnées doit être du même type: soit `Float32` ou `Float64`. - -**Valeurs renvoyées** - -- gamme de précision de longues chaînes de geohash-boîtes couvrant la zone, vous ne devriez pas compter sur l'ordre des éléments. -- \[\] - tableau vide si *min* les valeurs de *latitude* et *longitude* ne sont pas moins de correspondant *Max* valeur. - -Veuillez noter que la fonction lancera une exception si le tableau résultant a plus de 10'000'000 éléments. - -**Exemple** - -``` sql -SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos -``` - -``` text -┌─thasos──────────────────────────────────────┐ -│ ['sx1q','sx1r','sx32','sx1w','sx1x','sx38'] │ -└─────────────────────────────────────────────┘ -``` - -## h3GetBaseCell {#h3getbasecell} - -Renvoie le numéro de cellule de base de l'index. - -**Syntaxe** - -``` sql -h3GetBaseCell(index) -``` - -**Paramètre** - -- `index` — Hexagon index number. Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- Numéro de cellule de base hexagonale. Type: [UInt8](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT h3GetBaseCell(612916788725809151) as basecell -``` - -Résultat: - -``` text -┌─basecell─┐ -│ 12 │ -└──────────┘ -``` - -## h3HexAreaM2 {#h3hexaream2} - -Surface hexagonale Moyenne en mètres carrés à la résolution donnée. - -**Syntaxe** - -``` sql -h3HexAreaM2(resolution) -``` - -**Paramètre** - -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- Area in m². Type: [Float64](../../sql-reference/data-types/float.md). - -**Exemple** - -Requête: - -``` sql -SELECT h3HexAreaM2(13) as area -``` - -Résultat: - -``` text -┌─area─┐ -│ 43.9 │ -└──────┘ -``` - -## h3IndexesAreNeighbors {#h3indexesareneighbors} - -Renvoie si les H3Indexes fournis sont voisins ou non. - -**Syntaxe** - -``` sql -h3IndexesAreNeighbors(index1, index2) -``` - -**Paramètre** - -- `index1` — Hexagon index number. Type: [UInt64](../../sql-reference/data-types/int-uint.md). -- `index2` — Hexagon index number. Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- Retourner `1` si les index sont voisins, `0` autrement. Type: [UInt8](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT h3IndexesAreNeighbors(617420388351344639, 617420388352655359) AS n -``` - -Résultat: - -``` text -┌─n─┐ -│ 1 │ -└───┘ -``` - -## h3enfants {#h3tochildren} - -Retourne un tableau avec les index enfants de l'index donné. - -**Syntaxe** - -``` sql -h3ToChildren(index, resolution) -``` - -**Paramètre** - -- `index` — Hexagon index number. Type: [UInt64](../../sql-reference/data-types/int-uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- Tableau avec les index H3 enfants. Tableau de type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT h3ToChildren(599405990164561919, 6) AS children -``` - -Résultat: - -``` text -┌─children───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ [603909588852408319,603909588986626047,603909589120843775,603909589255061503,603909589389279231,603909589523496959,603909589657714687] │ -└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -## h3ToParent {#h3toparent} - -Renvoie l'index parent (plus grossier) contenant l'index donné. - -**Syntaxe** - -``` sql -h3ToParent(index, resolution) -``` - -**Paramètre** - -- `index` — Hexagon index number. Type: [UInt64](../../sql-reference/data-types/int-uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- Parent H3 index. Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT h3ToParent(599405990164561919, 3) as parent -``` - -Résultat: - -``` text -┌─────────────parent─┐ -│ 590398848891879423 │ -└────────────────────┘ -``` - -## h3ToString {#h3tostring} - -Convertit la représentation H3Index de l'index en représentation de chaîne. - -``` sql -h3ToString(index) -``` - -**Paramètre** - -- `index` — Hexagon index number. Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- Représentation en chaîne de l'index H3. Type: [Chaîne](../../sql-reference/data-types/string.md). - -**Exemple** - -Requête: - -``` sql -SELECT h3ToString(617420388352917503) as h3_string -``` - -Résultat: - -``` text -┌─h3_string───────┐ -│ 89184926cdbffff │ -└─────────────────┘ -``` - -## stringToH3 {#stringtoh3} - -Convertit la représentation de chaîne en représentation H3Index (UInt64). - -``` sql -stringToH3(index_str) -``` - -**Paramètre** - -- `index_str` — String representation of the H3 index. Type: [Chaîne](../../sql-reference/data-types/string.md). - -**Valeurs renvoyées** - -- Numéro d'indice hexagonal. Renvoie 0 en cas d'erreur. Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT stringToH3('89184926cc3ffff') as index -``` - -Résultat: - -``` text -┌──────────────index─┐ -│ 617420388351344639 │ -└────────────────────┘ -``` - -## h3grésolution {#h3getresolution} - -Retourne la résolution de l'index. - -**Syntaxe** - -``` sql -h3GetResolution(index) -``` - -**Paramètre** - -- `index` — Hexagon index number. Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Valeurs renvoyées** - -- L'indice de la résolution. Gamme: `[0, 15]`. Type: [UInt8](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT h3GetResolution(617420388352917503) as res -``` - -Résultat: - -``` text -┌─res─┐ -│ 9 │ -└─────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/geo/) diff --git a/docs/fr/sql-reference/functions/hash-functions.md b/docs/fr/sql-reference/functions/hash-functions.md deleted file mode 100644 index 3b0f92dd4f8..00000000000 --- a/docs/fr/sql-reference/functions/hash-functions.md +++ /dev/null @@ -1,484 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 50 -toc_title: Hachage ---- - -# Les Fonctions De Hachage {#hash-functions} - -Les fonctions de hachage peuvent être utilisées pour le brassage pseudo-aléatoire déterministe des éléments. - -## halfMD5 {#hash-functions-halfmd5} - -[Interpréter](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) tous les paramètres d'entrée sous forme de chaînes et calcule le [MD5](https://en.wikipedia.org/wiki/MD5) la valeur de hachage pour chacun d'eux. Puis combine les hachages, prend les 8 premiers octets du hachage de la chaîne résultante, et les interprète comme `UInt64` dans l'ordre des octets big-endian. - -``` sql -halfMD5(par1, ...) -``` - -La fonction est relativement lente (5 millions de chaînes courtes par seconde par cœur de processeur). -Envisager l'utilisation de la [sipHash64](#hash_functions-siphash64) la fonction la place. - -**Paramètre** - -La fonction prend un nombre variable de paramètres d'entrée. Les paramètres peuvent être tout de la [types de données pris en charge](../../sql-reference/data-types/index.md). - -**Valeur Renvoyée** - -A [UInt64](../../sql-reference/data-types/int-uint.md) valeur de hachage du type de données. - -**Exemple** - -``` sql -SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS halfMD5hash, toTypeName(halfMD5hash) AS type -``` - -``` text -┌────────halfMD5hash─┬─type───┐ -│ 186182704141653334 │ UInt64 │ -└────────────────────┴────────┘ -``` - -## MD5 {#hash_functions-md5} - -Calcule le MD5 à partir d'une chaîne et renvoie L'ensemble d'octets résultant en tant que FixedString(16). -Si vous n'avez pas besoin de MD5 en particulier, mais que vous avez besoin d'un hachage cryptographique 128 bits décent, utilisez le ‘sipHash128’ la fonction la place. -Si vous voulez obtenir le même résultat que la sortie de l'utilitaire md5sum, utilisez lower (hex(MD5 (s))). - -## sipHash64 {#hash_functions-siphash64} - -Produit un 64 bits [SipHash](https://131002.net/siphash/) la valeur de hachage. - -``` sql -sipHash64(par1,...) -``` - -C'est une fonction de hachage cryptographique. Il fonctionne au moins trois fois plus vite que le [MD5](#hash_functions-md5) fonction. - -Fonction [interpréter](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) tous les paramètres d'entrée sous forme de chaînes et calcule la valeur de hachage pour chacun d'eux. Puis combine les hachages par l'algorithme suivant: - -1. Après avoir haché tous les paramètres d'entrée, la fonction obtient le tableau de hachages. -2. La fonction prend le premier et le second éléments et calcule un hachage pour le tableau d'entre eux. -3. Ensuite, la fonction prend la valeur de hachage, calculée à l'étape précédente, et le troisième élément du tableau de hachage initial, et calcule un hachage pour le tableau d'entre eux. -4. L'étape précédente est répétée pour tous les éléments restants de la période initiale de hachage tableau. - -**Paramètre** - -La fonction prend un nombre variable de paramètres d'entrée. Les paramètres peuvent être tout de la [types de données pris en charge](../../sql-reference/data-types/index.md). - -**Valeur Renvoyée** - -A [UInt64](../../sql-reference/data-types/int-uint.md) valeur de hachage du type de données. - -**Exemple** - -``` sql -SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type -``` - -``` text -┌──────────────SipHash─┬─type───┐ -│ 13726873534472839665 │ UInt64 │ -└──────────────────────┴────────┘ -``` - -## sipHash128 {#hash_functions-siphash128} - -Calcule SipHash à partir d'une chaîne. -Accepte un argument de type chaîne. Renvoie FixedString (16). -Diffère de sipHash64 en ce que l'état de pliage xor final n'est effectué que jusqu'à 128 bits. - -## cityHash64 {#cityhash64} - -Produit un 64 bits [CityHash](https://github.com/google/cityhash) la valeur de hachage. - -``` sql -cityHash64(par1,...) -``` - -Ceci est une fonction de hachage non cryptographique rapide. Il utilise L'algorithme CityHash pour les paramètres de chaîne et la fonction de hachage rapide non cryptographique spécifique à l'implémentation pour les paramètres avec d'autres types de données. La fonction utilise le combinateur CityHash pour obtenir les résultats finaux. - -**Paramètre** - -La fonction prend un nombre variable de paramètres d'entrée. Les paramètres peuvent être tout de la [types de données pris en charge](../../sql-reference/data-types/index.md). - -**Valeur Renvoyée** - -A [UInt64](../../sql-reference/data-types/int-uint.md) valeur de hachage du type de données. - -**Exemple** - -Appelez exemple: - -``` sql -SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS CityHash, toTypeName(CityHash) AS type -``` - -``` text -┌─────────────CityHash─┬─type───┐ -│ 12072650598913549138 │ UInt64 │ -└──────────────────────┴────────┘ -``` - -L'exemple suivant montre comment calculer la somme de l'ensemble de la table avec précision jusqu'à la ligne de commande: - -``` sql -SELECT groupBitXor(cityHash64(*)) FROM table -``` - -## intHash32 {#inthash32} - -Calcule un code de hachage 32 bits à partir de n'importe quel type d'entier. -C'est une fonction de hachage non cryptographique relativement rapide de qualité moyenne pour les nombres. - -## intHash64 {#inthash64} - -Calcule un code de hachage 64 bits à partir de n'importe quel type d'entier. -Il fonctionne plus vite que intHash32. Qualité moyenne. - -## SHA1 {#sha1} - -## SHA224 {#sha224} - -## SHA256 {#sha256} - -Calcule SHA-1, SHA-224 ou SHA-256 à partir d'une chaîne et renvoie l'ensemble d'octets résultant en tant que FixedString(20), FixedString(28) ou FixedString(32). -La fonction fonctionne assez lentement (SHA-1 traite environ 5 millions de chaînes courtes par seconde par cœur de processeur, tandis que SHA-224 et SHA-256 traitent environ 2,2 millions). -Nous vous recommandons d'utiliser cette fonction uniquement dans les cas où vous avez besoin d'une fonction de hachage spécifique et que vous ne pouvez pas la sélectionner. -Même dans ces cas, nous vous recommandons d'appliquer la fonction hors ligne et de pré-calculer les valeurs lors de leur insertion dans la table, au lieu de l'appliquer dans SELECTS. - -## URLHash(url \[, N\]) {#urlhashurl-n} - -Une fonction de hachage non cryptographique rapide et de qualité décente pour une chaîne obtenue à partir d'une URL en utilisant un type de normalisation. -`URLHash(s)` – Calculates a hash from a string without one of the trailing symbols `/`,`?` ou `#` à la fin, si elle est présente. -`URLHash(s, N)` – Calculates a hash from a string up to the N level in the URL hierarchy, without one of the trailing symbols `/`,`?` ou `#` à la fin, si elle est présente. -Les niveaux sont les mêmes que dans URLHierarchy. Cette fonction est spécifique à Yandex.Metrica. - -## farmHash64 {#farmhash64} - -Produit un 64 bits [FarmHash](https://github.com/google/farmhash) la valeur de hachage. - -``` sql -farmHash64(par1, ...) -``` - -La fonction utilise le `Hash64` la méthode de tous les [les méthodes disponibles](https://github.com/google/farmhash/blob/master/src/farmhash.h). - -**Paramètre** - -La fonction prend un nombre variable de paramètres d'entrée. Les paramètres peuvent être tout de la [types de données pris en charge](../../sql-reference/data-types/index.md). - -**Valeur Renvoyée** - -A [UInt64](../../sql-reference/data-types/int-uint.md) valeur de hachage du type de données. - -**Exemple** - -``` sql -SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS FarmHash, toTypeName(FarmHash) AS type -``` - -``` text -┌─────────────FarmHash─┬─type───┐ -│ 17790458267262532859 │ UInt64 │ -└──────────────────────┴────────┘ -``` - -## javaHash {#hash_functions-javahash} - -Calculer [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) à partir d'une chaîne. Cette fonction de hachage n'est ni rapide ni de bonne qualité. La seule raison de l'utiliser est lorsque cet algorithme est déjà utilisé dans un autre système et que vous devez calculer exactement le même résultat. - -**Syntaxe** - -``` sql -SELECT javaHash(''); -``` - -**Valeur renvoyée** - -A `Int32` valeur de hachage du type de données. - -**Exemple** - -Requête: - -``` sql -SELECT javaHash('Hello, world!'); -``` - -Résultat: - -``` text -┌─javaHash('Hello, world!')─┐ -│ -1880044555 │ -└───────────────────────────┘ -``` - -## javaHashUTF16LE {#javahashutf16le} - -Calculer [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) à partir d'une chaîne, en supposant qu'elle contient des octets représentant une chaîne en encodage UTF-16LE. - -**Syntaxe** - -``` sql -javaHashUTF16LE(stringUtf16le) -``` - -**Paramètre** - -- `stringUtf16le` — a string in UTF-16LE encoding. - -**Valeur renvoyée** - -A `Int32` valeur de hachage du type de données. - -**Exemple** - -Requête correcte avec une chaîne codée UTF-16LE. - -Requête: - -``` sql -SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')) -``` - -Résultat: - -``` text -┌─javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le'))─┐ -│ 3556498 │ -└──────────────────────────────────────────────────────────────┘ -``` - -## hiveHash {#hash-functions-hivehash} - -Calculer `HiveHash` à partir d'une chaîne. - -``` sql -SELECT hiveHash(''); -``` - -C'est juste [JavaHash](#hash_functions-javahash) avec le bit de signe mis à zéro. Cette fonction est utilisée dans [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) pour les versions antérieures à la version 3.0. Cette fonction de hachage n'est ni rapide ni de bonne qualité. La seule raison de l'utiliser est lorsque cet algorithme est déjà utilisé dans un autre système et que vous devez calculer exactement le même résultat. - -**Valeur renvoyée** - -A `Int32` valeur de hachage du type de données. - -Type: `hiveHash`. - -**Exemple** - -Requête: - -``` sql -SELECT hiveHash('Hello, world!'); -``` - -Résultat: - -``` text -┌─hiveHash('Hello, world!')─┐ -│ 267439093 │ -└───────────────────────────┘ -``` - -## metroHash64 {#metrohash64} - -Produit un 64 bits [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/) la valeur de hachage. - -``` sql -metroHash64(par1, ...) -``` - -**Paramètre** - -La fonction prend un nombre variable de paramètres d'entrée. Les paramètres peuvent être tout de la [types de données pris en charge](../../sql-reference/data-types/index.md). - -**Valeur Renvoyée** - -A [UInt64](../../sql-reference/data-types/int-uint.md) valeur de hachage du type de données. - -**Exemple** - -``` sql -SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MetroHash, toTypeName(MetroHash) AS type -``` - -``` text -┌────────────MetroHash─┬─type───┐ -│ 14235658766382344533 │ UInt64 │ -└──────────────────────┴────────┘ -``` - -## jumpConsistentHash {#jumpconsistenthash} - -Calcule JumpConsistentHash forme un UInt64. -Accepte deux arguments: une clé de type UInt64 et le nombre de compartiments. Renvoie Int32. -Pour plus d'informations, voir le lien: [JumpConsistentHash](https://arxiv.org/pdf/1406.2294.pdf) - -## murmurHash2_32, murmurHash2_64 {#murmurhash2-32-murmurhash2-64} - -Produit un [MurmurHash2](https://github.com/aappleby/smhasher) la valeur de hachage. - -``` sql -murmurHash2_32(par1, ...) -murmurHash2_64(par1, ...) -``` - -**Paramètre** - -Les deux fonctions prennent un nombre variable de paramètres d'entrée. Les paramètres peuvent être tout de la [types de données pris en charge](../../sql-reference/data-types/index.md). - -**Valeur Renvoyée** - -- Le `murmurHash2_32` fonction renvoie la valeur de hachage ayant le [UInt32](../../sql-reference/data-types/int-uint.md) type de données. -- Le `murmurHash2_64` fonction renvoie la valeur de hachage ayant le [UInt64](../../sql-reference/data-types/int-uint.md) type de données. - -**Exemple** - -``` sql -SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash2, toTypeName(MurmurHash2) AS type -``` - -``` text -┌──────────MurmurHash2─┬─type───┐ -│ 11832096901709403633 │ UInt64 │ -└──────────────────────┴────────┘ -``` - -## gccMurmurHash {#gccmurmurhash} - -Calcule un 64 bits [MurmurHash2](https://github.com/aappleby/smhasher) valeur de hachage utilisant la même graine de hachage que [gcc](https://github.com/gcc-mirror/gcc/blob/41d6b10e96a1de98e90a7c0378437c3255814b16/libstdc%2B%2B-v3/include/bits/functional_hash.h#L191). Il est portable entre Clang et GCC construit. - -**Syntaxe** - -``` sql -gccMurmurHash(par1, ...); -``` - -**Paramètre** - -- `par1, ...` — A variable number of parameters that can be any of the [types de données pris en charge](../../sql-reference/data-types/index.md#data_types). - -**Valeur renvoyée** - -- Valeur de hachage calculée. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT - gccMurmurHash(1, 2, 3) AS res1, - gccMurmurHash(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2)))) AS res2 -``` - -Résultat: - -``` text -┌─────────────────res1─┬────────────────res2─┐ -│ 12384823029245979431 │ 1188926775431157506 │ -└──────────────────────┴─────────────────────┘ -``` - -## murmurHash3_32, murmurHash3_64 {#murmurhash3-32-murmurhash3-64} - -Produit un [MurmurHash3](https://github.com/aappleby/smhasher) la valeur de hachage. - -``` sql -murmurHash3_32(par1, ...) -murmurHash3_64(par1, ...) -``` - -**Paramètre** - -Les deux fonctions prennent un nombre variable de paramètres d'entrée. Les paramètres peuvent être tout de la [types de données pris en charge](../../sql-reference/data-types/index.md). - -**Valeur Renvoyée** - -- Le `murmurHash3_32` la fonction retourne un [UInt32](../../sql-reference/data-types/int-uint.md) valeur de hachage du type de données. -- Le `murmurHash3_64` la fonction retourne un [UInt64](../../sql-reference/data-types/int-uint.md) valeur de hachage du type de données. - -**Exemple** - -``` sql -SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash3, toTypeName(MurmurHash3) AS type -``` - -``` text -┌─MurmurHash3─┬─type───┐ -│ 2152717 │ UInt32 │ -└─────────────┴────────┘ -``` - -## murmurHash3_128 {#murmurhash3-128} - -Produit de 128 bits [MurmurHash3](https://github.com/aappleby/smhasher) la valeur de hachage. - -``` sql -murmurHash3_128( expr ) -``` - -**Paramètre** - -- `expr` — [Expression](../syntax.md#syntax-expressions) de retour d'un [Chaîne](../../sql-reference/data-types/string.md)-le type de la valeur. - -**Valeur Renvoyée** - -A [FixedString (16)](../../sql-reference/data-types/fixedstring.md) valeur de hachage du type de données. - -**Exemple** - -``` sql -SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) AS type -``` - -``` text -┌─MurmurHash3──────┬─type────────────┐ -│ 6�1�4"S5KT�~~q │ FixedString(16) │ -└──────────────────┴─────────────────┘ -``` - -## xxHash32, xxHash64 {#hash-functions-xxhash32} - -Calculer `xxHash` à partir d'une chaîne. Il est proposé en deux saveurs, 32 et 64 bits. - -``` sql -SELECT xxHash32(''); - -OR - -SELECT xxHash64(''); -``` - -**Valeur renvoyée** - -A `Uint32` ou `Uint64` valeur de hachage du type de données. - -Type: `xxHash`. - -**Exemple** - -Requête: - -``` sql -SELECT xxHash32('Hello, world!'); -``` - -Résultat: - -``` text -┌─xxHash32('Hello, world!')─┐ -│ 834093149 │ -└───────────────────────────┘ -``` - -**Voir Aussi** - -- [xxHash](http://cyan4973.github.io/xxHash/). - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/hash_functions/) diff --git a/docs/fr/sql-reference/functions/higher-order-functions.md b/docs/fr/sql-reference/functions/higher-order-functions.md deleted file mode 100644 index ac24b67bb97..00000000000 --- a/docs/fr/sql-reference/functions/higher-order-functions.md +++ /dev/null @@ -1,264 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 57 -toc_title: "D'Ordre Sup\xE9rieur" ---- - -# Fonctions d'ordre supérieur {#higher-order-functions} - -## `->` opérateur, fonction lambda (params, expr) {#operator-lambdaparams-expr-function} - -Allows describing a lambda function for passing to a higher-order function. The left side of the arrow has a formal parameter, which is any ID, or multiple formal parameters – any IDs in a tuple. The right side of the arrow has an expression that can use these formal parameters, as well as any table columns. - -Exemple: `x -> 2 * x, str -> str != Referer.` - -Les fonctions d'ordre supérieur ne peuvent accepter que les fonctions lambda comme argument fonctionnel. - -Une fonction lambda qui accepte plusieurs arguments peuvent être passés à une fonction d'ordre supérieur. Dans ce cas, la fonction d'ordre supérieur est passé plusieurs tableaux de longueur identique que ces arguments correspondent. - -Pour certaines fonctions, telles que [arrayCount](#higher_order_functions-array-count) ou [arraySum](#higher_order_functions-array-count) le premier argument (la fonction lambda) peut être omis. Dans ce cas, un mappage identique est supposé. - -Une fonction lambda ne peut pas être omise pour les fonctions suivantes: - -- [arrayMap](#higher_order_functions-array-map) -- [arrayFilter](#higher_order_functions-array-filter) -- [arrayFill](#higher_order_functions-array-fill) -- [arrayReverseFill](#higher_order_functions-array-reverse-fill) -- [arraySplit](#higher_order_functions-array-split) -- [arrayReverseSplit](#higher_order_functions-array-reverse-split) -- [arrayFirst](#higher_order_functions-array-first) -- [arrayFirstIndex](#higher_order_functions-array-first-index) - -### arrayMap(func, arr1, …) {#higher_order_functions-array-map} - -Renvoie un tableau obtenu à partir de l'application d'origine `func` fonction à chaque élément dans le `arr` tableau. - -Exemple: - -``` sql -SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; -``` - -``` text -┌─res─────┐ -│ [3,4,5] │ -└─────────┘ -``` - -L'exemple suivant montre comment créer un n-uplet d'éléments de différents tableaux: - -``` sql -SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res -``` - -``` text -┌─res─────────────────┐ -│ [(1,4),(2,5),(3,6)] │ -└─────────────────────┘ -``` - -Notez que le premier argument (fonction lambda) ne peut pas être omis dans le `arrayMap` fonction. - -### arrayFilter(func, arr1, …) {#higher_order_functions-array-filter} - -Renvoie un tableau contenant uniquement les éléments `arr1` pour ce qui `func` retourne autre chose que 0. - -Exemple: - -``` sql -SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res -``` - -``` text -┌─res───────────┐ -│ ['abc World'] │ -└───────────────┘ -``` - -``` sql -SELECT - arrayFilter( - (i, x) -> x LIKE '%World%', - arrayEnumerate(arr), - ['Hello', 'abc World'] AS arr) - AS res -``` - -``` text -┌─res─┐ -│ [2] │ -└─────┘ -``` - -Notez que le premier argument (fonction lambda) ne peut pas être omis dans le `arrayFilter` fonction. - -### arrayFill(func, arr1, …) {#higher_order_functions-array-fill} - -Analyse par le biais de `arr1` du premier élément au dernier élément et remplacer `arr1[i]` par `arr1[i - 1]` si `func` renvoie 0. Le premier élément de `arr1` ne sera pas remplacé. - -Exemple: - -``` sql -SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res -``` - -``` text -┌─res──────────────────────────────┐ -│ [1,1,3,11,12,12,12,5,6,14,14,14] │ -└──────────────────────────────────┘ -``` - -Notez que le premier argument (fonction lambda) ne peut pas être omis dans le `arrayFill` fonction. - -### arrayReverseFill(func, arr1, …) {#higher_order_functions-array-reverse-fill} - -Analyse par le biais de `arr1` du dernier élément au premier élément et remplacer `arr1[i]` par `arr1[i + 1]` si `func` renvoie 0. Le dernier élément de `arr1` ne sera pas remplacé. - -Exemple: - -``` sql -SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res -``` - -``` text -┌─res────────────────────────────────┐ -│ [1,3,3,11,12,5,5,5,6,14,NULL,NULL] │ -└────────────────────────────────────┘ -``` - -Notez que le premier argument (fonction lambda) ne peut pas être omis dans le `arrayReverseFill` fonction. - -### arraySplit(func, arr1, …) {#higher_order_functions-array-split} - -Split `arr1` en plusieurs tableaux. Lorsque `func` retourne autre chose que 0, la matrice sera de split sur le côté gauche de l'élément. Le tableau ne sera pas partagé avant le premier élément. - -Exemple: - -``` sql -SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res -``` - -``` text -┌─res─────────────┐ -│ [[1,2,3],[4,5]] │ -└─────────────────┘ -``` - -Notez que le premier argument (fonction lambda) ne peut pas être omis dans le `arraySplit` fonction. - -### arrayReverseSplit(func, arr1, …) {#higher_order_functions-array-reverse-split} - -Split `arr1` en plusieurs tableaux. Lorsque `func` retourne autre chose que 0, la matrice sera de split sur le côté droit de l'élément. Le tableau ne sera pas divisé après le dernier élément. - -Exemple: - -``` sql -SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res -``` - -``` text -┌─res───────────────┐ -│ [[1],[2,3,4],[5]] │ -└───────────────────┘ -``` - -Notez que le premier argument (fonction lambda) ne peut pas être omis dans le `arraySplit` fonction. - -### arrayCount(\[func,\] arr1, …) {#higher_order_functions-array-count} - -Renvoie le nombre d'éléments dans l'arr tableau pour lequel func renvoie autre chose que 0. Si ‘func’ n'est pas spécifié, il renvoie le nombre d'éléments non nuls dans le tableau. - -### arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} - -Renvoie 1 s'il existe au moins un élément ‘arr’ pour ce qui ‘func’ retourne autre chose que 0. Sinon, il renvoie 0. - -### arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} - -Renvoie 1 si ‘func’ retourne autre chose que 0 pour tous les éléments de ‘arr’. Sinon, il renvoie 0. - -### arraySum(\[func,\] arr1, …) {#higher-order-functions-array-sum} - -Renvoie la somme de la ‘func’ valeur. Si la fonction est omise, elle retourne la somme des éléments du tableau. - -### arrayFirst(func, arr1, …) {#higher_order_functions-array-first} - -Renvoie le premier élément du ‘arr1’ tableau pour lequel ‘func’ retourne autre chose que 0. - -Notez que le premier argument (fonction lambda) ne peut pas être omis dans le `arrayFirst` fonction. - -### arrayFirstIndex(func, arr1, …) {#higher_order_functions-array-first-index} - -Renvoie l'index du premier élément de la ‘arr1’ tableau pour lequel ‘func’ retourne autre chose que 0. - -Notez que le premier argument (fonction lambda) ne peut pas être omis dans le `arrayFirstIndex` fonction. - -### arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} - -Retourne un tableau des sommes partielles d'éléments dans le tableau source (une somme). Si l' `func` la fonction est spécifiée, les valeurs des éléments du tableau sont convertis par cette fonction avant l'addition. - -Exemple: - -``` sql -SELECT arrayCumSum([1, 1, 1, 1]) AS res -``` - -``` text -┌─res──────────┐ -│ [1, 2, 3, 4] │ -└──────────────┘ -``` - -### arrayCumSumNonNegative (arr) {#arraycumsumnonnegativearr} - -Même que `arrayCumSum`, renvoie un tableau des sommes partielles d'éléments dans le tableau source (une somme). Différent `arrayCumSum`, lorsque la valeur renvoyée contient une valeur inférieure à zéro, la valeur est remplacée par zéro et le calcul ultérieur est effectué avec des paramètres zéro. Exemple: - -``` sql -SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res -``` - -``` text -┌─res───────┐ -│ [1,2,0,1] │ -└───────────┘ -``` - -### arraySort(\[func,\] arr1, …) {#arraysortfunc-arr1} - -Renvoie un tableau à la suite du tri des éléments de `arr1` dans l'ordre croissant. Si l' `func` la fonction est spécifiée, l'ordre de classement est déterminé par le résultat de la fonction `func` appliquée aux éléments du tableau (tableaux) - -Le [Transformation schwartzienne](https://en.wikipedia.org/wiki/Schwartzian_transform) est utilisé pour améliorer l'efficacité du tri. - -Exemple: - -``` sql -SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); -``` - -``` text -┌─res────────────────┐ -│ ['world', 'hello'] │ -└────────────────────┘ -``` - -Pour plus d'informations sur la `arraySort` la méthode, voir l' [Fonctions pour travailler avec des tableaux](array-functions.md#array_functions-sort) section. - -### arrayReverseSort(\[func,\] arr1, …) {#arrayreversesortfunc-arr1} - -Renvoie un tableau à la suite du tri des éléments de `arr1` dans l'ordre décroissant. Si l' `func` la fonction est spécifiée, l'ordre de classement est déterminé par le résultat de la fonction `func` appliquée aux éléments du tableau (tableaux). - -Exemple: - -``` sql -SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; -``` - -``` text -┌─res───────────────┐ -│ ['hello','world'] │ -└───────────────────┘ -``` - -Pour plus d'informations sur la `arrayReverseSort` la méthode, voir l' [Fonctions pour travailler avec des tableaux](array-functions.md#array_functions-reverse-sort) section. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/higher_order_functions/) diff --git a/docs/fr/sql-reference/functions/in-functions.md b/docs/fr/sql-reference/functions/in-functions.md deleted file mode 100644 index ced5ef73e46..00000000000 --- a/docs/fr/sql-reference/functions/in-functions.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 60 -toc_title: "Mise en \u0153uvre de L'op\xE9rateur IN" ---- - -# Fonctions de mise en œuvre de L'opérateur IN {#functions-for-implementing-the-in-operator} - -## in, notin, globalIn, globalNotIn {#in-functions} - -Voir la section [Dans les opérateurs](../operators/in.md#select-in-operators). - -## tuple(x, y, …), operator (x, y, …) {#tuplex-y-operator-x-y} - -Une fonction qui permet de regrouper plusieurs colonnes. -For columns with the types T1, T2, …, it returns a Tuple(T1, T2, …) type tuple containing these columns. There is no cost to execute the function. -Les Tuples sont normalement utilisés comme valeurs intermédiaires pour un argument D'opérateurs IN, ou pour créer une liste de paramètres formels de fonctions lambda. Les Tuples ne peuvent pas être écrits sur une table. - -## tupleElement (tuple, n), opérateur X. N {#tupleelementtuple-n-operator-x-n} - -Une fonction qui permet d'obtenir une colonne à partir d'un tuple. -‘N’ est l'index de colonne, à partir de 1. N doit être une constante. ‘N’ doit être une constante. ‘N’ doit être un entier postif strict ne dépassant pas la taille du tuple. -Il n'y a aucun coût pour exécuter la fonction. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/in_functions/) diff --git a/docs/fr/sql-reference/functions/index.md b/docs/fr/sql-reference/functions/index.md deleted file mode 100644 index 6e5333f68f5..00000000000 --- a/docs/fr/sql-reference/functions/index.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Fonction -toc_priority: 32 -toc_title: Introduction ---- - -# Fonction {#functions} - -Il y a au moins\* deux types de fonctions - des fonctions régulières (elles sont simplement appelées “functions”) and aggregate functions. These are completely different concepts. Regular functions work as if they are applied to each row separately (for each row, the result of the function doesn't depend on the other rows). Aggregate functions accumulate a set of values from various rows (i.e. they depend on the entire set of rows). - -Dans cette section, nous discutons des fonctions classiques. Pour les fonctions d'agrégation, voir la section “Aggregate functions”. - -\* - Il existe un troisième type de fonction ‘arrayJoin’ la fonction appartient à; les fonctions de table peuvent également être mentionnées séparément.\* - -## Typage Fort {#strong-typing} - -Contrairement à SQL standard, ClickHouse a une forte typage. En d'autres termes, il ne fait pas de conversions implicites entre les types. Chaque fonction fonctionne pour un ensemble spécifique de types. Cela signifie que vous devez parfois utiliser des fonctions de conversion de type. - -## Élimination Des Sous-Expressions Courantes {#common-subexpression-elimination} - -Toutes les expressions d'une requête qui ont le même AST (le même enregistrement ou le même résultat d'analyse syntaxique) sont considérées comme ayant des valeurs identiques. De telles expressions sont concaténées et exécutées une fois. Les sous-requêtes identiques sont également éliminées de cette façon. - -## Types de résultats {#types-of-results} - -Toutes les fonctions renvoient un seul retour comme résultat (pas plusieurs valeurs, et pas des valeurs nulles). Le type de résultat est généralement défini uniquement par les types d'arguments, pas par les valeurs. Les Exceptions sont la fonction tupleElement (l'opérateur A. N) et la fonction toFixedString. - -## Constant {#constants} - -Pour simplifier, certaines fonctions ne peuvent fonctionner qu'avec des constantes pour certains arguments. Par exemple, le bon argument de L'opérateur LIKE doit être une constante. -Presque toutes les fonctions renvoient une constante pour des arguments constants. L'exception est les fonctions qui génèrent des nombres aléatoires. -Le ‘now’ function renvoie des valeurs différentes pour les requêtes qui ont été exécutées à des moments différents, mais le résultat est considéré comme une constante, car la constance n'est importante que dans une seule requête. -Une expression constante est également considérée comme une constante (par exemple, la moitié droite de L'opérateur LIKE peut être construite à partir de plusieurs constantes). - -Les fonctions peuvent être implémentées de différentes manières pour des arguments constants et non constants (un code différent est exécuté). Mais les résultats pour une constante et pour une colonne vraie Ne contenant que la même valeur doivent correspondre les uns aux autres. - -## Le Traitement NULL {#null-processing} - -Les fonctions ont les comportements suivants: - -- Si au moins l'un des arguments de la fonction est `NULL` le résultat de la fonction est également `NULL`. -- Comportement spécial spécifié individuellement dans la description de chaque fonction. Dans le code source de ClickHouse, ces fonctions ont `UseDefaultImplementationForNulls=false`. - -## Constance {#constancy} - -Functions can't change the values of their arguments – any changes are returned as the result. Thus, the result of calculating separate functions does not depend on the order in which the functions are written in the query. - -## Erreur De Manipulation {#error-handling} - -Certaines fonctions peuvent lancer une exception si les données ne sont pas valides. Dans ce cas, la requête est annulée et un message d'erreur est retourné au client. Pour le traitement distribué, lorsqu'une exception se produit sur l'un des serveurs, les autres serveurs aussi tenté d'interrompre la requête. - -## Évaluation des Expressions D'Argument {#evaluation-of-argument-expressions} - -Dans presque tous les langages de programmation, l'un des arguments peut pas être évalué pour certains opérateurs. Ce sont généralement les opérateurs `&&`, `||`, et `?:`. -Mais dans ClickHouse, les arguments des fonctions (opérateurs) sont toujours évalués. En effet, des parties entières de colonnes sont évaluées à la fois, au lieu de calculer chaque ligne séparément. - -## Exécution de fonctions pour le traitement de requêtes distribuées {#performing-functions-for-distributed-query-processing} - -Pour le traitement de requête distribué, autant d'étapes de traitement de requête que possible sont effectuées sur des serveurs distants, et le reste des étapes (fusion des résultats intermédiaires et tout ce qui suit) sont effectuées sur le serveur demandeur. - -Cela signifie que les fonctions peuvent être effectuées sur différents serveurs. -Par exemple, dans la requête `SELECT f(sum(g(x))) FROM distributed_table GROUP BY h(y),` - -- si un `distributed_table` a au moins deux fragments, les fonctions ‘g’ et ‘h’ sont effectuées sur des serveurs distants, et la fonction ‘f’ est effectuée sur le serveur demandeur. -- si un `distributed_table` a un seul fragment, tous les ‘f’, ‘g’, et ‘h’ les fonctions sont exécutées sur le serveur de ce fragment. - -Le résultat d'une fonction habituellement ne dépendent pas le serveur sur lequel elle est exécutée. Cependant, parfois c'est important. -Par exemple, les fonctions qui fonctionnent avec des dictionnaires utilisent le dictionnaire qui existe sur le serveur sur lequel elles s'exécutent. -Un autre exemple est l' `hostName` fonction, qui renvoie le nom du serveur sur lequel il s'exécute afin de `GROUP BY` par les serveurs dans un `SELECT` requête. - -Si une fonction dans une requête est effectuée sur le demandeur serveur, mais vous devez l'exécuter sur des serveurs distants, vous pouvez l'envelopper dans un ‘any’ fonction d'agrégation ou l'ajouter à une clé dans `GROUP BY`. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/) diff --git a/docs/fr/sql-reference/functions/introspection.md b/docs/fr/sql-reference/functions/introspection.md deleted file mode 100644 index 91299217dc7..00000000000 --- a/docs/fr/sql-reference/functions/introspection.md +++ /dev/null @@ -1,310 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 65 -toc_title: Introspection ---- - -# Fonctions D'Introspection {#introspection-functions} - -Vous pouvez utiliser les fonctions décrites dans ce chapitre pour introspecter [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) et [DWARF](https://en.wikipedia.org/wiki/DWARF) pour le profilage de requête. - -!!! warning "Avertissement" - Ces fonctions sont lentes et peuvent imposer des considérations de sécurité. - -Pour le bon fonctionnement des fonctions d'introspection: - -- Installer le `clickhouse-common-static-dbg` paquet. - -- Définir le [allow_introspection_functions](../../operations/settings/settings.md#settings-allow_introspection_functions) réglage sur 1. - - For security reasons introspection functions are disabled by default. - -Clickhouse enregistre les rapports du profileur [trace_log](../../operations/system-tables.md#system_tables-trace_log) système de table. Assurez-vous que la table et le profileur sont correctement configurés. - -## addressToLine {#addresstoline} - -Convertit l'adresse de mémoire virtuelle dans le processus de serveur ClickHouse en nom de fichier et en numéro de ligne dans le code source de ClickHouse. - -Si vous utilisez des paquets clickhouse officiels, vous devez installer le `clickhouse-common-static-dbg` paquet. - -**Syntaxe** - -``` sql -addressToLine(address_of_binary_instruction) -``` - -**Paramètre** - -- `address_of_binary_instruction` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Address of instruction in a running process. - -**Valeur renvoyée** - -- Nom de fichier du code Source et le numéro de ligne dans ce fichier délimité par deux-points. - - For example, `/build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.cpp:199`, where `199` is a line number. - -- Nom d'un binaire, si la fonction n'a pas pu trouver les informations de débogage. - -- Chaîne vide, si l'adresse n'est pas valide. - -Type: [Chaîne](../../sql-reference/data-types/string.md). - -**Exemple** - -Activation des fonctions d'introspection: - -``` sql -SET allow_introspection_functions=1 -``` - -Sélection de la première chaîne de `trace_log` système de table: - -``` sql -SELECT * FROM system.trace_log LIMIT 1 \G -``` - -``` text -Row 1: -────── -event_date: 2019-11-19 -event_time: 2019-11-19 18:57:23 -revision: 54429 -timer_type: Real -thread_number: 48 -query_id: 421b6855-1858-45a5-8f37-f383409d6d72 -trace: [140658411141617,94784174532828,94784076370703,94784076372094,94784076361020,94784175007680,140658411116251,140658403895439] -``` - -Le `trace` champ contient la trace de pile au moment de l'échantillonnage. - -Obtenir le nom de fichier du code source et le numéro de ligne pour une seule adresse: - -``` sql -SELECT addressToLine(94784076370703) \G -``` - -``` text -Row 1: -────── -addressToLine(94784076370703): /build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.cpp:199 -``` - -Application de la fonction à la trace de la pile entière: - -``` sql -SELECT - arrayStringConcat(arrayMap(x -> addressToLine(x), trace), '\n') AS trace_source_code_lines -FROM system.trace_log -LIMIT 1 -\G -``` - -Le [arrayMap](higher-order-functions.md#higher_order_functions-array-map) permet de traiter chaque élément individuel de l' `trace` tableau par la `addressToLine` fonction. Le résultat de ce traitement que vous voyez dans l' `trace_source_code_lines` colonne de sortie. - -``` text -Row 1: -────── -trace_source_code_lines: /lib/x86_64-linux-gnu/libpthread-2.27.so -/usr/lib/debug/usr/bin/clickhouse -/build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.cpp:199 -/build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.h:155 -/usr/include/c++/9/bits/atomic_base.h:551 -/usr/lib/debug/usr/bin/clickhouse -/lib/x86_64-linux-gnu/libpthread-2.27.so -/build/glibc-OTsEL5/glibc-2.27/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:97 -``` - -## adressetosymbol {#addresstosymbol} - -Convertit l'adresse de mémoire virtuelle dans le processus de serveur ClickHouse en symbole à partir des fichiers d'objets ClickHouse. - -**Syntaxe** - -``` sql -addressToSymbol(address_of_binary_instruction) -``` - -**Paramètre** - -- `address_of_binary_instruction` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Address of instruction in a running process. - -**Valeur renvoyée** - -- Symbole des fichiers D'objets ClickHouse. -- Chaîne vide, si l'adresse n'est pas valide. - -Type: [Chaîne](../../sql-reference/data-types/string.md). - -**Exemple** - -Activation des fonctions d'introspection: - -``` sql -SET allow_introspection_functions=1 -``` - -Sélection de la première chaîne de `trace_log` système de table: - -``` sql -SELECT * FROM system.trace_log LIMIT 1 \G -``` - -``` text -Row 1: -────── -event_date: 2019-11-20 -event_time: 2019-11-20 16:57:59 -revision: 54429 -timer_type: Real -thread_number: 48 -query_id: 724028bf-f550-45aa-910d-2af6212b94ac -trace: [94138803686098,94138815010911,94138815096522,94138815101224,94138815102091,94138814222988,94138806823642,94138814457211,94138806823642,94138814457211,94138806823642,94138806795179,94138806796144,94138753770094,94138753771646,94138753760572,94138852407232,140399185266395,140399178045583] -``` - -Le `trace` champ contient la trace de pile au moment de l'échantillonnage. - -Obtenir un symbole pour une seule adresse: - -``` sql -SELECT addressToSymbol(94138803686098) \G -``` - -``` text -Row 1: -────── -addressToSymbol(94138803686098): _ZNK2DB24IAggregateFunctionHelperINS_20AggregateFunctionSumImmNS_24AggregateFunctionSumDataImEEEEE19addBatchSinglePlaceEmPcPPKNS_7IColumnEPNS_5ArenaE -``` - -Application de la fonction à la trace de la pile entière: - -``` sql -SELECT - arrayStringConcat(arrayMap(x -> addressToSymbol(x), trace), '\n') AS trace_symbols -FROM system.trace_log -LIMIT 1 -\G -``` - -Le [arrayMap](higher-order-functions.md#higher_order_functions-array-map) permet de traiter chaque élément individuel de l' `trace` tableau par la `addressToSymbols` fonction. Le résultat de ce traitement que vous voyez dans l' `trace_symbols` colonne de sortie. - -``` text -Row 1: -────── -trace_symbols: _ZNK2DB24IAggregateFunctionHelperINS_20AggregateFunctionSumImmNS_24AggregateFunctionSumDataImEEEEE19addBatchSinglePlaceEmPcPPKNS_7IColumnEPNS_5ArenaE -_ZNK2DB10Aggregator21executeWithoutKeyImplERPcmPNS0_28AggregateFunctionInstructionEPNS_5ArenaE -_ZN2DB10Aggregator14executeOnBlockESt6vectorIN3COWINS_7IColumnEE13immutable_ptrIS3_EESaIS6_EEmRNS_22AggregatedDataVariantsERS1_IPKS3_SaISC_EERS1_ISE_SaISE_EERb -_ZN2DB10Aggregator14executeOnBlockERKNS_5BlockERNS_22AggregatedDataVariantsERSt6vectorIPKNS_7IColumnESaIS9_EERS6_ISB_SaISB_EERb -_ZN2DB10Aggregator7executeERKSt10shared_ptrINS_17IBlockInputStreamEERNS_22AggregatedDataVariantsE -_ZN2DB27AggregatingBlockInputStream8readImplEv -_ZN2DB17IBlockInputStream4readEv -_ZN2DB26ExpressionBlockInputStream8readImplEv -_ZN2DB17IBlockInputStream4readEv -_ZN2DB26ExpressionBlockInputStream8readImplEv -_ZN2DB17IBlockInputStream4readEv -_ZN2DB28AsynchronousBlockInputStream9calculateEv -_ZNSt17_Function_handlerIFvvEZN2DB28AsynchronousBlockInputStream4nextEvEUlvE_E9_M_invokeERKSt9_Any_data -_ZN14ThreadPoolImplI20ThreadFromGlobalPoolE6workerESt14_List_iteratorIS0_E -_ZZN20ThreadFromGlobalPoolC4IZN14ThreadPoolImplIS_E12scheduleImplIvEET_St8functionIFvvEEiSt8optionalImEEUlvE1_JEEEOS4_DpOT0_ENKUlvE_clEv -_ZN14ThreadPoolImplISt6threadE6workerESt14_List_iteratorIS0_E -execute_native_thread_routine -start_thread -clone -``` - -## demangle {#demangle} - -Convertit un symbole que vous pouvez obtenir en utilisant le [adressetosymbol](#addresstosymbol) fonction au nom de la fonction c++. - -**Syntaxe** - -``` sql -demangle(symbol) -``` - -**Paramètre** - -- `symbol` ([Chaîne](../../sql-reference/data-types/string.md)) — Symbol from an object file. - -**Valeur renvoyée** - -- Nom de la fonction C++. -- Chaîne vide si un symbole n'est pas valide. - -Type: [Chaîne](../../sql-reference/data-types/string.md). - -**Exemple** - -Activation des fonctions d'introspection: - -``` sql -SET allow_introspection_functions=1 -``` - -Sélection de la première chaîne de `trace_log` système de table: - -``` sql -SELECT * FROM system.trace_log LIMIT 1 \G -``` - -``` text -Row 1: -────── -event_date: 2019-11-20 -event_time: 2019-11-20 16:57:59 -revision: 54429 -timer_type: Real -thread_number: 48 -query_id: 724028bf-f550-45aa-910d-2af6212b94ac -trace: [94138803686098,94138815010911,94138815096522,94138815101224,94138815102091,94138814222988,94138806823642,94138814457211,94138806823642,94138814457211,94138806823642,94138806795179,94138806796144,94138753770094,94138753771646,94138753760572,94138852407232,140399185266395,140399178045583] -``` - -Le `trace` champ contient la trace de pile au moment de l'échantillonnage. - -Obtenir un nom de fonction pour une seule adresse: - -``` sql -SELECT demangle(addressToSymbol(94138803686098)) \G -``` - -``` text -Row 1: -────── -demangle(addressToSymbol(94138803686098)): DB::IAggregateFunctionHelper > >::addBatchSinglePlace(unsigned long, char*, DB::IColumn const**, DB::Arena*) const -``` - -Application de la fonction à la trace de la pile entière: - -``` sql -SELECT - arrayStringConcat(arrayMap(x -> demangle(addressToSymbol(x)), trace), '\n') AS trace_functions -FROM system.trace_log -LIMIT 1 -\G -``` - -Le [arrayMap](higher-order-functions.md#higher_order_functions-array-map) permet de traiter chaque élément individuel de l' `trace` tableau par la `demangle` fonction. Le résultat de ce traitement que vous voyez dans l' `trace_functions` colonne de sortie. - -``` text -Row 1: -────── -trace_functions: DB::IAggregateFunctionHelper > >::addBatchSinglePlace(unsigned long, char*, DB::IColumn const**, DB::Arena*) const -DB::Aggregator::executeWithoutKeyImpl(char*&, unsigned long, DB::Aggregator::AggregateFunctionInstruction*, DB::Arena*) const -DB::Aggregator::executeOnBlock(std::vector::immutable_ptr, std::allocator::immutable_ptr > >, unsigned long, DB::AggregatedDataVariants&, std::vector >&, std::vector >, std::allocator > > >&, bool&) -DB::Aggregator::executeOnBlock(DB::Block const&, DB::AggregatedDataVariants&, std::vector >&, std::vector >, std::allocator > > >&, bool&) -DB::Aggregator::execute(std::shared_ptr const&, DB::AggregatedDataVariants&) -DB::AggregatingBlockInputStream::readImpl() -DB::IBlockInputStream::read() -DB::ExpressionBlockInputStream::readImpl() -DB::IBlockInputStream::read() -DB::ExpressionBlockInputStream::readImpl() -DB::IBlockInputStream::read() -DB::AsynchronousBlockInputStream::calculate() -std::_Function_handler::_M_invoke(std::_Any_data const&) -ThreadPoolImpl::worker(std::_List_iterator) -ThreadFromGlobalPool::ThreadFromGlobalPool::scheduleImpl(std::function, int, std::optional)::{lambda()#3}>(ThreadPoolImpl::scheduleImpl(std::function, int, std::optional)::{lambda()#3}&&)::{lambda()#1}::operator()() const -ThreadPoolImpl::worker(std::_List_iterator) -execute_native_thread_routine -start_thread -clone -``` diff --git a/docs/fr/sql-reference/functions/ip-address-functions.md b/docs/fr/sql-reference/functions/ip-address-functions.md deleted file mode 100644 index 8beb40a534b..00000000000 --- a/docs/fr/sql-reference/functions/ip-address-functions.md +++ /dev/null @@ -1,248 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 55 -toc_title: Travailler avec des adresses IP ---- - -# Fonctions pour travailler avec des adresses IP {#functions-for-working-with-ip-addresses} - -## IPv4NumToString (num) {#ipv4numtostringnum} - -Prend un numéro UInt32. Interprète comme une adresse IPv4 dans big endian. Renvoie une chaîne contenant l'adresse IPv4 correspondante au format A. B. C. d (Nombres séparés par des points sous forme décimale). - -## IPv4StringToNum (s) {#ipv4stringtonums} - -La fonction inverse de IPv4NumToString. Si L'adresse IPv4 a un format non valide, elle renvoie 0. - -## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} - -Similaire à IPv4NumToString, mais en utilisant xxx au lieu du dernier octet. - -Exemple: - -``` sql -SELECT - IPv4NumToStringClassC(ClientIP) AS k, - count() AS c -FROM test.hits -GROUP BY k -ORDER BY c DESC -LIMIT 10 -``` - -``` text -┌─k──────────────┬─────c─┐ -│ 83.149.9.xxx │ 26238 │ -│ 217.118.81.xxx │ 26074 │ -│ 213.87.129.xxx │ 25481 │ -│ 83.149.8.xxx │ 24984 │ -│ 217.118.83.xxx │ 22797 │ -│ 78.25.120.xxx │ 22354 │ -│ 213.87.131.xxx │ 21285 │ -│ 78.25.121.xxx │ 20887 │ -│ 188.162.65.xxx │ 19694 │ -│ 83.149.48.xxx │ 17406 │ -└────────────────┴───────┘ -``` - -Depuis l'utilisation de ‘xxx’ est très inhabituel, cela peut être changé à l'avenir. Nous vous recommandons de ne pas compter sur le format exact de ce fragment. - -### IPv6NumToString (x) {#ipv6numtostringx} - -Accepte une valeur FixedString (16) contenant L'adresse IPv6 au format binaire. Renvoie une chaîne contenant cette adresse au format texte. -Les adresses IPv4 mappées IPv6 sont sorties au format:: ffff: 111.222.33.44. Exemple: - -``` sql -SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr -``` - -``` text -┌─addr─────────┐ -│ 2a02:6b8::11 │ -└──────────────┘ -``` - -``` sql -SELECT - IPv6NumToString(ClientIP6 AS k), - count() AS c -FROM hits_all -WHERE EventDate = today() AND substring(ClientIP6, 1, 12) != unhex('00000000000000000000FFFF') -GROUP BY k -ORDER BY c DESC -LIMIT 10 -``` - -``` text -┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ -│ 2a02:2168:aaa:bbbb::2 │ 24695 │ -│ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ -│ 2a02:6b8:0:fff::ff │ 16389 │ -│ 2a01:4f8:111:6666::2 │ 16016 │ -│ 2a02:2168:888:222::1 │ 15896 │ -│ 2a01:7e00::ffff:ffff:ffff:222 │ 14774 │ -│ 2a02:8109:eee:ee:eeee:eeee:eeee:eeee │ 14443 │ -│ 2a02:810b:8888:888:8888:8888:8888:8888 │ 14345 │ -│ 2a02:6b8:0:444:4444:4444:4444:4444 │ 14279 │ -│ 2a01:7e00::ffff:ffff:ffff:ffff │ 13880 │ -└─────────────────────────────────────────┴───────┘ -``` - -``` sql -SELECT - IPv6NumToString(ClientIP6 AS k), - count() AS c -FROM hits_all -WHERE EventDate = today() -GROUP BY k -ORDER BY c DESC -LIMIT 10 -``` - -``` text -┌─IPv6NumToString(ClientIP6)─┬──────c─┐ -│ ::ffff:94.26.111.111 │ 747440 │ -│ ::ffff:37.143.222.4 │ 529483 │ -│ ::ffff:5.166.111.99 │ 317707 │ -│ ::ffff:46.38.11.77 │ 263086 │ -│ ::ffff:79.105.111.111 │ 186611 │ -│ ::ffff:93.92.111.88 │ 176773 │ -│ ::ffff:84.53.111.33 │ 158709 │ -│ ::ffff:217.118.11.22 │ 154004 │ -│ ::ffff:217.118.11.33 │ 148449 │ -│ ::ffff:217.118.11.44 │ 148243 │ -└────────────────────────────┴────────┘ -``` - -## IPv6StringToNum (s) {#ipv6stringtonums} - -La fonction inverse de IPv6NumToString. Si L'adresse IPv6 a un format non valide, elle renvoie une chaîne d'octets null. -HEX peut être en majuscules ou en minuscules. - -## IPv4ToIPv6 (x) {#ipv4toipv6x} - -Prend un `UInt32` nombre. Interprète comme une adresse IPv4 dans [big endian](https://en.wikipedia.org/wiki/Endianness). Retourne un `FixedString(16)` valeur contenant l'adresse IPv6 au format binaire. Exemple: - -``` sql -SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr -``` - -``` text -┌─addr───────────────┐ -│ ::ffff:192.168.0.1 │ -└────────────────────┘ -``` - -## cutIPv6 (x, bytesToCutForIPv6, bytesToCutForIPv4) {#cutipv6x-bytestocutforipv6-bytestocutforipv4} - -Accepte une valeur FixedString (16) contenant L'adresse IPv6 au format binaire. Renvoie une chaîne contenant l'adresse du nombre spécifié d'octets retiré au format texte. Exemple: - -``` sql -WITH - IPv6StringToNum('2001:0DB8:AC10:FE01:FEED:BABE:CAFE:F00D') AS ipv6, - IPv4ToIPv6(IPv4StringToNum('192.168.0.1')) AS ipv4 -SELECT - cutIPv6(ipv6, 2, 0), - cutIPv6(ipv4, 0, 2) -``` - -``` text -┌─cutIPv6(ipv6, 2, 0)─────────────────┬─cutIPv6(ipv4, 0, 2)─┐ -│ 2001:db8:ac10:fe01:feed:babe:cafe:0 │ ::ffff:192.168.0.0 │ -└─────────────────────────────────────┴─────────────────────┘ -``` - -## Ipv4cirtorange (ipv4, Cidr), {#ipv4cidrtorangeipv4-cidr} - -Accepte un IPv4 et une valeur UInt8 contenant [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing). Renvoie un tuple avec deux IPv4 contenant la plage inférieure et la plage supérieure du sous-réseau. - -``` sql -SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) -``` - -``` text -┌─IPv4CIDRToRange(toIPv4('192.168.5.2'), 16)─┐ -│ ('192.168.0.0','192.168.255.255') │ -└────────────────────────────────────────────┘ -``` - -## Ipv6cirtorange (ipv6, Cidr), {#ipv6cidrtorangeipv6-cidr} - -Accepte un IPv6 et une valeur UInt8 contenant le CIDR. Renvoie un tuple avec deux IPv6 contenant la plage inférieure et la plage supérieure du sous-réseau. - -``` sql -SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); -``` - -``` text -┌─IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32)─┐ -│ ('2001:db8::','2001:db8:ffff:ffff:ffff:ffff:ffff:ffff') │ -└────────────────────────────────────────────────────────────────────────┘ -``` - -## toipv4 (chaîne) {#toipv4string} - -Un alias `IPv4StringToNum()` cela prend une forme de chaîne D'adresse IPv4 et renvoie la valeur de [IPv4](../../sql-reference/data-types/domains/ipv4.md) type, qui est binaire égal à la valeur renvoyée par `IPv4StringToNum()`. - -``` sql -WITH - '171.225.130.45' as IPv4_string -SELECT - toTypeName(IPv4StringToNum(IPv4_string)), - toTypeName(toIPv4(IPv4_string)) -``` - -``` text -┌─toTypeName(IPv4StringToNum(IPv4_string))─┬─toTypeName(toIPv4(IPv4_string))─┐ -│ UInt32 │ IPv4 │ -└──────────────────────────────────────────┴─────────────────────────────────┘ -``` - -``` sql -WITH - '171.225.130.45' as IPv4_string -SELECT - hex(IPv4StringToNum(IPv4_string)), - hex(toIPv4(IPv4_string)) -``` - -``` text -┌─hex(IPv4StringToNum(IPv4_string))─┬─hex(toIPv4(IPv4_string))─┐ -│ ABE1822D │ ABE1822D │ -└───────────────────────────────────┴──────────────────────────┘ -``` - -## toipv6 (chaîne) {#toipv6string} - -Un alias `IPv6StringToNum()` cela prend une forme de chaîne D'adresse IPv6 et renvoie la valeur de [IPv6](../../sql-reference/data-types/domains/ipv6.md) type, qui est binaire égal à la valeur renvoyée par `IPv6StringToNum()`. - -``` sql -WITH - '2001:438:ffff::407d:1bc1' as IPv6_string -SELECT - toTypeName(IPv6StringToNum(IPv6_string)), - toTypeName(toIPv6(IPv6_string)) -``` - -``` text -┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ -│ FixedString(16) │ IPv6 │ -└──────────────────────────────────────────┴─────────────────────────────────┘ -``` - -``` sql -WITH - '2001:438:ffff::407d:1bc1' as IPv6_string -SELECT - hex(IPv6StringToNum(IPv6_string)), - hex(toIPv6(IPv6_string)) -``` - -``` text -┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ -│ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ -└───────────────────────────────────┴──────────────────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/ip_address_functions/) diff --git a/docs/fr/sql-reference/functions/json-functions.md b/docs/fr/sql-reference/functions/json-functions.md deleted file mode 100644 index 5f92c99d0f5..00000000000 --- a/docs/fr/sql-reference/functions/json-functions.md +++ /dev/null @@ -1,297 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 56 -toc_title: Travailler avec JSON ---- - -# Fonctions pour travailler avec JSON {#functions-for-working-with-json} - -Dans Yandex.Metrica, JSON est transmis par les utilisateurs en tant que paramètres de session. Il y a quelques fonctions spéciales pour travailler avec ce JSON. (Bien que dans la plupart des cas, les JSONs soient en outre prétraités et les valeurs résultantes sont placées dans des colonnes séparées dans leur format traité.) Toutes ces fonctions sont basées sur des hypothèses fortes sur ce que le JSON peut être, mais elles essaient de faire le moins possible pour faire le travail. - -Les hypothèses suivantes sont apportées: - -1. Le nom du champ (argument de fonction) doit être une constante. -2. Le nom du champ est en quelque sorte codé canoniquement dans JSON. Exemple: `visitParamHas('{"abc":"def"}', 'abc') = 1`, mais `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` -3. Les champs sont recherchés à n'importe quel niveau d'imbrication, sans discrimination. S'il y a plusieurs champs correspondants, la première occurrence est utilisé. -4. Le JSON n'a pas de caractères d'espace en dehors des littéraux de chaîne. - -## visitParamHas(params, nom) {#visitparamhasparams-name} - -Vérifie s'il existe un champ avec ‘name’ nom. - -## visitParamExtractUInt(params, nom) {#visitparamextractuintparams-name} - -Analyse UInt64 à partir de la valeur du champ nommé ‘name’. Si c'est un champ de type chaîne, il tente d'analyser un numéro à partir du début de la chaîne. Si le champ n'existe pas, ou s'il existe mais ne contient pas de nombre, il renvoie 0. - -## visitParamExtractInt(params, name) {#visitparamextractintparams-name} - -Le même que pour Int64. - -## visitParamExtractFloat(params, nom) {#visitparamextractfloatparams-name} - -Le même que pour Float64. - -## visitParamExtractBool(params, nom) {#visitparamextractboolparams-name} - -Analyse d'une valeur vrai/faux. Le résultat est UInt8. - -## visitParamExtractRaw(params, nom) {#visitparamextractrawparams-name} - -Retourne la valeur d'un champ, y compris les séparateurs. - -Exemple: - -``` sql -visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' -visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' -``` - -## visitParamExtractString(params, nom) {#visitparamextractstringparams-name} - -Analyse la chaîne entre guillemets doubles. La valeur est sans échappement. Si l'échappement échoue, il renvoie une chaîne vide. - -Exemple: - -``` sql -visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' -visitParamExtractString('{"abc":"hello}', 'abc') = '' -``` - -Il n'y a actuellement aucun support pour les points de code dans le format `\uXXXX\uYYYY` qui ne proviennent pas du plan multilingue de base (ils sont convertis en CESU-8 au lieu de UTF-8). - -Les fonctions suivantes sont basées sur [simdjson](https://github.com/lemire/simdjson) conçu pour des exigences D'analyse JSON plus complexes. L'hypothèse 2 mentionnée ci-dessus s'applique toujours. - -## isValidJSON (json) {#isvalidjsonjson} - -Vérifie que la chaîne est un json valide. - -Exemple: - -``` sql -SELECT isValidJSON('{"a": "hello", "b": [-100, 200.0, 300]}') = 1 -SELECT isValidJSON('not a json') = 0 -``` - -## JSONHas(json\[, indices_or_keys\]…) {#jsonhasjson-indices-or-keys} - -Si la valeur existe dans le document JSON, `1` sera retourné. - -Si la valeur n'existe pas, `0` sera retourné. - -Exemple: - -``` sql -SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 -SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 -``` - -`indices_or_keys` est une liste de zéro ou plusieurs arguments chacun d'entre eux peut être une chaîne ou un entier. - -- String = membre d'objet d'accès par clé. -- Entier positif = accédez au n-ème membre / clé depuis le début. -- Entier négatif = accédez au n-ème membre / clé à partir de la fin. - -Minimum de l'indice de l'élément est 1. Ainsi, l'élément 0 n'existe pas. - -Vous pouvez utiliser des entiers pour accéder à la fois aux tableaux JSON et aux objets JSON. - -Ainsi, par exemple: - -``` sql -SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' -SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' -SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' -SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' -SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' -``` - -## JSONLength(json\[, indices_or_keys\]…) {#jsonlengthjson-indices-or-keys} - -Renvoie la longueur D'un tableau JSON ou d'un objet JSON. - -Si la valeur n'existe pas ou a un mauvais type, `0` sera retourné. - -Exemple: - -``` sql -SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 -SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 -``` - -## JSONType(json\[, indices_or_keys\]…) {#jsontypejson-indices-or-keys} - -De retour le type d'une valeur JSON. - -Si la valeur n'existe pas, `Null` sera retourné. - -Exemple: - -``` sql -SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' -SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' -SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' -``` - -## JSONExtractUInt(json\[, indices_or_keys\]…) {#jsonextractuintjson-indices-or-keys} - -## JSONExtractInt(json\[, indices_or_keys\]…) {#jsonextractintjson-indices-or-keys} - -## JSONExtractFloat(json\[, indices_or_keys\]…) {#jsonextractfloatjson-indices-or-keys} - -## JSONExtractBool(json\[, indices_or_keys\]…) {#jsonextractbooljson-indices-or-keys} - -Analyse un JSON et extrait une valeur. Ces fonctions sont similaires à `visitParam` fonction. - -Si la valeur n'existe pas ou a un mauvais type, `0` sera retourné. - -Exemple: - -``` sql -SELECT JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 -SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 -SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 -``` - -## JSONExtractString(json\[, indices_or_keys\]…) {#jsonextractstringjson-indices-or-keys} - -Analyse un JSON et extrait une chaîne. Cette fonction est similaire à `visitParamExtractString` fonction. - -Si la valeur n'existe pas ou a un mauvais type, une chaîne vide est retournée. - -La valeur est sans échappement. Si l'échappement échoue, il renvoie une chaîne vide. - -Exemple: - -``` sql -SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' -SELECT JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -SELECT JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' -SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' -``` - -## JSONExtract(json\[, indices_or_keys…\], Return_type) {#jsonextractjson-indices-or-keys-return-type} - -Analyse un JSON et extrait une valeur du type de données clickhouse donné. - -C'est une généralisation de la précédente `JSONExtract` fonction. -Cela signifie -`JSONExtract(..., 'String')` retourne exactement le même que `JSONExtractString()`, -`JSONExtract(..., 'Float64')` retourne exactement le même que `JSONExtractFloat()`. - -Exemple: - -``` sql -SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300]) -SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello') -SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL] -SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4, 'Nullable(Int64)') = NULL -SELECT JSONExtract('{"passed": true}', 'passed', 'UInt8') = 1 -SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Thursday' -SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' -``` - -## JSONExtractKeysAndValues(json\[, indices_or_keys…\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} - -Analyse les paires clé-valeur à partir D'un JSON où les valeurs sont du type de données clickhouse donné. - -Exemple: - -``` sql -SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)] -``` - -## JSONExtractRaw(json\[, indices_or_keys\]…) {#jsonextractrawjson-indices-or-keys} - -Renvoie une partie de JSON en tant que chaîne non analysée. - -Si la pièce n'existe pas ou a un mauvais type, une chaîne vide est retournée. - -Exemple: - -``` sql -SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' -``` - -## JSONExtractArrayRaw(json\[, indices_or_keys…\]) {#jsonextractarrayrawjson-indices-or-keys} - -Retourne un tableau avec des éléments de tableau JSON, chacun représenté comme une chaîne non analysée. - -Si la pièce n'existe pas ou n'est pas de tableau, un tableau vide sera retournée. - -Exemple: - -``` sql -SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = ['-100', '200.0', '"hello"']' -``` - -## JSONExtractKeysAndValuesRaw {#json-extract-keys-and-values-raw} - -Extrait les données brutes d'un objet JSON. - -**Syntaxe** - -``` sql -JSONExtractKeysAndValuesRaw(json[, p, a, t, h]) -``` - -**Paramètre** - -- `json` — [Chaîne](../data-types/string.md) avec JSON valide. -- `p, a, t, h` — Comma-separated indices or keys that specify the path to the inner field in a nested JSON object. Each argument can be either a [chaîne](../data-types/string.md) pour obtenir le champ par la touche ou un [entier](../data-types/int-uint.md) pour obtenir le N-ème champ (indexé à partir de 1, les entiers négatifs comptent à partir de la fin). S'il n'est pas défini, le JSON entier est analysé en tant qu'objet de niveau supérieur. Paramètre facultatif. - -**Valeurs renvoyées** - -- Tableau avec `('key', 'value')` tuple. Les deux membres du tuple sont des chaînes. -- Tableau vide si l'objet demandé n'existe pas, ou entrée JSON n'est pas valide. - -Type: [Tableau](../data-types/array.md)([Tuple](../data-types/tuple.md)([Chaîne](../data-types/string.md), [Chaîne](../data-types/string.md)). - -**Exemple** - -Requête: - -``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}') -``` - -Résultat: - -``` text -┌─JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}')─┐ -│ [('a','[-100,200]'),('b','{"c":{"d":"hello","f":"world"}}')] │ -└──────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -Requête: - -``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', 'b') -``` - -Résultat: - -``` text -┌─JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', 'b')─┐ -│ [('c','{"d":"hello","f":"world"}')] │ -└───────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -Requête: - -``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', -1, 'c') -``` - -Résultat: - -``` text -┌─JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', -1, 'c')─┐ -│ [('d','"hello"'),('f','"world"')] │ -└───────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/json_functions/) diff --git a/docs/fr/sql-reference/functions/logical-functions.md b/docs/fr/sql-reference/functions/logical-functions.md deleted file mode 100644 index d01d9e02088..00000000000 --- a/docs/fr/sql-reference/functions/logical-functions.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 37 -toc_title: Logique ---- - -# Les Fonctions Logiques {#logical-functions} - -Les fonctions logiques acceptent tous les types numériques, mais renvoient un nombre UInt8 égal à 0 ou 1. - -Zéro comme argument est considéré “false,” alors que toute valeur non nulle est considérée comme “true”. - -## et, et opérateur {#and-and-operator} - -## ou, ou opérateur {#or-or-operator} - -## pas, pas opérateur {#not-not-operator} - -## xor {#xor} - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/logical_functions/) diff --git a/docs/fr/sql-reference/functions/machine-learning-functions.md b/docs/fr/sql-reference/functions/machine-learning-functions.md deleted file mode 100644 index 2212e0caa5a..00000000000 --- a/docs/fr/sql-reference/functions/machine-learning-functions.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 64 -toc_title: Fonctions D'Apprentissage Automatique ---- - -# Fonctions D'Apprentissage Automatique {#machine-learning-functions} - -## evalMLMethod (prédiction) {#machine_learning_methods-evalmlmethod} - -Prédiction utilisant des modèles de régression ajustés utilise `evalMLMethod` fonction. Voir le lien dans la `linearRegression`. - -### Régression Linéaire Stochastique {#stochastic-linear-regression} - -Le [stochasticLinearRegression](../../sql-reference/aggregate-functions/reference.md#agg_functions-stochasticlinearregression) la fonction d'agrégat implémente une méthode de descente de gradient stochastique utilisant un modèle linéaire et une fonction de perte MSE. Utiliser `evalMLMethod` prédire sur de nouvelles données. - -### Régression Logistique Stochastique {#stochastic-logistic-regression} - -Le [stochasticLogisticRegression](../../sql-reference/aggregate-functions/reference.md#agg_functions-stochasticlogisticregression) la fonction d'agrégation implémente la méthode de descente de gradient stochastique pour le problème de classification binaire. Utiliser `evalMLMethod` prédire sur de nouvelles données. diff --git a/docs/fr/sql-reference/functions/math-functions.md b/docs/fr/sql-reference/functions/math-functions.md deleted file mode 100644 index f5dff150caa..00000000000 --- a/docs/fr/sql-reference/functions/math-functions.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 44 -toc_title: "Math\xE9matique" ---- - -# Fonctions Mathématiques {#mathematical-functions} - -Toutes les fonctions renvoient un nombre Float64. La précision du résultat est proche de la précision maximale possible, mais le résultat peut ne pas coïncider avec le nombre représentable de la machine le plus proche du nombre réel correspondant. - -## e() {#e} - -Renvoie un nombre Float64 proche du nombre E. - -## pi() {#pi} - -Returns a Float64 number that is close to the number π. - -## exp (x) {#expx} - -Accepte un argument numérique et renvoie un Float64 nombre proche de l'exposant de l'argument. - -## log(x), ln (x) {#logx-lnx} - -Accepte un argument numérique et renvoie un nombre Float64 proche du logarithme naturel de l'argument. - -## exp2 (x) {#exp2x} - -Accepte un argument numérique et renvoie un nombre Float64 proche de 2 à la puissance de X. - -## log2 (x) {#log2x} - -Accepte un argument numérique et renvoie un Float64 nombre proximité du logarithme binaire de l'argument. - -## exp10 (x) {#exp10x} - -Accepte un argument numérique et renvoie un nombre Float64 proche de 10 à la puissance de X. - -## log10 (x) {#log10x} - -Accepte un argument numérique et renvoie un nombre Float64 proche du logarithme décimal de l'argument. - -## sqrt (x) {#sqrtx} - -Accepte un argument numérique et renvoie un Float64 nombre proche de la racine carrée de l'argument. - -## cbrt (x) {#cbrtx} - -Accepte un argument numérique et renvoie un Float64 nombre proche de la racine cubique de l'argument. - -## erf (x) {#erfx} - -Si ‘x’ est non négatif, alors `erf(x / σ√2)` est la probabilité qu'une variable aléatoire ayant une distribution normale avec un écart type ‘σ’ prend la valeur qui est séparée de la valeur attendue par plus de ‘x’. - -Exemple (règle de trois sigma): - -``` sql -SELECT erf(3 / sqrt(2)) -``` - -``` text -┌─erf(divide(3, sqrt(2)))─┐ -│ 0.9973002039367398 │ -└─────────────────────────┘ -``` - -## erfc (x) {#erfcx} - -Accepte un argument numérique et renvoie un nombre Float64 proche de 1-erf (x), mais sans perte de précision pour ‘x’ valeur. - -## lgamma (x) {#lgammax} - -Le logarithme de la fonction gamma. - -## tgamma (x) {#tgammax} - -La fonction Gamma. - -## sin (x) {#sinx} - -Sine. - -## cos (x) {#cosx} - -Cosinus. - -## tan (x) {#tanx} - -Tangente. - -## asin (x) {#asinx} - -Le sinus d'arc. - -## acos (x) {#acosx} - -Le cosinus de l'arc. - -## atan (x) {#atanx} - -L'arc tangente. - -## pow(x, y), la puissance(x, y) {#powx-y-powerx-y} - -Prend deux arguments numériques x et Y. renvoie un nombre Float64 proche de x à la puissance de Y. - -## intExp2 {#intexp2} - -Accepte un argument numérique et renvoie un nombre UInt64 proche de 2 à la puissance de X. - -## intExp10 {#intexp10} - -Accepte un argument numérique et renvoie un nombre UInt64 proche de 10 à la puissance de X. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/math_functions/) diff --git a/docs/fr/sql-reference/functions/other-functions.md b/docs/fr/sql-reference/functions/other-functions.md deleted file mode 100644 index e5c6abedd75..00000000000 --- a/docs/fr/sql-reference/functions/other-functions.md +++ /dev/null @@ -1,1205 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 66 -toc_title: Autre ---- - -# D'Autres Fonctions {#other-functions} - -## hôte() {#hostname} - -Renvoie une chaîne avec le nom de l'hôte sur lequel cette fonction a été exécutée. Pour le traitement distribué, c'est le nom du serveur distant, si la fonction est exécutée sur un serveur distant. - -## getMacro {#getmacro} - -Obtient une valeur nommée à partir [macro](../../operations/server-configuration-parameters/settings.md#macros) la section de la configuration du serveur. - -**Syntaxe** - -``` sql -getMacro(name); -``` - -**Paramètre** - -- `name` — Name to retrieve from the `macros` section. [Chaîne](../../sql-reference/data-types/string.md#string). - -**Valeur renvoyée** - -- Valeur de la macro spécifiée. - -Type: [Chaîne](../../sql-reference/data-types/string.md). - -**Exemple** - -Exemple `macros` section dans le fichier de configuration du serveur: - -``` xml - - Value - -``` - -Requête: - -``` sql -SELECT getMacro('test'); -``` - -Résultat: - -``` text -┌─getMacro('test')─┐ -│ Value │ -└──────────────────┘ -``` - -Une méthode alternative pour obtenir la même valeur: - -``` sql -SELECT * FROM system.macros -WHERE macro = 'test'; -``` - -``` text -┌─macro─┬─substitution─┐ -│ test │ Value │ -└───────┴──────────────┘ -``` - -## FQDN {#fqdn} - -Retourne le nom de domaine pleinement qualifié. - -**Syntaxe** - -``` sql -fqdn(); -``` - -Cette fonction est insensible à la casse. - -**Valeur renvoyée** - -- Chaîne avec le nom de domaine complet. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT FQDN(); -``` - -Résultat: - -``` text -┌─FQDN()──────────────────────────┐ -│ clickhouse.ru-central1.internal │ -└─────────────────────────────────┘ -``` - -## basename {#basename} - -Extrait la partie finale d'une chaîne après la dernière barre oblique ou barre oblique inverse. Cette fonction est souvent utilisée pour extraire le nom de fichier d'un chemin. - -``` sql -basename( expr ) -``` - -**Paramètre** - -- `expr` — Expression resulting in a [Chaîne](../../sql-reference/data-types/string.md) type de valeur. Tous les antislashs doivent être échappés dans la valeur résultante. - -**Valeur Renvoyée** - -Une chaîne de caractères qui contient: - -- La partie finale d'une chaîne après la dernière barre oblique ou barre oblique inverse. - - If the input string contains a path ending with slash or backslash, for example, `/` or `c:\`, the function returns an empty string. - -- La chaîne d'origine s'il n'y a pas de barres obliques ou de barres obliques inverses. - -**Exemple** - -``` sql -SELECT 'some/long/path/to/file' AS a, basename(a) -``` - -``` text -┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ -│ some\long\path\to\file │ file │ -└────────────────────────┴────────────────────────────────────────┘ -``` - -``` sql -SELECT 'some\\long\\path\\to\\file' AS a, basename(a) -``` - -``` text -┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ -│ some\long\path\to\file │ file │ -└────────────────────────┴────────────────────────────────────────┘ -``` - -``` sql -SELECT 'some-file-name' AS a, basename(a) -``` - -``` text -┌─a──────────────┬─basename('some-file-name')─┐ -│ some-file-name │ some-file-name │ -└────────────────┴────────────────────────────┘ -``` - -## visibleWidth (x) {#visiblewidthx} - -Calcule la largeur approximative lors de la sortie des valeurs vers la console au format texte (séparé par des tabulations). -Cette fonction est utilisée par le système pour implémenter de jolis formats. - -`NULL` est représenté comme une chaîne correspondant à `NULL` dans `Pretty` format. - -``` sql -SELECT visibleWidth(NULL) -``` - -``` text -┌─visibleWidth(NULL)─┐ -│ 4 │ -└────────────────────┘ -``` - -## toTypeName (x) {#totypenamex} - -Renvoie une chaîne contenant le nom du type de l'argument passé. - -Si `NULL` est passé à la fonction en entrée, puis il renvoie le `Nullable(Nothing)` type, ce qui correspond à un interne `NULL` représentation à ClickHouse. - -## la taille de bloc() {#function-blocksize} - -Récupère la taille du bloc. -Dans ClickHouse, les requêtes sont toujours exécutées sur des blocs (ensembles de parties de colonne). Cette fonction permet d'obtenir la taille du bloc pour lequel vous l'avez appelé. - -## matérialiser (x) {#materializex} - -Transforme une constante dans une colonne contenant une seule valeur. -Dans ClickHouse, les colonnes complètes et les constantes sont représentées différemment en mémoire. Les fonctions fonctionnent différemment pour les arguments constants et les arguments normaux (un code différent est exécuté), bien que le résultat soit presque toujours le même. Cette fonction sert à déboguer ce comportement. - -## ignore(…) {#ignore} - -Accepte tous les arguments, y compris `NULL`. Renvoie toujours 0. -Cependant, l'argument est toujours évalué. Cela peut être utilisé pour les benchmarks. - -## sommeil(secondes) {#sleepseconds} - -Dormir ‘seconds’ secondes sur chaque bloc de données. Vous pouvez spécifier un nombre entier ou un nombre à virgule flottante. - -## sleepEachRow (secondes) {#sleepeachrowseconds} - -Dormir ‘seconds’ secondes sur chaque ligne. Vous pouvez spécifier un nombre entier ou un nombre à virgule flottante. - -## currentDatabase() {#currentdatabase} - -Retourne le nom de la base de données actuelle. -Vous pouvez utiliser cette fonction dans les paramètres du moteur de table dans une requête CREATE TABLE où vous devez spécifier la base de données. - -## currentUser() {#other-function-currentuser} - -Renvoie la connexion de l'utilisateur actuel. La connexion de l'utilisateur, cette requête initiée, sera renvoyée en cas de requête distibuted. - -``` sql -SELECT currentUser(); -``` - -Alias: `user()`, `USER()`. - -**Valeurs renvoyées** - -- Connexion de l'utilisateur actuel. -- Connexion de l'utilisateur qui a lancé la requête en cas de requête distribuée. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT currentUser(); -``` - -Résultat: - -``` text -┌─currentUser()─┐ -│ default │ -└───────────────┘ -``` - -## isConstant {#is-constant} - -Vérifie si l'argument est une expression constante. - -A constant expression means an expression whose resulting value is known at the query analysis (i.e. before execution). For example, expressions over [littéral](../syntax.md#literals) sont des expressions constantes. - -La fonction est destinée au développement, au débogage et à la démonstration. - -**Syntaxe** - -``` sql -isConstant(x) -``` - -**Paramètre** - -- `x` — Expression to check. - -**Valeurs renvoyées** - -- `1` — `x` est constante. -- `0` — `x` est non constante. - -Type: [UInt8](../data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT isConstant(x + 1) FROM (SELECT 43 AS x) -``` - -Résultat: - -``` text -┌─isConstant(plus(x, 1))─┐ -│ 1 │ -└────────────────────────┘ -``` - -Requête: - -``` sql -WITH 3.14 AS pi SELECT isConstant(cos(pi)) -``` - -Résultat: - -``` text -┌─isConstant(cos(pi))─┐ -│ 1 │ -└─────────────────────┘ -``` - -Requête: - -``` sql -SELECT isConstant(number) FROM numbers(1) -``` - -Résultat: - -``` text -┌─isConstant(number)─┐ -│ 0 │ -└────────────────────┘ -``` - -## isFinite (x) {#isfinitex} - -Accepte Float32 et Float64 et renvoie UInt8 égal à 1 si l'argument n'est pas infini et pas un NaN, sinon 0. - -## isInfinite (x) {#isinfinitex} - -Accepte Float32 et Float64 et renvoie UInt8 égal à 1 si l'argument est infini, sinon 0. Notez que 0 est retourné pour un NaN. - -## ifNotFinite {#ifnotfinite} - -Vérifie si la valeur à virgule flottante est finie. - -**Syntaxe** - - ifNotFinite(x,y) - -**Paramètre** - -- `x` — Value to be checked for infinity. Type: [Flottant\*](../../sql-reference/data-types/float.md). -- `y` — Fallback value. Type: [Flottant\*](../../sql-reference/data-types/float.md). - -**Valeur renvoyée** - -- `x` si `x` est finie. -- `y` si `x` n'est pas finie. - -**Exemple** - -Requête: - - SELECT 1/0 as infimum, ifNotFinite(infimum,42) - -Résultat: - - ┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ - │ inf │ 42 │ - └─────────┴───────────────────────────────┘ - -Vous pouvez obtenir un résultat similaire en utilisant [opérateur ternaire](conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. - -## isNaN (x) {#isnanx} - -Accepte Float32 et Float64 et renvoie UInt8 égal à 1 si l'argument est un NaN, sinon 0. - -## hasColumnInTable(\[‘hostname’\[, ‘username’\[, ‘password’\]\],\] ‘database’, ‘table’, ‘column’) {#hascolumnintablehostname-username-password-database-table-column} - -Accepte les chaînes constantes: nom de la base de données, nom de la table et nom de la colonne. Renvoie une expression constante UInt8 égale à 1 s'il y a une colonne, sinon 0. Si le paramètre hostname est défini, le test s'exécutera sur un serveur distant. -La fonction renvoie une exception si la table n'existe pas. -Pour les éléments imbriqués structure des données, la fonction vérifie l'existence d'une colonne. Pour la structure de données imbriquée elle-même, la fonction renvoie 0. - -## bar {#function-bar} - -Permet de construire un diagramme unicode-art. - -`bar(x, min, max, width)` dessine une bande avec une largeur proportionnelle à `(x - min)` et égale à `width` les caractères lors de la `x = max`. - -Paramètre: - -- `x` — Size to display. -- `min, max` — Integer constants. The value must fit in `Int64`. -- `width` — Constant, positive integer, can be fractional. - -La bande dessinée avec précision à un huitième d'un symbole. - -Exemple: - -``` sql -SELECT - toHour(EventTime) AS h, - count() AS c, - bar(c, 0, 600000, 20) AS bar -FROM test.hits -GROUP BY h -ORDER BY h ASC -``` - -``` text -┌──h─┬──────c─┬─bar────────────────┐ -│ 0 │ 292907 │ █████████▋ │ -│ 1 │ 180563 │ ██████ │ -│ 2 │ 114861 │ ███▋ │ -│ 3 │ 85069 │ ██▋ │ -│ 4 │ 68543 │ ██▎ │ -│ 5 │ 78116 │ ██▌ │ -│ 6 │ 113474 │ ███▋ │ -│ 7 │ 170678 │ █████▋ │ -│ 8 │ 278380 │ █████████▎ │ -│ 9 │ 391053 │ █████████████ │ -│ 10 │ 457681 │ ███████████████▎ │ -│ 11 │ 493667 │ ████████████████▍ │ -│ 12 │ 509641 │ ████████████████▊ │ -│ 13 │ 522947 │ █████████████████▍ │ -│ 14 │ 539954 │ █████████████████▊ │ -│ 15 │ 528460 │ █████████████████▌ │ -│ 16 │ 539201 │ █████████████████▊ │ -│ 17 │ 523539 │ █████████████████▍ │ -│ 18 │ 506467 │ ████████████████▊ │ -│ 19 │ 520915 │ █████████████████▎ │ -│ 20 │ 521665 │ █████████████████▍ │ -│ 21 │ 542078 │ ██████████████████ │ -│ 22 │ 493642 │ ████████████████▍ │ -│ 23 │ 400397 │ █████████████▎ │ -└────┴────────┴────────────────────┘ -``` - -## transformer {#transform} - -Transforme une valeur en fonction explicitement définis cartographie de certains éléments à l'autre. -Il existe deux variantes de cette fonction: - -### de transformation(x, array_from, array_to, par défaut) {#transformx-array-from-array-to-default} - -`x` – What to transform. - -`array_from` – Constant array of values for converting. - -`array_to` – Constant array of values to convert the values in ‘from’ de. - -`default` – Which value to use if ‘x’ n'est pas égale à une des valeurs de ‘from’. - -`array_from` et `array_to` – Arrays of the same size. - -Type: - -`transform(T, Array(T), Array(U), U) -> U` - -`T` et `U` peuvent être des types numériques, chaîne ou Date ou DateTime. -Lorsque la même lettre est indiquée (T ou U), pour les types numériques, il se peut qu'il ne s'agisse pas de types correspondants, mais de types ayant un type commun. -Par exemple, le premier argument peut avoir le type Int64, tandis que le second a le type Array(UInt16). - -Si l' ‘x’ la valeur est égale à l'un des éléments dans la ‘array_from’ tableau, elle renvoie l'élément existant (qui est numéroté de même) de la ‘array_to’ tableau. Sinon, elle renvoie ‘default’. S'il y a plusieurs éléments correspondants dans ‘array_from’ il renvoie l'un des matches. - -Exemple: - -``` sql -SELECT - transform(SearchEngineID, [2, 3], ['Yandex', 'Google'], 'Other') AS title, - count() AS c -FROM test.hits -WHERE SearchEngineID != 0 -GROUP BY title -ORDER BY c DESC -``` - -``` text -┌─title─────┬──────c─┐ -│ Yandex │ 498635 │ -│ Google │ 229872 │ -│ Other │ 104472 │ -└───────────┴────────┘ -``` - -### de transformation(x, array_from, array_to) {#transformx-array-from-array-to} - -Diffère de la première variation en ce que le ‘default’ l'argument est omis. -Si l' ‘x’ la valeur est égale à l'un des éléments dans la ‘array_from’ tableau, elle renvoie l'élément correspondant (qui est numéroté de même) de la ‘array_to’ tableau. Sinon, elle renvoie ‘x’. - -Type: - -`transform(T, Array(T), Array(T)) -> T` - -Exemple: - -``` sql -SELECT - transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'example.com']) AS s, - count() AS c -FROM test.hits -GROUP BY domain(Referer) -ORDER BY count() DESC -LIMIT 10 -``` - -``` text -┌─s──────────────┬───────c─┐ -│ │ 2906259 │ -│ www.yandex │ 867767 │ -│ ███████.ru │ 313599 │ -│ mail.yandex.ru │ 107147 │ -│ ██████.ru │ 100355 │ -│ █████████.ru │ 65040 │ -│ news.yandex.ru │ 64515 │ -│ ██████.net │ 59141 │ -│ example.com │ 57316 │ -└────────────────┴─────────┘ -``` - -## formatReadableSize (x) {#formatreadablesizex} - -Accepte la taille (nombre d'octets). Renvoie une taille arrondie avec un suffixe (KiB, MiB, etc.) comme une chaîne de caractères. - -Exemple: - -``` sql -SELECT - arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, - formatReadableSize(filesize_bytes) AS filesize -``` - -``` text -┌─filesize_bytes─┬─filesize───┐ -│ 1 │ 1.00 B │ -│ 1024 │ 1.00 KiB │ -│ 1048576 │ 1.00 MiB │ -│ 192851925 │ 183.92 MiB │ -└────────────────┴────────────┘ -``` - -## moins (a, b) {#leasta-b} - -Renvoie la plus petite valeur de a et b. - -## la plus grande(a, b) {#greatesta-b} - -Renvoie la plus grande valeur de a et B. - -## le temps de disponibilité() {#uptime} - -Renvoie la disponibilité du serveur en quelques secondes. - -## version() {#version} - -Renvoie la version du serveur sous forme de chaîne. - -## fuseau() {#timezone} - -Retourne le fuseau horaire du serveur. - -## blockNumber {#blocknumber} - -Renvoie le numéro de séquence du bloc de données où se trouve la ligne. - -## rowNumberInBlock {#function-rownumberinblock} - -Renvoie le numéro de séquence de la ligne dans le bloc de données. Différents blocs de données sont toujours recalculés. - -## rowNumberInAllBlocks() {#rownumberinallblocks} - -Renvoie le numéro de séquence de la ligne dans le bloc de données. Cette fonction ne prend en compte que les blocs de données affectés. - -## voisin {#neighbor} - -La fonction de fenêtre qui donne accès à une ligne à un décalage spécifié qui vient avant ou après la ligne actuelle d'une colonne donnée. - -**Syntaxe** - -``` sql -neighbor(column, offset[, default_value]) -``` - -Le résultat de la fonction dépend du touché des blocs de données et l'ordre des données dans le bloc. -Si vous créez une sous-requête avec ORDER BY et appelez la fonction depuis l'extérieur de la sous-requête, vous pouvez obtenir le résultat attendu. - -**Paramètre** - -- `column` — A column name or scalar expression. -- `offset` — The number of rows forwards or backwards from the current row of `column`. [Int64](../../sql-reference/data-types/int-uint.md). -- `default_value` — Optional. The value to be returned if offset goes beyond the scope of the block. Type of data blocks affected. - -**Valeurs renvoyées** - -- De la valeur pour `column` dans `offset` distance de la ligne actuelle si `offset` la valeur n'est pas en dehors des limites du bloc. -- La valeur par défaut pour `column` si `offset` la valeur est en dehors des limites du bloc. Si `default_value` est donné, alors il sera utilisé. - -Type: type de blocs de données affectés ou type de valeur par défaut. - -**Exemple** - -Requête: - -``` sql -SELECT number, neighbor(number, 2) FROM system.numbers LIMIT 10; -``` - -Résultat: - -``` text -┌─number─┬─neighbor(number, 2)─┐ -│ 0 │ 2 │ -│ 1 │ 3 │ -│ 2 │ 4 │ -│ 3 │ 5 │ -│ 4 │ 6 │ -│ 5 │ 7 │ -│ 6 │ 8 │ -│ 7 │ 9 │ -│ 8 │ 0 │ -│ 9 │ 0 │ -└────────┴─────────────────────┘ -``` - -Requête: - -``` sql -SELECT number, neighbor(number, 2, 999) FROM system.numbers LIMIT 10; -``` - -Résultat: - -``` text -┌─number─┬─neighbor(number, 2, 999)─┐ -│ 0 │ 2 │ -│ 1 │ 3 │ -│ 2 │ 4 │ -│ 3 │ 5 │ -│ 4 │ 6 │ -│ 5 │ 7 │ -│ 6 │ 8 │ -│ 7 │ 9 │ -│ 8 │ 999 │ -│ 9 │ 999 │ -└────────┴──────────────────────────┘ -``` - -Cette fonction peut être utilisée pour calculer une année à valeur métrique: - -Requête: - -``` sql -WITH toDate('2018-01-01') AS start_date -SELECT - toStartOfMonth(start_date + (number * 32)) AS month, - toInt32(month) % 100 AS money, - neighbor(money, -12) AS prev_year, - round(prev_year / money, 2) AS year_over_year -FROM numbers(16) -``` - -Résultat: - -``` text -┌──────month─┬─money─┬─prev_year─┬─year_over_year─┐ -│ 2018-01-01 │ 32 │ 0 │ 0 │ -│ 2018-02-01 │ 63 │ 0 │ 0 │ -│ 2018-03-01 │ 91 │ 0 │ 0 │ -│ 2018-04-01 │ 22 │ 0 │ 0 │ -│ 2018-05-01 │ 52 │ 0 │ 0 │ -│ 2018-06-01 │ 83 │ 0 │ 0 │ -│ 2018-07-01 │ 13 │ 0 │ 0 │ -│ 2018-08-01 │ 44 │ 0 │ 0 │ -│ 2018-09-01 │ 75 │ 0 │ 0 │ -│ 2018-10-01 │ 5 │ 0 │ 0 │ -│ 2018-11-01 │ 36 │ 0 │ 0 │ -│ 2018-12-01 │ 66 │ 0 │ 0 │ -│ 2019-01-01 │ 97 │ 32 │ 0.33 │ -│ 2019-02-01 │ 28 │ 63 │ 2.25 │ -│ 2019-03-01 │ 56 │ 91 │ 1.62 │ -│ 2019-04-01 │ 87 │ 22 │ 0.25 │ -└────────────┴───────┴───────────┴────────────────┘ -``` - -## runningDifference(x) {#other_functions-runningdifference} - -Calculates the difference between successive row values ​​in the data block. -Renvoie 0 pour la première ligne et la différence par rapport à la rangée précédente pour chaque nouvelle ligne. - -Le résultat de la fonction dépend du touché des blocs de données et l'ordre des données dans le bloc. -Si vous créez une sous-requête avec ORDER BY et appelez la fonction depuis l'extérieur de la sous-requête, vous pouvez obtenir le résultat attendu. - -Exemple: - -``` sql -SELECT - EventID, - EventTime, - runningDifference(EventTime) AS delta -FROM -( - SELECT - EventID, - EventTime - FROM events - WHERE EventDate = '2016-11-24' - ORDER BY EventTime ASC - LIMIT 5 -) -``` - -``` text -┌─EventID─┬───────────EventTime─┬─delta─┐ -│ 1106 │ 2016-11-24 00:00:04 │ 0 │ -│ 1107 │ 2016-11-24 00:00:05 │ 1 │ -│ 1108 │ 2016-11-24 00:00:05 │ 0 │ -│ 1109 │ 2016-11-24 00:00:09 │ 4 │ -│ 1110 │ 2016-11-24 00:00:10 │ 1 │ -└─────────┴─────────────────────┴───────┘ -``` - -Veuillez noter que la taille du bloc affecte le résultat. Avec chaque nouveau bloc, le `runningDifference` l'état est réinitialisé. - -``` sql -SELECT - number, - runningDifference(number + 1) AS diff -FROM numbers(100000) -WHERE diff != 1 -``` - -``` text -┌─number─┬─diff─┐ -│ 0 │ 0 │ -└────────┴──────┘ -┌─number─┬─diff─┐ -│ 65536 │ 0 │ -└────────┴──────┘ -``` - -``` sql -set max_block_size=100000 -- default value is 65536! - -SELECT - number, - runningDifference(number + 1) AS diff -FROM numbers(100000) -WHERE diff != 1 -``` - -``` text -┌─number─┬─diff─┐ -│ 0 │ 0 │ -└────────┴──────┘ -``` - -## runningDifferenceStartingWithFirstvalue {#runningdifferencestartingwithfirstvalue} - -De même que pour [runningDifference](./other-functions.md#other_functions-runningdifference) la différence est la valeur de la première ligne, est retourné à la valeur de la première ligne, et chaque rangée suivante renvoie la différence de la rangée précédente. - -## MACNumToString (num) {#macnumtostringnum} - -Accepte un numéro UInt64. Interprète comme une adresse MAC dans big endian. Renvoie une chaîne contenant l'adresse MAC correspondante au format AA:BB:CC: DD:EE: FF (Nombres séparés par deux points sous forme hexadécimale). - -## MACStringToNum (s) {#macstringtonums} - -La fonction inverse de MACNumToString. Si l'adresse MAC a un format non valide, elle renvoie 0. - -## MACStringToOUI (s) {#macstringtoouis} - -Accepte une adresse MAC au format AA:BB:CC: DD:EE: FF (Nombres séparés par deux points sous forme hexadécimale). Renvoie les trois premiers octets sous la forme D'un nombre UInt64. Si l'adresse MAC a un format non valide, elle renvoie 0. - -## getSizeOfEnumType {#getsizeofenumtype} - -Retourne le nombre de champs dans [Enum](../../sql-reference/data-types/enum.md). - -``` sql -getSizeOfEnumType(value) -``` - -**Paramètre:** - -- `value` — Value of type `Enum`. - -**Valeurs renvoyées** - -- Le nombre de champs avec `Enum` les valeurs d'entrée. -- Une exception est levée si le type n'est pas `Enum`. - -**Exemple** - -``` sql -SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x -``` - -``` text -┌─x─┐ -│ 2 │ -└───┘ -``` - -## blockSerializedSize {#blockserializedsize} - -Retourne la taille sur le disque (sans tenir compte de la compression). - -``` sql -blockSerializedSize(value[, value[, ...]]) -``` - -**Paramètre:** - -- `value` — Any value. - -**Valeurs renvoyées** - -- Le nombre d'octets qui seront écrites sur le disque pour le bloc de valeurs (sans compression). - -**Exemple** - -``` sql -SELECT blockSerializedSize(maxState(1)) as x -``` - -``` text -┌─x─┐ -│ 2 │ -└───┘ -``` - -## toColumnTypeName {#tocolumntypename} - -Renvoie le nom de la classe qui représente le type de données de la colonne dans la RAM. - -``` sql -toColumnTypeName(value) -``` - -**Paramètre:** - -- `value` — Any type of value. - -**Valeurs renvoyées** - -- Une chaîne avec le nom de la classe utilisée pour représenter `value` type de données dans la mémoire RAM. - -**Exemple de la différence entre`toTypeName ' and ' toColumnTypeName`** - -``` sql -SELECT toTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) -``` - -``` text -┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ -│ DateTime │ -└─────────────────────────────────────────────────────┘ -``` - -``` sql -SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) -``` - -``` text -┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ -│ Const(UInt32) │ -└───────────────────────────────────────────────────────────┘ -``` - -L'exemple montre que le `DateTime` type de données est stocké dans la mémoire comme `Const(UInt32)`. - -## dumpColumnStructure {#dumpcolumnstructure} - -Affiche une description détaillée des structures de données en RAM - -``` sql -dumpColumnStructure(value) -``` - -**Paramètre:** - -- `value` — Any type of value. - -**Valeurs renvoyées** - -- Une chaîne décrivant la structure utilisée pour représenter `value` type de données dans la mémoire RAM. - -**Exemple** - -``` sql -SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) -``` - -``` text -┌─dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ -│ DateTime, Const(size = 1, UInt32(size = 1)) │ -└──────────────────────────────────────────────────────────────┘ -``` - -## defaultValueOfArgumentType {#defaultvalueofargumenttype} - -Affiche la valeur par défaut du type de données. - -Ne pas inclure des valeurs par défaut pour les colonnes personnalisées définies par l'utilisateur. - -``` sql -defaultValueOfArgumentType(expression) -``` - -**Paramètre:** - -- `expression` — Arbitrary type of value or an expression that results in a value of an arbitrary type. - -**Valeurs renvoyées** - -- `0` pour les nombres. -- Chaîne vide pour les chaînes. -- `ᴺᵁᴸᴸ` pour [Nullable](../../sql-reference/data-types/nullable.md). - -**Exemple** - -``` sql -SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) -``` - -``` text -┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ -│ 0 │ -└─────────────────────────────────────────────┘ -``` - -``` sql -SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) -``` - -``` text -┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ -│ ᴺᵁᴸᴸ │ -└───────────────────────────────────────────────────────┘ -``` - -## reproduire {#other-functions-replicate} - -Crée un tableau avec une seule valeur. - -Utilisé pour la mise en œuvre interne de [arrayJoin](array-join.md#functions_arrayjoin). - -``` sql -SELECT replicate(x, arr); -``` - -**Paramètre:** - -- `arr` — Original array. ClickHouse creates a new array of the same length as the original and fills it with the value `x`. -- `x` — The value that the resulting array will be filled with. - -**Valeur renvoyée** - -Un tableau rempli de la valeur `x`. - -Type: `Array`. - -**Exemple** - -Requête: - -``` sql -SELECT replicate(1, ['a', 'b', 'c']) -``` - -Résultat: - -``` text -┌─replicate(1, ['a', 'b', 'c'])─┐ -│ [1,1,1] │ -└───────────────────────────────┘ -``` - -## filesystemAvailable {#filesystemavailable} - -Renvoie la quantité d'espace restant sur le système de fichiers où se trouvent les fichiers des bases de données. Il est toujours plus petit que l'espace libre total ([filesystemFree](#filesystemfree)) parce qu'un peu d'espace est réservé au système D'exploitation. - -**Syntaxe** - -``` sql -filesystemAvailable() -``` - -**Valeur renvoyée** - -- La quantité d'espace restant disponible en octets. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT formatReadableSize(filesystemAvailable()) AS "Available space", toTypeName(filesystemAvailable()) AS "Type"; -``` - -Résultat: - -``` text -┌─Available space─┬─Type───┐ -│ 30.75 GiB │ UInt64 │ -└─────────────────┴────────┘ -``` - -## filesystemFree {#filesystemfree} - -Retourne montant total de l'espace libre sur le système de fichiers où les fichiers des bases de données. Voir aussi `filesystemAvailable` - -**Syntaxe** - -``` sql -filesystemFree() -``` - -**Valeur renvoyée** - -- Quantité d'espace libre en octets. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT formatReadableSize(filesystemFree()) AS "Free space", toTypeName(filesystemFree()) AS "Type"; -``` - -Résultat: - -``` text -┌─Free space─┬─Type───┐ -│ 32.39 GiB │ UInt64 │ -└────────────┴────────┘ -``` - -## filesystemCapacity {#filesystemcapacity} - -Renvoie la capacité du système de fichiers en octets. Pour l'évaluation, la [chemin](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-path) le répertoire de données doit être configuré. - -**Syntaxe** - -``` sql -filesystemCapacity() -``` - -**Valeur renvoyée** - -- Informations de capacité du système de fichiers en octets. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesystemCapacity()) AS "Type" -``` - -Résultat: - -``` text -┌─Capacity──┬─Type───┐ -│ 39.32 GiB │ UInt64 │ -└───────────┴────────┘ -``` - -## finalizeAggregation {#function-finalizeaggregation} - -Prend de l'état de la fonction d'agrégation. Renvoie le résultat de l'agrégation (état finalisé). - -## runningAccumulate {#function-runningaccumulate} - -Prend les membres de la fonction d'agrégation et renvoie une colonne avec des valeurs, sont le résultat de l'accumulation de ces états pour un ensemble de bloc de lignes, de la première à la ligne actuelle. -Par exemple, prend l'état de la fonction d'agrégat (exemple runningAccumulate(uniqState(UserID))), et pour chaque ligne de bloc, retourne le résultat de la fonction d'agrégat lors de la fusion des états de toutes les lignes précédentes et de la ligne actuelle. -Ainsi, le résultat de la fonction dépend de la partition des données aux blocs et de l'ordre des données dans le bloc. - -## joinGet {#joinget} - -La fonction vous permet d'extraire les données de la table de la même manière qu'à partir d'un [dictionnaire](../../sql-reference/dictionaries/index.md). - -Obtient les données de [Rejoindre](../../engines/table-engines/special/join.md#creating-a-table) tables utilisant la clé de jointure spécifiée. - -Ne prend en charge que les tables créées avec `ENGINE = Join(ANY, LEFT, )` déclaration. - -**Syntaxe** - -``` sql -joinGet(join_storage_table_name, `value_column`, join_keys) -``` - -**Paramètre** - -- `join_storage_table_name` — an [identificateur](../syntax.md#syntax-identifiers) indique l'endroit où la recherche est effectuée. L'identificateur est recherché dans la base de données par défaut (voir paramètre `default_database` dans le fichier de config). Pour remplacer la base de données par défaut, utilisez `USE db_name` ou spécifiez la base de données et la table via le séparateur `db_name.db_table` voir l'exemple. -- `value_column` — name of the column of the table that contains required data. -- `join_keys` — list of keys. - -**Valeur renvoyée** - -Retourne la liste des valeurs correspond à la liste des clés. - -Si certain n'existe pas dans la table source alors `0` ou `null` seront renvoyés basé sur [join_use_nulls](../../operations/settings/settings.md#join_use_nulls) paramètre. - -Plus d'infos sur `join_use_nulls` dans [Opération de jointure](../../engines/table-engines/special/join.md). - -**Exemple** - -Table d'entrée: - -``` sql -CREATE DATABASE db_test -CREATE TABLE db_test.id_val(`id` UInt32, `val` UInt32) ENGINE = Join(ANY, LEFT, id) SETTINGS join_use_nulls = 1 -INSERT INTO db_test.id_val VALUES (1,11)(2,12)(4,13) -``` - -``` text -┌─id─┬─val─┐ -│ 4 │ 13 │ -│ 2 │ 12 │ -│ 1 │ 11 │ -└────┴─────┘ -``` - -Requête: - -``` sql -SELECT joinGet(db_test.id_val,'val',toUInt32(number)) from numbers(4) SETTINGS join_use_nulls = 1 -``` - -Résultat: - -``` text -┌─joinGet(db_test.id_val, 'val', toUInt32(number))─┐ -│ 0 │ -│ 11 │ -│ 12 │ -│ 0 │ -└──────────────────────────────────────────────────┘ -``` - -## modelEvaluate(model_name, …) {#function-modelevaluate} - -Évaluer le modèle externe. -Accepte un nom de modèle et le modèle de l'argumentation. Renvoie Float64. - -## throwIf (x \[, custom_message\]) {#throwifx-custom-message} - -Lever une exception si l'argument est non nul. -custom_message - est un paramètre optionnel: une chaîne constante, fournit un message d'erreur - -``` sql -SELECT throwIf(number = 3, 'Too many') FROM numbers(10); -``` - -``` text -↙ Progress: 0.00 rows, 0.00 B (0.00 rows/s., 0.00 B/s.) Received exception from server (version 19.14.1): -Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. -``` - -## identité {#identity} - -Renvoie la même valeur qui a été utilisée comme argument. Utilisé pour le débogage et les tests, permet d'annuler l'utilisation de l'index et d'obtenir les performances de requête d'une analyse complète. Lorsque la requête est analysée pour une utilisation possible de l'index, l'analyseur ne regarde pas à l'intérieur `identity` fonction. - -**Syntaxe** - -``` sql -identity(x) -``` - -**Exemple** - -Requête: - -``` sql -SELECT identity(42) -``` - -Résultat: - -``` text -┌─identity(42)─┐ -│ 42 │ -└──────────────┘ -``` - -## randomPrintableASCII {#randomascii} - -Génère une chaîne avec un ensemble aléatoire de [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) caractères imprimables. - -**Syntaxe** - -``` sql -randomPrintableASCII(length) -``` - -**Paramètre** - -- `length` — Resulting string length. Positive integer. - - If you pass `length < 0`, behavior of the function is undefined. - -**Valeur renvoyée** - -- Chaîne avec un ensemble aléatoire de [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) caractères imprimables. - -Type: [Chaîne](../../sql-reference/data-types/string.md) - -**Exemple** - -``` sql -SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3 -``` - -``` text -┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐ -│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │ -│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │ -│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │ -└────────┴────────────────────────────────┴──────────────────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/other_functions/) diff --git a/docs/fr/sql-reference/functions/random-functions.md b/docs/fr/sql-reference/functions/random-functions.md deleted file mode 100644 index 3c4e15507bb..00000000000 --- a/docs/fr/sql-reference/functions/random-functions.md +++ /dev/null @@ -1,65 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 51 -toc_title: "La G\xE9n\xE9ration De Nombres Pseudo-Al\xE9atoires" ---- - -# Fonctions pour générer des nombres Pseudo-aléatoires {#functions-for-generating-pseudo-random-numbers} - -Des générateurs Non cryptographiques de nombres pseudo-aléatoires sont utilisés. - -Toutes les fonctions acceptent zéro argument ou un argument. -Si un argument est passé, il peut être de n'importe quel type, et sa valeur n'est utilisée pour rien. -Le seul but de cet argument est d'empêcher l'élimination des sous-expressions courantes, de sorte que deux instances différentes de la même fonction renvoient des colonnes différentes avec des nombres aléatoires différents. - -## Rand {#rand} - -Renvoie un nombre UInt32 pseudo-aléatoire, réparti uniformément entre tous les nombres de type UInt32. -Utilise un générateur congruentiel linéaire. - -## rand64 {#rand64} - -Renvoie un nombre UInt64 pseudo-aléatoire, réparti uniformément entre tous les nombres de type UInt64. -Utilise un générateur congruentiel linéaire. - -## randConstant {#randconstant} - -Produit une colonne constante avec une valeur aléatoire. - -**Syntaxe** - -``` sql -randConstant([x]) -``` - -**Paramètre** - -- `x` — [Expression](../syntax.md#syntax-expressions) résultant de la [types de données pris en charge](../data-types/index.md#data_types). La valeur résultante est ignorée, mais l'expression elle-même si elle est utilisée pour contourner [élimination des sous-expressions courantes](index.md#common-subexpression-elimination) si la fonction est appelée plusieurs fois dans une seule requête. Paramètre facultatif. - -**Valeur renvoyée** - -- Nombre Pseudo-aléatoire. - -Type: [UInt32](../data-types/int-uint.md). - -**Exemple** - -Requête: - -``` sql -SELECT rand(), rand(1), rand(number), randConstant(), randConstant(1), randConstant(number) -FROM numbers(3) -``` - -Résultat: - -``` text -┌─────rand()─┬────rand(1)─┬─rand(number)─┬─randConstant()─┬─randConstant(1)─┬─randConstant(number)─┐ -│ 3047369878 │ 4132449925 │ 4044508545 │ 2740811946 │ 4229401477 │ 1924032898 │ -│ 2938880146 │ 1267722397 │ 4154983056 │ 2740811946 │ 4229401477 │ 1924032898 │ -│ 956619638 │ 4238287282 │ 1104342490 │ 2740811946 │ 4229401477 │ 1924032898 │ -└────────────┴────────────┴──────────────┴────────────────┴─────────────────┴──────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/random_functions/) diff --git a/docs/fr/sql-reference/functions/rounding-functions.md b/docs/fr/sql-reference/functions/rounding-functions.md deleted file mode 100644 index f99e6358026..00000000000 --- a/docs/fr/sql-reference/functions/rounding-functions.md +++ /dev/null @@ -1,190 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 45 -toc_title: Arrondi ---- - -# Fonctions D'Arrondi {#rounding-functions} - -## floor(x\[, N\]) {#floorx-n} - -Renvoie le plus grand nombre rond inférieur ou égal à `x`. Un nombre rond est un multiple de 1 / 10N, ou le nombre le plus proche du type de données approprié si 1 / 10N n'est pas exact. -‘N’ est une constante entière, paramètre facultatif. Par défaut, il est zéro, ce qui signifie arrondir à un entier. -‘N’ peut être négative. - -Exemple: `floor(123.45, 1) = 123.4, floor(123.45, -1) = 120.` - -`x` est n'importe quel type numérique. Le résultat est un nombre du même type. -Pour les arguments entiers, il est logique d'arrondir avec un négatif `N` valeur (pour non négatif `N`, la fonction ne fait rien). -Si l'arrondi provoque un débordement (par exemple, floor(-128, -1)), un résultat spécifique à l'implémentation est renvoyé. - -## ceil(x\[, n\]), plafond (x\[, n\]) {#ceilx-n-ceilingx-n} - -Renvoie le plus petit nombre rond supérieur ou égal à `x`. Dans tous les autres sens, il est le même que le `floor` fonction (voir ci-dessus). - -## trunc(x \[, N\]), truncate(x \[, N\]) {#truncx-n-truncatex-n} - -Renvoie le nombre rond avec la plus grande valeur absolue qui a une valeur absolue inférieure ou égale à `x`‘s. In every other way, it is the same as the ’floor’ fonction (voir ci-dessus). - -## round(x\[, N\]) {#rounding_functions-round} - -Arrondit une valeur à un nombre spécifié de décimales. - -La fonction renvoie le nombre plus proche de l'ordre spécifié. Dans le cas où un nombre donné a une distance égale aux nombres environnants, la fonction utilise l'arrondi de banquier pour les types de nombres flottants et arrondit à partir de zéro pour les autres types de nombres. - -``` sql -round(expression [, decimal_places]) -``` - -**Paramètre:** - -- `expression` — A number to be rounded. Can be any [expression](../syntax.md#syntax-expressions) retour du numérique [type de données](../../sql-reference/data-types/index.md#data_types). -- `decimal-places` — An integer value. - - Si `decimal-places > 0` alors la fonction arrondit la valeur à droite du point décimal. - - Si `decimal-places < 0` alors la fonction arrondit la valeur à gauche de la virgule décimale. - - Si `decimal-places = 0` alors la fonction arrondit la valeur à l'entier. Dans ce cas, l'argument peut être omis. - -**Valeur renvoyée:** - -Le nombre arrondi du même type que le nombre d'entrée. - -### Exemple {#examples} - -**Exemple d'utilisation** - -``` sql -SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 -``` - -``` text -┌───x─┬─round(divide(number, 2))─┐ -│ 0 │ 0 │ -│ 0.5 │ 0 │ -│ 1 │ 1 │ -└─────┴──────────────────────────┘ -``` - -**Des exemples de l'arrondissement** - -Le résultat est arrondi au plus proche. - -``` text -round(3.2, 0) = 3 -round(4.1267, 2) = 4.13 -round(22,-1) = 20 -round(467,-2) = 500 -round(-467,-2) = -500 -``` - -Le Banquier arrondit. - -``` text -round(3.5) = 4 -round(4.5) = 4 -round(3.55, 1) = 3.6 -round(3.65, 1) = 3.6 -``` - -**Voir Aussi** - -- [roundBankers](#roundbankers) - -## roundBankers {#roundbankers} - -Arrondit un nombre à une position décimale spécifiée. - -- Si le nombre est arrondi à mi-chemin entre deux nombres, la fonction utilise l'arrondi. - - Banker's rounding is a method of rounding fractional numbers. When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. - - It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. - -- Dans d'autres cas, la fonction arrondit les nombres à l'entier le plus proche. - -À l'aide de l'arrondi, vous pouvez réduire l'effet qu'arrondir les nombres sur les résultats d'additionner ou de soustraire ces chiffres. - -Par exemple, les nombres de somme 1.5, 2.5, 3.5, 4.5 avec des arrondis différents: - -- Pas d'arrondi: 1.5 + 2.5 + 3.5 + 4.5 = 12. -- Arrondi du banquier: 2 + 2 + 4 + 4 = 12. -- Arrondi à l'entier le plus proche: 2 + 3 + 4 + 5 = 14. - -**Syntaxe** - -``` sql -roundBankers(expression [, decimal_places]) -``` - -**Paramètre** - -- `expression` — A number to be rounded. Can be any [expression](../syntax.md#syntax-expressions) retour du numérique [type de données](../../sql-reference/data-types/index.md#data_types). -- `decimal-places` — Decimal places. An integer number. - - `decimal-places > 0` — The function rounds the number to the given position right of the decimal point. Example: `roundBankers(3.55, 1) = 3.6`. - - `decimal-places < 0` — The function rounds the number to the given position left of the decimal point. Example: `roundBankers(24.55, -1) = 20`. - - `decimal-places = 0` — The function rounds the number to an integer. In this case the argument can be omitted. Example: `roundBankers(2.5) = 2`. - -**Valeur renvoyée** - -Valeur arrondie par la méthode d'arrondi du banquier. - -### Exemple {#examples-1} - -**Exemple d'utilisation** - -Requête: - -``` sql - SELECT number / 2 AS x, roundBankers(x, 0) AS b fROM system.numbers limit 10 -``` - -Résultat: - -``` text -┌───x─┬─b─┐ -│ 0 │ 0 │ -│ 0.5 │ 0 │ -│ 1 │ 1 │ -│ 1.5 │ 2 │ -│ 2 │ 2 │ -│ 2.5 │ 2 │ -│ 3 │ 3 │ -│ 3.5 │ 4 │ -│ 4 │ 4 │ -│ 4.5 │ 4 │ -└─────┴───┘ -``` - -**Exemples d'arrondi bancaire** - -``` text -roundBankers(0.4) = 0 -roundBankers(-3.5) = -4 -roundBankers(4.5) = 4 -roundBankers(3.55, 1) = 3.6 -roundBankers(3.65, 1) = 3.6 -roundBankers(10.35, 1) = 10.4 -roundBankers(10.755, 2) = 11,76 -``` - -**Voir Aussi** - -- [rond](#rounding_functions-round) - -## roundToExp2 (num) {#roundtoexp2num} - -Accepte un certain nombre. Si le nombre est inférieur à un, elle renvoie 0. Sinon, il arrondit le nombre au degré le plus proche (entier non négatif) de deux. - -## roundDuration (num) {#rounddurationnum} - -Accepte un certain nombre. Si le nombre est inférieur à un, elle renvoie 0. Sinon, il arrondit le nombre vers le bas pour les nombres de l'ensemble: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. Cette fonction est spécifique à Yandex.Metrica et utilisé pour la mise en œuvre du rapport sur la durée de la session. - -## roundAge (num) {#roundagenum} - -Accepte un certain nombre. Si le nombre est inférieur à 18, il renvoie 0. Sinon, il arrondit le nombre à un nombre de l'ensemble: 18, 25, 35, 45, 55. Cette fonction est spécifique à Yandex.Metrica et utilisé pour la mise en œuvre du rapport sur l'âge des utilisateurs. - -## roundDown(num, arr) {#rounddownnum-arr} - -Accepte un nombre et l'arrondit à un élément dans le tableau spécifié. Si la valeur est inférieure à la plus basse, la plus basse lié est retourné. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/rounding_functions/) diff --git a/docs/fr/sql-reference/functions/splitting-merging-functions.md b/docs/fr/sql-reference/functions/splitting-merging-functions.md deleted file mode 100644 index a1260e918b0..00000000000 --- a/docs/fr/sql-reference/functions/splitting-merging-functions.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 47 -toc_title: "Fractionnement et fusion de cha\xEEnes et de tableaux" ---- - -# Fonctions pour diviser et fusionner des chaînes et des tableaux {#functions-for-splitting-and-merging-strings-and-arrays} - -## splitByChar (séparateur, s) {#splitbycharseparator-s} - -Divise une chaîne en sous-chaînes séparées par un caractère spécifique. Il utilise une chaîne constante `separator` qui composé d'un seul caractère. -Retourne un tableau de certaines chaînes. Les sous-chaînes vides peuvent être sélectionnées si le séparateur se produit au début ou à la fin de la chaîne, ou s'il existe plusieurs séparateurs consécutifs. - -**Syntaxe** - -``` sql -splitByChar(, ) -``` - -**Paramètre** - -- `separator` — The separator which should contain exactly one character. [Chaîne](../../sql-reference/data-types/string.md). -- `s` — The string to split. [Chaîne](../../sql-reference/data-types/string.md). - -**Valeur renvoyée(s)** - -Retourne un tableau de certaines chaînes. Des sous-chaînes vides peuvent être sélectionnées lorsque: - -- Un séparateur se produit au début ou à la fin de la chaîne; -- Il existe plusieurs séparateurs consécutifs; -- La chaîne d'origine `s` est vide. - -Type: [Tableau](../../sql-reference/data-types/array.md) de [Chaîne](../../sql-reference/data-types/string.md). - -**Exemple** - -``` sql -SELECT splitByChar(',', '1,2,3,abcde') -``` - -``` text -┌─splitByChar(',', '1,2,3,abcde')─┐ -│ ['1','2','3','abcde'] │ -└─────────────────────────────────┘ -``` - -## splitByString(séparateur, s) {#splitbystringseparator-s} - -Divise une chaîne en sous-chaînes séparées par une chaîne. Il utilise une chaîne constante `separator` de plusieurs caractères comme séparateur. Si la chaîne `separator` est vide, il va diviser la chaîne `s` dans un tableau de caractères uniques. - -**Syntaxe** - -``` sql -splitByString(, ) -``` - -**Paramètre** - -- `separator` — The separator. [Chaîne](../../sql-reference/data-types/string.md). -- `s` — The string to split. [Chaîne](../../sql-reference/data-types/string.md). - -**Valeur renvoyée(s)** - -Retourne un tableau de certaines chaînes. Des sous-chaînes vides peuvent être sélectionnées lorsque: - -Type: [Tableau](../../sql-reference/data-types/array.md) de [Chaîne](../../sql-reference/data-types/string.md). - -- Un séparateur non vide se produit au début ou à la fin de la chaîne; -- Il existe plusieurs séparateurs consécutifs non vides; -- La chaîne d'origine `s` est vide tandis que le séparateur n'est pas vide. - -**Exemple** - -``` sql -SELECT splitByString(', ', '1, 2 3, 4,5, abcde') -``` - -``` text -┌─splitByString(', ', '1, 2 3, 4,5, abcde')─┐ -│ ['1','2 3','4,5','abcde'] │ -└───────────────────────────────────────────┘ -``` - -``` sql -SELECT splitByString('', 'abcde') -``` - -``` text -┌─splitByString('', 'abcde')─┐ -│ ['a','b','c','d','e'] │ -└────────────────────────────┘ -``` - -## arrayStringConcat(arr \[, séparateur\]) {#arraystringconcatarr-separator} - -Concatène les chaînes répertoriées dans le tableau avec le séparateur."séparateur" est un paramètre facultatif: une chaîne constante, définie à une chaîne vide par défaut. -Retourne une chaîne de caractères. - -## alphaTokens (s) {#alphatokenss} - -Sélectionne des sous-chaînes d'octets consécutifs dans les plages A-z et A-Z. retourne un tableau de sous-chaînes. - -**Exemple** - -``` sql -SELECT alphaTokens('abca1abc') -``` - -``` text -┌─alphaTokens('abca1abc')─┐ -│ ['abca','abc'] │ -└─────────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/splitting_merging_functions/) diff --git a/docs/fr/sql-reference/functions/string-functions.md b/docs/fr/sql-reference/functions/string-functions.md deleted file mode 100644 index 1482952426c..00000000000 --- a/docs/fr/sql-reference/functions/string-functions.md +++ /dev/null @@ -1,489 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 40 -toc_title: "Travailler avec des cha\xEEnes" ---- - -# Fonctions pour travailler avec des chaînes {#functions-for-working-with-strings} - -## vide {#empty} - -Renvoie 1 pour une chaîne vide ou 0 pour une chaîne non vide. -Le type de résultat est UInt8. -Une chaîne est considérée comme non vide si elle contient au moins un octet, même s'il s'agit d'un espace ou d'un octet nul. -La fonction fonctionne également pour les tableaux. - -## notEmpty {#notempty} - -Renvoie 0 pour une chaîne vide ou 1 pour une chaîne non vide. -Le type de résultat est UInt8. -La fonction fonctionne également pour les tableaux. - -## longueur {#length} - -Renvoie la longueur d'une chaîne en octets (pas en caractères, et pas en points de code). -Le type de résultat est UInt64. -La fonction fonctionne également pour les tableaux. - -## lengthUTF8 {#lengthutf8} - -Renvoie la longueur d'une chaîne en points de code Unicode (pas en caractères), en supposant que la chaîne contient un ensemble d'octets qui composent le texte codé en UTF-8. Si cette hypothèse n'est pas remplie, elle renvoie un résultat (elle ne lance pas d'exception). -Le type de résultat est UInt64. - -## char_length, CHAR_LENGTH {#char-length} - -Renvoie la longueur d'une chaîne en points de code Unicode (pas en caractères), en supposant que la chaîne contient un ensemble d'octets qui composent le texte codé en UTF-8. Si cette hypothèse n'est pas remplie, elle renvoie un résultat (elle ne lance pas d'exception). -Le type de résultat est UInt64. - -## character_length, CHARACTER_LENGTH {#character-length} - -Renvoie la longueur d'une chaîne en points de code Unicode (pas en caractères), en supposant que la chaîne contient un ensemble d'octets qui composent le texte codé en UTF-8. Si cette hypothèse n'est pas remplie, elle renvoie un résultat (elle ne lance pas d'exception). -Le type de résultat est UInt64. - -## plus bas, lcase {#lower} - -Convertit les symboles latins ASCII dans une chaîne en minuscules. - -## supérieur, ucase {#upper} - -Convertit les symboles latins ASCII dans une chaîne en majuscules. - -## lowerUTF8 {#lowerutf8} - -Convertit une chaîne en minuscules, en supposant que la chaîne de caractères contient un ensemble d'octets qui composent un texte UTF-8. -Il ne détecte pas la langue. Donc, pour le turc, le résultat pourrait ne pas être exactement correct. -Si la longueur de la séquence d'octets UTF-8 est différente pour les majuscules et les minuscules d'un point de code, le résultat peut être incorrect pour ce point de code. -Si la chaîne contient un ensemble d'octets qui N'est pas UTF-8, le comportement n'est pas défini. - -## upperUTF8 {#upperutf8} - -Convertit une chaîne en majuscules, en supposant que la chaîne de caractères contient un ensemble d'octets qui composent un texte UTF-8. -Il ne détecte pas la langue. Donc, pour le turc, le résultat pourrait ne pas être exactement correct. -Si la longueur de la séquence d'octets UTF-8 est différente pour les majuscules et les minuscules d'un point de code, le résultat peut être incorrect pour ce point de code. -Si la chaîne contient un ensemble d'octets qui N'est pas UTF-8, le comportement n'est pas défini. - -## isValidUTF8 {#isvalidutf8} - -Renvoie 1, si l'ensemble d'octets est codé en UTF-8 valide, sinon 0. - -## toValidUTF8 {#tovalidutf8} - -Remplace les caractères UTF-8 non valides par `�` (U+FFFD) caractère. Tous les caractères non valides s'exécutant dans une rangée sont réduits en un seul caractère de remplacement. - -``` sql -toValidUTF8( input_string ) -``` - -Paramètre: - -- input_string — Any set of bytes represented as the [Chaîne](../../sql-reference/data-types/string.md) type de données objet. - -Valeur renvoyée: chaîne UTF-8 valide. - -**Exemple** - -``` sql -SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') -``` - -``` text -┌─toValidUTF8('a����b')─┐ -│ a�b │ -└───────────────────────┘ -``` - -## répéter {#repeat} - -Répète une corde autant de fois que spécifié et concatène les valeurs répliquées comme une seule chaîne. - -**Syntaxe** - -``` sql -repeat(s, n) -``` - -**Paramètre** - -- `s` — The string to repeat. [Chaîne](../../sql-reference/data-types/string.md). -- `n` — The number of times to repeat the string. [UInt](../../sql-reference/data-types/int-uint.md). - -**Valeur renvoyée** - -La chaîne unique, qui contient la chaîne `s` répéter `n` temps. Si `n` \< 1, la fonction renvoie une chaîne vide. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT repeat('abc', 10) -``` - -Résultat: - -``` text -┌─repeat('abc', 10)──────────────┐ -│ abcabcabcabcabcabcabcabcabcabc │ -└────────────────────────────────┘ -``` - -## inverser {#reverse} - -Inverse la chaîne (comme une séquence d'octets). - -## reverseUTF8 {#reverseutf8} - -Inverse une séquence de points de code Unicode, en supposant que la chaîne contient un ensemble d'octets représentant un texte UTF-8. Sinon, il fait autre chose (il ne lance pas d'exception). - -## format(pattern, s0, s1, …) {#format} - -Formatage du motif constant avec la chaîne listée dans les arguments. `pattern` est un modèle de format Python simplifié. Chaîne de Format contient “replacement fields” entouré par des accolades `{}`. Tout ce qui n'est pas contenu dans les accolades est considéré comme du texte littéral, qui est copié inchangé dans la sortie. Si vous devez inclure un caractère d'Accolade dans le texte littéral, il peut être échappé en doublant: `{{ '{{' }}` et `{{ '}}' }}`. Les noms de champs peuvent être des nombres (à partir de zéro) ou vides (ils sont alors traités comme des nombres de conséquence). - -``` sql -SELECT format('{1} {0} {1}', 'World', 'Hello') -``` - -``` text -┌─format('{1} {0} {1}', 'World', 'Hello')─┐ -│ Hello World Hello │ -└─────────────────────────────────────────┘ -``` - -``` sql -SELECT format('{} {}', 'Hello', 'World') -``` - -``` text -┌─format('{} {}', 'Hello', 'World')─┐ -│ Hello World │ -└───────────────────────────────────┘ -``` - -## concat {#concat} - -Concatène les chaînes répertoriées dans les arguments, sans séparateur. - -**Syntaxe** - -``` sql -concat(s1, s2, ...) -``` - -**Paramètre** - -Valeurs de type String ou FixedString. - -**Valeurs renvoyées** - -Renvoie la chaîne qui résulte de la concaténation des arguments. - -Si l'une des valeurs d'argument est `NULL`, `concat` retourner `NULL`. - -**Exemple** - -Requête: - -``` sql -SELECT concat('Hello, ', 'World!') -``` - -Résultat: - -``` text -┌─concat('Hello, ', 'World!')─┐ -│ Hello, World! │ -└─────────────────────────────┘ -``` - -## concatAssumeInjective {#concatassumeinjective} - -Même que [concat](#concat) la différence est que vous devez vous assurer que `concat(s1, s2, ...) → sn` est injectif, il sera utilisé pour l'optimisation du groupe par. - -La fonction est nommée “injective” si elle renvoie toujours un résultat différent pour différentes valeurs d'arguments. En d'autres termes: des arguments différents ne donnent jamais un résultat identique. - -**Syntaxe** - -``` sql -concatAssumeInjective(s1, s2, ...) -``` - -**Paramètre** - -Valeurs de type String ou FixedString. - -**Valeurs renvoyées** - -Renvoie la chaîne qui résulte de la concaténation des arguments. - -Si l'une des valeurs d'argument est `NULL`, `concatAssumeInjective` retourner `NULL`. - -**Exemple** - -Table d'entrée: - -``` sql -CREATE TABLE key_val(`key1` String, `key2` String, `value` UInt32) ENGINE = TinyLog; -INSERT INTO key_val VALUES ('Hello, ','World',1), ('Hello, ','World',2), ('Hello, ','World!',3), ('Hello',', World!',2); -SELECT * from key_val; -``` - -``` text -┌─key1────┬─key2─────┬─value─┐ -│ Hello, │ World │ 1 │ -│ Hello, │ World │ 2 │ -│ Hello, │ World! │ 3 │ -│ Hello │ , World! │ 2 │ -└─────────┴──────────┴───────┘ -``` - -Requête: - -``` sql -SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY concatAssumeInjective(key1, key2) -``` - -Résultat: - -``` text -┌─concat(key1, key2)─┬─sum(value)─┐ -│ Hello, World! │ 3 │ -│ Hello, World! │ 2 │ -│ Hello, World │ 3 │ -└────────────────────┴────────────┘ -``` - -## substring(s, offset, longueur), mid(s, offset, longueur), substr(s, offset, longueur) {#substring} - -Renvoie une sous-chaîne commençant par l'octet du ‘offset’ index ‘length’ octets de long. L'indexation des caractères commence à partir d'un (comme dans SQL standard). Le ‘offset’ et ‘length’ les arguments doivent être des constantes. - -## substringUTF8(s, offset, longueur) {#substringutf8} - -Le même que ‘substring’, mais pour les points de code Unicode. Fonctionne sous l'hypothèse que la chaîne contient un ensemble d'octets représentant un texte codé en UTF-8. Si cette hypothèse n'est pas remplie, elle renvoie un résultat (elle ne lance pas d'exception). - -## appendTrailingCharIfAbsent (s, c) {#appendtrailingcharifabsent} - -Si l' ‘s’ la chaîne n'est pas vide et ne contient pas ‘c’ personnage à la fin, il ajoute le ‘c’ personnage à la fin. - -## convertCharset(s, à partir de, à) {#convertcharset} - -Retourne une chaîne de caractères ‘s’ qui a été converti à partir de l'encodage dans ‘from’ pour l'encodage dans ‘to’. - -## base64Encode(s) {#base64encode} - -Encodage ‘s’ chaîne dans base64 - -## base64Decode(s) {#base64decode} - -Décoder la chaîne codée en base64 ‘s’ dans la chaîne d'origine. En cas d'échec, une exception est levée. - -## tryBase64Decode(s) {#trybase64decode} - -Semblable à base64Decode, mais en cas d'erreur, une chaîne vide serait renvoyé. - -## endsWith (s, suffixe) {#endswith} - -Renvoie s'il faut se terminer par le suffixe spécifié. Retourne 1 si la chaîne se termine par le suffixe spécifié, sinon elle renvoie 0. - -## startsWith (STR, préfixe) {#startswith} - -Retourne 1 si la chaîne commence par le préfixe spécifié, sinon elle renvoie 0. - -``` sql -SELECT startsWith('Spider-Man', 'Spi'); -``` - -**Valeurs renvoyées** - -- 1, si la chaîne commence par le préfixe spécifié. -- 0, si la chaîne ne commence pas par le préfixe spécifié. - -**Exemple** - -Requête: - -``` sql -SELECT startsWith('Hello, world!', 'He'); -``` - -Résultat: - -``` text -┌─startsWith('Hello, world!', 'He')─┐ -│ 1 │ -└───────────────────────────────────┘ -``` - -## coupe {#trim} - -Supprime tous les caractères spécifiés du début ou de la fin d'une chaîne. -Par défaut supprime toutes les occurrences consécutives d'espaces communs (caractère ASCII 32) des deux extrémités d'une chaîne. - -**Syntaxe** - -``` sql -trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) -``` - -**Paramètre** - -- `trim_character` — specified characters for trim. [Chaîne](../../sql-reference/data-types/string.md). -- `input_string` — string for trim. [Chaîne](../../sql-reference/data-types/string.md). - -**Valeur renvoyée** - -Une chaîne sans caractères de début et (ou) de fin spécifiés. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT trim(BOTH ' ()' FROM '( Hello, world! )') -``` - -Résultat: - -``` text -┌─trim(BOTH ' ()' FROM '( Hello, world! )')─┐ -│ Hello, world! │ -└───────────────────────────────────────────────┘ -``` - -## trimLeft {#trimleft} - -Supprime toutes les occurrences consécutives d'espaces communs (caractère ASCII 32) depuis le début d'une chaîne. Il ne supprime pas d'autres types de caractères d'espaces (tabulation, espace sans pause, etc.). - -**Syntaxe** - -``` sql -trimLeft(input_string) -``` - -Alias: `ltrim(input_string)`. - -**Paramètre** - -- `input_string` — string to trim. [Chaîne](../../sql-reference/data-types/string.md). - -**Valeur renvoyée** - -Une chaîne sans ouvrir les espaces communs. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT trimLeft(' Hello, world! ') -``` - -Résultat: - -``` text -┌─trimLeft(' Hello, world! ')─┐ -│ Hello, world! │ -└─────────────────────────────────────┘ -``` - -## trimRight {#trimright} - -Supprime toutes les occurrences consécutives d'espaces communs (caractère ASCII 32) de la fin d'une chaîne. Il ne supprime pas d'autres types de caractères d'espaces (tabulation, espace sans pause, etc.). - -**Syntaxe** - -``` sql -trimRight(input_string) -``` - -Alias: `rtrim(input_string)`. - -**Paramètre** - -- `input_string` — string to trim. [Chaîne](../../sql-reference/data-types/string.md). - -**Valeur renvoyée** - -Une chaîne sans espaces communs de fin. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT trimRight(' Hello, world! ') -``` - -Résultat: - -``` text -┌─trimRight(' Hello, world! ')─┐ -│ Hello, world! │ -└──────────────────────────────────────┘ -``` - -## trimBoth {#trimboth} - -Supprime toutes les occurrences consécutives d'espaces communs (caractère ASCII 32) des deux extrémités d'une chaîne. Il ne supprime pas d'autres types de caractères d'espaces (tabulation, espace sans pause, etc.). - -**Syntaxe** - -``` sql -trimBoth(input_string) -``` - -Alias: `trim(input_string)`. - -**Paramètre** - -- `input_string` — string to trim. [Chaîne](../../sql-reference/data-types/string.md). - -**Valeur renvoyée** - -Une chaîne sans espaces communs de début et de fin. - -Type: `String`. - -**Exemple** - -Requête: - -``` sql -SELECT trimBoth(' Hello, world! ') -``` - -Résultat: - -``` text -┌─trimBoth(' Hello, world! ')─┐ -│ Hello, world! │ -└─────────────────────────────────────┘ -``` - -## CRC32 (s) {#crc32} - -Renvoie la somme de contrôle CRC32 d'une chaîne, en utilisant le polynôme CRC-32-IEEE 802.3 et la valeur initiale `0xffffffff` (zlib mise en œuvre). - -Le type de résultat est UInt32. - -## CRC32IEEE (s) {#crc32ieee} - -Renvoie la somme de contrôle CRC32 d'une chaîne, en utilisant le polynôme CRC-32-IEEE 802.3. - -Le type de résultat est UInt32. - -## CRC64 (s) {#crc64} - -Renvoie la somme de contrôle CRC64 d'une chaîne, en utilisant le polynôme CRC-64-ECMA. - -Le type de résultat est UInt64. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/string_functions/) diff --git a/docs/fr/sql-reference/functions/string-replace-functions.md b/docs/fr/sql-reference/functions/string-replace-functions.md deleted file mode 100644 index 5389a2bc927..00000000000 --- a/docs/fr/sql-reference/functions/string-replace-functions.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 42 -toc_title: "Pour remplacer dans les cha\xEEnes" ---- - -# Fonctions de recherche et de remplacement dans les chaînes {#functions-for-searching-and-replacing-in-strings} - -## replaceOne(botte de foin, modèle, remplacement) {#replaceonehaystack-pattern-replacement} - -Remplace la première occurrence, si elle existe, ‘pattern’ sous-chaîne dans ‘haystack’ avec l' ‘replacement’ substring. -Ci-après, ‘pattern’ et ‘replacement’ doivent être constantes. - -## replaceAll(botte de foin, motif, remplacement), Remplacer(botte de foin, motif, remplacement) {#replaceallhaystack-pattern-replacement-replacehaystack-pattern-replacement} - -Remplace toutes les occurrences du ‘pattern’ sous-chaîne dans ‘haystack’ avec l' ‘replacement’ substring. - -## replaceRegexpOne(botte de foin, modèle, remplacement) {#replaceregexponehaystack-pattern-replacement} - -Remplacement en utilisant le ‘pattern’ expression régulière. Une expression régulière re2. -Remplace seulement la première occurrence, si elle existe. -Un motif peut être spécifié comme ‘replacement’. Ce modèle peut inclure des substitutions `\0-\9`. -Substitution `\0` inclut l'expression régulière entière. Substitution `\1-\9` correspond au sous-modèle numbers.To utilisez le `\` caractère dans un modèle, échappez-le en utilisant `\`. -Aussi garder à l'esprit qu'un littéral de chaîne nécessite une évasion. - -Exemple 1. Conversion de la date au format américain: - -``` sql -SELECT DISTINCT - EventDate, - replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res -FROM test.hits -LIMIT 7 -FORMAT TabSeparated -``` - -``` text -2014-03-17 03/17/2014 -2014-03-18 03/18/2014 -2014-03-19 03/19/2014 -2014-03-20 03/20/2014 -2014-03-21 03/21/2014 -2014-03-22 03/22/2014 -2014-03-23 03/23/2014 -``` - -Exemple 2. Copier une chaîne dix fois: - -``` sql -SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res -``` - -``` text -┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ -└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -## replaceRegexpAll(botte de foin, modèle, remplacement) {#replaceregexpallhaystack-pattern-replacement} - -Cela fait la même chose, mais remplace toutes les occurrences. Exemple: - -``` sql -SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res -``` - -``` text -┌─res────────────────────────┐ -│ HHeelllloo,, WWoorrlldd!! │ -└────────────────────────────┘ -``` - -Par exception, si une expression régulière travaillé sur un vide sous-chaîne, le remplacement n'est pas effectué plus d'une fois. -Exemple: - -``` sql -SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res -``` - -``` text -┌─res─────────────────┐ -│ here: Hello, World! │ -└─────────────────────┘ -``` - -## regexpQuoteMeta (s) {#regexpquotemetas} - -La fonction ajoute une barre oblique inverse avant certains caractères prédéfinis dans la chaîne. -Les personnages prédéfinis: ‘0’, ‘\\’, ‘\|’, ‘(’, ‘)’, ‘^’, ‘$’, ‘.’, ‘\[’, '\]', ‘?’, '\*‘,’+‘,’{‘,’:‘,’-'. -Cette implémentation diffère légèrement de re2:: RE2:: QuoteMeta. Il échappe à zéro octet comme \\0 au lieu de 00 et il échappe uniquement les caractères requis. -Pour plus d'informations, voir le lien: [RE2](https://github.com/google/re2/blob/master/re2/re2.cc#L473) - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/string_replace_functions/) diff --git a/docs/fr/sql-reference/functions/string-search-functions.md b/docs/fr/sql-reference/functions/string-search-functions.md deleted file mode 100644 index 20217edd32c..00000000000 --- a/docs/fr/sql-reference/functions/string-search-functions.md +++ /dev/null @@ -1,379 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 41 -toc_title: "Pour Rechercher Des Cha\xEEnes" ---- - -# Fonctions de recherche de chaînes {#functions-for-searching-strings} - -La recherche est sensible à la casse par défaut dans toutes ces fonctions. Il existe des variantes pour la recherche insensible à la casse. - -## position(botte de foin, aiguille), localiser( botte de foin, aiguille) {#position} - -Renvoie la position (en octets) de la sous-chaîne trouvée dans la chaîne, à partir de 1. - -Fonctionne sous l'hypothèse que la chaîne de caractères contient un ensemble d'octets représentant un octet texte codé. Si cette hypothèse n'est pas remplie et qu'un caractère ne peut pas être représenté à l'aide d'un seul octet, la fonction ne lance pas d'exception et renvoie un résultat inattendu. Si le caractère peut être représenté en utilisant deux octets, il utilisera deux octets et ainsi de suite. - -Pour une recherche insensible à la casse, utilisez la fonction [positioncaseinsensible](#positioncaseinsensitive). - -**Syntaxe** - -``` sql -position(haystack, needle) -``` - -Alias: `locate(haystack, needle)`. - -**Paramètre** - -- `haystack` — string, in which substring will to be searched. [Chaîne](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [Chaîne](../syntax.md#syntax-string-literal). - -**Valeurs renvoyées** - -- Position de départ en octets (à partir de 1), si la sous-chaîne a été trouvée. -- 0, si la sous-chaîne n'a pas été trouvé. - -Type: `Integer`. - -**Exemple** - -Phrase “Hello, world!” contient un ensemble d'octets représentant un octet texte codé. La fonction renvoie un résultat attendu: - -Requête: - -``` sql -SELECT position('Hello, world!', '!') -``` - -Résultat: - -``` text -┌─position('Hello, world!', '!')─┐ -│ 13 │ -└────────────────────────────────┘ -``` - -La même phrase en russe contient des caractères qui ne peuvent pas être représentés en utilisant un seul octet. La fonction renvoie un résultat inattendu (utilisation [positionUTF8](#positionutf8) fonction pour le texte codé sur plusieurs octets): - -Requête: - -``` sql -SELECT position('Привет, мир!', '!') -``` - -Résultat: - -``` text -┌─position('Привет, мир!', '!')─┐ -│ 21 │ -└───────────────────────────────┘ -``` - -## positioncaseinsensible {#positioncaseinsensitive} - -Le même que [position](#position) renvoie la position (en octets) de la sous-chaîne trouvée dans la chaîne, à partir de 1. Utilisez la fonction pour une recherche insensible à la casse. - -Fonctionne sous l'hypothèse que la chaîne de caractères contient un ensemble d'octets représentant un octet texte codé. Si cette hypothèse n'est pas remplie et qu'un caractère ne peut pas être représenté à l'aide d'un seul octet, la fonction ne lance pas d'exception et renvoie un résultat inattendu. Si le caractère peut être représenté en utilisant deux octets, il utilisera deux octets et ainsi de suite. - -**Syntaxe** - -``` sql -positionCaseInsensitive(haystack, needle) -``` - -**Paramètre** - -- `haystack` — string, in which substring will to be searched. [Chaîne](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [Chaîne](../syntax.md#syntax-string-literal). - -**Valeurs renvoyées** - -- Position de départ en octets (à partir de 1), si la sous-chaîne a été trouvée. -- 0, si la sous-chaîne n'a pas été trouvé. - -Type: `Integer`. - -**Exemple** - -Requête: - -``` sql -SELECT positionCaseInsensitive('Hello, world!', 'hello') -``` - -Résultat: - -``` text -┌─positionCaseInsensitive('Hello, world!', 'hello')─┐ -│ 1 │ -└───────────────────────────────────────────────────┘ -``` - -## positionUTF8 {#positionutf8} - -Renvoie la position (en points Unicode) de la sous-chaîne trouvée dans la chaîne, à partir de 1. - -Fonctionne sous l'hypothèse que la chaîne contient un ensemble d'octets représentant un texte codé en UTF-8. Si cette hypothèse n'est pas remplie, la fonction ne lance pas d'exception et renvoie un résultat inattendu. Si le caractère peut être représenté en utilisant deux points Unicode, il en utilisera deux et ainsi de suite. - -Pour une recherche insensible à la casse, utilisez la fonction [positionCaseInsensitiveUTF8](#positioncaseinsensitiveutf8). - -**Syntaxe** - -``` sql -positionUTF8(haystack, needle) -``` - -**Paramètre** - -- `haystack` — string, in which substring will to be searched. [Chaîne](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [Chaîne](../syntax.md#syntax-string-literal). - -**Valeurs renvoyées** - -- Position de départ dans les points Unicode (à partir de 1), si la sous-chaîne a été trouvée. -- 0, si la sous-chaîne n'a pas été trouvé. - -Type: `Integer`. - -**Exemple** - -Phrase “Hello, world!” en russe contient un ensemble de points Unicode représentant un texte codé à un seul point. La fonction renvoie un résultat attendu: - -Requête: - -``` sql -SELECT positionUTF8('Привет, мир!', '!') -``` - -Résultat: - -``` text -┌─positionUTF8('Привет, мир!', '!')─┐ -│ 12 │ -└───────────────────────────────────┘ -``` - -Phrase “Salut, étudiante!” où le caractère `é` peut être représenté en utilisant un point (`U+00E9`) ou deux points (`U+0065U+0301`) la fonction peut être retournée un résultat inattendu: - -Requête pour la lettre `é` qui est représenté un point Unicode `U+00E9`: - -``` sql -SELECT positionUTF8('Salut, étudiante!', '!') -``` - -Résultat: - -``` text -┌─positionUTF8('Salut, étudiante!', '!')─┐ -│ 17 │ -└────────────────────────────────────────┘ -``` - -Requête pour la lettre `é` qui est représenté deux points Unicode `U+0065U+0301`: - -``` sql -SELECT positionUTF8('Salut, étudiante!', '!') -``` - -Résultat: - -``` text -┌─positionUTF8('Salut, étudiante!', '!')─┐ -│ 18 │ -└────────────────────────────────────────┘ -``` - -## positionCaseInsensitiveUTF8 {#positioncaseinsensitiveutf8} - -Le même que [positionUTF8](#positionutf8) mais est sensible à la casse. Renvoie la position (en points Unicode) de la sous-chaîne trouvée dans la chaîne, à partir de 1. - -Fonctionne sous l'hypothèse que la chaîne contient un ensemble d'octets représentant un texte codé en UTF-8. Si cette hypothèse n'est pas remplie, la fonction ne lance pas d'exception et renvoie un résultat inattendu. Si le caractère peut être représenté en utilisant deux points Unicode, il en utilisera deux et ainsi de suite. - -**Syntaxe** - -``` sql -positionCaseInsensitiveUTF8(haystack, needle) -``` - -**Paramètre** - -- `haystack` — string, in which substring will to be searched. [Chaîne](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [Chaîne](../syntax.md#syntax-string-literal). - -**Valeur renvoyée** - -- Position de départ dans les points Unicode (à partir de 1), si la sous-chaîne a été trouvée. -- 0, si la sous-chaîne n'a pas été trouvé. - -Type: `Integer`. - -**Exemple** - -Requête: - -``` sql -SELECT positionCaseInsensitiveUTF8('Привет, мир!', 'Мир') -``` - -Résultat: - -``` text -┌─positionCaseInsensitiveUTF8('Привет, мир!', 'Мир')─┐ -│ 9 │ -└────────────────────────────────────────────────────┘ -``` - -## multirecherchallpositions {#multisearchallpositions} - -Le même que [position](string-search-functions.md#position) mais les retours `Array` des positions (en octets) des sous-chaînes correspondantes trouvées dans la chaîne. Les Positions sont indexées à partir de 1. - -La recherche est effectuée sur des séquences d'octets sans tenir compte de l'encodage et du classement des chaînes. - -- Pour une recherche ASCII insensible à la casse, utilisez la fonction `multiSearchAllPositionsCaseInsensitive`. -- Pour la recherche en UTF-8, Utilisez la fonction [multiSearchAllPositionsUTF8](#multiSearchAllPositionsUTF8). -- Pour la recherche UTF-8 insensible à la casse, utilisez la fonction multiSearchAllPositionsCaseInsensitiveutf8. - -**Syntaxe** - -``` sql -multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) -``` - -**Paramètre** - -- `haystack` — string, in which substring will to be searched. [Chaîne](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [Chaîne](../syntax.md#syntax-string-literal). - -**Valeurs renvoyées** - -- Tableau de positions de départ en octets (à partir de 1), si la sous-chaîne correspondante a été trouvée et 0 si elle n'est pas trouvée. - -**Exemple** - -Requête: - -``` sql -SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']) -``` - -Résultat: - -``` text -┌─multiSearchAllPositions('Hello, World!', ['hello', '!', 'world'])─┐ -│ [0,13,0] │ -└───────────────────────────────────────────────────────────────────┘ -``` - -## multiSearchAllPositionsUTF8 {#multiSearchAllPositionsUTF8} - -Voir `multiSearchAllPositions`. - -## multiSearchFirstPosition(botte de foin, \[aiguille1, aiguille2, …, needleet\]) {#multisearchfirstposition} - -Le même que `position` mais renvoie le décalage le plus à gauche de la chaîne `haystack` et qui correspond à certains des aiguilles. - -Pour une recherche insensible à la casse ou/et au format UTF-8, utilisez les fonctions `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`. - -## multiSearchFirstIndex(botte de foin, \[aiguille1, aiguille2, …, needleet\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} - -Renvoie l'index `i` (à partir de 1) de l'aiguille trouvée la plus à gaucheje dans la chaîne `haystack` et 0 sinon. - -Pour une recherche insensible à la casse ou/et au format UTF-8, utilisez les fonctions `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. - -## multiSearchAny(botte de foin, \[aiguille1, aiguille2, …, needleet\]) {#function-multisearchany} - -Renvoie 1, si au moins une aiguille de chaîneje correspond à la chaîne `haystack` et 0 sinon. - -Pour une recherche insensible à la casse ou/et au format UTF-8, utilisez les fonctions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. - -!!! note "Note" - Dans tous les `multiSearch*` fonctions le nombre d'aiguilles doit être d'au moins 28 en raison de la spécification de mise en œuvre. - -## match (botte de foin, motif) {#matchhaystack-pattern} - -Vérifie si la chaîne correspond au `pattern` expression régulière. Un `re2` expression régulière. Le [syntaxe](https://github.com/google/re2/wiki/Syntax) de la `re2` les expressions régulières sont plus limitées que la syntaxe des expressions régulières Perl. - -Renvoie 0 si elle ne correspond pas, ou 1 si elle correspond. - -Notez que le symbole antislash (`\`) est utilisé pour s'échapper dans l'expression régulière. Le même symbole est utilisé pour échapper dans les littéraux de chaîne. Donc, pour échapper au symbole dans une expression régulière, vous devez écrire deux barres obliques inverses ( \\ ) dans un littéral de chaîne. - -L'expression régulière travaille à la chaîne, comme si c'est un ensemble d'octets. L'expression régulière ne peut pas contenir d'octets nuls. -Pour que les modèles recherchent des sous-chaînes dans une chaîne, il est préférable D'utiliser LIKE ou ‘position’ depuis ils travaillent beaucoup plus vite. - -## multiMatchAny(botte de foin, \[motif1, modèle2, …, patternet\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} - -Le même que `match` mais renvoie 0 si aucune des expressions régulières sont appariés et 1 si l'un des modèles les matchs. Il utilise [hyperscan](https://github.com/intel/hyperscan) bibliothèque. Pour que les modèles recherchent des sous-chaînes dans une chaîne, il est préférable d'utiliser `multiSearchAny` comme cela fonctionne beaucoup plus vite. - -!!! note "Note" - La longueur de l'un des `haystack` la chaîne doit être inférieure à 232 octets sinon l'exception est levée. Cette restriction a lieu en raison de l'API hyperscan. - -## multiMatchAnyIndex(botte de foin, \[motif1, modèle2, …, patternet\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} - -Le même que `multiMatchAny` mais retourne un index qui correspond à la botte de foin. - -## multiMatchAllIndices(botte de foin, \[motif1, modèle2, …, patternet\]) {#multimatchallindiceshaystack-pattern1-pattern2-patternn} - -Le même que `multiMatchAny`, mais renvoie le tableau de tous les indices qui correspondent à la botte de foin dans n'importe quel ordre. - -## multiFuzzyMatchAny(botte de foin, distance, \[motif1, modèle2, …, patternet\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} - -Le même que `multiMatchAny`, mais renvoie 1 si un motif correspond à la botte de foin dans une constante [distance d'édition](https://en.wikipedia.org/wiki/Edit_distance). Cette fonction est également en mode expérimental et peut être extrêmement lente. Pour plus d'informations, voir [documentation hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). - -## multiFuzzyMatchAnyIndex(botte de foin, distance, \[motif1, modèle2, …, patternet\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} - -Le même que `multiFuzzyMatchAny`, mais renvoie tout index qui correspond à la botte de foin à une distance d'édition constante. - -## multiFuzzyMatchAllIndices(botte de foin, distance, \[motif1, modèle2, …, patternet\]) {#multifuzzymatchallindiceshaystack-distance-pattern1-pattern2-patternn} - -Le même que `multiFuzzyMatchAny`, mais renvoie le tableau de tous les indices dans n'importe quel ordre qui correspond à la botte de foin à une distance d'édition constante. - -!!! note "Note" - `multiFuzzyMatch*` les fonctions ne prennent pas en charge les expressions régulières UTF-8, et ces expressions sont traitées comme des octets en raison de la restriction hyperscan. - -!!! note "Note" - Pour désactiver toutes les fonctions qui utilisent hyperscan, utilisez le réglage `SET allow_hyperscan = 0;`. - -## extrait(botte de foin, motif) {#extracthaystack-pattern} - -Extraits d'un fragment d'une chaîne à l'aide d'une expression régulière. Si ‘haystack’ ne correspond pas à l' ‘pattern’ regex, une chaîne vide est renvoyée. Si l'expression rationnelle ne contient pas de sous-modèles, elle prend le fragment qui correspond à l'expression rationnelle entière. Sinon, il prend le fragment qui correspond au premier sous-masque. - -## extractAll(botte de foin, motif) {#extractallhaystack-pattern} - -Extrait tous les fragments d'une chaîne à l'aide d'une expression régulière. Si ‘haystack’ ne correspond pas à l' ‘pattern’ regex, une chaîne vide est renvoyée. Renvoie un tableau de chaînes composé de toutes les correspondances à l'expression rationnelle. En général, le comportement est le même que le ‘extract’ fonction (il prend le premier sous-masque, ou l'expression entière s'il n'y a pas de sous-masque). - -## comme (botte de foin, motif), botte de foin comme opérateur de motif {#function-like} - -Vérifie si une chaîne correspond à une expression régulière simple. -L'expression régulière peut contenir les métasymboles `%` et `_`. - -`%` indique n'importe quelle quantité d'octets (y compris zéro caractère). - -`_` indique un octet. - -Utilisez la barre oblique inverse (`\`) pour échapper aux métasymboles. Voir la note sur l'échappement dans la description du ‘match’ fonction. - -Pour les expressions régulières comme `%needle%`, le code est plus optimale et fonctionne aussi vite que le `position` fonction. -Pour d'autres expressions régulières, le code est le même que pour la ‘match’ fonction. - -## notLike (botte de foin, motif), botte de foin pas comme opérateur de motif {#function-notlike} - -La même chose que ‘like’ mais négative. - -## ngramDistance(botte de foin, aiguille) {#ngramdistancehaystack-needle} - -Calcule la distance de 4 grammes entre `haystack` et `needle`: counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns float number from 0 to 1 – the closer to zero, the more strings are similar to each other. If the constant `needle` ou `haystack` est plus de 32Kb, jette une exception. Si une partie de la non-constante `haystack` ou `needle` les chaînes sont plus que 32Kb, la distance est toujours un. - -Pour une recherche insensible à la casse ou/et au format UTF-8, utilisez les fonctions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. - -## ngramSearch(botte de foin, aiguille) {#ngramsearchhaystack-needle} - -Même que `ngramDistance` mais calcule la différence non symétrique entre `needle` et `haystack` – the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grammes. Le plus proche d'un, le plus probable `needle` est dans le `haystack`. Peut être utile pour la recherche de chaîne floue. - -Pour une recherche insensible à la casse ou/et au format UTF-8, utilisez les fonctions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. - -!!! note "Note" - For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/fr/sql-reference/functions/type-conversion-functions.md b/docs/fr/sql-reference/functions/type-conversion-functions.md deleted file mode 100644 index c17b24c69dc..00000000000 --- a/docs/fr/sql-reference/functions/type-conversion-functions.md +++ /dev/null @@ -1,534 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 38 -toc_title: La Conversion De Type ---- - -# Fonctions De Conversion De Type {#type-conversion-functions} - -## Problèmes courants des Conversions numériques {#numeric-conversion-issues} - -Lorsque vous convertissez une valeur d'un type de données à un autre, vous devez vous rappeler que dans le cas courant, il s'agit d'une opération dangereuse qui peut entraîner une perte de données. Une perte de données peut se produire si vous essayez d'ajuster la valeur d'un type de données plus grand à un type de données plus petit, ou si vous convertissez des valeurs entre différents types de données. - -ClickHouse a le [même comportement que les programmes C++ ](https://en.cppreference.com/w/cpp/language/implicit_conversion). - -## toInt (8/16/32/64) {#toint8163264} - -Convertit une valeur d'entrée en [Int](../../sql-reference/data-types/int-uint.md) type de données. Cette fonction comprend: - -- `toInt8(expr)` — Results in the `Int8` type de données. -- `toInt16(expr)` — Results in the `Int16` type de données. -- `toInt32(expr)` — Results in the `Int32` type de données. -- `toInt64(expr)` — Results in the `Int64` type de données. - -**Paramètre** - -- `expr` — [Expression](../syntax.md#syntax-expressions) renvoyer un nombre ou une chaîne avec la représentation décimale d'un nombre. Les représentations binaires, octales et hexadécimales des nombres ne sont pas prises en charge. Les zéros principaux sont dépouillés. - -**Valeur renvoyée** - -Valeur entière dans le `Int8`, `Int16`, `Int32`, ou `Int64` type de données. - -Fonctions d'utilisation [l'arrondi vers zéro](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), ce qui signifie qu'ils tronquent des chiffres fractionnaires de nombres. - -Le comportement des fonctions pour le [NaN et Inf](../../sql-reference/data-types/float.md#data_type-float-nan-inf) arguments est indéfini. Rappelez-vous sur [problèmes de conversion numérique](#numeric-conversion-issues), lorsque vous utilisez les fonctions. - -**Exemple** - -``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) -``` - -``` text -┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ -│ -9223372036854775808 │ 32 │ 16 │ 8 │ -└──────────────────────┴─────────────┴───────────────┴─────────────┘ -``` - -## toInt (8/16/32/64)OrZero {#toint8163264orzero} - -Il prend un argument de type String et essaie de l'analyser en Int (8 \| 16 \| 32 \| 64). En cas d'échec, renvoie 0. - -**Exemple** - -``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') -``` - -``` text -┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ -│ 123123 │ 0 │ -└─────────────────────────┴───────────────────────────┘ -``` - -## toInt (8/16/32/64)OrNull {#toint8163264ornull} - -Il prend un argument de type String et essaie de l'analyser en Int (8 \| 16 \| 32 \| 64). En cas d'échec, renvoie NULL. - -**Exemple** - -``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') -``` - -``` text -┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ -│ 123123 │ ᴺᵁᴸᴸ │ -└─────────────────────────┴───────────────────────────┘ -``` - -## toUInt (8/16/32/64) {#touint8163264} - -Convertit une valeur d'entrée en [UInt](../../sql-reference/data-types/int-uint.md) type de données. Cette fonction comprend: - -- `toUInt8(expr)` — Results in the `UInt8` type de données. -- `toUInt16(expr)` — Results in the `UInt16` type de données. -- `toUInt32(expr)` — Results in the `UInt32` type de données. -- `toUInt64(expr)` — Results in the `UInt64` type de données. - -**Paramètre** - -- `expr` — [Expression](../syntax.md#syntax-expressions) renvoyer un nombre ou une chaîne avec la représentation décimale d'un nombre. Les représentations binaires, octales et hexadécimales des nombres ne sont pas prises en charge. Les zéros principaux sont dépouillés. - -**Valeur renvoyée** - -Valeur entière dans le `UInt8`, `UInt16`, `UInt32`, ou `UInt64` type de données. - -Fonctions d'utilisation [l'arrondi vers zéro](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), ce qui signifie qu'ils tronquent des chiffres fractionnaires de nombres. - -Le comportement des fonctions pour les agruments négatifs et pour le [NaN et Inf](../../sql-reference/data-types/float.md#data_type-float-nan-inf) arguments est indéfini. Si vous passez une chaîne avec un nombre négatif, par exemple `'-32'`, ClickHouse soulève une exception. Rappelez-vous sur [problèmes de conversion numérique](#numeric-conversion-issues), lorsque vous utilisez les fonctions. - -**Exemple** - -``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) -``` - -``` text -┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ -│ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ -└─────────────────────┴───────────────┴────────────────┴──────────────┘ -``` - -## toUInt (8/16/32/64)OrZero {#touint8163264orzero} - -## toUInt (8/16/32/64)OrNull {#touint8163264ornull} - -## toFloat (32/64) {#tofloat3264} - -## toFloat (32/64)OrZero {#tofloat3264orzero} - -## toFloat (32/64) OrNull {#tofloat3264ornull} - -## jour {#todate} - -## toDateOrZero {#todateorzero} - -## toDateOrNull {#todateornull} - -## toDateTime {#todatetime} - -## toDateTimeOrZero {#todatetimeorzero} - -## toDateTimeOrNull {#todatetimeornull} - -## toDecimal (32/64/128) {#todecimal3264128} - -Convertir `value` à l' [Décimal](../../sql-reference/data-types/decimal.md) type de données avec précision de `S`. Le `value` peut être un nombre ou une chaîne. Le `S` (l'échelle) paramètre spécifie le nombre de décimales. - -- `toDecimal32(value, S)` -- `toDecimal64(value, S)` -- `toDecimal128(value, S)` - -## toDecimal (32/64/128) OrNull {#todecimal3264128ornull} - -Convertit une chaîne d'entrée en [Nullable (Décimal (P, S))](../../sql-reference/data-types/decimal.md) valeur de type de données. Cette famille de fonctions comprennent: - -- `toDecimal32OrNull(expr, S)` — Results in `Nullable(Decimal32(S))` type de données. -- `toDecimal64OrNull(expr, S)` — Results in `Nullable(Decimal64(S))` type de données. -- `toDecimal128OrNull(expr, S)` — Results in `Nullable(Decimal128(S))` type de données. - -Ces fonctions devraient être utilisées à la place de `toDecimal*()` fonctions, si vous préférez obtenir un `NULL` la valeur au lieu d'une exception dans le cas d'une valeur d'entrée erreur d'analyse. - -**Paramètre** - -- `expr` — [Expression](../syntax.md#syntax-expressions), retourne une valeur dans l' [Chaîne](../../sql-reference/data-types/string.md) type de données. ClickHouse attend la représentation textuelle du nombre décimal. Exemple, `'1.111'`. -- `S` — Scale, the number of decimal places in the resulting value. - -**Valeur renvoyée** - -Une valeur dans l' `Nullable(Decimal(P,S))` type de données. La valeur contient: - -- Numéro `S` décimales, si ClickHouse interprète la chaîne d'entrée comme un nombre. -- `NULL` si ClickHouse ne peut pas interpréter la chaîne d'entrée comme un nombre ou si le nombre d'entrée contient plus de `S` décimale. - -**Exemple** - -``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) -``` - -``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ -``` - -``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) -``` - -``` text -┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ -│ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ -└──────┴────────────────────────────────────────────────────┘ -``` - -## toDecimal (32/64/128)OrZero {#todecimal3264128orzero} - -Convertit une valeur d'entrée en [Decimal(P,S)](../../sql-reference/data-types/decimal.md) type de données. Cette famille de fonctions comprennent: - -- `toDecimal32OrZero( expr, S)` — Results in `Decimal32(S)` type de données. -- `toDecimal64OrZero( expr, S)` — Results in `Decimal64(S)` type de données. -- `toDecimal128OrZero( expr, S)` — Results in `Decimal128(S)` type de données. - -Ces fonctions devraient être utilisées à la place de `toDecimal*()` fonctions, si vous préférez obtenir un `0` la valeur au lieu d'une exception dans le cas d'une valeur d'entrée erreur d'analyse. - -**Paramètre** - -- `expr` — [Expression](../syntax.md#syntax-expressions), retourne une valeur dans l' [Chaîne](../../sql-reference/data-types/string.md) type de données. ClickHouse attend la représentation textuelle du nombre décimal. Exemple, `'1.111'`. -- `S` — Scale, the number of decimal places in the resulting value. - -**Valeur renvoyée** - -Une valeur dans l' `Nullable(Decimal(P,S))` type de données. La valeur contient: - -- Numéro `S` décimales, si ClickHouse interprète la chaîne d'entrée comme un nombre. -- 0 avec `S` décimales, si ClickHouse ne peut pas interpréter la chaîne d'entrée comme un nombre ou si le nombre d'entrée contient plus de `S` décimale. - -**Exemple** - -``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) -``` - -``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ -``` - -``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) -``` - -``` text -┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ -│ 0.00 │ Decimal(9, 2) │ -└──────┴────────────────────────────────────────────────────┘ -``` - -## toString {#tostring} - -Fonctions de conversion entre des nombres, des chaînes (mais pas des chaînes fixes), des dates et des dates avec des heures. -Toutes ces fonctions acceptent un argument. - -Lors de la conversion vers ou à partir d'une chaîne, la valeur est formatée ou analysée en utilisant les mêmes règles que pour le format TabSeparated (et presque tous les autres formats de texte). Si la chaîne ne peut pas être analysée, une exception est levée et la demande est annulée. - -Lors de la conversion de dates en nombres ou vice versa, la date correspond au nombre de jours depuis le début de L'époque Unix. -Lors de la conversion de dates avec des heures en nombres ou vice versa, la date avec l'heure correspond au nombre de secondes depuis le début de L'époque Unix. - -Les formats date et date-avec-heure pour les fonctions toDate/toDateTime sont définis comme suit: - -``` text -YYYY-MM-DD -YYYY-MM-DD hh:mm:ss -``` - -À titre d'exception, si vous convertissez des types numériques UInt32, Int32, UInt64 ou Int64 à Date, et si le nombre est supérieur ou égal à 65536, le nombre est interprété comme un horodatage Unix (et non comme le nombre de jours) et est arrondi à la date. Cela permet de prendre en charge l'occurrence commune de l'écriture ‘toDate(unix_timestamp)’, qui autrement serait une erreur et nécessiterait d'écrire le plus lourd ‘toDate(toDateTime(unix_timestamp))’. - -La Conversion entre une date et une date avec l'heure est effectuée de manière naturelle: en ajoutant une heure nulle ou en supprimant l'heure. - -La Conversion entre types numériques utilise les mêmes règles que les affectations entre différents types numériques en C++. - -De plus, la fonction ToString de L'argument DateTime peut prendre un deuxième argument de chaîne contenant le nom du fuseau horaire. Exemple: `Asia/Yekaterinburg` Dans ce cas, l'heure est formatée en fonction du fuseau horaire spécifié. - -``` sql -SELECT - now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat -``` - -``` text -┌───────────now_local─┬─now_yekat───────────┐ -│ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ -└─────────────────────┴─────────────────────┘ -``` - -Voir aussi l' `toUnixTimestamp` fonction. - -## toFixedString (s, N) {#tofixedstrings-n} - -Convertit un argument de type String en un type FixedString (N) (une chaîne de longueur fixe N). N doit être une constante. -Si la chaîne a moins d'octets que N, elle est complétée avec des octets null à droite. Si la chaîne a plus d'octets que N, une exception est levée. - -## toStringCutToZero(s) {#tostringcuttozeros} - -Accepte un argument String ou FixedString. Renvoie la chaîne avec le contenu tronqué au premier octet zéro trouvé. - -Exemple: - -``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut -``` - -``` text -┌─s─────────────┬─s_cut─┐ -│ foo\0\0\0\0\0 │ foo │ -└───────────────┴───────┘ -``` - -``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut -``` - -``` text -┌─s──────────┬─s_cut─┐ -│ foo\0bar\0 │ foo │ -└────────────┴───────┘ -``` - -## reinterpretAsUInt (8/16/32/64) {#reinterpretasuint8163264} - -## reinterpretAsInt (8/16/32/64) {#reinterpretasint8163264} - -## reinterpretAsFloat (32/64) {#reinterpretasfloat3264} - -## réinterprétasdate {#reinterpretasdate} - -## reinterpretAsDateTime {#reinterpretasdatetime} - -Ces fonctions acceptent une chaîne et interprètent les octets placés au début de la chaîne comme un nombre dans l'ordre de l'hôte (little endian). Si la chaîne n'est pas assez longue, les fonctions fonctionnent comme si la chaîne était remplie avec le nombre nécessaire d'octets nuls. Si la chaîne est plus longue que nécessaire, les octets supplémentaires sont ignorés. Une date est interprétée comme le nombre de jours depuis le début de l'Époque Unix, et une date avec le temps, est interprété comme le nombre de secondes écoulées depuis le début de l'Époque Unix. - -## reinterpretAsString {#type_conversion_functions-reinterpretAsString} - -Cette fonction accepte un nombre ou une date ou une date avec l'heure, et renvoie une chaîne contenant des octets représentant la valeur correspondante dans l'ordre de l'hôte (little endian). Les octets nuls sont supprimés de la fin. Par exemple, une valeur de type uint32 de 255 est une chaîne longue d'un octet. - -## reinterpretAsFixedString {#reinterpretasfixedstring} - -Cette fonction accepte un nombre ou une date ou une date avec l'heure, et renvoie une chaîne fixe contenant des octets représentant la valeur correspondante dans l'ordre de l'hôte (little endian). Les octets nuls sont supprimés de la fin. Par exemple, une valeur de type uint32 de 255 est une chaîne fixe longue d'un octet. - -## CAST (x, T) {#type_conversion_function-cast} - -Convertir ‘x’ à l' ‘t’ type de données. La syntaxe CAST (X comme t) est également prise en charge. - -Exemple: - -``` sql -SELECT - '2016-06-15 23:00:00' AS timestamp, - CAST(timestamp AS DateTime) AS datetime, - CAST(timestamp AS Date) AS date, - CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string -``` - -``` text -┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ -│ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ -└─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ -``` - -La Conversion en FixedString (N) ne fonctionne que pour les arguments de type String ou FixedString (N). - -Type conversion en [Nullable](../../sql-reference/data-types/nullable.md) et le dos est pris en charge. Exemple: - -``` sql -SELECT toTypeName(x) FROM t_null -``` - -``` text -┌─toTypeName(x)─┐ -│ Int8 │ -│ Int8 │ -└───────────────┘ -``` - -``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null -``` - -``` text -┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ -│ Nullable(UInt16) │ -│ Nullable(UInt16) │ -└─────────────────────────────────────────┘ -``` - -## toInterval (année / trimestre / Mois / Semaine / Jour / Heure / Minute / Seconde) {#function-tointerval} - -Convertit un argument de type Number en [Intervalle](../../sql-reference/data-types/special-data-types/interval.md) type de données. - -**Syntaxe** - -``` sql -toIntervalSecond(number) -toIntervalMinute(number) -toIntervalHour(number) -toIntervalDay(number) -toIntervalWeek(number) -toIntervalMonth(number) -toIntervalQuarter(number) -toIntervalYear(number) -``` - -**Paramètre** - -- `number` — Duration of interval. Positive integer number. - -**Valeurs renvoyées** - -- La valeur de `Interval` type de données. - -**Exemple** - -``` sql -WITH - toDate('2019-01-01') AS date, - INTERVAL 1 WEEK AS interval_week, - toIntervalWeek(1) AS interval_to_week -SELECT - date + interval_week, - date + interval_to_week -``` - -``` text -┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ -│ 2019-01-08 │ 2019-01-08 │ -└───────────────────────────┴──────────────────────────────┘ -``` - -## parseDateTimeBestEffort {#parsedatetimebesteffort} - -Convertit une date et une heure dans le [Chaîne](../../sql-reference/data-types/string.md) la représentation de [DateTime](../../sql-reference/data-types/datetime.md#data_type-datetime) type de données. - -La fonction d'analyse [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601), [RFC 1123 - 5.2.14 RFC-822 date et heure Spécification](https://tools.ietf.org/html/rfc1123#page-55), ClickHouse et d'autres formats de date et d'heure. - -**Syntaxe** - -``` sql -parseDateTimeBestEffort(time_string [, time_zone]); -``` - -**Paramètre** - -- `time_string` — String containing a date and time to convert. [Chaîne](../../sql-reference/data-types/string.md). -- `time_zone` — Time zone. The function parses `time_string` selon le fuseau horaire. [Chaîne](../../sql-reference/data-types/string.md). - -**Formats non standard pris en charge** - -- Une chaîne contenant 9..10 chiffres [le timestamp unix](https://en.wikipedia.org/wiki/Unix_time). -- Une chaîne avec une date et une heure composant: `YYYYMMDDhhmmss`, `DD/MM/YYYY hh:mm:ss`, `DD-MM-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. -- Une chaîne avec une date, mais pas de composant de temps: `YYYY`, `YYYYMM`, `YYYY*MM`, `DD/MM/YYYY`, `DD-MM-YY` etc. -- Une chaîne avec un jour et une heure: `DD`, `DD hh`, `DD hh:mm`. Dans ce cas `YYYY-MM` sont substitués comme suit `2000-01`. -- Une chaîne qui inclut la date et l'heure ainsi que des informations de décalage de fuseau horaire: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. Exemple, `2020-12-12 17:36:00 -5:00`. - -Pour tous les formats avec séparateur, la fonction analyse les noms de mois exprimés par leur nom complet ou par les trois premières lettres d'un nom de mois. Exemple: `24/DEC/18`, `24-Dec-18`, `01-September-2018`. - -**Valeur renvoyée** - -- `time_string` converti à l' `DateTime` type de données. - -**Exemple** - -Requête: - -``` sql -SELECT parseDateTimeBestEffort('12/12/2020 12:12:57') -AS parseDateTimeBestEffort; -``` - -Résultat: - -``` text -┌─parseDateTimeBestEffort─┐ -│ 2020-12-12 12:12:57 │ -└─────────────────────────┘ -``` - -Requête: - -``` sql -SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort -``` - -Résultat: - -``` text -┌─parseDateTimeBestEffort─┐ -│ 2018-08-18 10:22:16 │ -└─────────────────────────┘ -``` - -Requête: - -``` sql -SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort -``` - -Résultat: - -``` text -┌─parseDateTimeBestEffort─┐ -│ 2015-07-07 12:04:41 │ -└─────────────────────────┘ -``` - -Requête: - -``` sql -SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort -``` - -Résultat: - -``` text -┌─parseDateTimeBestEffort─┐ -│ 2018-12-12 10:12:12 │ -└─────────────────────────┘ -``` - -Requête: - -``` sql -SELECT parseDateTimeBestEffort('10 20:19') -``` - -Résultat: - -``` text -┌─parseDateTimeBestEffort('10 20:19')─┐ -│ 2000-01-10 20:19:00 │ -└─────────────────────────────────────┘ -``` - -**Voir Aussi** - -- \[Annonce ISO 8601 par @xkcd\](https://xkcd.com/1179/) -- [RFC 1123](https://tools.ietf.org/html/rfc1123) -- [jour](#todate) -- [toDateTime](#todatetime) - -## parseDateTimeBestEffortOrNull {#parsedatetimebesteffortornull} - -De même que pour [parseDateTimeBestEffort](#parsedatetimebesteffort) sauf qu'il renvoie null lorsqu'il rencontre un format de date qui ne peut pas être traité. - -## parseDateTimeBestEffortOrZero {#parsedatetimebesteffortorzero} - -De même que pour [parseDateTimeBestEffort](#parsedatetimebesteffort) sauf qu'il renvoie une date zéro ou une date zéro lorsqu'il rencontre un format de date qui ne peut pas être traité. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/type_conversion_functions/) diff --git a/docs/fr/sql-reference/functions/url-functions.md b/docs/fr/sql-reference/functions/url-functions.md deleted file mode 100644 index 2bb2203a10b..00000000000 --- a/docs/fr/sql-reference/functions/url-functions.md +++ /dev/null @@ -1,209 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 54 -toc_title: Travailler avec des URL ---- - -# Fonctions pour travailler avec des URL {#functions-for-working-with-urls} - -Toutes ces fonctions ne suivent pas la RFC. Ils sont simplifiés au maximum pour améliorer les performances. - -## Fonctions qui extraient des parties d'une URL {#functions-that-extract-parts-of-a-url} - -Si la partie pertinente n'est pas présente dans une URL, une chaîne vide est renvoyée. - -### protocole {#protocol} - -Extrait le protocole d'une URL. - -Examples of typical returned values: http, https, ftp, mailto, tel, magnet… - -### domaine {#domain} - -Extrait le nom d'hôte d'une URL. - -``` sql -domain(url) -``` - -**Paramètre** - -- `url` — URL. Type: [Chaîne](../../sql-reference/data-types/string.md). - -L'URL peut être spécifiée avec ou sans schéma. Exemple: - -``` text -svn+ssh://some.svn-hosting.com:80/repo/trunk -some.svn-hosting.com:80/repo/trunk -https://yandex.com/time/ -``` - -Pour ces exemples, le `domain` la fonction renvoie les résultats suivants: - -``` text -some.svn-hosting.com -some.svn-hosting.com -yandex.com -``` - -**Valeurs renvoyées** - -- Nom d'hôte. Si ClickHouse peut analyser la chaîne d'entrée en tant QU'URL. -- Chaîne vide. Si ClickHouse ne peut pas analyser la chaîne d'entrée en tant QU'URL. - -Type: `String`. - -**Exemple** - -``` sql -SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk') -``` - -``` text -┌─domain('svn+ssh://some.svn-hosting.com:80/repo/trunk')─┐ -│ some.svn-hosting.com │ -└────────────────────────────────────────────────────────┘ -``` - -### domainWithoutWWW {#domainwithoutwww} - -Renvoie le domaine et ne supprime pas plus d'un ‘www.’ dès le début de celui-ci, si présent. - -### topLevelDomain {#topleveldomain} - -Extrait le domaine de premier niveau d'une URL. - -``` sql -topLevelDomain(url) -``` - -**Paramètre** - -- `url` — URL. Type: [Chaîne](../../sql-reference/data-types/string.md). - -L'URL peut être spécifiée avec ou sans schéma. Exemple: - -``` text -svn+ssh://some.svn-hosting.com:80/repo/trunk -some.svn-hosting.com:80/repo/trunk -https://yandex.com/time/ -``` - -**Valeurs renvoyées** - -- Nom de domaine. Si ClickHouse peut analyser la chaîne d'entrée en tant QU'URL. -- Chaîne vide. Si ClickHouse ne peut pas analyser la chaîne d'entrée en tant QU'URL. - -Type: `String`. - -**Exemple** - -``` sql -SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk') -``` - -``` text -┌─topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk')─┐ -│ com │ -└────────────────────────────────────────────────────────────────────┘ -``` - -### firstSignificantSubdomain {#firstsignificantsubdomain} - -Renvoie la “first significant subdomain”. C'est un concept non standard spécifique à Yandex.Metrica. Le premier sous-domaine significatif est un domaine de deuxième niveau s'il est ‘com’, ‘net’, ‘org’, ou ‘co’. Sinon, il est un domaine de troisième niveau. Exemple, `firstSignificantSubdomain (‘https://news.yandex.ru/’) = ‘yandex’, firstSignificantSubdomain (‘https://news.yandex.com.tr/’) = ‘yandex’`. La liste des “insignificant” les domaines de deuxième niveau et d'autres détails de mise en œuvre peuvent changer à l'avenir. - -### cutToFirstSignificantSubdomain {#cuttofirstsignificantsubdomain} - -Renvoie la partie du domaine qui inclut les sous-domaines de premier niveau “first significant subdomain” (voir l'explication ci-dessus). - -Exemple, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`. - -### chemin {#path} - -Retourne le chemin d'accès. Exemple: `/top/news.html` Le chemin n'inclut pas la chaîne de requête. - -### pathFull {#pathfull} - -La même chose que ci-dessus, mais y compris la chaîne de requête et le fragment. Exemple: / top / nouvelles.le html?page = 2 # commentaires - -### queryString {#querystring} - -Retourne la chaîne de requête. Exemple: page = 1 & lr=213. query-string n'inclut pas le point d'interrogation initial, ainsi que # et tout ce qui suit #. - -### fragment {#fragment} - -Renvoie l'identificateur de fragment. fragment n'inclut pas le symbole de hachage initial. - -### queryStringAndFragment {#querystringandfragment} - -Renvoie la chaîne de requête et l'Identificateur de fragment. Exemple: page = 1 # 29390. - -### extractURLParameter (URL, nom) {#extracturlparameterurl-name} - -Renvoie la valeur de la ‘name’ paramètre dans l'URL, le cas échéant. Sinon, une chaîne vide. S'il y a beaucoup de paramètres avec ce nom, il renvoie la première occurrence. Cette fonction fonctionne en supposant que le nom du paramètre est codé dans L'URL exactement de la même manière que dans l'argument passé. - -### extractURLParameters (URL) {#extracturlparametersurl} - -Renvoie un tableau de chaînes name = value correspondant aux paramètres D'URL. Les valeurs ne sont en aucun cas décodées. - -### extractURLParameterNames (URL) {#extracturlparameternamesurl} - -Retourne un tableau de chaînes de noms correspondant aux noms des paramètres d'URL. Les valeurs ne sont en aucun cas décodées. - -### URLHierarchy (URL) {#urlhierarchyurl} - -Retourne un tableau contenant L'URL, tronquée à la fin par les symboles /,? dans le chemin et la chaîne de requête. Les caractères séparateurs consécutifs sont comptés comme un. La coupe est faite dans la position après tous les caractères de séparation consécutifs. - -### URLPathHierarchy (URL) {#urlpathhierarchyurl} - -La même chose que ci-dessus, mais sans le protocole et l'hôte dans le résultat. Le / les élément (racine) n'est pas inclus. Exemple: la fonction est utilisée pour implémenter l'arborescence des rapports de L'URL dans Yandex. Métrique. - -``` text -URLPathHierarchy('https://example.com/browse/CONV-6788') = -[ - '/browse/', - '/browse/CONV-6788' -] -``` - -### decodeURLComponent (URL) {#decodeurlcomponenturl} - -Renvoie L'URL décodée. -Exemple: - -``` sql -SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; -``` - -``` text -┌─DecodedURL─────────────────────────────┐ -│ http://127.0.0.1:8123/?query=SELECT 1; │ -└────────────────────────────────────────┘ -``` - -## Fonctions qui suppriment une partie D'une URL {#functions-that-remove-part-of-a-url} - -Si L'URL n'a rien de similaire, L'URL reste inchangée. - -### cutWWW {#cutwww} - -Supprime pas plus d'une ‘www.’ depuis le début du domaine de L'URL, s'il est présent. - -### cutQueryString {#cutquerystring} - -Supprime la chaîne de requête. Le point d'interrogation est également supprimé. - -### cutFragment {#cutfragment} - -Supprime l'identificateur de fragment. Le signe est également supprimé. - -### couperystringandfragment {#cutquerystringandfragment} - -Supprime la chaîne de requête et l'Identificateur de fragment. Le point d'interrogation et le signe numérique sont également supprimés. - -### cutURLParameter (URL, nom) {#cuturlparameterurl-name} - -Supprime le ‘name’ Paramètre URL, si présent. Cette fonction fonctionne en supposant que le nom du paramètre est codé dans L'URL exactement de la même manière que dans l'argument passé. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/url_functions/) diff --git a/docs/fr/sql-reference/functions/uuid-functions.md b/docs/fr/sql-reference/functions/uuid-functions.md deleted file mode 100644 index 9f9eb67d3e9..00000000000 --- a/docs/fr/sql-reference/functions/uuid-functions.md +++ /dev/null @@ -1,122 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 53 -toc_title: Travailler avec UUID ---- - -# Fonctions pour travailler avec UUID {#functions-for-working-with-uuid} - -Les fonctions pour travailler avec UUID sont listées ci-dessous. - -## generateUUIDv4 {#uuid-function-generate} - -Génère le [UUID](../../sql-reference/data-types/uuid.md) de [la version 4](https://tools.ietf.org/html/rfc4122#section-4.4). - -``` sql -generateUUIDv4() -``` - -**Valeur renvoyée** - -La valeur de type UUID. - -**Exemple d'utilisation** - -Cet exemple montre la création d'une table avec la colonne de type UUID et l'insertion d'une valeur dans la table. - -``` sql -CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog - -INSERT INTO t_uuid SELECT generateUUIDv4() - -SELECT * FROM t_uuid -``` - -``` text -┌────────────────────────────────────x─┐ -│ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │ -└──────────────────────────────────────┘ -``` - -## toUUID (x) {#touuid-x} - -Convertit la valeur de type de chaîne en type UUID. - -``` sql -toUUID(String) -``` - -**Valeur renvoyée** - -La valeur de type UUID. - -**Exemple d'utilisation** - -``` sql -SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid -``` - -``` text -┌─────────────────────────────────uuid─┐ -│ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │ -└──────────────────────────────────────┘ -``` - -## UUIDStringToNum {#uuidstringtonum} - -Accepte une chaîne contenant 36 caractères dans le format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, et le renvoie comme un ensemble d'octets dans un [FixedString (16)](../../sql-reference/data-types/fixedstring.md). - -``` sql -UUIDStringToNum(String) -``` - -**Valeur renvoyée** - -FixedString (16) - -**Exemples d'utilisation** - -``` sql -SELECT - '612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid, - UUIDStringToNum(uuid) AS bytes -``` - -``` text -┌─uuid─────────────────────────────────┬─bytes────────────┐ -│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │ -└──────────────────────────────────────┴──────────────────┘ -``` - -## UUIDNumToString {#uuidnumtostring} - -Accepte un [FixedString (16)](../../sql-reference/data-types/fixedstring.md) valeur, et renvoie une chaîne contenant 36 caractères au format texte. - -``` sql -UUIDNumToString(FixedString(16)) -``` - -**Valeur renvoyée** - -Chaîne. - -**Exemple d'utilisation** - -``` sql -SELECT - 'a/<@];!~p{jTj={)' AS bytes, - UUIDNumToString(toFixedString(bytes, 16)) AS uuid -``` - -``` text -┌─bytes────────────┬─uuid─────────────────────────────────┐ -│ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ -└──────────────────┴──────────────────────────────────────┘ -``` - -## Voir Aussi {#see-also} - -- [dictGetUUID](ext-dict-functions.md#ext_dict_functions-other) - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/uuid_function/) diff --git a/docs/fr/sql-reference/functions/ym-dict-functions.md b/docs/fr/sql-reference/functions/ym-dict-functions.md deleted file mode 100644 index f1e4461e24a..00000000000 --- a/docs/fr/sql-reference/functions/ym-dict-functions.md +++ /dev/null @@ -1,155 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 59 -toc_title: Travailler avec Yandex.Dictionnaires Metrica ---- - -# Fonctions pour travailler avec Yandex.Dictionnaires Metrica {#functions-for-working-with-yandex-metrica-dictionaries} - -Pour que les fonctions ci-dessous fonctionnent, la configuration du serveur doit spécifier les chemins et les adresses pour obtenir tous les Yandex.Dictionnaires Metrica. Les dictionnaires sont chargés au premier appel de l'une de ces fonctions. Si les listes de référence ne peuvent pas être chargées, une exception est levée. - -Pour plus d'informations sur la création de listes de références, consultez la section “Dictionaries”. - -## Plusieurs Geobases {#multiple-geobases} - -ClickHouse soutient le travail avec plusieurs géobases alternatives (hiérarchies régionales) simultanément, afin de soutenir diverses perspectives sur les pays auxquels appartiennent certaines régions. - -Le ‘clickhouse-server’ config spécifie le fichier avec l'échelon régional::`/opt/geo/regions_hierarchy.txt` - -Outre ce fichier, il recherche également les fichiers à proximité qui ont le symbole _ et tout suffixe ajouté au nom (avant l'extension de fichier). -Par exemple, il trouvera également le fichier `/opt/geo/regions_hierarchy_ua.txt` si présente. - -`ua` est appelée la clé du dictionnaire. Pour un dictionnaire sans suffixe, la clé est une chaîne vide. - -Tous les dictionnaires sont rechargés dans l'exécution (une fois toutes les secondes, comme défini dans le paramètre de configuration builtin_dictionaries_reload_interval, ou une fois par heure par défaut). Cependant, la liste des dictionnaires disponibles est définie une fois, lorsque le serveur démarre. - -All functions for working with regions have an optional argument at the end – the dictionary key. It is referred to as the geobase. -Exemple: - -``` sql -regionToCountry(RegionID) – Uses the default dictionary: /opt/geo/regions_hierarchy.txt -regionToCountry(RegionID, '') – Uses the default dictionary: /opt/geo/regions_hierarchy.txt -regionToCountry(RegionID, 'ua') – Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt -``` - -### regionToCity (id \[, geobase\]) {#regiontocityid-geobase} - -Accepts a UInt32 number – the region ID from the Yandex geobase. If this region is a city or part of a city, it returns the region ID for the appropriate city. Otherwise, returns 0. - -### regionToArea (id \[, geobase\]) {#regiontoareaid-geobase} - -Convertit une région en une zone (tapez 5 dans la géobase). Dans tous les autres cas, cette fonction est la même que ‘regionToCity’. - -``` sql -SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua')) -FROM system.numbers -LIMIT 15 -``` - -``` text -┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ -│ │ -│ Moscow and Moscow region │ -│ St. Petersburg and Leningrad region │ -│ Belgorod region │ -│ Ivanovsk region │ -│ Kaluga region │ -│ Kostroma region │ -│ Kursk region │ -│ Lipetsk region │ -│ Orlov region │ -│ Ryazan region │ -│ Smolensk region │ -│ Tambov region │ -│ Tver region │ -│ Tula region │ -└──────────────────────────────────────────────────────┘ -``` - -### regionToDistrict(id \[, geobase\]) {#regiontodistrictid-geobase} - -Convertit une région en district fédéral (type 4 dans la géobase). Dans tous les autres cas, cette fonction est la même que ‘regionToCity’. - -``` sql -SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua')) -FROM system.numbers -LIMIT 15 -``` - -``` text -┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ -│ │ -│ Central federal district │ -│ Northwest federal district │ -│ South federal district │ -│ North Caucases federal district │ -│ Privolga federal district │ -│ Ural federal district │ -│ Siberian federal district │ -│ Far East federal district │ -│ Scotland │ -│ Faroe Islands │ -│ Flemish region │ -│ Brussels capital region │ -│ Wallonia │ -│ Federation of Bosnia and Herzegovina │ -└──────────────────────────────────────────────────────────┘ -``` - -### regionToCountry (id \[, geobase\]) {#regiontocountryid-geobase} - -Convertit une région en un pays. Dans tous les autres cas, cette fonction est la même que ‘regionToCity’. -Exemple: `regionToCountry(toUInt32(213)) = 225` convertit Moscou (213) en Russie (225). - -### regionToContinent(id \[, géobase\]) {#regiontocontinentid-geobase} - -Convertit une région en continent. Dans tous les autres cas, cette fonction est la même que ‘regionToCity’. -Exemple: `regionToContinent(toUInt32(213)) = 10001` convertit Moscou (213) en Eurasie (10001). - -### regionToTopContinent (#regiontotopcontinent) {#regiontotopcontinent-regiontotopcontinent} - -Trouve le continent le plus élevé dans la hiérarchie de la région. - -**Syntaxe** - -``` sql -regionToTopContinent(id[, geobase]); -``` - -**Paramètre** - -- `id` — Region ID from the Yandex geobase. [UInt32](../../sql-reference/data-types/int-uint.md). -- `geobase` — Dictionary key. See [Plusieurs Geobases](#multiple-geobases). [Chaîne](../../sql-reference/data-types/string.md). Facultatif. - -**Valeur renvoyée** - -- Identifiant du continent de haut niveau (ce dernier lorsque vous grimpez dans la hiérarchie des régions). -- 0, si il n'y a aucun. - -Type: `UInt32`. - -### regionToPopulation (id \[, geobase\]) {#regiontopopulationid-geobase} - -Obtient la population d'une région. -La population peut être enregistrée dans des fichiers avec la géobase. Voir la section “External dictionaries”. -Si la population n'est pas enregistrée pour la région, elle renvoie 0. -Dans la géobase Yandex, la population peut être enregistrée pour les régions enfants, mais pas pour les régions parentes. - -### regionIn(lhs, rhs \[, géobase\]) {#regioninlhs-rhs-geobase} - -Vérifie si un ‘lhs’ région appartient à une ‘rhs’ région. Renvoie un nombre UInt8 égal à 1 s'il appartient, Ou 0 s'il n'appartient pas. -The relationship is reflexive – any region also belongs to itself. - -### regionHierarchy (id \[, geobase\]) {#regionhierarchyid-geobase} - -Accepts a UInt32 number – the region ID from the Yandex geobase. Returns an array of region IDs consisting of the passed region and all parents along the chain. -Exemple: `regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]`. - -### regionToName(id \[, lang\]) {#regiontonameid-lang} - -Accepts a UInt32 number – the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language ‘ru’ is used. If the language is not supported, an exception is thrown. Returns a string – the name of the region in the corresponding language. If the region with the specified ID doesn't exist, an empty string is returned. - -`ua` et `uk` les deux signifient ukrainien. - -[Article Original](https://clickhouse.tech/docs/en/query_language/functions/ym_dict_functions/) diff --git a/docs/fr/sql-reference/index.md b/docs/fr/sql-reference/index.md deleted file mode 100644 index 04e44892c05..00000000000 --- a/docs/fr/sql-reference/index.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "R\xE9f\xE9rence SQL" -toc_hidden: true -toc_priority: 28 -toc_title: "cach\xE9s" ---- - -# Référence SQL {#sql-reference} - -ClickHouse prend en charge les types de requêtes suivants: - -- [SELECT](statements/select/index.md) -- [INSERT INTO](statements/insert-into.md) -- [CREATE](statements/create.md) -- [ALTER](statements/alter.md#query_language_queries_alter) -- [Autres types de requêtes](statements/misc.md) - -[Article Original](https://clickhouse.tech/docs/en/sql-reference/) diff --git a/docs/fr/sql-reference/operators/in.md b/docs/fr/sql-reference/operators/in.md deleted file mode 100644 index d87fe41a04f..00000000000 --- a/docs/fr/sql-reference/operators/in.md +++ /dev/null @@ -1,204 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -### Dans les opérateurs {#select-in-operators} - -Le `IN`, `NOT IN`, `GLOBAL IN`, et `GLOBAL NOT IN` les opérateurs sont traitées séparément, car leur fonctionnalité est assez riche. - -Le côté gauche de l'opérateur, soit une seule colonne ou un tuple. - -Exemple: - -``` sql -SELECT UserID IN (123, 456) FROM ... -SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... -``` - -Si le côté gauche est une colonne unique qui est dans l'index, et le côté droit est un ensemble de constantes, le système utilise l'index pour le traitement de la requête. - -Don't list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”), puis utiliser une sous-requête. - -Le côté droit de l'opérateur peut être un ensemble d'expressions constantes, un ensemble de tuples avec des expressions constantes (illustrées dans les exemples ci-dessus), ou le nom d'une table de base de données ou une sous-requête SELECT entre parenthèses. - -Si le côté droit de l'opérateur est le nom d'une table (par exemple, `UserID IN users`), ceci est équivalent à la sous-requête `UserID IN (SELECT * FROM users)`. Utilisez ceci lorsque vous travaillez avec des données externes envoyées avec la requête. Par exemple, la requête peut être envoyée avec un ensemble d'ID utilisateur chargés dans le ‘users’ table temporaire, qui doit être filtrée. - -Si le côté droit de l'opérateur est un nom de table qui a le moteur Set (un ensemble de données préparé qui est toujours en RAM), l'ensemble de données ne sera pas créé à nouveau pour chaque requête. - -La sous-requête peut spécifier plusieurs colonnes pour filtrer les tuples. -Exemple: - -``` sql -SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... -``` - -Les colonnes à gauche et à droite de l'opérateur doit avoir le même type. - -L'opérateur IN et la sous-requête peuvent se produire dans n'importe quelle partie de la requête, y compris dans les fonctions d'agrégation et les fonctions lambda. -Exemple: - -``` sql -SELECT - EventDate, - avg(UserID IN - ( - SELECT UserID - FROM test.hits - WHERE EventDate = toDate('2014-03-17') - )) AS ratio -FROM test.hits -GROUP BY EventDate -ORDER BY EventDate ASC -``` - -``` text -┌──EventDate─┬────ratio─┐ -│ 2014-03-17 │ 1 │ -│ 2014-03-18 │ 0.807696 │ -│ 2014-03-19 │ 0.755406 │ -│ 2014-03-20 │ 0.723218 │ -│ 2014-03-21 │ 0.697021 │ -│ 2014-03-22 │ 0.647851 │ -│ 2014-03-23 │ 0.648416 │ -└────────────┴──────────┘ -``` - -Pour chaque jour après le 17 mars, comptez le pourcentage de pages vues par les utilisateurs qui ont visité le site le 17 mars. -Une sous-requête dans la clause est toujours exécuter une seule fois sur un seul serveur. Il n'y a pas de sous-requêtes dépendantes. - -## Le Traitement NULL {#null-processing-1} - -Pendant le traitement de la demande, l'opérateur n'assume que le résultat d'une opération avec [NULL](../syntax.md#null-literal) est toujours égale à `0` indépendamment de savoir si `NULL` est sur le côté droit ou gauche de l'opérateur. `NULL` les valeurs ne sont incluses dans aucun jeu de données, ne correspondent pas entre elles et ne peuvent pas être comparées. - -Voici un exemple avec le `t_null` table: - -``` text -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` - -L'exécution de la requête `SELECT x FROM t_null WHERE y IN (NULL,3)` vous donne le résultat suivant: - -``` text -┌─x─┐ -│ 2 │ -└───┘ -``` - -Vous pouvez voir que la ligne dans laquelle `y = NULL` est jeté hors de résultats de la requête. C'est parce que ClickHouse ne peut pas décider si `NULL` est inclus dans le `(NULL,3)` ensemble, les retours `0` comme le résultat de l'opération, et `SELECT` exclut cette ligne de la sortie finale. - -``` sql -SELECT y IN (NULL, 3) -FROM t_null -``` - -``` text -┌─in(y, tuple(NULL, 3))─┐ -│ 0 │ -│ 1 │ -└───────────────────────┘ -``` - -## Sous-Requêtes Distribuées {#select-distributed-subqueries} - -Il y a deux options pour IN-S avec des sous-requêtes (similaires aux jointures): normal `IN` / `JOIN` et `GLOBAL IN` / `GLOBAL JOIN`. Ils diffèrent dans la façon dont ils sont exécutés pour le traitement des requêtes distribuées. - -!!! attention "Attention" - Rappelez-vous que les algorithmes décrits ci-dessous peuvent travailler différemment en fonction de la [paramètre](../../operations/settings/settings.md) `distributed_product_mode` paramètre. - -Lors de l'utilisation de l'IN régulier, la requête est envoyée à des serveurs distants, et chacun d'eux exécute les sous-requêtes dans le `IN` ou `JOIN` clause. - -Lors de l'utilisation de `GLOBAL IN` / `GLOBAL JOINs`, d'abord toutes les sous-requêtes sont exécutées pour `GLOBAL IN` / `GLOBAL JOINs`, et les résultats sont recueillis dans des tableaux temporaires. Ensuite, les tables temporaires sont envoyés à chaque serveur distant, où les requêtes sont exécutées à l'aide temporaire de données. - -Pour une requête non distribuée, utilisez `IN` / `JOIN`. - -Soyez prudent lorsque vous utilisez des sous-requêtes dans le `IN` / `JOIN` clauses pour le traitement des requêtes distribuées. - -Regardons quelques exemples. Supposons que chaque serveur du cluster a un **local_table**. Chaque serveur dispose également d'une **table distributed_table** table avec le **Distribué** type, qui regarde tous les serveurs du cluster. - -Pour une requête à l' **table distributed_table**, la requête sera envoyée à tous les serveurs distants et exécutée sur eux en utilisant le **local_table**. - -Par exemple, la requête - -``` sql -SELECT uniq(UserID) FROM distributed_table -``` - -sera envoyé à tous les serveurs distants - -``` sql -SELECT uniq(UserID) FROM local_table -``` - -et l'exécuter sur chacun d'eux en parallèle, jusqu'à ce qu'il atteigne le stade où les résultats intermédiaires peuvent être combinés. Ensuite, les résultats intermédiaires seront retournés au demandeur de serveur et de fusion, et le résultat final sera envoyé au client. - -Examinons maintenant une requête avec IN: - -``` sql -SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) -``` - -- Calcul de l'intersection des audiences de deux sites. - -Cette requête sera envoyée à tous les serveurs distants - -``` sql -SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) -``` - -En d'autres termes, l'ensemble de données de la clause IN sera collecté sur chaque serveur indépendamment, uniquement à travers les données stockées localement sur chacun des serveurs. - -Cela fonctionnera correctement et de manière optimale si vous êtes prêt pour ce cas et que vous avez réparti les données entre les serveurs de cluster de telle sorte que les données d'un seul ID utilisateur résident entièrement sur un seul serveur. Dans ce cas, toutes les données nécessaires seront disponibles localement sur chaque serveur. Sinon, le résultat sera erroné. Nous nous référons à cette variation de la requête que “local IN”. - -Pour corriger le fonctionnement de la requête lorsque les données sont réparties aléatoirement sur les serveurs de cluster, vous pouvez spécifier **table distributed_table** à l'intérieur d'une sous-requête. La requête ressemblerait à ceci: - -``` sql -SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) -``` - -Cette requête sera envoyée à tous les serveurs distants - -``` sql -SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) -``` - -La sous-requête commencera à s'exécuter sur chaque serveur distant. Étant donné que la sous-requête utilise une table distribuée, la sous-requête qui se trouve sur chaque serveur distant sera renvoyée à chaque serveur distant comme - -``` sql -SELECT UserID FROM local_table WHERE CounterID = 34 -``` - -Par exemple, si vous avez un cluster de 100 SERVEURS, l'exécution de la requête entière nécessitera 10 000 requêtes élémentaires, ce qui est généralement considéré comme inacceptable. - -Dans de tels cas, vous devez toujours utiliser GLOBAL IN au lieu de IN. Voyons comment cela fonctionne pour la requête - -``` sql -SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) -``` - -Le serveur demandeur exécutera la sous requête - -``` sql -SELECT UserID FROM distributed_table WHERE CounterID = 34 -``` - -et le résultat sera mis dans une table temporaire en RAM. Ensuite, la demande sera envoyée à chaque serveur distant - -``` sql -SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1 -``` - -et la table temporaire `_data1` sera envoyé à chaque serveur distant avec la requête (le nom de la table temporaire est défini par l'implémentation). - -Ceci est plus optimal que d'utiliser la normale dans. Cependant, gardez les points suivants à l'esprit: - -1. Lors de la création d'une table temporaire, les données ne sont pas uniques. Pour réduire le volume de données transmises sur le réseau, spécifiez DISTINCT dans la sous-requête. (Vous n'avez pas besoin de le faire pour un IN normal.) -2. La table temporaire sera envoyé à tous les serveurs distants. La Transmission ne tient pas compte de la topologie du réseau. Par exemple, si 10 serveurs distants résident dans un centre de données très distant par rapport au serveur demandeur, les données seront envoyées 10 fois sur le canal au centre de données distant. Essayez d'éviter les grands ensembles de données lorsque vous utilisez GLOBAL IN. -3. Lors de la transmission de données à des serveurs distants, les restrictions sur la bande passante réseau ne sont pas configurables. Vous pourriez surcharger le réseau. -4. Essayez de distribuer les données entre les serveurs afin que vous n'ayez pas besoin D'utiliser GLOBAL IN sur une base régulière. -5. Si vous devez utiliser GLOBAL in souvent, planifiez l'emplacement du cluster ClickHouse de sorte qu'un seul groupe de répliques ne réside pas dans plus d'un centre de données avec un réseau rapide entre eux, de sorte qu'une requête puisse être traitée entièrement dans un seul centre de données. - -Il est également judicieux de spécifier une table locale dans le `GLOBAL IN` clause, dans le cas où cette table locale est uniquement disponible sur le serveur demandeur et que vous souhaitez utiliser les données de celui-ci sur des serveurs distants. diff --git a/docs/fr/sql-reference/operators/index.md b/docs/fr/sql-reference/operators/index.md deleted file mode 100644 index 1635c7eece3..00000000000 --- a/docs/fr/sql-reference/operators/index.md +++ /dev/null @@ -1,277 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 37 -toc_title: "Op\xE9rateur" ---- - -# Opérateur {#operators} - -ClickHouse transforme les opérateurs en leurs fonctions correspondantes à l'étape d'analyse des requêtes en fonction de leur priorité, de leur priorité et de leur associativité. - -## Des Opérateurs D'Accès {#access-operators} - -`a[N]` – Access to an element of an array. The `arrayElement(a, N)` fonction. - -`a.N` – Access to a tuple element. The `tupleElement(a, N)` fonction. - -## Opérateur De Négation Numérique {#numeric-negation-operator} - -`-a` – The `negate (a)` fonction. - -## Opérateurs de Multiplication et de Division {#multiplication-and-division-operators} - -`a * b` – The `multiply (a, b)` fonction. - -`a / b` – The `divide(a, b)` fonction. - -`a % b` – The `modulo(a, b)` fonction. - -## Opérateurs d'Addition et de soustraction {#addition-and-subtraction-operators} - -`a + b` – The `plus(a, b)` fonction. - -`a - b` – The `minus(a, b)` fonction. - -## Opérateurs De Comparaison {#comparison-operators} - -`a = b` – The `equals(a, b)` fonction. - -`a == b` – The `equals(a, b)` fonction. - -`a != b` – The `notEquals(a, b)` fonction. - -`a <> b` – The `notEquals(a, b)` fonction. - -`a <= b` – The `lessOrEquals(a, b)` fonction. - -`a >= b` – The `greaterOrEquals(a, b)` fonction. - -`a < b` – The `less(a, b)` fonction. - -`a > b` – The `greater(a, b)` fonction. - -`a LIKE s` – The `like(a, b)` fonction. - -`a NOT LIKE s` – The `notLike(a, b)` fonction. - -`a BETWEEN b AND c` – The same as `a >= b AND a <= c`. - -`a NOT BETWEEN b AND c` – The same as `a < b OR a > c`. - -## Opérateurs pour travailler avec des ensembles de données {#operators-for-working-with-data-sets} - -*Voir [Dans les opérateurs](in.md).* - -`a IN ...` – The `in(a, b)` fonction. - -`a NOT IN ...` – The `notIn(a, b)` fonction. - -`a GLOBAL IN ...` – The `globalIn(a, b)` fonction. - -`a GLOBAL NOT IN ...` – The `globalNotIn(a, b)` fonction. - -## Opérateurs pour travailler avec des Dates et des heures {#operators-datetime} - -### EXTRACT {#operator-extract} - -``` sql -EXTRACT(part FROM date); -``` - -Extraire des parties d'une date donnée. Par exemple, vous pouvez récupérer un mois à partir d'une date donnée, ou d'une seconde à partir d'un moment. - -Le `part` paramètre spécifie la partie de la date à récupérer. Les valeurs suivantes sont disponibles: - -- `DAY` — The day of the month. Possible values: 1–31. -- `MONTH` — The number of a month. Possible values: 1–12. -- `YEAR` — The year. -- `SECOND` — The second. Possible values: 0–59. -- `MINUTE` — The minute. Possible values: 0–59. -- `HOUR` — The hour. Possible values: 0–23. - -Le `part` le paramètre est insensible à la casse. - -Le `date` paramètre spécifie la date ou l'heure à traiter. Soit [Date](../../sql-reference/data-types/date.md) ou [DateTime](../../sql-reference/data-types/datetime.md) le type est pris en charge. - -Exemple: - -``` sql -SELECT EXTRACT(DAY FROM toDate('2017-06-15')); -SELECT EXTRACT(MONTH FROM toDate('2017-06-15')); -SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); -``` - -Dans l'exemple suivant, nous créons un tableau et de les insérer dans une valeur avec le `DateTime` type. - -``` sql -CREATE TABLE test.Orders -( - OrderId UInt64, - OrderName String, - OrderDate DateTime -) -ENGINE = Log; -``` - -``` sql -INSERT INTO test.Orders VALUES (1, 'Jarlsberg Cheese', toDateTime('2008-10-11 13:23:44')); -``` - -``` sql -SELECT - toYear(OrderDate) AS OrderYear, - toMonth(OrderDate) AS OrderMonth, - toDayOfMonth(OrderDate) AS OrderDay, - toHour(OrderDate) AS OrderHour, - toMinute(OrderDate) AS OrderMinute, - toSecond(OrderDate) AS OrderSecond -FROM test.Orders; -``` - -``` text -┌─OrderYear─┬─OrderMonth─┬─OrderDay─┬─OrderHour─┬─OrderMinute─┬─OrderSecond─┐ -│ 2008 │ 10 │ 11 │ 13 │ 23 │ 44 │ -└───────────┴────────────┴──────────┴───────────┴─────────────┴─────────────┘ -``` - -Vous pouvez voir plus d'exemples de [test](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00619_extract.sql). - -### INTERVAL {#operator-interval} - -Crée un [Intervalle](../../sql-reference/data-types/special-data-types/interval.md)- valeur de type qui doit être utilisée dans les opérations arithmétiques avec [Date](../../sql-reference/data-types/date.md) et [DateTime](../../sql-reference/data-types/datetime.md)-type de valeurs. - -Types d'intervalles: -- `SECOND` -- `MINUTE` -- `HOUR` -- `DAY` -- `WEEK` -- `MONTH` -- `QUARTER` -- `YEAR` - -!!! warning "Avertissement" - Les intervalles avec différents types ne peuvent pas être combinés. Vous ne pouvez pas utiliser des expressions comme `INTERVAL 4 DAY 1 HOUR`. Spécifiez des intervalles en unités inférieures ou égales à la plus petite unité de l'intervalle, par exemple, `INTERVAL 25 HOUR`. Vous pouvez utiliser les opérations consécutives, comme dans l'exemple ci-dessous. - -Exemple: - -``` sql -SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR -``` - -``` text -┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ -│ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ -└─────────────────────┴────────────────────────────────────────────────────────┘ -``` - -**Voir Aussi** - -- [Intervalle](../../sql-reference/data-types/special-data-types/interval.md) type de données -- [toInterval](../../sql-reference/functions/type-conversion-functions.md#function-tointerval) type fonctions de conversion - -## Opérateur De Négation Logique {#logical-negation-operator} - -`NOT a` – The `not(a)` fonction. - -## Logique ET de l'Opérateur {#logical-and-operator} - -`a AND b` – The`and(a, b)` fonction. - -## Logique ou opérateur {#logical-or-operator} - -`a OR b` – The `or(a, b)` fonction. - -## Opérateur Conditionnel {#conditional-operator} - -`a ? b : c` – The `if(a, b, c)` fonction. - -Note: - -L'opérateur conditionnel calcule les valeurs de b et c, puis vérifie si la condition a est remplie, puis renvoie la valeur correspondante. Si `b` ou `C` est un [arrayJoin()](../../sql-reference/functions/array-join.md#functions_arrayjoin) fonction, chaque ligne sera répliquée indépendamment de la “a” condition. - -## Expression Conditionnelle {#operator_case} - -``` sql -CASE [x] - WHEN a THEN b - [WHEN ... THEN ...] - [ELSE c] -END -``` - -Si `x` est spécifié, alors `transform(x, [a, ...], [b, ...], c)` function is used. Otherwise – `multiIf(a, b, ..., c)`. - -Si il n'y a pas de `ELSE c` dans l'expression, la valeur par défaut est `NULL`. - -Le `transform` la fonction ne fonctionne pas avec `NULL`. - -## Opérateur De Concaténation {#concatenation-operator} - -`s1 || s2` – The `concat(s1, s2) function.` - -## Opérateur De Création Lambda {#lambda-creation-operator} - -`x -> expr` – The `lambda(x, expr) function.` - -Les opérateurs suivants n'ont pas de priorité puisqu'ils sont des parenthèses: - -## Opérateur De Création De Tableau {#array-creation-operator} - -`[x1, ...]` – The `array(x1, ...) function.` - -## Opérateur De Création De Tuple {#tuple-creation-operator} - -`(x1, x2, ...)` – The `tuple(x2, x2, ...) function.` - -## Associativité {#associativity} - -Tous les opérateurs binaires ont associativité gauche. Exemple, `1 + 2 + 3` est transformé à `plus(plus(1, 2), 3)`. -Parfois, cela ne fonctionne pas de la façon que vous attendez. Exemple, `SELECT 4 > 2 > 3` résultat sera 0. - -Pour l'efficacité, le `and` et `or` les fonctions acceptent n'importe quel nombre d'arguments. Les chaînes de `AND` et `OR` les opérateurs se sont transformés en un seul appel de ces fonctions. - -## La vérification de `NULL` {#checking-for-null} - -Clickhouse soutient le `IS NULL` et `IS NOT NULL` opérateur. - -### IS NULL {#operator-is-null} - -- Pour [Nullable](../../sql-reference/data-types/nullable.md) type de valeurs, l' `IS NULL` opérateur retourne: - - `1` si la valeur est `NULL`. - - `0` autrement. -- Pour les autres valeurs, la `IS NULL` l'opérateur renvoie toujours `0`. - - - -``` sql -SELECT x+100 FROM t_null WHERE y IS NULL -``` - -``` text -┌─plus(x, 100)─┐ -│ 101 │ -└──────────────┘ -``` - -### IS NOT NULL {#is-not-null} - -- Pour [Nullable](../../sql-reference/data-types/nullable.md) type de valeurs, l' `IS NOT NULL` opérateur retourne: - - `0` si la valeur est `NULL`. - - `1` autrement. -- Pour les autres valeurs, la `IS NOT NULL` l'opérateur renvoie toujours `1`. - - - -``` sql -SELECT * FROM t_null WHERE y IS NOT NULL -``` - -``` text -┌─x─┬─y─┐ -│ 2 │ 3 │ -└───┴───┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/operators/) diff --git a/docs/fr/sql-reference/statements/alter.md b/docs/fr/sql-reference/statements/alter.md deleted file mode 100644 index 64fe21046a3..00000000000 --- a/docs/fr/sql-reference/statements/alter.md +++ /dev/null @@ -1,602 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 36 -toc_title: ALTER ---- - -## ALTER {#query_language_queries_alter} - -Le `ALTER` la requête est prise en charge uniquement pour `*MergeTree` des tables, ainsi que `Merge`et`Distributed`. La requête a plusieurs variantes. - -### Manipulations De Colonne {#column-manipulations} - -Modification de la structure de la table. - -``` sql -ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN ... -``` - -Dans la requête, spécifiez une liste d'une ou plusieurs actions séparées par des virgules. -Chaque action est une opération sur une colonne. - -Les actions suivantes sont prises en charge: - -- [ADD COLUMN](#alter_add-column) — Adds a new column to the table. -- [DROP COLUMN](#alter_drop-column) — Deletes the column. -- [CLEAR COLUMN](#alter_clear-column) — Resets column values. -- [COMMENT COLUMN](#alter_comment-column) — Adds a text comment to the column. -- [MODIFY COLUMN](#alter_modify-column) — Changes column's type, default expression and TTL. - -Ces actions sont décrites en détail ci-dessous. - -#### ADD COLUMN {#alter_add-column} - -``` sql -ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after] -``` - -Ajoute une nouvelle colonne à la table spécifiée `name`, `type`, [`codec`](create.md#codecs) et `default_expr` (voir la section [Expressions par défaut](create.md#create-default-values)). - -Si l' `IF NOT EXISTS` la clause est incluse, la requête ne retournera pas d'erreur si la colonne existe déjà. Si vous spécifiez `AFTER name_after` (le nom d'une autre colonne), la colonne est ajoutée après celle spécifiée dans la liste des colonnes de la table. Sinon, la colonne est ajoutée à la fin de la table. Notez qu'il n'existe aucun moyen d'ajouter une colonne au début d'un tableau. Pour une chaîne d'actions, `name_after` peut être le nom d'une colonne est ajoutée dans l'une des actions précédentes. - -L'ajout d'une colonne modifie simplement la structure de la table, sans effectuer d'actions avec des données. Les données n'apparaissent pas sur le disque après la `ALTER`. Si les données sont manquantes pour une colonne lors de la lecture de la table, elles sont remplies avec des valeurs par défaut (en exécutant l'expression par défaut s'il y en a une, ou en utilisant des zéros ou des chaînes vides). La colonne apparaît sur le disque après la fusion des parties de données (voir [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)). - -Cette approche nous permet de compléter le `ALTER` requête instantanément, sans augmenter le volume de données anciennes. - -Exemple: - -``` sql -ALTER TABLE visits ADD COLUMN browser String AFTER user_id -``` - -#### DROP COLUMN {#alter_drop-column} - -``` sql -DROP COLUMN [IF EXISTS] name -``` - -Supprime la colonne avec le nom `name`. Si l' `IF EXISTS` la clause est spécifiée, la requête ne retournera pas d'erreur si la colonne n'existe pas. - -Supprime les données du système de fichiers. Comme cela supprime des fichiers entiers, la requête est terminée presque instantanément. - -Exemple: - -``` sql -ALTER TABLE visits DROP COLUMN browser -``` - -#### CLEAR COLUMN {#alter_clear-column} - -``` sql -CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name -``` - -Réinitialise toutes les données dans une colonne pour une partition spécifiée. En savoir plus sur la définition du nom de la partition dans la section [Comment spécifier l'expression de partition](#alter-how-to-specify-part-expr). - -Si l' `IF EXISTS` la clause est spécifiée, la requête ne retournera pas d'erreur si la colonne n'existe pas. - -Exemple: - -``` sql -ALTER TABLE visits CLEAR COLUMN browser IN PARTITION tuple() -``` - -#### COMMENT COLUMN {#alter_comment-column} - -``` sql -COMMENT COLUMN [IF EXISTS] name 'comment' -``` - -Ajoute un commentaire à la colonne. Si l' `IF EXISTS` la clause est spécifiée, la requête ne retournera pas d'erreur si la colonne n'existe pas. - -Chaque colonne peut avoir un commentaire. Si un commentaire existe déjà pour la colonne, un nouveau commentaire remplace le précédent commentaire. - -Les commentaires sont stockés dans le `comment_expression` colonne renvoyée par le [DESCRIBE TABLE](misc.md#misc-describe-table) requête. - -Exemple: - -``` sql -ALTER TABLE visits COMMENT COLUMN browser 'The table shows the browser used for accessing the site.' -``` - -#### MODIFY COLUMN {#alter_modify-column} - -``` sql -MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] -``` - -Cette requête modifie le `name` les propriétés de la colonne: - -- Type - -- Expression par défaut - -- TTL - - For examples of columns TTL modifying, see [Column TTL](../engines/table_engines/mergetree_family/mergetree.md#mergetree-column-ttl). - -Si l' `IF EXISTS` la clause est spécifiée, la requête ne retournera pas d'erreur si la colonne n'existe pas. - -Lors de la modification du type, les valeurs sont converties comme si [toType](../../sql-reference/functions/type-conversion-functions.md) les fonctions ont été appliquées. Si seule l'expression par défaut est modifiée, la requête ne fait rien de complexe et est terminée presque instantanément. - -Exemple: - -``` sql -ALTER TABLE visits MODIFY COLUMN browser Array(String) -``` - -Changing the column type is the only complex action – it changes the contents of files with data. For large tables, this may take a long time. - -Il y a plusieurs étapes de traitement: - -- Préparation de (nouveaux) fichiers temporaires avec des données modifiées. -- Renommer les anciens fichiers. -- Renommer les (nouveaux) fichiers temporaires en anciens noms. -- Suppression des anciens fichiers. - -Seule la première étape prend du temps. Si il y a un échec à ce stade, les données ne sont pas modifiées. -En cas d'échec au cours d'une des étapes successives, les données peuvent être restaurées manuellement. L'exception est si les anciens fichiers ont été supprimés du système de fichiers mais que les données des nouveaux fichiers n'ont pas été écrites sur le disque et ont été perdues. - -Le `ALTER` la requête de modification des colonnes est répliquée. Les instructions sont enregistrées dans ZooKeeper, puis chaque réplique les applique. Tout `ALTER` les requêtes sont exécutées dans le même ordre. La requête attend que les actions appropriées soient terminées sur les autres répliques. Cependant, une requête pour modifier des colonnes dans une table répliquée peut être interrompue, et toutes les actions seront effectuées de manière asynchrone. - -#### Modifier les limites de la requête {#alter-query-limitations} - -Le `ALTER` query vous permet de créer et de supprimer des éléments distincts (colonnes) dans des structures de données imbriquées, mais pas des structures de données imbriquées entières. Pour ajouter une structure de données imbriquée, vous pouvez ajouter des colonnes avec un nom comme `name.nested_name` et le type `Array(T)`. Une structure de données imbriquée est équivalente à plusieurs colonnes de tableau avec un nom qui a le même préfixe avant le point. - -Il n'y a pas de support pour supprimer des colonnes dans la clé primaire ou la clé d'échantillonnage (colonnes qui sont utilisées dans le `ENGINE` expression). La modification du type des colonnes incluses dans la clé primaire n'est possible que si cette modification n'entraîne pas la modification des données (par exemple, vous êtes autorisé à ajouter des valeurs à une énumération ou à modifier un type de `DateTime` de `UInt32`). - -Si l' `ALTER` la requête n'est pas suffisante pour apporter les modifications de table dont vous avez besoin, vous pouvez créer une nouvelle table, y copier les données en utilisant le [INSERT SELECT](insert-into.md#insert_query_insert-select) requête, puis changer les tables en utilisant le [RENAME](misc.md#misc_operations-rename) requête et supprimer l'ancienne table. Vous pouvez utiliser l' [clickhouse-copieur](../../operations/utilities/clickhouse-copier.md) comme une alternative à la `INSERT SELECT` requête. - -Le `ALTER` query bloque toutes les lectures et écritures pour la table. En d'autres termes, si une longue `SELECT` est en cours d'exécution au moment de la `ALTER` requête, la `ALTER` la requête va attendre qu'elle se termine. Dans le même temps, toutes les nouvelles requêtes à la même table attendre que ce `ALTER` est en cours d'exécution. - -Pour les tables qui ne stockent pas les données elles-mêmes (telles que `Merge` et `Distributed`), `ALTER` change simplement la structure de la table, et ne change pas la structure des tables subordonnées. Par exemple, lors de L'exécution de ALTER pour un `Distributed` table, vous devrez également exécuter `ALTER` pour les tables sur tous les serveurs distants. - -### Manipulations avec des Expressions clés {#manipulations-with-key-expressions} - -La commande suivante est prise en charge: - -``` sql -MODIFY ORDER BY new_expression -``` - -Cela ne fonctionne que pour les tables du [`MergeTree`](../../engines/table-engines/mergetree-family/mergetree.md) de la famille (y compris les -[répliqué](../../engines/table-engines/mergetree-family/replication.md) table). La commande change l' -[clé de tri](../../engines/table-engines/mergetree-family/mergetree.md) de la table -de `new_expression` (une expression ou un tuple d'expressions). Clé primaire reste le même. - -La commande est légère en ce sens qu'elle ne modifie que les métadonnées. Pour conserver la propriété cette partie de données -les lignes sont ordonnées par l'expression de clé de tri vous ne pouvez pas ajouter d'expressions contenant des colonnes existantes -à la clé de tri (seules les colonnes ajoutées par `ADD COLUMN` commande dans le même `ALTER` requête). - -### Manipulations avec des Indices de saut de données {#manipulations-with-data-skipping-indices} - -Cela ne fonctionne que pour les tables du [`*MergeTree`](../../engines/table-engines/mergetree-family/mergetree.md) de la famille (y compris les -[répliqué](../../engines/table-engines/mergetree-family/replication.md) table). Les opérations suivantes -sont disponibles: - -- `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value AFTER name [AFTER name2]` - Ajoute la description de l'index aux métadonnées des tables. - -- `ALTER TABLE [db].name DROP INDEX name` - Supprime la description de l'index des métadonnées des tables et supprime les fichiers d'index du disque. - -Ces commandes sont légères dans le sens où elles ne modifient que les métadonnées ou suppriment des fichiers. -En outre, ils sont répliqués (synchronisation des métadonnées des indices via ZooKeeper). - -### Manipulations avec contraintes {#manipulations-with-constraints} - -En voir plus sur [contraintes](create.md#constraints) - -Les contraintes peuvent être ajoutées ou supprimées à l'aide de la syntaxe suivante: - -``` sql -ALTER TABLE [db].name ADD CONSTRAINT constraint_name CHECK expression; -ALTER TABLE [db].name DROP CONSTRAINT constraint_name; -``` - -Les requêtes ajouteront ou supprimeront des métadonnées sur les contraintes de la table afin qu'elles soient traitées immédiatement. - -Contrainte de vérifier *ne sera pas exécuté* sur les données existantes si elle a été ajoutée. - -Toutes les modifications sur les tables répliquées sont diffusées sur ZooKeeper et seront donc appliquées sur d'autres répliques. - -### Manipulations avec des Partitions et des pièces {#alter_manipulations-with-partitions} - -Les opérations suivantes avec [partition](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) sont disponibles: - -- [DETACH PARTITION](#alter_detach-partition) – Moves a partition to the `detached` répertoire et de l'oublier. -- [DROP PARTITION](#alter_drop-partition) – Deletes a partition. -- [ATTACH PART\|PARTITION](#alter_attach-partition) – Adds a part or partition from the `detached` répertoire à la table. -- [ATTACH PARTITION FROM](#alter_attach-partition-from) – Copies the data partition from one table to another and adds. -- [REPLACE PARTITION](#alter_replace-partition) - Copie la partition de données d'une table à l'autre et la remplace. -- [MOVE PARTITION TO TABLE](#alter_move_to_table-partition)(#alter_move_to_table-partition) - déplace la partition de données d'une table à l'autre. -- [CLEAR COLUMN IN PARTITION](#alter_clear-column-partition) - Rétablit la valeur d'une colonne spécifiée dans une partition. -- [CLEAR INDEX IN PARTITION](#alter_clear-index-partition) - Réinitialise l'index secondaire spécifié dans une partition. -- [FREEZE PARTITION](#alter_freeze-partition) – Creates a backup of a partition. -- [FETCH PARTITION](#alter_fetch-partition) – Downloads a partition from another server. -- [MOVE PARTITION\|PART](#alter_move-partition) – Move partition/data part to another disk or volume. - - - -#### DETACH PARTITION {#alter_detach-partition} - -``` sql -ALTER TABLE table_name DETACH PARTITION partition_expr -``` - -Déplace toutes les données de la partition spécifiée vers `detached` répertoire. Le serveur oublie la partition de données détachée comme si elle n'existait pas. Le serveur ne connaîtra pas ces données tant que vous n'aurez pas [ATTACH](#alter_attach-partition) requête. - -Exemple: - -``` sql -ALTER TABLE visits DETACH PARTITION 201901 -``` - -Lisez à propos de la définition de l'expression de partition dans une section [Comment spécifier l'expression de partition](#alter-how-to-specify-part-expr). - -Une fois la requête exécutée, vous pouvez faire ce que vous voulez avec les données du `detached` directory — delete it from the file system, or just leave it. - -This query is replicated – it moves the data to the `detached` répertoire sur toutes les répliques. Notez que vous ne pouvez exécuter cette requête que sur un réplica leader. Pour savoir si une réplique est un leader, effectuez le `SELECT` requête à l' [système.réplique](../../operations/system-tables.md#system_tables-replicas) table. Alternativement, il est plus facile de faire une `DETACH` requête sur toutes les répliques - toutes les répliques lancent une exception, à l'exception de la réplique leader. - -#### DROP PARTITION {#alter_drop-partition} - -``` sql -ALTER TABLE table_name DROP PARTITION partition_expr -``` - -Supprime la partition spécifiée de la table. Cette requête marque la partition comme inactive et supprime complètement les données, environ en 10 minutes. - -Lisez à propos de la définition de l'expression de partition dans une section [Comment spécifier l'expression de partition](#alter-how-to-specify-part-expr). - -The query is replicated – it deletes data on all replicas. - -#### DROP DETACHED PARTITION\|PART {#alter_drop-detached} - -``` sql -ALTER TABLE table_name DROP DETACHED PARTITION|PART partition_expr -``` - -Supprime la partie spécifiée ou toutes les parties de la partition spécifiée de `detached`. -En savoir plus sur la définition de l'expression de partition dans une section [Comment spécifier l'expression de partition](#alter-how-to-specify-part-expr). - -#### ATTACH PARTITION\|PART {#alter_attach-partition} - -``` sql -ALTER TABLE table_name ATTACH PARTITION|PART partition_expr -``` - -Ajoute des données à la table à partir du `detached` répertoire. Il est possible d'ajouter des données dans une partition entière ou pour une partie distincte. Exemple: - -``` sql -ALTER TABLE visits ATTACH PARTITION 201901; -ALTER TABLE visits ATTACH PART 201901_2_2_0; -``` - -En savoir plus sur la définition de l'expression de partition dans une section [Comment spécifier l'expression de partition](#alter-how-to-specify-part-expr). - -Cette requête est répliquée. L'initiateur de réplica vérifie s'il y a des données dans le `detached` répertoire. Si des données existent, la requête vérifie son intégrité. Si tout est correct, la requête ajoute les données à la table. Tous les autres réplicas téléchargent les données de l'initiateur de réplica. - -Ainsi, vous pouvez mettre des données à la `detached` répertoire sur une réplique, et utilisez le `ALTER ... ATTACH` requête pour l'ajouter à la table sur tous les réplicas. - -#### ATTACH PARTITION FROM {#alter_attach-partition-from} - -``` sql -ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1 -``` - -Cette requête copie la partition de données du `table1` de `table2` ajoute des données de gratuit dans la `table2`. Notez que les données ne seront pas supprimées de `table1`. - -Pour que la requête s'exécute correctement, les conditions suivantes doivent être remplies: - -- Les deux tables doivent avoir la même structure. -- Les deux tables doivent avoir la même clé de partition. - -#### REPLACE PARTITION {#alter_replace-partition} - -``` sql -ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1 -``` - -Cette requête copie la partition de données du `table1` de `table2` et remplace la partition existante dans le `table2`. Notez que les données ne seront pas supprimées de `table1`. - -Pour que la requête s'exécute correctement, les conditions suivantes doivent être remplies: - -- Les deux tables doivent avoir la même structure. -- Les deux tables doivent avoir la même clé de partition. - -#### MOVE PARTITION TO TABLE {#alter_move_to_table-partition} - -``` sql -ALTER TABLE table_source MOVE PARTITION partition_expr TO TABLE table_dest -``` - -Cette requête déplace la partition de données du `table_source` de `table_dest` avec la suppression des données de `table_source`. - -Pour que la requête s'exécute correctement, les conditions suivantes doivent être remplies: - -- Les deux tables doivent avoir la même structure. -- Les deux tables doivent avoir la même clé de partition. -- Les deux tables doivent appartenir à la même famille de moteurs. (répliqué ou non répliqué) -- Les deux tables doivent avoir la même stratégie de stockage. - -#### CLEAR COLUMN IN PARTITION {#alter_clear-column-partition} - -``` sql -ALTER TABLE table_name CLEAR COLUMN column_name IN PARTITION partition_expr -``` - -Réinitialise toutes les valeurs de la colonne spécifiée dans une partition. Si l' `DEFAULT` la clause a été déterminée lors de la création d'une table, cette requête définit la valeur de la colonne à une valeur par défaut spécifiée. - -Exemple: - -``` sql -ALTER TABLE visits CLEAR COLUMN hour in PARTITION 201902 -``` - -#### FREEZE PARTITION {#alter_freeze-partition} - -``` sql -ALTER TABLE table_name FREEZE [PARTITION partition_expr] -``` - -Cette requête crée une sauvegarde locale d'une partition spécifiée. Si l' `PARTITION` la clause est omise, la requête crée la sauvegarde de toutes les partitions à la fois. - -!!! note "Note" - L'ensemble du processus de sauvegarde est effectuée sans arrêter le serveur. - -Notez que pour les tables de style ancien, vous pouvez spécifier le préfixe du nom de la partition (par exemple, ‘2019’)- ensuite, la requête crée la sauvegarde pour toutes les partitions correspondantes. Lisez à propos de la définition de l'expression de partition dans une section [Comment spécifier l'expression de partition](#alter-how-to-specify-part-expr). - -Au moment de l'exécution, pour un instantané de données, la requête crée des liens rigides vers des données de table. Les liens sont placés dans le répertoire `/var/lib/clickhouse/shadow/N/...`, où: - -- `/var/lib/clickhouse/` est le répertoire de travail clickhouse spécifié dans la configuration. -- `N` est le numéro incrémental de la sauvegarde. - -!!! note "Note" - Si vous utilisez [un ensemble de disques pour le stockage des données dans une table](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes), le `shadow/N` le répertoire apparaît sur chaque disque, stockant les parties de données correspondant `PARTITION` expression. - -La même structure de répertoires est créée à l'intérieur de la sauvegarde qu'à l'intérieur `/var/lib/clickhouse/`. La requête effectue ‘chmod’ pour tous les fichiers, interdisant d'écrire en eux. - -Après avoir créé la sauvegarde, vous pouvez copier les données depuis `/var/lib/clickhouse/shadow/` sur le serveur distant, puis supprimez-le du serveur local. Notez que l' `ALTER t FREEZE PARTITION` la requête n'est pas répliqué. Il crée une sauvegarde locale uniquement sur le serveur local. - -La requête crée une sauvegarde presque instantanément (mais elle attend d'abord que les requêtes en cours à la table correspondante se terminent). - -`ALTER TABLE t FREEZE PARTITION` copie uniquement les données, pas les métadonnées de la table. Faire une sauvegarde des métadonnées de la table, copiez le fichier `/var/lib/clickhouse/metadata/database/table.sql` - -Pour restaurer des données à partir d'une sauvegarde, procédez comme suit: - -1. Créer la table si elle n'existe pas. Pour afficher la requête, utilisez la .fichier sql (remplacer `ATTACH` avec `CREATE`). -2. Copier les données de la `data/database/table/` répertoire à l'intérieur de la sauvegarde `/var/lib/clickhouse/data/database/table/detached/` répertoire. -3. Exécuter `ALTER TABLE t ATTACH PARTITION` les requêtes pour ajouter les données à une table. - -La restauration à partir d'une sauvegarde ne nécessite pas l'arrêt du serveur. - -Pour plus d'informations sur les sauvegardes et la restauration [La Sauvegarde Des Données](../../operations/backup.md) section. - -#### CLEAR INDEX IN PARTITION {#alter_clear-index-partition} - -``` sql -ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr -``` - -La requête fonctionne de manière similaire à `CLEAR COLUMN` mais il remet un index au lieu d'une colonne de données. - -#### FETCH PARTITION {#alter_fetch-partition} - -``` sql -ALTER TABLE table_name FETCH PARTITION partition_expr FROM 'path-in-zookeeper' -``` - -Télécharge une partition depuis un autre serveur. Cette requête ne fonctionne que pour les tables répliquées. - -La requête effectue les opérations suivantes: - -1. Télécharge la partition à partir du fragment spécifié. Dans ‘path-in-zookeeper’ vous devez spécifier un chemin vers le fragment dans ZooKeeper. -2. Ensuite, la requête met les données téléchargées dans le `detached` répertoire de la `table_name` table. L'utilisation de la [ATTACH PARTITION\|PART](#alter_attach-partition) requête pour ajouter les données à la table. - -Exemple: - -``` sql -ALTER TABLE users FETCH PARTITION 201902 FROM '/clickhouse/tables/01-01/visits'; -ALTER TABLE users ATTACH PARTITION 201902; -``` - -Notez que: - -- Le `ALTER ... FETCH PARTITION` la requête n'est pas répliqué. Il place la partition à la `detached` répertoire sur le serveur local. -- Le `ALTER TABLE ... ATTACH` la requête est répliquée. Il ajoute les données à toutes les répliques. Les données sont ajoutées à l'une des répliques `detached` répertoire, et aux autres-des répliques voisines. - -Avant le téléchargement, le système vérifie si la partition existe et si la structure de la table correspond. La réplique la plus appropriée est sélectionnée automatiquement parmi les répliques saines. - -Bien que la requête soit appelée `ALTER TABLE`, il ne modifie pas la structure de la table et ne modifie pas immédiatement les données disponibles dans la table. - -#### MOVE PARTITION\|PART {#alter_move-partition} - -Déplace des partitions ou des parties de données vers un autre volume ou disque pour `MergeTree`-tables de moteur. Voir [Utilisation de plusieurs périphériques de bloc pour le stockage de données](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes). - -``` sql -ALTER TABLE table_name MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name' -``` - -Le `ALTER TABLE t MOVE` requête: - -- Non répliqué, car différentes répliques peuvent avoir des stratégies de stockage différentes. -- Renvoie une erreur si le disque ou le volume n'est pas configuré. Query renvoie également une erreur si les conditions de déplacement des données, spécifiées dans la stratégie de stockage, ne peuvent pas être appliquées. -- Peut renvoyer une erreur dans le cas, lorsque les données à déplacer sont déjà déplacées par un processus en arrière-plan, simultané `ALTER TABLE t MOVE` requête ou à la suite de la fusion de données d'arrière-plan. Un utilisateur ne doit effectuer aucune action supplémentaire dans ce cas. - -Exemple: - -``` sql -ALTER TABLE hits MOVE PART '20190301_14343_16206_438' TO VOLUME 'slow' -ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' -``` - -#### Comment définir L'Expression de la Partition {#alter-how-to-specify-part-expr} - -Vous pouvez spécifier l'expression de partition dans `ALTER ... PARTITION` requêtes de différentes manières: - -- Comme une valeur de l' `partition` la colonne de la `system.parts` table. Exemple, `ALTER TABLE visits DETACH PARTITION 201901`. -- Comme expression de la colonne de la table. Les constantes et les expressions constantes sont prises en charge. Exemple, `ALTER TABLE visits DETACH PARTITION toYYYYMM(toDate('2019-01-25'))`. -- À l'aide de l'ID de partition. Partition ID est un identifiant de chaîne de la partition (lisible par l'homme, si possible) qui est utilisé comme noms de partitions dans le système de fichiers et dans ZooKeeper. L'ID de partition doit être spécifié dans `PARTITION ID` clause, entre guillemets simples. Exemple, `ALTER TABLE visits DETACH PARTITION ID '201901'`. -- Dans le [ALTER ATTACH PART](#alter_attach-partition) et [DROP DETACHED PART](#alter_drop-detached) requête, pour spécifier le nom d'une partie, utilisez le littéral de chaîne avec une valeur de `name` la colonne de la [système.detached_parts](../../operations/system-tables.md#system_tables-detached_parts) table. Exemple, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. - -L'utilisation de guillemets lors de la spécification de la partition dépend du type d'expression de partition. Par exemple, pour la `String` type, vous devez spécifier son nom entre guillemets (`'`). Pour l' `Date` et `Int*` types aucune citation n'est nécessaire. - -Pour les tables de style ancien, vous pouvez spécifier la partition sous forme de nombre `201901` ou une chaîne de caractères `'201901'`. La syntaxe des tables new-style est plus stricte avec les types (similaire à l'analyseur pour le format D'entrée des valeurs). - -Toutes les règles ci-dessus sont aussi valables pour la [OPTIMIZE](misc.md#misc_operations-optimize) requête. Si vous devez spécifier la seule partition lors de l'optimisation d'une table non partitionnée, définissez l'expression `PARTITION tuple()`. Exemple: - -``` sql -OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; -``` - -Les exemples de `ALTER ... PARTITION` les requêtes sont démontrées dans les tests [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_local.sql) et [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). - -### Manipulations avec Table TTL {#manipulations-with-table-ttl} - -Vous pouvez modifier [tableau TTL](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-table-ttl) avec une demande du formulaire suivant: - -``` sql -ALTER TABLE table-name MODIFY TTL ttl-expression -``` - -### Synchronicité des requêtes ALTER {#synchronicity-of-alter-queries} - -Pour les tables non réplicables, tous `ALTER` les requêtes sont exécutées simultanément. Pour les tables réplicables, la requête ajoute simplement des instructions pour les actions appropriées à `ZooKeeper` et les actions elles-mêmes sont effectuées dès que possible. Cependant, la requête peut attendre que ces actions soient terminées sur tous les réplicas. - -Pour `ALTER ... ATTACH|DETACH|DROP` les requêtes, vous pouvez utiliser le `replication_alter_partitions_sync` configuration pour configurer l'attente. -Valeurs possibles: `0` – do not wait; `1` – only wait for own execution (default); `2` – wait for all. - -### Mutation {#alter-mutations} - -Les Mutations sont une variante ALTER query qui permet de modifier ou de supprimer des lignes dans une table. Contrairement à la norme `UPDATE` et `DELETE` les requêtes qui sont destinées aux changements de données de point, les mutations sont destinées aux opérations lourdes qui modifient beaucoup de lignes dans une table. Pris en charge pour le `MergeTree` famille de moteurs de table, y compris les moteurs avec support de réplication. - -Les tables existantes sont prêtes pour les mutations telles quelles (aucune conversion nécessaire), mais après l'application de la première mutation à une table, son format de métadonnées devient incompatible avec les versions précédentes du serveur et il devient impossible de revenir à une version précédente. - -Commandes actuellement disponibles: - -``` sql -ALTER TABLE [db.]table DELETE WHERE filter_expr -``` - -Le `filter_expr` doit être de type `UInt8`. La requête supprime les lignes de la table pour lesquelles cette expression prend une valeur différente de zéro. - -``` sql -ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr -``` - -Le `filter_expr` doit être de type `UInt8`. Cette requête met à jour les valeurs des colonnes spécifiées en les valeurs des expressions correspondantes dans les lignes pour lesquelles `filter_expr` prend une valeur non nulle. Les valeurs sont converties en type de colonne à l'aide `CAST` opérateur. La mise à jour des colonnes utilisées dans le calcul de la clé primaire ou de la clé de partition n'est pas prise en charge. - -``` sql -ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name -``` - -La requête reconstruit l'index secondaire `name` dans la partition `partition_name`. - -Une requête peut contenir plusieurs commandes séparées par des virgules. - -Pour les tables \* MergeTree, les mutations s'exécutent en réécrivant des parties de données entières. Il n'y a pas d'atomicité-les pièces sont substituées aux pièces mutées dès qu'elles sont prêtes et un `SELECT` la requête qui a commencé à s'exécuter pendant une mutation verra les données des parties qui ont déjà été mutées ainsi que les données des parties qui n'ont pas encore été mutées. - -Les Mutations sont totalement ordonnées par leur ordre de création et sont appliquées à chaque partie dans cet ordre. Les Mutations sont également partiellement ordonnées avec des insertions - les données insérées dans la table avant la soumission de la mutation seront mutées et les données insérées après ne seront pas mutées. Notez que les mutations ne bloquent en aucune façon les INSERTs. - -Une requête de mutation retourne immédiatement après l'ajout de l'entrée de mutation (dans le cas de tables répliquées à ZooKeeper, pour les tables non compliquées - au système de fichiers). La mutation elle-même s'exécute de manière asynchrone en utilisant les paramètres du profil système. Pour suivre l'avancement des mutations vous pouvez utiliser la [`system.mutations`](../../operations/system-tables.md#system_tables-mutations) table. Une mutation qui a été soumise avec succès continuera à s'exécuter même si les serveurs ClickHouse sont redémarrés. Il n'y a aucun moyen de faire reculer la mutation une fois qu'elle est soumise, mais si la mutation est bloquée pour une raison quelconque, elle peut être annulée avec le [`KILL MUTATION`](misc.md#kill-mutation) requête. - -Les entrées pour les mutations finies ne sont pas supprimées immédiatement (le nombre d'entrées conservées est déterminé par `finished_mutations_to_keep` le moteur de stockage de paramètre). Les anciennes entrées de mutation sont supprimées. - -## ALTER USER {#alter-user-statement} - -Changements clickhouse comptes d'utilisateurs. - -### Syntaxe {#alter-user-syntax} - -``` sql -ALTER USER [IF EXISTS] name [ON CLUSTER cluster_name] - [RENAME TO new_name] - [IDENTIFIED [WITH {PLAINTEXT_PASSWORD|SHA256_PASSWORD|DOUBLE_SHA1_PASSWORD}] BY {'password'|'hash'}] - [[ADD|DROP] HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] - [DEFAULT ROLE role [,...] | ALL | ALL EXCEPT role [,...] ] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] -``` - -### Description {#alter-user-dscr} - -Utiliser `ALTER USER` vous devez avoir le [ALTER USER](grant.md#grant-access-management) privilège. - -### Exemple {#alter-user-examples} - -Définir les rôles accordés par défaut: - -``` sql -ALTER USER user DEFAULT ROLE role1, role2 -``` - -Si les rôles ne sont pas précédemment accordés à un utilisateur, ClickHouse lève une exception. - -Définissez tous les rôles accordés à défaut: - -``` sql -ALTER USER user DEFAULT ROLE ALL -``` - -Si un rôle seront accordés à un utilisateur dans l'avenir, il deviendra automatiquement par défaut. - -Définissez tous les rôles accordés sur default excepting `role1` et `role2`: - -``` sql -ALTER USER user DEFAULT ROLE ALL EXCEPT role1, role2 -``` - -## ALTER ROLE {#alter-role-statement} - -Les changements de rôles. - -### Syntaxe {#alter-role-syntax} - -``` sql -ALTER ROLE [IF EXISTS] name [ON CLUSTER cluster_name] - [RENAME TO new_name] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] -``` - -## ALTER ROW POLICY {#alter-row-policy-statement} - -Modifie la stratégie de ligne. - -### Syntaxe {#alter-row-policy-syntax} - -``` sql -ALTER [ROW] POLICY [IF EXISTS] name [ON CLUSTER cluster_name] ON [database.]table - [RENAME TO new_name] - [AS {PERMISSIVE | RESTRICTIVE}] - [FOR SELECT] - [USING {condition | NONE}][,...] - [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] -``` - -## ALTER QUOTA {#alter-quota-statement} - -Les changements de quotas. - -### Syntaxe {#alter-quota-syntax} - -``` sql -ALTER QUOTA [IF EXISTS] name [ON CLUSTER cluster_name] - [RENAME TO new_name] - [KEYED BY {'none' | 'user name' | 'ip address' | 'client key' | 'client key or user name' | 'client key or ip address'}] - [FOR [RANDOMIZED] INTERVAL number {SECOND | MINUTE | HOUR | DAY | WEEK | MONTH | QUARTER | YEAR} - {MAX { {QUERIES | ERRORS | RESULT ROWS | RESULT BYTES | READ ROWS | READ BYTES | EXECUTION TIME} = number } [,...] | - NO LIMITS | TRACKING ONLY} [,...]] - [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] -``` - -## ALTER SETTINGS PROFILE {#alter-settings-profile-statement} - -Les changements de quotas. - -### Syntaxe {#alter-settings-profile-syntax} - -``` sql -ALTER SETTINGS PROFILE [IF EXISTS] name [ON CLUSTER cluster_name] - [RENAME TO new_name] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | INHERIT 'profile_name'] [,...] -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/alter/) diff --git a/docs/fr/sql-reference/statements/create.md b/docs/fr/sql-reference/statements/create.md deleted file mode 100644 index e7c8040ee6e..00000000000 --- a/docs/fr/sql-reference/statements/create.md +++ /dev/null @@ -1,502 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 35 -toc_title: CREATE ---- - -# Créer des requêtes {#create-queries} - -## CREATE DATABASE {#query-language-create-database} - -Crée la base de données. - -``` sql -CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] -``` - -### Clause {#clauses} - -- `IF NOT EXISTS` - Si l' `db_name` la base de données existe déjà, alors ClickHouse ne crée pas de nouvelle base de données et: - - - Ne lance pas d'exception si la clause est spécifiée. - - Lève une exception si la clause n'est pas spécifiée. - -- `ON CLUSTER` - Clickhouse crée le `db_name` base de données sur tous les serveurs d'un cluster spécifié. - -- `ENGINE` - - - [MySQL](../../engines/database-engines/mysql.md) - Vous permet de récupérer des données à partir du serveur MySQL distant. - Par défaut, ClickHouse utilise son propre [moteur de base de données](../../engines/database-engines/index.md). - -## CREATE TABLE {#create-table-query} - -Le `CREATE TABLE` la requête peut avoir plusieurs formes. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [compression_codec] [TTL expr2], - ... -) ENGINE = engine -``` - -Crée une table nommée ‘name’ dans le ‘db’ base de données ou la base de données actuelle si ‘db’ n'est pas définie, avec la structure spécifiée entre parenthèses et l' ‘engine’ moteur. -La structure de la table est une liste de descriptions de colonnes. Si les index sont pris en charge par le moteur, ils sont indiqués comme paramètres pour le moteur de table. - -Une description de colonne est `name type` dans le cas le plus simple. Exemple: `RegionID UInt32`. -Des Expressions peuvent également être définies pour les valeurs par défaut (voir ci-dessous). - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name AS [db2.]name2 [ENGINE = engine] -``` - -Crée une table avec la même structure qu'une autre table. Vous pouvez spécifier un moteur différent pour la table. Si le moteur n'est pas spécifié, le même moteur sera utilisé que pour la `db2.name2` table. - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_function() -``` - -Crée une table avec la structure et les données renvoyées par [fonction de table](../table-functions/index.md#table-functions). - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... -``` - -Crée une table avec une structure comme le résultat de l' `SELECT` une requête avec les ‘engine’ moteur, et le remplit avec des données de SELECT. - -Dans tous les cas, si `IF NOT EXISTS` est spécifié, la requête ne renvoie pas une erreur si la table existe déjà. Dans ce cas, la requête ne font rien. - -Il peut y avoir d'autres clauses après le `ENGINE` la clause dans la requête. Voir la documentation détaillée sur la façon de créer des tables dans les descriptions de [moteurs de table](../../engines/table-engines/index.md#table_engines). - -### Les Valeurs Par Défaut {#create-default-values} - -La description de colonne peut spécifier une expression pour une valeur par défaut, de l'une des manières suivantes:`DEFAULT expr`, `MATERIALIZED expr`, `ALIAS expr`. -Exemple: `URLDomain String DEFAULT domain(URL)`. - -Si une expression pour la valeur par défaut n'est pas définie, les valeurs par défaut seront définies sur zéros pour les nombres, chaînes vides pour les chaînes, tableaux vides pour les tableaux et `1970-01-01` pour les dates ou zero unix timestamp pour les dates avec le temps. Les valeurs NULL ne sont pas prises en charge. - -Si l'expression par défaut est définie, le type de colonne est facultatif. S'il n'y a pas de type explicitement défini, le type d'expression par défaut est utilisé. Exemple: `EventDate DEFAULT toDate(EventTime)` – the ‘Date’ type sera utilisé pour la ‘EventDate’ colonne. - -Si le type de données et l'expression par défaut sont définis explicitement, cette expression sera convertie au type spécifié à l'aide des fonctions de conversion de type. Exemple: `Hits UInt32 DEFAULT 0` signifie la même chose que `Hits UInt32 DEFAULT toUInt32(0)`. - -Default expressions may be defined as an arbitrary expression from table constants and columns. When creating and changing the table structure, it checks that expressions don't contain loops. For INSERT, it checks that expressions are resolvable – that all columns they can be calculated from have been passed. - -`DEFAULT expr` - -Valeur par défaut normale. Si la requête INSERT ne spécifie pas la colonne correspondante, elle sera remplie en calculant l'expression correspondante. - -`MATERIALIZED expr` - -Expression matérialisée. Une telle colonne ne peut pas être spécifiée pour INSERT, car elle est toujours calculée. -Pour un INSERT sans Liste de colonnes, ces colonnes ne sont pas prises en compte. -De plus, cette colonne n'est pas substituée lors de l'utilisation d'un astérisque dans une requête SELECT. C'est pour préserver l'invariant que le dump obtenu en utilisant `SELECT *` peut être inséré dans la table en utilisant INSERT sans spécifier la liste des colonnes. - -`ALIAS expr` - -Synonyme. Une telle colonne n'est pas du tout stockée dans la table. -Ses valeurs ne peuvent pas être insérées dans une table et elles ne sont pas substituées lors de l'utilisation d'un astérisque dans une requête SELECT. -Il peut être utilisé dans SELECTs si l'alias est développé pendant l'analyse des requêtes. - -Lorsque vous utilisez la requête ALTER pour ajouter de nouvelles colonnes, les anciennes données de ces colonnes ne sont pas écrites. Au lieu de cela, lors de la lecture d'anciennes données qui n'ont pas de valeurs pour les nouvelles colonnes, les expressions sont calculées à la volée par défaut. Cependant, si l'exécution des expressions nécessite différentes colonnes qui ne sont pas indiquées dans la requête, ces colonnes seront en outre lues, mais uniquement pour les blocs de données qui en ont besoin. - -Si vous ajoutez une nouvelle colonne à une table mais modifiez ultérieurement son expression par défaut, les valeurs utilisées pour les anciennes données changeront (pour les données où les valeurs n'ont pas été stockées sur le disque). Notez que lors de l'exécution de fusions d'arrière-plan, les données des colonnes manquantes dans l'une des parties de fusion sont écrites dans la partie fusionnée. - -Il n'est pas possible de définir des valeurs par défaut pour les éléments dans les structures de données. - -### Contraintes {#constraints} - -Avec les descriptions de colonnes des contraintes peuvent être définies: - -``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], - ... - CONSTRAINT constraint_name_1 CHECK boolean_expr_1, - ... -) ENGINE = engine -``` - -`boolean_expr_1` pourrait par n'importe quelle expression booléenne. Si les contraintes sont définies pour la table, chacun d'eux sera vérifiée pour chaque ligne `INSERT` query. If any constraint is not satisfied — server will raise an exception with constraint name and checking expression. - -L'ajout d'une grande quantité de contraintes peut affecter négativement les performances de big `INSERT` requête. - -### Expression TTL {#ttl-expression} - -Définit la durée de stockage des valeurs. Peut être spécifié uniquement pour les tables mergetree-family. Pour la description détaillée, voir [TTL pour les colonnes et les tableaux](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). - -### Codecs De Compression De Colonne {#codecs} - -Par défaut, ClickHouse applique le `lz4` méthode de compression. Pour `MergeTree`- famille de moteurs Vous pouvez modifier la méthode de compression par défaut dans le [compression](../../operations/server-configuration-parameters/settings.md#server-settings-compression) section d'une configuration de serveur. Vous pouvez également définir la méthode de compression pour chaque colonne `CREATE TABLE` requête. - -``` sql -CREATE TABLE codec_example -( - dt Date CODEC(ZSTD), - ts DateTime CODEC(LZ4HC), - float_value Float32 CODEC(NONE), - double_value Float64 CODEC(LZ4HC(9)) - value Float32 CODEC(Delta, ZSTD) -) -ENGINE = -... -``` - -Si un codec est spécifié, le codec par défaut ne s'applique pas. Les Codecs peuvent être combinés dans un pipeline, par exemple, `CODEC(Delta, ZSTD)`. Pour sélectionner la meilleure combinaison de codecs pour votre projet, passez des benchmarks similaires à ceux décrits dans Altinity [Nouveaux encodages pour améliorer L'efficacité du ClickHouse](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. - -!!! warning "Avertissement" - Vous ne pouvez pas décompresser les fichiers de base de données ClickHouse avec des utilitaires externes tels que `lz4`. Au lieu de cela, utilisez le spécial [clickhouse-compresseur](https://github.com/ClickHouse/ClickHouse/tree/master/programs/compressor) utilitaire. - -La Compression est prise en charge pour les moteurs de tableau suivants: - -- [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) famille. Prend en charge les codecs de compression de colonne et la sélection de la méthode de compression par défaut par [compression](../../operations/server-configuration-parameters/settings.md#server-settings-compression) paramètre. -- [Journal](../../engines/table-engines/log-family/index.md) famille. Utilise le `lz4` méthode de compression par défaut et prend en charge les codecs de compression de colonne. -- [Définir](../../engines/table-engines/special/set.md). Uniquement pris en charge la compression par défaut. -- [Rejoindre](../../engines/table-engines/special/join.md). Uniquement pris en charge la compression par défaut. - -ClickHouse prend en charge les codecs à usage commun et les codecs spécialisés. - -#### Codecs Spécialisés {#create-query-specialized-codecs} - -Ces codecs sont conçus pour rendre la compression plus efficace en utilisant des fonctionnalités spécifiques des données. Certains de ces codecs ne compressent pas les données eux-mêmes. Au lieu de cela, ils préparent les données pour un codec à usage commun, qui les compresse mieux que sans cette préparation. - -Spécialisé codecs: - -- `Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` sont utilisés pour stocker des valeurs delta, donc `delta_bytes` est la taille maximale des valeurs brutes. Possible `delta_bytes` valeurs: 1, 2, 4, 8. La valeur par défaut pour `delta_bytes` être `sizeof(type)` si égale à 1, 2, 4 ou 8. Dans tous les autres cas, c'est 1. -- `DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: Une Base De Données De Séries Chronologiques Rapide, Évolutive Et En Mémoire](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: Une Base De Données De Séries Chronologiques Rapide, Évolutive Et En Mémoire](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` et `DateTime`). À chaque étape de son algorithme, le codec prend un bloc de 64 valeurs, les place dans une matrice de 64x64 bits, le transpose, recadre les bits de valeurs inutilisés et renvoie le reste sous forme de séquence. Les bits inutilisés sont les bits, qui ne diffèrent pas entre les valeurs maximum et minimum dans la partie de données entière pour laquelle la compression est utilisée. - -`DoubleDelta` et `Gorilla` les codecs sont utilisés dans Gorilla TSDB comme composants de son algorithme de compression. L'approche Gorilla est efficace dans les scénarios où il y a une séquence de valeurs qui changent lentement avec leurs horodatages. Les horodatages sont effectivement compressés par le `DoubleDelta` codec, et les valeurs sont effectivement comprimé par le `Gorilla` codec. Par exemple, pour obtenir une table stockée efficacement, vous pouvez la créer dans la configuration suivante: - -``` sql -CREATE TABLE codec_example -( - timestamp DateTime CODEC(DoubleDelta), - slow_values Float32 CODEC(Gorilla) -) -ENGINE = MergeTree() -``` - -#### Codecs À Usage Général {#create-query-general-purpose-codecs} - -Codec: - -- `NONE` — No compression. -- `LZ4` — Lossless [algorithme de compression de données](https://github.com/lz4/lz4) utilisé par défaut. Applique la compression rapide LZ4. -- `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` s'applique le niveau par défaut. Niveaux possibles: \[1, 12\]. Plage de niveau recommandée: \[4, 9\]. -- `ZSTD[(level)]` — [Algorithme de compression ZSTD](https://en.wikipedia.org/wiki/Zstandard) avec configurables `level`. Niveaux possibles: \[1, 22\]. Valeur par défaut: 1. - -Des niveaux de compression élevés sont utiles pour les scénarios asymétriques, comme compresser une fois, décompresser à plusieurs reprises. Des niveaux plus élevés signifient une meilleure compression et une utilisation plus élevée du processeur. - -## Les Tables Temporaires {#temporary-tables} - -Clickhouse prend en charge les tables temporaires qui ont les caractéristiques suivantes: - -- Les tables temporaires disparaissent à la fin de la session, y compris si la connexion est perdue. -- Une table temporaire utilise uniquement le moteur de mémoire. -- La base de données ne peut pas être spécifiée pour une table temporaire. Il est créé en dehors des bases de données. -- Impossible de créer une table temporaire avec une requête DDL distribuée sur tous les serveurs de cluster (en utilisant `ON CLUSTER`): ce tableau n'existe que dans la session en cours. -- Si une table temporaire a le même nom qu'une autre et qu'une requête spécifie le nom de la table sans spécifier la base de données, la table temporaire sera utilisée. -- Pour le traitement des requêtes distribuées, les tables temporaires utilisées dans une requête sont transmises à des serveurs distants. - -Pour créer une table temporaire, utilisez la syntaxe suivante: - -``` sql -CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) -``` - -Dans la plupart des cas, les tables temporaires ne sont pas créées manuellement, mais lors de l'utilisation de données externes pour une requête ou pour `(GLOBAL) IN`. Pour plus d'informations, consultez les sections appropriées - -Il est possible d'utiliser des tables avec [Moteur = mémoire](../../engines/table-engines/special/memory.md) au lieu de tables temporaires. - -## Requêtes DDL distribuées (sur la clause CLUSTER) {#distributed-ddl-queries-on-cluster-clause} - -Le `CREATE`, `DROP`, `ALTER`, et `RENAME` les requêtes prennent en charge l'exécution distribuée sur un cluster. -Par exemple, la requête suivante crée la `all_hits` `Distributed` tableau sur chaque ordinateur hôte `cluster`: - -``` sql -CREATE TABLE IF NOT EXISTS all_hits ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(cluster, default, hits) -``` - -Pour exécuter ces requêtes correctement, chaque hôte doit avoir la même définition de cluster (pour simplifier la synchronisation des configs, vous pouvez utiliser des substitutions de ZooKeeper). Ils doivent également se connecter aux serveurs ZooKeeper. -La version locale de la requête sera finalement implémentée sur chaque hôte du cluster, même si certains hôtes ne sont actuellement pas disponibles. L'ordre d'exécution des requêtes au sein d'un seul hôte est garanti. - -## CREATE VIEW {#create-view} - -``` sql -CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]table_name [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ... -``` - -Crée une vue. Il existe deux types de vues: normale et matérialisée. - -Les vues normales ne stockent aucune donnée, mais effectuent simplement une lecture à partir d'une autre table. En d'autres termes, une vue normale n'est rien de plus qu'une requête enregistrée. Lors de la lecture à partir d'une vue, cette requête enregistrée est utilisée comme sous-requête dans la clause FROM. - -Par exemple, supposons que vous avez créé une vue: - -``` sql -CREATE VIEW view AS SELECT ... -``` - -et écrit une requête: - -``` sql -SELECT a, b, c FROM view -``` - -Cette requête est entièrement équivalente à l'utilisation de la sous requête: - -``` sql -SELECT a, b, c FROM (SELECT ...) -``` - -Les vues matérialisées stockent les données transformées par la requête SELECT correspondante. - -Lors de la création d'une vue matérialisée sans `TO [db].[table]`, you must specify ENGINE – the table engine for storing data. - -Lors de la création d'une vue matérialisée avec `TO [db].[table]` vous ne devez pas utiliser `POPULATE`. - -Une vue matérialisée est agencée comme suit: lors de l'insertion de données dans la table spécifiée dans SELECT, une partie des données insérées est convertie par cette requête SELECT, et le résultat est inséré dans la vue. - -Si vous spécifiez POPULATE, les données de table existantes sont insérées dans la vue lors de sa création, comme si `CREATE TABLE ... AS SELECT ...` . Sinon, la requête ne contient que les données insérées dans la table après la création de la vue. Nous ne recommandons pas D'utiliser POPULATE, car les données insérées dans la table lors de la création de la vue ne seront pas insérées dedans. - -A `SELECT` la requête peut contenir `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Note that the corresponding conversions are performed independently on each block of inserted data. For example, if `GROUP BY` est définie, les données sont agrégées lors de l'insertion, mais uniquement dans un seul paquet de données insérées. Les données ne seront pas agrégées davantage. L'exception concerne l'utilisation d'un moteur qui effectue indépendamment l'agrégation de données, par exemple `SummingMergeTree`. - -L'exécution de `ALTER` les requêtes sur les vues matérialisées n'ont pas été complètement développées, elles pourraient donc être gênantes. Si la vue matérialisée utilise la construction `TO [db.]name` vous pouvez `DETACH` la vue, exécutez `ALTER` pour la table cible, puis `ATTACH` précédemment détaché (`DETACH`) vue. - -Les vues ressemblent aux tables normales. Par exemple, ils sont répertoriés dans le résultat de la `SHOW TABLES` requête. - -Il n'y a pas de requête séparée pour supprimer des vues. Pour supprimer une vue, utilisez `DROP TABLE`. - -## CREATE DICTIONARY {#create-dictionary-query} - -``` sql -CREATE DICTIONARY [IF NOT EXISTS] [db.]dictionary_name [ON CLUSTER cluster] -( - key1 type1 [DEFAULT|EXPRESSION expr1] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID], - key2 type2 [DEFAULT|EXPRESSION expr2] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID], - attr1 type2 [DEFAULT|EXPRESSION expr3], - attr2 type2 [DEFAULT|EXPRESSION expr4] -) -PRIMARY KEY key1, key2 -SOURCE(SOURCE_NAME([param1 value1 ... paramN valueN])) -LAYOUT(LAYOUT_NAME([param_name param_value])) -LIFETIME({MIN min_val MAX max_val | max_val}) -``` - -Crée [externe dictionnaire](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) avec le [structure](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md), [source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), [disposition](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) et [vie](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). - -Structure de dictionnaire externe se compose d'attributs. Les attributs du dictionnaire sont spécifiés de la même manière que les colonnes du tableau. La seule propriété d'attribut requise est son type, toutes les autres propriétés peuvent avoir des valeurs par défaut. - -Selon le dictionnaire [disposition](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) un ou plusieurs attributs peuvent être spécifiés comme les clés de dictionnaire. - -Pour plus d'informations, voir [Dictionnaires Externes](../dictionaries/external-dictionaries/external-dicts.md) section. - -## CREATE USER {#create-user-statement} - -Crée un [compte d'utilisateur](../../operations/access-rights.md#user-account-management). - -### Syntaxe {#create-user-syntax} - -``` sql -CREATE USER [IF NOT EXISTS | OR REPLACE] name [ON CLUSTER cluster_name] - [IDENTIFIED [WITH {NO_PASSWORD|PLAINTEXT_PASSWORD|SHA256_PASSWORD|SHA256_HASH|DOUBLE_SHA1_PASSWORD|DOUBLE_SHA1_HASH}] BY {'password'|'hash'}] - [HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] - [DEFAULT ROLE role [,...]] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] -``` - -#### Identification {#identification} - -Il existe de multiples façons d'identification d'un utilisateur: - -- `IDENTIFIED WITH no_password` -- `IDENTIFIED WITH plaintext_password BY 'qwerty'` -- `IDENTIFIED WITH sha256_password BY 'qwerty'` ou `IDENTIFIED BY 'password'` -- `IDENTIFIED WITH sha256_hash BY 'hash'` -- `IDENTIFIED WITH double_sha1_password BY 'qwerty'` -- `IDENTIFIED WITH double_sha1_hash BY 'hash'` - -#### L'Utilisateur De L'Hôte {#user-host} - -L'hôte utilisateur est un hôte à partir duquel une connexion au serveur ClickHouse peut être établie. Hôte peut être spécifié dans le `HOST` section de requête par les moyens suivants: - -- `HOST IP 'ip_address_or_subnetwork'` — User can connect to ClickHouse server only from the specified IP address or a [sous-réseau](https://en.wikipedia.org/wiki/Subnetwork). Exemple: `HOST IP '192.168.0.0/16'`, `HOST IP '2001:DB8::/32'`. Pour une utilisation en production, spécifiez uniquement `HOST IP` (adresses IP et leurs masques), depuis l'utilisation `host` et `host_regexp` peut causer une latence supplémentaire. -- `HOST ANY` — User can connect from any location. This is default option. -- `HOST LOCAL` — User can connect only locally. -- `HOST NAME 'fqdn'` — User host can be specified as FQDN. For example, `HOST NAME 'mysite.com'`. -- `HOST NAME REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) expressions régulières lors de la spécification des hôtes utilisateur. Exemple, `HOST NAME REGEXP '.*\.mysite\.com'`. -- `HOST LIKE 'template'` — Allows you use the [LIKE](../functions/string-search-functions.md#function-like) opérateur de filtre de l'utilisateur hôtes. Exemple, `HOST LIKE '%'` est équivalent à `HOST ANY`, `HOST LIKE '%.mysite.com'` filtre tous les hôtes dans le `mysite.com` domaine. - -Une autre façon de spécifier l'hôte est d'utiliser `@` syntaxe avec le nom d'utilisateur. Exemple: - -- `CREATE USER mira@'127.0.0.1'` — Equivalent to the `HOST IP` syntaxe. -- `CREATE USER mira@'localhost'` — Equivalent to the `HOST LOCAL` syntaxe. -- `CREATE USER mira@'192.168.%.%'` — Equivalent to the `HOST LIKE` syntaxe. - -!!! info "Avertissement" - Clickhouse traite `user_name@'address'` comme un nom d'utilisateur dans son ensemble. Donc, techniquement, vous pouvez créer plusieurs utilisateurs avec `user_name` et différentes constructions après `@`. Nous ne recommandons pas de le faire. - -### Exemple {#create-user-examples} - -Créer le compte d'utilisateur `mira` protégé par le mot de passe `qwerty`: - -``` sql -CREATE USER mira HOST IP '127.0.0.1' IDENTIFIED WITH sha256_password BY 'qwerty' -``` - -`mira` devrait démarrer l'application client sur l'hôte où le serveur ClickHouse s'exécute. - -Créer le compte d'utilisateur `john`, attribuez-lui des rôles et définissez ces rôles par défaut: - -``` sql -CREATE USER john DEFAULT ROLE role1, role2 -``` - -Créer le compte d'utilisateur `john` et faire tous ses futurs rôles par défaut: - -``` sql -ALTER USER user DEFAULT ROLE ALL -``` - -Quand un rôle sera attribué à `john` dans l'avenir, il deviendra automatiquement par défaut. - -Créer le compte d'utilisateur `john` et faire tous ses futurs rôles par défaut sauf `role1` et `role2`: - -``` sql -ALTER USER john DEFAULT ROLE ALL EXCEPT role1, role2 -``` - -## CREATE ROLE {#create-role-statement} - -Crée un [rôle](../../operations/access-rights.md#role-management). - -### Syntaxe {#create-role-syntax} - -``` sql -CREATE ROLE [IF NOT EXISTS | OR REPLACE] name - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] -``` - -### Description {#create-role-description} - -Rôle est un ensemble de [privilège](grant.md#grant-privileges). Un utilisateur reçoit un rôle obtient tous les privilèges de ce rôle. - -Un utilisateur peut être affecté à plusieurs rôles. Les utilisateurs peuvent appliquer leurs rôles accordés dans des combinaisons arbitraires par le [SET ROLE](misc.md#set-role-statement) déclaration. La finale de la portée des privilèges est un ensemble combiné de tous les privilèges de tous les rôles. Si un utilisateur a des privilèges accordés directement à son compte d'utilisateur, ils sont également combinés avec les privilèges accordés par les rôles. - -L'utilisateur peut avoir des rôles par défaut qui s'appliquent à la connexion de l'utilisateur. Pour définir les rôles par défaut, utilisez [SET DEFAULT ROLE](misc.md#set-default-role-statement) - déclaration ou de la [ALTER USER](alter.md#alter-user-statement) déclaration. - -Pour révoquer un rôle, utilisez [REVOKE](revoke.md) déclaration. - -Pour supprimer le rôle, utilisez [DROP ROLE](misc.md#drop-role-statement) déclaration. Le rôle supprimé est automatiquement révoqué de tous les utilisateurs et rôles auxquels il a été accordé. - -### Exemple {#create-role-examples} - -``` sql -CREATE ROLE accountant; -GRANT SELECT ON db.* TO accountant; -``` - -Cette séquence de requêtes crée le rôle `accountant` cela a le privilège de lire les données du `accounting` la base de données. - -Octroi du rôle à l'utilisateur `mira`: - -``` sql -GRANT accountant TO mira; -``` - -Une fois le rôle accordé, l'utilisateur peut l'utiliser et effectuer les requêtes autorisées. Exemple: - -``` sql -SET ROLE accountant; -SELECT * FROM db.*; -``` - -## CREATE ROW POLICY {#create-row-policy-statement} - -Crée un [filtre pour les lignes](../../operations/access-rights.md#row-policy-management) qu'un utilisateur peut lire à partir d'une table. - -### Syntaxe {#create-row-policy-syntax} - -``` sql -CREATE [ROW] POLICY [IF NOT EXISTS | OR REPLACE] policy_name [ON CLUSTER cluster_name] ON [db.]table - [AS {PERMISSIVE | RESTRICTIVE}] - [FOR SELECT] - [USING condition] - [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] -``` - -#### Section AS {#create-row-policy-as} - -À l'aide de cette section, vous pouvez créer des stratégies permissives ou restrictives. - -La stratégie Permissive accorde l'accès aux lignes. Les stratégies permissives qui s'appliquent à la même table sont combinées ensemble en utilisant le booléen `OR` opérateur. Les stratégies sont permissives par défaut. - -La politique Restrictive limite l'accès à la ligne. Les politiques restrictives qui s'appliquent à la même table sont combinées en utilisant le booléen `AND` opérateur. - -Les stratégies restrictives s'appliquent aux lignes qui ont passé les filtres permissifs. Si vous définissez des stratégies restrictives mais aucune politique permissive, l'utilisateur ne peut obtenir aucune ligne de la table. - -#### La Section DE {#create-row-policy-to} - -Dans la section `TO` vous pouvez donner une liste mixte de rôles et d'utilisateurs, par exemple, `CREATE ROW POLICY ... TO accountant, john@localhost`. - -Mot `ALL` signifie Tous les utilisateurs de ClickHouse, y compris l'utilisateur actuel. Mot `ALL EXCEPT` autoriser à exclure certains utilisateurs de la liste tous les utilisateurs, par exemple `CREATE ROW POLICY ... TO ALL EXCEPT accountant, john@localhost` - -### Exemple {#examples} - -- `CREATE ROW POLICY filter ON mydb.mytable FOR SELECT USING a<1000 TO accountant, john@localhost` -- `CREATE ROW POLICY filter ON mydb.mytable FOR SELECT USING a<1000 TO ALL EXCEPT mira` - -## CREATE QUOTA {#create-quota-statement} - -Crée un [quota](../../operations/access-rights.md#quotas-management) qui peut être attribué à un utilisateur ou un rôle. - -### Syntaxe {#create-quota-syntax} - -``` sql -CREATE QUOTA [IF NOT EXISTS | OR REPLACE] name [ON CLUSTER cluster_name] - [KEYED BY {'none' | 'user name' | 'ip address' | 'client key' | 'client key or user name' | 'client key or ip address'}] - [FOR [RANDOMIZED] INTERVAL number {SECOND | MINUTE | HOUR | DAY | WEEK | MONTH | QUARTER | YEAR} - {MAX { {QUERIES | ERRORS | RESULT ROWS | RESULT BYTES | READ ROWS | READ BYTES | EXECUTION TIME} = number } [,...] | - NO LIMITS | TRACKING ONLY} [,...]] - [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] -``` - -### Exemple {#create-quota-example} - -Limiter le nombre maximum de requêtes pour l'utilisateur actuel avec 123 requêtes en 15 mois contrainte: - -``` sql -CREATE QUOTA qA FOR INTERVAL 15 MONTH MAX QUERIES 123 TO CURRENT_USER -``` - -## CREATE SETTINGS PROFILE {#create-settings-profile-statement} - -Crée un [les paramètres de profil](../../operations/access-rights.md#settings-profiles-management) qui peut être attribué à un utilisateur ou un rôle. - -### Syntaxe {#create-settings-profile-syntax} - -``` sql -CREATE SETTINGS PROFILE [IF NOT EXISTS | OR REPLACE] name [ON CLUSTER cluster_name] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | INHERIT 'profile_name'] [,...] -``` - -# Exemple {#create-settings-profile-syntax} - -Créer l' `max_memory_usage_profile` paramètres du profil avec valeur et contraintes pour `max_memory_usage` paramètre. L'affecter à `robin`: - -``` sql -CREATE SETTINGS PROFILE max_memory_usage_profile SETTINGS max_memory_usage = 100000001 MIN 90000000 MAX 110000000 TO robin -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/create/) diff --git a/docs/fr/sql-reference/statements/grant.md b/docs/fr/sql-reference/statements/grant.md deleted file mode 100644 index 143c9a36e33..00000000000 --- a/docs/fr/sql-reference/statements/grant.md +++ /dev/null @@ -1,476 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 39 -toc_title: GRANT ---- - -# GRANT {#grant} - -- Accorder [privilège](#grant-privileges) pour ClickHouse comptes d'utilisateurs ou des rôles. -- Affecte des rôles à des comptes d'utilisateurs ou à d'autres rôles. - -Pour révoquer les privilèges, utilisez [REVOKE](revoke.md) déclaration. Vous pouvez également classer les privilèges accordés par le [SHOW GRANTS](show.md#show-grants-statement) déclaration. - -## Accorder La Syntaxe Des Privilèges {#grant-privigele-syntax} - -``` sql -GRANT [ON CLUSTER cluster_name] privilege[(column_name [,...])] [,...] ON {db.table|db.*|*.*|table|*} TO {user | role | CURRENT_USER} [,...] [WITH GRANT OPTION] -``` - -- `privilege` — Type of privilege. -- `role` — ClickHouse user role. -- `user` — ClickHouse user account. - -Le `WITH GRANT OPTION` clause de subventions `user` ou `role` avec l'autorisation de réaliser des `GRANT` requête. Les utilisateurs peuvent accorder des privilèges de la même portée qu'ils ont et moins. - -## Attribution De La Syntaxe Du Rôle {#assign-role-syntax} - -``` sql -GRANT [ON CLUSTER cluster_name] role [,...] TO {user | another_role | CURRENT_USER} [,...] [WITH ADMIN OPTION] -``` - -- `role` — ClickHouse user role. -- `user` — ClickHouse user account. - -Le `WITH ADMIN OPTION` clause de jeux [ADMIN OPTION](#admin-option-privilege) privilège pour `user` ou `role`. - -## Utilisation {#grant-usage} - -Utiliser `GRANT` votre compte doit avoir le `GRANT OPTION` privilège. Vous ne pouvez accorder des privilèges que dans le cadre de vos privilèges de Compte. - -Par exemple, l'administrateur a accordé des privilèges `john` compte par la requête: - -``` sql -GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION -``` - -Cela signifie que `john` a la permission d'effectuer: - -- `SELECT x,y FROM db.table`. -- `SELECT x FROM db.table`. -- `SELECT y FROM db.table`. - -`john` ne pouvez pas effectuer de `SELECT z FROM db.table`. Le `SELECT * FROM db.table` aussi n'est pas disponible. En traitant cette requête, ClickHouse ne renvoie aucune donnée, même `x` et `y`. La seule exception est si une table contient uniquement `x` et `y` colonnes, dans ce cas ClickHouse renvoie toutes les données. - -Également `john` a l' `GRANT OPTION` privilège, de sorte qu'il peut accorder à d'autres utilisateurs avec des privilèges de la même ou de la plus petite portée. - -Spécification des privilèges vous pouvez utiliser asterisk (`*`) au lieu d'une table ou d'un nom de base de données. Par exemple, l' `GRANT SELECT ON db.* TO john` requête permet `john` pour effectuer la `SELECT` requête sur toutes les tables dans `db` la base de données. En outre, vous pouvez omettre le nom de la base de données. Dans ce cas, des privilèges sont accordés pour la base de données actuelle, par exemple: `GRANT SELECT ON * TO john` accorde le privilège sur toutes les tables dans la base de données actuelle, `GRANT SELECT ON mytable TO john` accorde le privilège sur le `mytable` table dans la base de données actuelle. - -L'accès à la `system` la base de données est toujours autorisée (puisque cette base de données est utilisée pour traiter les requêtes). - -Vous pouvez accorder plusieurs privilèges à plusieurs comptes dans une requête. Requête `GRANT SELECT, INSERT ON *.* TO john, robin` permet de comptes `john` et `robin` pour effectuer la `INSERT` et `SELECT` des requêtes sur toutes les tables de toutes les bases de données sur le serveur. - -## Privilège {#grant-privileges} - -Privilège est une autorisation pour effectuer un type spécifique de requêtes. - -Les privilèges ont une structure hiérarchique. Un ensemble de requêtes autorisées dépend de la portée des privilèges. - -Hiérarchie des privilèges: - -- [SELECT](#grant-select) -- [INSERT](#grant-insert) -- [ALTER](#grant-alter) - - `ALTER TABLE` - - `ALTER UPDATE` - - `ALTER DELETE` - - `ALTER COLUMN` - - `ALTER ADD COLUMN` - - `ALTER DROP COLUMN` - - `ALTER MODIFY COLUMN` - - `ALTER COMMENT COLUMN` - - `ALTER CLEAR COLUMN` - - `ALTER RENAME COLUMN` - - `ALTER INDEX` - - `ALTER ORDER BY` - - `ALTER ADD INDEX` - - `ALTER DROP INDEX` - - `ALTER MATERIALIZE INDEX` - - `ALTER CLEAR INDEX` - - `ALTER CONSTRAINT` - - `ALTER ADD CONSTRAINT` - - `ALTER DROP CONSTRAINT` - - `ALTER TTL` - - `ALTER MATERIALIZE TTL` - - `ALTER SETTINGS` - - `ALTER MOVE PARTITION` - - `ALTER FETCH PARTITION` - - `ALTER FREEZE PARTITION` - - `ALTER VIEW` - - `ALTER VIEW REFRESH` - - `ALTER VIEW MODIFY QUERY` -- [CREATE](#grant-create) - - `CREATE DATABASE` - - `CREATE TABLE` - - `CREATE VIEW` - - `CREATE DICTIONARY` - - `CREATE TEMPORARY TABLE` -- [DROP](#grant-drop) - - `DROP DATABASE` - - `DROP TABLE` - - `DROP VIEW` - - `DROP DICTIONARY` -- [TRUNCATE](#grant-truncate) -- [OPTIMIZE](#grant-optimize) -- [SHOW](#grant-show) - - `SHOW DATABASES` - - `SHOW TABLES` - - `SHOW COLUMNS` - - `SHOW DICTIONARIES` -- [KILL QUERY](#grant-kill-query) -- [ACCESS MANAGEMENT](#grant-access-management) - - `CREATE USER` - - `ALTER USER` - - `DROP USER` - - `CREATE ROLE` - - `ALTER ROLE` - - `DROP ROLE` - - `CREATE ROW POLICY` - - `ALTER ROW POLICY` - - `DROP ROW POLICY` - - `CREATE QUOTA` - - `ALTER QUOTA` - - `DROP QUOTA` - - `CREATE SETTINGS PROFILE` - - `ALTER SETTINGS PROFILE` - - `DROP SETTINGS PROFILE` - - `SHOW ACCESS` - - `SHOW_USERS` - - `SHOW_ROLES` - - `SHOW_ROW_POLICIES` - - `SHOW_QUOTAS` - - `SHOW_SETTINGS_PROFILES` - - `ROLE ADMIN` -- [SYSTEM](#grant-system) - - `SYSTEM SHUTDOWN` - - `SYSTEM DROP CACHE` - - `SYSTEM DROP DNS CACHE` - - `SYSTEM DROP MARK CACHE` - - `SYSTEM DROP UNCOMPRESSED CACHE` - - `SYSTEM RELOAD` - - `SYSTEM RELOAD CONFIG` - - `SYSTEM RELOAD DICTIONARY` - - `SYSTEM RELOAD EMBEDDED DICTIONARIES` - - `SYSTEM MERGES` - - `SYSTEM TTL MERGES` - - `SYSTEM FETCHES` - - `SYSTEM MOVES` - - `SYSTEM SENDS` - - `SYSTEM DISTRIBUTED SENDS` - - `SYSTEM REPLICATED SENDS` - - `SYSTEM REPLICATION QUEUES` - - `SYSTEM SYNC REPLICA` - - `SYSTEM RESTART REPLICA` - - `SYSTEM FLUSH` - - `SYSTEM FLUSH DISTRIBUTED` - - `SYSTEM FLUSH LOGS` -- [INTROSPECTION](#grant-introspection) - - `addressToLine` - - `addressToSymbol` - - `demangle` -- [SOURCES](#grant-sources) - - `FILE` - - `URL` - - `REMOTE` - - `YSQL` - - `ODBC` - - `JDBC` - - `HDFS` - - `S3` -- [dictGet](#grant-dictget) - -Exemples de la façon dont cette hiérarchie est traitée: - -- Le `ALTER` privilège comprend tous les autres `ALTER*` privilège. -- `ALTER CONSTRAINT` comprendre `ALTER ADD CONSTRAINT` et `ALTER DROP CONSTRAINT` privilège. - -Les privilèges sont appliqués à différents niveaux. Connaissant un niveau suggère la syntaxe disponible pour le privilège. - -Les niveaux (du plus faible au plus élevé): - -- `COLUMN` — Privilege can be granted for column, table, database, or globally. -- `TABLE` — Privilege can be granted for table, database, or globally. -- `VIEW` — Privilege can be granted for view, database, or globally. -- `DICTIONARY` — Privilege can be granted for dictionary, database, or globally. -- `DATABASE` — Privilege can be granted for database or globally. -- `GLOBAL` — Privilege can be granted only globally. -- `GROUP` — Groups privileges of different levels. When `GROUP`- le privilège de niveau est accordé, seuls les privilèges du groupe sont accordés qui correspondent à la syntaxe utilisée. - -Exemples de syntaxe: - -- `GRANT SELECT(x) ON db.table TO user` -- `GRANT SELECT ON db.* TO user` - -Exemples de syntaxe refusée: - -- `GRANT CREATE USER(x) ON db.table TO user` -- `GRANT CREATE USER ON db.* TO user` - -Le privilège spécial [ALL](#grant-all) accorde tous les privilèges à un compte d'utilisateur ou à un rôle. - -Par défaut, un compte d'utilisateur ou un rôle a pas de privilèges. - -Si un utilisateur ou un rôle ont pas de privilèges qu'il s'affiche comme [NONE](#grant-none) privilège. - -Certaines requêtes par leur implémentation nécessitent un ensemble de privilèges. Par exemple, pour effectuer la [RENAME](misc.md#misc_operations-rename) requête vous avez besoin des privilèges suivants: `SELECT`, `CREATE TABLE`, `INSERT` et `DROP TABLE`. - -### SELECT {#grant-select} - -Permet d'effectuer des [SELECT](select/index.md) requête. - -Le niveau de privilège: `COLUMN`. - -**Description** - -L'utilisateur accordé avec ce privilège peut effectuer `SELECT` requêtes sur une liste spécifiée de colonnes dans la table et la base de données spécifiées. Si l'utilisateur inclut d'autres colonnes, une requête ne renvoie aucune donnée. - -Considérez le privilège suivant: - -``` sql -GRANT SELECT(x,y) ON db.table TO john -``` - -Ce privilège permet à `john` pour effectuer toute `SELECT` requête qui implique des données du `x` et/ou `y` les colonnes en `db.table`. Exemple, `SELECT x FROM db.table`. `john` ne pouvez pas effectuer de `SELECT z FROM db.table`. Le `SELECT * FROM db.table` aussi n'est pas disponible. En traitant cette requête, ClickHouse ne renvoie aucune donnée, même `x` et `y`. La seule exception est si une table contient uniquement `x` et `y` colonnes, dans ce cas ClickHouse renvoie toutes les données. - -### INSERT {#grant-insert} - -Permet d'effectuer des [INSERT](insert-into.md) requête. - -Le niveau de privilège: `COLUMN`. - -**Description** - -L'utilisateur accordé avec ce privilège peut effectuer `INSERT` requêtes sur une liste spécifiée de colonnes dans la table et la base de données spécifiées. Si l'utilisateur inclut d'autres colonnes, une requête n'insère aucune donnée. - -**Exemple** - -``` sql -GRANT INSERT(x,y) ON db.table TO john -``` - -Le privilège accordé permet `john` pour insérer des données à l' `x` et/ou `y` les colonnes en `db.table`. - -### ALTER {#grant-alter} - -Permet d'effectuer des [ALTER](alter.md) requêtes correspondant à la hiérarchie de privilèges suivante: - -- `ALTER`. Niveau: `COLUMN`. - - `ALTER TABLE`. Niveau: `GROUP` - - `ALTER UPDATE`. Niveau: `COLUMN`. Alias: `UPDATE` - - `ALTER DELETE`. Niveau: `COLUMN`. Alias: `DELETE` - - `ALTER COLUMN`. Niveau: `GROUP` - - `ALTER ADD COLUMN`. Niveau: `COLUMN`. Alias: `ADD COLUMN` - - `ALTER DROP COLUMN`. Niveau: `COLUMN`. Alias: `DROP COLUMN` - - `ALTER MODIFY COLUMN`. Niveau: `COLUMN`. Alias: `MODIFY COLUMN` - - `ALTER COMMENT COLUMN`. Niveau: `COLUMN`. Alias: `COMMENT COLUMN` - - `ALTER CLEAR COLUMN`. Niveau: `COLUMN`. Alias: `CLEAR COLUMN` - - `ALTER RENAME COLUMN`. Niveau: `COLUMN`. Alias: `RENAME COLUMN` - - `ALTER INDEX`. Niveau: `GROUP`. Alias: `INDEX` - - `ALTER ORDER BY`. Niveau: `TABLE`. Alias: `ALTER MODIFY ORDER BY`, `MODIFY ORDER BY` - - `ALTER ADD INDEX`. Niveau: `TABLE`. Alias: `ADD INDEX` - - `ALTER DROP INDEX`. Niveau: `TABLE`. Alias: `DROP INDEX` - - `ALTER MATERIALIZE INDEX`. Niveau: `TABLE`. Alias: `MATERIALIZE INDEX` - - `ALTER CLEAR INDEX`. Niveau: `TABLE`. Alias: `CLEAR INDEX` - - `ALTER CONSTRAINT`. Niveau: `GROUP`. Alias: `CONSTRAINT` - - `ALTER ADD CONSTRAINT`. Niveau: `TABLE`. Alias: `ADD CONSTRAINT` - - `ALTER DROP CONSTRAINT`. Niveau: `TABLE`. Alias: `DROP CONSTRAINT` - - `ALTER TTL`. Niveau: `TABLE`. Alias: `ALTER MODIFY TTL`, `MODIFY TTL` - - `ALTER MATERIALIZE TTL`. Niveau: `TABLE`. Alias: `MATERIALIZE TTL` - - `ALTER SETTINGS`. Niveau: `TABLE`. Alias: `ALTER SETTING`, `ALTER MODIFY SETTING`, `MODIFY SETTING` - - `ALTER MOVE PARTITION`. Niveau: `TABLE`. Alias: `ALTER MOVE PART`, `MOVE PARTITION`, `MOVE PART` - - `ALTER FETCH PARTITION`. Niveau: `TABLE`. Alias: `FETCH PARTITION` - - `ALTER FREEZE PARTITION`. Niveau: `TABLE`. Alias: `FREEZE PARTITION` - - `ALTER VIEW` Niveau: `GROUP` - - `ALTER VIEW REFRESH`. Niveau: `VIEW`. Alias: `ALTER LIVE VIEW REFRESH`, `REFRESH VIEW` - - `ALTER VIEW MODIFY QUERY`. Niveau: `VIEW`. Alias: `ALTER TABLE MODIFY QUERY` - -Exemples de la façon dont cette hiérarchie est traitée: - -- Le `ALTER` privilège comprend tous les autres `ALTER*` privilège. -- `ALTER CONSTRAINT` comprendre `ALTER ADD CONSTRAINT` et `ALTER DROP CONSTRAINT` privilège. - -**Note** - -- Le `MODIFY SETTING` privilège permet de modifier les paramètres du moteur de table. In n'affecte pas les paramètres ou les paramètres de configuration du serveur. -- Le `ATTACH` opération a besoin de la [CREATE](#grant-create) privilège. -- Le `DETACH` opération a besoin de la [DROP](#grant-drop) privilège. -- Pour arrêter la mutation par le [KILL MUTATION](misc.md#kill-mutation) requête, vous devez avoir un privilège pour commencer cette mutation. Par exemple, si vous voulez arrêter l' `ALTER UPDATE` requête, vous avez besoin du `ALTER UPDATE`, `ALTER TABLE`, ou `ALTER` privilège. - -### CREATE {#grant-create} - -Permet d'effectuer des [CREATE](create.md) et [ATTACH](misc.md#attach) DDL-requêtes correspondant à la hiérarchie de privilèges suivante: - -- `CREATE`. Niveau: `GROUP` - - `CREATE DATABASE`. Niveau: `DATABASE` - - `CREATE TABLE`. Niveau: `TABLE` - - `CREATE VIEW`. Niveau: `VIEW` - - `CREATE DICTIONARY`. Niveau: `DICTIONARY` - - `CREATE TEMPORARY TABLE`. Niveau: `GLOBAL` - -**Note** - -- Pour supprimer la table créée, l'utilisateur doit [DROP](#grant-drop). - -### DROP {#grant-drop} - -Permet d'effectuer des [DROP](misc.md#drop) et [DETACH](misc.md#detach) requêtes correspondant à la hiérarchie de privilèges suivante: - -- `DROP`. Niveau: - - `DROP DATABASE`. Niveau: `DATABASE` - - `DROP TABLE`. Niveau: `TABLE` - - `DROP VIEW`. Niveau: `VIEW` - - `DROP DICTIONARY`. Niveau: `DICTIONARY` - -### TRUNCATE {#grant-truncate} - -Permet d'effectuer des [TRUNCATE](misc.md#truncate-statement) requête. - -Le niveau de privilège: `TABLE`. - -### OPTIMIZE {#grant-optimize} - -Permet d'effectuer les [OPTIMIZE TABLE](misc.md#misc_operations-optimize) requête. - -Le niveau de privilège: `TABLE`. - -### SHOW {#grant-show} - -Permet d'effectuer des `SHOW`, `DESCRIBE`, `USE`, et `EXISTS` requêtes, correspondant à la hiérarchie suivante des privilèges: - -- `SHOW`. Niveau: `GROUP` - - `SHOW DATABASES`. Niveau: `DATABASE`. Permet d'exécuter des `SHOW DATABASES`, `SHOW CREATE DATABASE`, `USE ` requête. - - `SHOW TABLES`. Niveau: `TABLE`. Permet d'exécuter des `SHOW TABLES`, `EXISTS `, `CHECK
` requête. - - `SHOW COLUMNS`. Niveau: `COLUMN`. Permet d'exécuter des `SHOW CREATE TABLE`, `DESCRIBE` requête. - - `SHOW DICTIONARIES`. Niveau: `DICTIONARY`. Permet d'exécuter des `SHOW DICTIONARIES`, `SHOW CREATE DICTIONARY`, `EXISTS ` requête. - -**Note** - -Un utilisateur a le `SHOW` privilège s'il a un autre privilège concernant la table, le dictionnaire ou la base de données spécifiés. - -### KILL QUERY {#grant-kill-query} - -Permet d'effectuer les [KILL](misc.md#kill-query-statement) requêtes correspondant à la hiérarchie de privilèges suivante: - -Le niveau de privilège: `GLOBAL`. - -**Note** - -`KILL QUERY` privilège permet à un utilisateur de tuer les requêtes des autres utilisateurs. - -### ACCESS MANAGEMENT {#grant-access-management} - -Permet à un utilisateur d'effectuer des requêtes qui gèrent les utilisateurs, les rôles et les stratégies de ligne. - -- `ACCESS MANAGEMENT`. Niveau: `GROUP` - - `CREATE USER`. Niveau: `GLOBAL` - - `ALTER USER`. Niveau: `GLOBAL` - - `DROP USER`. Niveau: `GLOBAL` - - `CREATE ROLE`. Niveau: `GLOBAL` - - `ALTER ROLE`. Niveau: `GLOBAL` - - `DROP ROLE`. Niveau: `GLOBAL` - - `ROLE ADMIN`. Niveau: `GLOBAL` - - `CREATE ROW POLICY`. Niveau: `GLOBAL`. Alias: `CREATE POLICY` - - `ALTER ROW POLICY`. Niveau: `GLOBAL`. Alias: `ALTER POLICY` - - `DROP ROW POLICY`. Niveau: `GLOBAL`. Alias: `DROP POLICY` - - `CREATE QUOTA`. Niveau: `GLOBAL` - - `ALTER QUOTA`. Niveau: `GLOBAL` - - `DROP QUOTA`. Niveau: `GLOBAL` - - `CREATE SETTINGS PROFILE`. Niveau: `GLOBAL`. Alias: `CREATE PROFILE` - - `ALTER SETTINGS PROFILE`. Niveau: `GLOBAL`. Alias: `ALTER PROFILE` - - `DROP SETTINGS PROFILE`. Niveau: `GLOBAL`. Alias: `DROP PROFILE` - - `SHOW ACCESS`. Niveau: `GROUP` - - `SHOW_USERS`. Niveau: `GLOBAL`. Alias: `SHOW CREATE USER` - - `SHOW_ROLES`. Niveau: `GLOBAL`. Alias: `SHOW CREATE ROLE` - - `SHOW_ROW_POLICIES`. Niveau: `GLOBAL`. Alias: `SHOW POLICIES`, `SHOW CREATE ROW POLICY`, `SHOW CREATE POLICY` - - `SHOW_QUOTAS`. Niveau: `GLOBAL`. Alias: `SHOW CREATE QUOTA` - - `SHOW_SETTINGS_PROFILES`. Niveau: `GLOBAL`. Alias: `SHOW PROFILES`, `SHOW CREATE SETTINGS PROFILE`, `SHOW CREATE PROFILE` - -Le `ROLE ADMIN` le privilège permet à un utilisateur d'accorder et de révoquer tous les rôles, y compris ceux qui ne lui sont pas accordés avec l'option admin. - -### SYSTEM {#grant-system} - -Permet à un utilisateur d'effectuer la [SYSTEM](system.md) requêtes correspondant à la hiérarchie de privilèges suivante. - -- `SYSTEM`. Niveau: `GROUP` - - `SYSTEM SHUTDOWN`. Niveau: `GLOBAL`. Alias: `SYSTEM KILL`, `SHUTDOWN` - - `SYSTEM DROP CACHE`. Alias: `DROP CACHE` - - `SYSTEM DROP DNS CACHE`. Niveau: `GLOBAL`. Alias: `SYSTEM DROP DNS`, `DROP DNS CACHE`, `DROP DNS` - - `SYSTEM DROP MARK CACHE`. Niveau: `GLOBAL`. Alias: `SYSTEM DROP MARK`, `DROP MARK CACHE`, `DROP MARKS` - - `SYSTEM DROP UNCOMPRESSED CACHE`. Niveau: `GLOBAL`. Alias: `SYSTEM DROP UNCOMPRESSED`, `DROP UNCOMPRESSED CACHE`, `DROP UNCOMPRESSED` - - `SYSTEM RELOAD`. Niveau: `GROUP` - - `SYSTEM RELOAD CONFIG`. Niveau: `GLOBAL`. Alias: `RELOAD CONFIG` - - `SYSTEM RELOAD DICTIONARY`. Niveau: `GLOBAL`. Alias: `SYSTEM RELOAD DICTIONARIES`, `RELOAD DICTIONARY`, `RELOAD DICTIONARIES` - - `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Niveau: `GLOBAL`. Alias: R`ELOAD EMBEDDED DICTIONARIES` - - `SYSTEM MERGES`. Niveau: `TABLE`. Alias: `SYSTEM STOP MERGES`, `SYSTEM START MERGES`, `STOP MERGES`, `START MERGES` - - `SYSTEM TTL MERGES`. Niveau: `TABLE`. Alias: `SYSTEM STOP TTL MERGES`, `SYSTEM START TTL MERGES`, `STOP TTL MERGES`, `START TTL MERGES` - - `SYSTEM FETCHES`. Niveau: `TABLE`. Alias: `SYSTEM STOP FETCHES`, `SYSTEM START FETCHES`, `STOP FETCHES`, `START FETCHES` - - `SYSTEM MOVES`. Niveau: `TABLE`. Alias: `SYSTEM STOP MOVES`, `SYSTEM START MOVES`, `STOP MOVES`, `START MOVES` - - `SYSTEM SENDS`. Niveau: `GROUP`. Alias: `SYSTEM STOP SENDS`, `SYSTEM START SENDS`, `STOP SENDS`, `START SENDS` - - `SYSTEM DISTRIBUTED SENDS`. Niveau: `TABLE`. Alias: `SYSTEM STOP DISTRIBUTED SENDS`, `SYSTEM START DISTRIBUTED SENDS`, `STOP DISTRIBUTED SENDS`, `START DISTRIBUTED SENDS` - - `SYSTEM REPLICATED SENDS`. Niveau: `TABLE`. Alias: `SYSTEM STOP REPLICATED SENDS`, `SYSTEM START REPLICATED SENDS`, `STOP REPLICATED SENDS`, `START REPLICATED SENDS` - - `SYSTEM REPLICATION QUEUES`. Niveau: `TABLE`. Alias: `SYSTEM STOP REPLICATION QUEUES`, `SYSTEM START REPLICATION QUEUES`, `STOP REPLICATION QUEUES`, `START REPLICATION QUEUES` - - `SYSTEM SYNC REPLICA`. Niveau: `TABLE`. Alias: `SYNC REPLICA` - - `SYSTEM RESTART REPLICA`. Niveau: `TABLE`. Alias: `RESTART REPLICA` - - `SYSTEM FLUSH`. Niveau: `GROUP` - - `SYSTEM FLUSH DISTRIBUTED`. Niveau: `TABLE`. Alias: `FLUSH DISTRIBUTED` - - `SYSTEM FLUSH LOGS`. Niveau: `GLOBAL`. Alias: `FLUSH LOGS` - -Le `SYSTEM RELOAD EMBEDDED DICTIONARIES` privilège implicitement accordé par le `SYSTEM RELOAD DICTIONARY ON *.*` privilège. - -### INTROSPECTION {#grant-introspection} - -Permet l'utilisation de [introspection](../../operations/optimizing-performance/sampling-query-profiler.md) fonction. - -- `INTROSPECTION`. Niveau: `GROUP`. Alias: `INTROSPECTION FUNCTIONS` - - `addressToLine`. Niveau: `GLOBAL` - - `addressToSymbol`. Niveau: `GLOBAL` - - `demangle`. Niveau: `GLOBAL` - -### SOURCES {#grant-sources} - -Permet d'utiliser des sources de données externes. S'applique à [moteurs de table](../../engines/table-engines/index.md) et [les fonctions de table](../table-functions/index.md#table-functions). - -- `SOURCES`. Niveau: `GROUP` - - `FILE`. Niveau: `GLOBAL` - - `URL`. Niveau: `GLOBAL` - - `REMOTE`. Niveau: `GLOBAL` - - `YSQL`. Niveau: `GLOBAL` - - `ODBC`. Niveau: `GLOBAL` - - `JDBC`. Niveau: `GLOBAL` - - `HDFS`. Niveau: `GLOBAL` - - `S3`. Niveau: `GLOBAL` - -Le `SOURCES` privilège permet l'utilisation de toutes les sources. Vous pouvez également accorder un privilège pour chaque source individuellement. Pour utiliser les sources, vous avez besoin de privilèges supplémentaires. - -Exemple: - -- Pour créer une table avec [Moteur de table MySQL](../../engines/table-engines/integrations/mysql.md), vous avez besoin `CREATE TABLE (ON db.table_name)` et `MYSQL` privilège. -- L'utilisation de la [fonction de table mysql](../table-functions/mysql.md), vous avez besoin `CREATE TEMPORARY TABLE` et `MYSQL` privilège. - -### dictGet {#grant-dictget} - -- `dictGet`. Alias: `dictHas`, `dictGetHierarchy`, `dictIsIn` - -Permet à un utilisateur d'exécuter [dictGet](../functions/ext-dict-functions.md#dictget), [dictHas](../functions/ext-dict-functions.md#dicthas), [dictGetHierarchy](../functions/ext-dict-functions.md#dictgethierarchy), [dictisine](../functions/ext-dict-functions.md#dictisin) fonction. - -Niveau de privilège: `DICTIONARY`. - -**Exemple** - -- `GRANT dictGet ON mydb.mydictionary TO john` -- `GRANT dictGet ON mydictionary TO john` - -### ALL {#grant-all} - -Les subventions de tous les privilèges sur l'entité réglementée à un compte d'utilisateur ou un rôle. - -### NONE {#grant-none} - -N'accorde pas de privilèges. - -### ADMIN OPTION {#admin-option-privilege} - -Le `ADMIN OPTION` le privilège permet à un utilisateur d'accorder son rôle à un autre utilisateur. - -[Article Original](https://clickhouse.tech/docs/en/query_language/grant/) diff --git a/docs/fr/sql-reference/statements/index.md b/docs/fr/sql-reference/statements/index.md deleted file mode 100644 index f08d64cee39..00000000000 --- a/docs/fr/sql-reference/statements/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "D\xE9claration" -toc_priority: 31 ---- - - diff --git a/docs/fr/sql-reference/statements/insert-into.md b/docs/fr/sql-reference/statements/insert-into.md deleted file mode 100644 index 987594bae65..00000000000 --- a/docs/fr/sql-reference/statements/insert-into.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 34 -toc_title: INSERT INTO ---- - -## INSERT {#insert} - -L'ajout de données. - -Format de requête de base: - -``` sql -INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ... -``` - -La requête peut spécifier une liste de colonnes à insérer `[(c1, c2, c3)]`. Dans ce cas, le reste des colonnes sont remplis avec: - -- Les valeurs calculées à partir `DEFAULT` expressions spécifiées dans la définition de la table. -- Zéros et chaînes vides, si `DEFAULT` les expressions ne sont pas définies. - -Si [strict_insert_defaults=1](../../operations/settings/settings.md), les colonnes qui n'ont pas `DEFAULT` défini doit être répertorié dans la requête. - -Les données peuvent être transmises à L'INSERT dans n'importe quel [format](../../interfaces/formats.md#formats) soutenu par ClickHouse. Le format doit être spécifié explicitement dans la requête: - -``` sql -INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set -``` - -For example, the following query format is identical to the basic version of INSERT … VALUES: - -``` sql -INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... -``` - -ClickHouse supprime tous les espaces et un saut de ligne (s'il y en a un) avant les données. Lors de la formation d'une requête, nous recommandons de placer les données sur une nouvelle ligne après les opérateurs de requête (ceci est important si les données commencent par des espaces). - -Exemple: - -``` sql -INSERT INTO t FORMAT TabSeparated -11 Hello, world! -22 Qwerty -``` - -Vous pouvez insérer des données séparément de la requête à l'aide du client de ligne de commande ou de L'interface HTTP. Pour plus d'informations, consultez la section “[Interface](../../interfaces/index.md#interfaces)”. - -### Contraintes {#constraints} - -Si la table a [contraintes](create.md#constraints), their expressions will be checked for each row of inserted data. If any of those constraints is not satisfied — server will raise an exception containing constraint name and expression, the query will be stopped. - -### Insertion des résultats de `SELECT` {#insert_query_insert-select} - -``` sql -INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... -``` - -Les colonnes sont mappées en fonction de leur position dans la clause SELECT. Cependant, leurs noms dans L'expression SELECT et la table pour INSERT peuvent différer. Si nécessaire, la coulée de type est effectuée. - -Aucun des formats de données à l'exception des Valeurs permettent de définir des valeurs d'expressions telles que `now()`, `1 + 2` et ainsi de suite. Le format des valeurs permet une utilisation limitée des expressions, mais ce n'est pas recommandé, car dans ce cas, un code inefficace est utilisé pour leur exécution. - -Les autres requêtes de modification des parties de données ne sont pas prises en charge: `UPDATE`, `DELETE`, `REPLACE`, `MERGE`, `UPSERT`, `INSERT UPDATE`. -Cependant, vous pouvez supprimer les anciennes données en utilisant `ALTER TABLE ... DROP PARTITION`. - -`FORMAT` la clause doit être spécifié à la fin de la requête si `SELECT` la clause contient la fonction de table [entrée()](../table-functions/input.md). - -### Considérations De Performance {#performance-considerations} - -`INSERT` trie les données d'entrée par la clé primaire et les divise en partitions par une clé de partition. Si vous insérez des données dans plusieurs partitions à la fois, cela peut réduire considérablement les performances de l' `INSERT` requête. Pour éviter cela: - -- Ajoutez des données en lots assez importants, tels que 100 000 lignes à la fois. -- Groupez les données par une clé de partition avant de les télécharger sur ClickHouse. - -Les performances ne diminueront pas si: - -- Les données sont ajoutées en temps réel. -- Vous téléchargez des données qui sont généralement triées par heure. - -[Article Original](https://clickhouse.tech/docs/en/query_language/insert_into/) diff --git a/docs/fr/sql-reference/statements/misc.md b/docs/fr/sql-reference/statements/misc.md deleted file mode 100644 index 4631f856266..00000000000 --- a/docs/fr/sql-reference/statements/misc.md +++ /dev/null @@ -1,358 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 41 -toc_title: Autre ---- - -# Diverses Requêtes {#miscellaneous-queries} - -## ATTACH {#attach} - -Cette requête est exactement la même que `CREATE`, mais - -- Au lieu de la parole `CREATE` il utilise le mot `ATTACH`. -- La requête ne crée pas de données sur le disque, mais suppose que les données sont déjà aux endroits appropriés, et ajoute simplement des informations sur la table au serveur. - Après avoir exécuté une requête ATTACH, le serveur connaîtra l'existence de la table. - -Si la table a été précédemment détachée (`DETACH`), ce qui signifie que sa structure est connue, vous pouvez utiliser un raccourci sans définir la structure. - -``` sql -ATTACH TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster] -``` - -Cette requête est utilisée lors du démarrage du serveur. Le serveur stocke les métadonnées de la table sous forme de fichiers avec `ATTACH` requêtes, qu'il exécute simplement au lancement (à l'exception des tables système, qui sont explicitement créées sur le serveur). - -## CHECK TABLE {#check-table} - -Vérifie si les données de la table sont corrompues. - -``` sql -CHECK TABLE [db.]name -``` - -Le `CHECK TABLE` requête compare réelle des tailles de fichier avec les valeurs attendues qui sont stockés sur le serveur. Si le fichier tailles ne correspondent pas aux valeurs stockées, cela signifie que les données sont endommagées. Cela peut être causé, par exemple, par un plantage du système lors de l'exécution de la requête. - -La réponse de la requête contient `result` colonne avec une seule ligne. La ligne a une valeur de -[Booléen](../../sql-reference/data-types/boolean.md) type: - -- 0 - les données de la table sont corrompues. -- 1 - les données maintiennent l'intégrité. - -Le `CHECK TABLE` query prend en charge les moteurs de table suivants: - -- [Journal](../../engines/table-engines/log-family/log.md) -- [TinyLog](../../engines/table-engines/log-family/tinylog.md) -- [StripeLog](../../engines/table-engines/log-family/stripelog.md) -- [Famille MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) - -Effectué sur les tables avec un autre moteur de table provoque une exception. - -Les moteurs de la `*Log` la famille ne fournit pas de récupération automatique des données en cas d'échec. L'utilisation de la `CHECK TABLE` requête pour suivre la perte de données en temps opportun. - -Pour `MergeTree` moteurs de la famille, le `CHECK TABLE` query affiche un État de vérification pour chaque partie de données individuelle d'une table sur le serveur local. - -**Si les données sont corrompues** - -Si la table est corrompue, vous pouvez copier les données non corrompues dans une autre table. Pour ce faire: - -1. Créez une nouvelle table avec la même structure que la table endommagée. Pour ce faire exécutez la requête `CREATE TABLE AS `. -2. Définir le [max_threads](../../operations/settings/settings.md#settings-max_threads) la valeur 1 pour traiter la requête suivante dans un seul thread. Pour ce faire, exécutez la requête `SET max_threads = 1`. -3. Exécuter la requête `INSERT INTO SELECT * FROM `. Cette demande copie les données non corrompues de la table endommagée vers une autre table. Seules les données avant la partie corrompue seront copiées. -4. Redémarrez l' `clickhouse-client` pour réinitialiser l' `max_threads` valeur. - -## DESCRIBE TABLE {#misc-describe-table} - -``` sql -DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] -``` - -Renvoie ce qui suit `String` les colonnes de type: - -- `name` — Column name. -- `type`— Column type. -- `default_type` — Clause that is used in [expression par défaut](create.md#create-default-values) (`DEFAULT`, `MATERIALIZED` ou `ALIAS`). Column contient une chaîne vide, si l'expression par défaut n'est pas spécifiée. -- `default_expression` — Value specified in the `DEFAULT` clause. -- `comment_expression` — Comment text. - -Les structures de données imbriquées sont sorties dans “expanded” format. Chaque colonne est affichée séparément, avec le nom après un point. - -## DETACH {#detach} - -Supprime les informations sur le ‘name’ table du serveur. Le serveur cesse de connaître l'existence de la table. - -``` sql -DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] -``` - -Cela ne supprime pas les données ou les métadonnées de la table. Lors du prochain lancement du serveur, le serveur Lira les métadonnées et découvrira à nouveau la table. -De même, un “detached” tableau peut être re-attaché en utilisant le `ATTACH` requête (à l'exception des tables système, qui n'ont pas de stocker les métadonnées pour eux). - -Il n'y a pas de `DETACH DATABASE` requête. - -## DROP {#drop} - -Cette requête a deux types: `DROP DATABASE` et `DROP TABLE`. - -``` sql -DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] -``` - -Supprime toutes les tables à l'intérieur de la ‘db’ la base de données, puis supprime le ‘db’ la base de données elle-même. -Si `IF EXISTS` est spécifié, il ne renvoie pas d'erreur si la base de données n'existe pas. - -``` sql -DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] -``` - -Supprime la table. -Si `IF EXISTS` est spécifié, il ne renvoie pas d'erreur si la table n'existe pas ou si la base de données n'existe pas. - - DROP DICTIONARY [IF EXISTS] [db.]name - -Delets le dictionnaire. -Si `IF EXISTS` est spécifié, il ne renvoie pas d'erreur si la table n'existe pas ou si la base de données n'existe pas. - -## DROP USER {#drop-user-statement} - -Supprime un utilisateur. - -### Syntaxe {#drop-user-syntax} - -``` sql -DROP USER [IF EXISTS] name [,...] [ON CLUSTER cluster_name] -``` - -## DROP ROLE {#drop-role-statement} - -Supprime un rôle. - -Le rôle supprimé est révoqué de toutes les entités où il a été accordé. - -### Syntaxe {#drop-role-syntax} - -``` sql -DROP ROLE [IF EXISTS] name [,...] [ON CLUSTER cluster_name] -``` - -## DROP ROW POLICY {#drop-row-policy-statement} - -Supprime une stratégie de ligne. - -La stratégie de ligne supprimée est révoquée de toutes les entités sur lesquelles elle a été affectée. - -### Syntaxe {#drop-row-policy-syntax} - -``` sql -DROP [ROW] POLICY [IF EXISTS] name [,...] ON [database.]table [,...] [ON CLUSTER cluster_name] -``` - -## DROP QUOTA {#drop-quota-statement} - -Supprime un quota. - -Le quota supprimé est révoqué de toutes les entités où il a été affecté. - -### Syntaxe {#drop-quota-syntax} - -``` sql -DROP QUOTA [IF EXISTS] name [,...] [ON CLUSTER cluster_name] -``` - -## DROP SETTINGS PROFILE {#drop-settings-profile-statement} - -Supprime un quota. - -Le quota supprimé est révoqué de toutes les entités où il a été affecté. - -### Syntaxe {#drop-settings-profile-syntax} - -``` sql -DROP [SETTINGS] PROFILE [IF EXISTS] name [,...] [ON CLUSTER cluster_name] -``` - -## EXISTS {#exists-statement} - -``` sql -EXISTS [TEMPORARY] [TABLE|DICTIONARY] [db.]name [INTO OUTFILE filename] [FORMAT format] -``` - -Renvoie un seul `UInt8`- type colonne, qui contient la valeur unique `0` si la table ou base de données n'existe pas, ou `1` si la table existe dans la base de données spécifiée. - -## KILL QUERY {#kill-query-statement} - -``` sql -KILL QUERY [ON CLUSTER cluster] - WHERE - [SYNC|ASYNC|TEST] - [FORMAT format] -``` - -Tente de mettre fin de force aux requêtes en cours d'exécution. -Les requêtes à terminer sont sélectionnées dans le système.processus en utilisant les critères définis dans le `WHERE` la clause de la `KILL` requête. - -Exemple: - -``` sql --- Forcibly terminates all queries with the specified query_id: -KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90' - --- Synchronously terminates all queries run by 'username': -KILL QUERY WHERE user='username' SYNC -``` - -Les utilisateurs en lecture seule peuvent uniquement arrêter leurs propres requêtes. - -Par défaut, la version asynchrone des requêtes est utilisé (`ASYNC`), qui n'attend pas la confirmation que les requêtes se sont arrêtées. - -La version synchrone (`SYNC`) attend que toutes les requêtes d'arrêter et affiche des informations sur chaque processus s'arrête. -La réponse contient l' `kill_status` la colonne, qui peut prendre les valeurs suivantes: - -1. ‘finished’ – The query was terminated successfully. -2. ‘waiting’ – Waiting for the query to end after sending it a signal to terminate. -3. The other values ​​explain why the query can't be stopped. - -Une requête de test (`TEST`) vérifie uniquement les droits de l'utilisateur et affiche une liste de requêtes à arrêter. - -## KILL MUTATION {#kill-mutation} - -``` sql -KILL MUTATION [ON CLUSTER cluster] - WHERE - [TEST] - [FORMAT format] -``` - -Essaie d'annuler et supprimer [mutation](alter.md#alter-mutations) actuellement en cours d'exécution. Les Mutations à annuler sont sélectionnées parmi [`system.mutations`](../../operations/system-tables.md#system_tables-mutations) tableau à l'aide du filtre spécifié par le `WHERE` la clause de la `KILL` requête. - -Une requête de test (`TEST`) vérifie uniquement les droits de l'utilisateur et affiche une liste de requêtes à arrêter. - -Exemple: - -``` sql --- Cancel and remove all mutations of the single table: -KILL MUTATION WHERE database = 'default' AND table = 'table' - --- Cancel the specific mutation: -KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = 'mutation_3.txt' -``` - -The query is useful when a mutation is stuck and cannot finish (e.g. if some function in the mutation query throws an exception when applied to the data contained in the table). - -Les modifications déjà apportées par la mutation ne sont pas annulées. - -## OPTIMIZE {#misc_operations-optimize} - -``` sql -OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE] -``` - -Cette requête tente d'initialiser une fusion non programmée de parties de données pour les tables avec un moteur de [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) famille. - -Le `OPTMIZE` la requête est également prise en charge pour [MaterializedView](../../engines/table-engines/special/materializedview.md) et la [Tampon](../../engines/table-engines/special/buffer.md) moteur. Les autres moteurs de table ne sont pas pris en charge. - -Lorsque `OPTIMIZE` est utilisé avec le [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md) famille de moteurs de table, ClickHouse crée une tâche pour la fusion et attend l'exécution sur tous les nœuds (si le `replication_alter_partitions_sync` paramètre est activé). - -- Si `OPTIMIZE` n'effectue pas de fusion pour une raison quelconque, il ne notifie pas le client. Pour activer les notifications, utilisez [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop) paramètre. -- Si vous spécifiez un `PARTITION`, seule la partition spécifiée est optimisé. [Comment définir l'expression de la partition](alter.md#alter-how-to-specify-part-expr). -- Si vous spécifiez `FINAL`, l'optimisation est effectuée, même lorsque toutes les données sont déjà dans une partie. -- Si vous spécifiez `DEDUPLICATE`, alors des lignes complètement identiques seront dédupliquées (toutes les colonnes sont comparées), cela n'a de sens que pour le moteur MergeTree. - -!!! warning "Avertissement" - `OPTIMIZE` ne peut pas réparer le “Too many parts” erreur. - -## RENAME {#misc_operations-rename} - -Renomme une ou plusieurs tables. - -``` sql -RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ... [ON CLUSTER cluster] -``` - -Toutes les tables sont renommées sous verrouillage global. Renommer des tables est une opération légère. Si vous avez indiqué une autre base de données après TO, la table sera déplacée vers cette base de données. Cependant, les répertoires contenant des bases de données doivent résider dans le même système de fichiers (sinon, une erreur est renvoyée). - -## SET {#query-set} - -``` sql -SET param = value -``` - -Assigner `value` à l' `param` [paramètre](../../operations/settings/index.md) pour la session en cours. Vous ne pouvez pas modifier [les paramètres du serveur](../../operations/server-configuration-parameters/index.md) de cette façon. - -Vous pouvez également définir toutes les valeurs de certains paramètres de profil dans une seule requête. - -``` sql -SET profile = 'profile-name-from-the-settings-file' -``` - -Pour plus d'informations, voir [Paramètre](../../operations/settings/settings.md). - -## SET ROLE {#set-role-statement} - -Active les rôles pour l'utilisateur actuel. - -### Syntaxe {#set-role-syntax} - -``` sql -SET ROLE {DEFAULT | NONE | role [,...] | ALL | ALL EXCEPT role [,...]} -``` - -## SET DEFAULT ROLE {#set-default-role-statement} - -Définit les rôles par défaut à un utilisateur. - -Les rôles par défaut sont automatiquement activés lors de la connexion de l'utilisateur. Vous pouvez définir par défaut uniquement les rôles précédemment accordés. Si le rôle n'est pas accordé à un utilisateur, ClickHouse lève une exception. - -### Syntaxe {#set-default-role-syntax} - -``` sql -SET DEFAULT ROLE {NONE | role [,...] | ALL | ALL EXCEPT role [,...]} TO {user|CURRENT_USER} [,...] -``` - -### Exemple {#set-default-role-examples} - -Définir plusieurs rôles par défaut à un utilisateur: - -``` sql -SET DEFAULT ROLE role1, role2, ... TO user -``` - -Définissez tous les rôles accordés par défaut sur un utilisateur: - -``` sql -SET DEFAULT ROLE ALL TO user -``` - -Purger les rôles par défaut d'un utilisateur: - -``` sql -SET DEFAULT ROLE NONE TO user -``` - -Définissez tous les rôles accordés par défaut à l'exception de certains d'entre eux: - -``` sql -SET DEFAULT ROLE ALL EXCEPT role1, role2 TO user -``` - -## TRUNCATE {#truncate-statement} - -``` sql -TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] -``` - -Supprime toutes les données d'une table. Lorsque la clause `IF EXISTS` est omis, la requête renvoie une erreur si la table n'existe pas. - -Le `TRUNCATE` la requête n'est pas prise en charge pour [Vue](../../engines/table-engines/special/view.md), [Fichier](../../engines/table-engines/special/file.md), [URL](../../engines/table-engines/special/url.md) et [NULL](../../engines/table-engines/special/null.md) table des moteurs. - -## USE {#use} - -``` sql -USE db -``` - -Vous permet de définir la base de données actuelle pour la session. -La base de données actuelle est utilisée pour rechercher des tables si la base de données n'est pas explicitement définie dans la requête avec un point avant le nom de la table. -Cette requête ne peut pas être faite lors de l'utilisation du protocole HTTP, car il n'y a pas de concept de session. - -[Article Original](https://clickhouse.tech/docs/en/query_language/misc/) diff --git a/docs/fr/sql-reference/statements/revoke.md b/docs/fr/sql-reference/statements/revoke.md deleted file mode 100644 index 6137cc30f8c..00000000000 --- a/docs/fr/sql-reference/statements/revoke.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 40 -toc_title: REVOKE ---- - -# REVOKE {#revoke} - -Révoque les privilèges des utilisateurs ou rôles. - -## Syntaxe {#revoke-syntax} - -**Révocation des privilèges des utilisateurs** - -``` sql -REVOKE [ON CLUSTER cluster_name] privilege[(column_name [,...])] [,...] ON {db.table|db.*|*.*|table|*} FROM {user | CURRENT_USER} [,...] | ALL | ALL EXCEPT {user | CURRENT_USER} [,...] -``` - -**Révocation des rôles des utilisateurs** - -``` sql -REVOKE [ON CLUSTER cluster_name] [ADMIN OPTION FOR] role [,...] FROM {user | role | CURRENT_USER} [,...] | ALL | ALL EXCEPT {user_name | role_name | CURRENT_USER} [,...] -``` - -## Description {#revoke-description} - -Pour révoquer certains privilèges, vous pouvez utiliser un privilège de portée plus large que vous envisagez de révoquer. Par exemple, si un utilisateur a la `SELECT (x,y)` privilège, administrateur peut effectuer `REVOKE SELECT(x,y) ...`, ou `REVOKE SELECT * ...` ou même `REVOKE ALL PRIVILEGES ...` requête de révoquer ce privilège. - -### Révocations Partielles {#partial-revokes-dscr} - -Vous pouvez révoquer une partie d'un privilège. Par exemple, si un utilisateur a la `SELECT *.*` Privilège vous pouvez révoquer un privilège pour lire les données d'une table ou d'une base de données. - -## Exemple {#revoke-example} - -Subvention de l' `john` compte utilisateur avec le privilège de sélectionner parmi toutes les bases de données `accounts` un: - -``` sql -GRANT SELECT ON *.* TO john; -REVOKE SELECT ON accounts.* FROM john; -``` - -Subvention de l' `mira` compte utilisateur avec le privilège de sélectionner parmi toutes les colonnes `accounts.staff` tableau à l'exception de la `wage` un. - -``` sql -GRANT SELECT ON accounts.staff TO mira; -REVOKE SELECT(wage) ON accounts.staff FROM mira; -``` - -{## [Article Original](https://clickhouse.tech/docs/en/operations/settings/settings/) ##} diff --git a/docs/fr/sql-reference/statements/select/array-join.md b/docs/fr/sql-reference/statements/select/array-join.md deleted file mode 100644 index 07b27d5d16c..00000000000 --- a/docs/fr/sql-reference/statements/select/array-join.md +++ /dev/null @@ -1,282 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause de jointure de tableau {#select-array-join-clause} - -C'est une opération courante pour les tables qui contiennent une colonne de tableau pour produire une nouvelle table qui a une colonne avec chaque élément de tableau individuel de cette colonne initiale, tandis que les valeurs des autres colonnes sont dupliquées. C'est le cas de fond de ce `ARRAY JOIN` la clause le fait. - -Son nom vient du fait qu'il peut être regardé comme l'exécution de `JOIN` avec un tableau ou une structure de données imbriquée. L'intention est similaire à la [arrayJoin](../../functions/array-join.md#functions_arrayjoin) fonction, mais la fonctionnalité de la clause est plus large. - -Syntaxe: - -``` sql -SELECT -FROM -[LEFT] ARRAY JOIN -[WHERE|PREWHERE ] -... -``` - -Vous ne pouvez en spécifier qu'un `ARRAY JOIN` la clause dans un `SELECT` requête. - -Types pris en charge de `ARRAY JOIN` sont énumérés ci-dessous: - -- `ARRAY JOIN` - Dans le cas de base, les tableaux vides ne sont pas inclus dans le résultat de `JOIN`. -- `LEFT ARRAY JOIN` - Le résultat de `JOIN` contient des lignes avec des tableaux vides. La valeur d'un tableau vide est définie sur la valeur par défaut pour le type d'élément de tableau (généralement 0, chaîne vide ou NULL). - -## Exemples de jointure de tableau de base {#basic-array-join-examples} - -Les exemples ci-dessous illustrent l'utilisation de la `ARRAY JOIN` et `LEFT ARRAY JOIN` clause. Créons une table avec un [Tableau](../../../sql-reference/data-types/array.md) tapez colonne et insérez des valeurs dedans: - -``` sql -CREATE TABLE arrays_test -( - s String, - arr Array(UInt8) -) ENGINE = Memory; - -INSERT INTO arrays_test -VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); -``` - -``` text -┌─s───────────┬─arr─────┐ -│ Hello │ [1,2] │ -│ World │ [3,4,5] │ -│ Goodbye │ [] │ -└─────────────┴─────────┘ -``` - -L'exemple ci-dessous utilise la `ARRAY JOIN` clause: - -``` sql -SELECT s, arr -FROM arrays_test -ARRAY JOIN arr; -``` - -``` text -┌─s─────┬─arr─┐ -│ Hello │ 1 │ -│ Hello │ 2 │ -│ World │ 3 │ -│ World │ 4 │ -│ World │ 5 │ -└───────┴─────┘ -``` - -L'exemple suivant utilise l' `LEFT ARRAY JOIN` clause: - -``` sql -SELECT s, arr -FROM arrays_test -LEFT ARRAY JOIN arr; -``` - -``` text -┌─s───────────┬─arr─┐ -│ Hello │ 1 │ -│ Hello │ 2 │ -│ World │ 3 │ -│ World │ 4 │ -│ World │ 5 │ -│ Goodbye │ 0 │ -└─────────────┴─────┘ -``` - -## À L'Aide D'Alias {#using-aliases} - -Un alias peut être spécifié pour un tableau `ARRAY JOIN` clause. Dans ce cas, un élément de tableau peut être consulté par ce pseudonyme, mais le tableau lui-même est accessible par le nom d'origine. Exemple: - -``` sql -SELECT s, arr, a -FROM arrays_test -ARRAY JOIN arr AS a; -``` - -``` text -┌─s─────┬─arr─────┬─a─┐ -│ Hello │ [1,2] │ 1 │ -│ Hello │ [1,2] │ 2 │ -│ World │ [3,4,5] │ 3 │ -│ World │ [3,4,5] │ 4 │ -│ World │ [3,4,5] │ 5 │ -└───────┴─────────┴───┘ -``` - -En utilisant des alias, vous pouvez effectuer `ARRAY JOIN` avec un groupe externe. Exemple: - -``` sql -SELECT s, arr_external -FROM arrays_test -ARRAY JOIN [1, 2, 3] AS arr_external; -``` - -``` text -┌─s───────────┬─arr_external─┐ -│ Hello │ 1 │ -│ Hello │ 2 │ -│ Hello │ 3 │ -│ World │ 1 │ -│ World │ 2 │ -│ World │ 3 │ -│ Goodbye │ 1 │ -│ Goodbye │ 2 │ -│ Goodbye │ 3 │ -└─────────────┴──────────────┘ -``` - -Plusieurs tableaux peuvent être séparés par des virgules `ARRAY JOIN` clause. Dans ce cas, `JOIN` est effectuée avec eux simultanément (la somme directe, pas le produit cartésien). Notez que tous les tableaux doivent avoir la même taille. Exemple: - -``` sql -SELECT s, arr, a, num, mapped -FROM arrays_test -ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped; -``` - -``` text -┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ -│ Hello │ [1,2] │ 1 │ 1 │ 2 │ -│ Hello │ [1,2] │ 2 │ 2 │ 3 │ -│ World │ [3,4,5] │ 3 │ 1 │ 4 │ -│ World │ [3,4,5] │ 4 │ 2 │ 5 │ -│ World │ [3,4,5] │ 5 │ 3 │ 6 │ -└───────┴─────────┴───┴─────┴────────┘ -``` - -L'exemple ci-dessous utilise la [arrayEnumerate](../../../sql-reference/functions/array-functions.md#array_functions-arrayenumerate) fonction: - -``` sql -SELECT s, arr, a, num, arrayEnumerate(arr) -FROM arrays_test -ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; -``` - -``` text -┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ -│ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ -│ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ -│ World │ [3,4,5] │ 3 │ 1 │ [1,2,3] │ -│ World │ [3,4,5] │ 4 │ 2 │ [1,2,3] │ -│ World │ [3,4,5] │ 5 │ 3 │ [1,2,3] │ -└───────┴─────────┴───┴─────┴─────────────────────┘ -``` - -## Jointure de tableau avec la Structure de données imbriquée {#array-join-with-nested-data-structure} - -`ARRAY JOIN` fonctionne également avec [structures de données imbriquées](../../../sql-reference/data-types/nested-data-structures/nested.md): - -``` sql -CREATE TABLE nested_test -( - s String, - nest Nested( - x UInt8, - y UInt32) -) ENGINE = Memory; - -INSERT INTO nested_test -VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []); -``` - -``` text -┌─s───────┬─nest.x──┬─nest.y─────┐ -│ Hello │ [1,2] │ [10,20] │ -│ World │ [3,4,5] │ [30,40,50] │ -│ Goodbye │ [] │ [] │ -└─────────┴─────────┴────────────┘ -``` - -``` sql -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN nest; -``` - -``` text -┌─s─────┬─nest.x─┬─nest.y─┐ -│ Hello │ 1 │ 10 │ -│ Hello │ 2 │ 20 │ -│ World │ 3 │ 30 │ -│ World │ 4 │ 40 │ -│ World │ 5 │ 50 │ -└───────┴────────┴────────┘ -``` - -Lorsque vous spécifiez des noms de structures de données imbriquées dans `ARRAY JOIN` le sens est le même que `ARRAY JOIN` avec tous les éléments du tableau qui la compose. Des exemples sont énumérés ci-dessous: - -``` sql -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN `nest.x`, `nest.y`; -``` - -``` text -┌─s─────┬─nest.x─┬─nest.y─┐ -│ Hello │ 1 │ 10 │ -│ Hello │ 2 │ 20 │ -│ World │ 3 │ 30 │ -│ World │ 4 │ 40 │ -│ World │ 5 │ 50 │ -└───────┴────────┴────────┘ -``` - -Cette variation a également du sens: - -``` sql -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN `nest.x`; -``` - -``` text -┌─s─────┬─nest.x─┬─nest.y─────┐ -│ Hello │ 1 │ [10,20] │ -│ Hello │ 2 │ [10,20] │ -│ World │ 3 │ [30,40,50] │ -│ World │ 4 │ [30,40,50] │ -│ World │ 5 │ [30,40,50] │ -└───────┴────────┴────────────┘ -``` - -Un alias peut être utilisé pour une structure de données imbriquée, afin de sélectionner `JOIN` le résultat ou le tableau source. Exemple: - -``` sql -SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN nest AS n; -``` - -``` text -┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ -│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ -│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ -│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ -│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ -│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ -└───────┴─────┴─────┴─────────┴────────────┘ -``` - -Exemple d'utilisation de l' [arrayEnumerate](../../../sql-reference/functions/array-functions.md#array_functions-arrayenumerate) fonction: - -``` sql -SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num -FROM nested_test -ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num; -``` - -``` text -┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ -│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ -│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ -│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ 1 │ -│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ 2 │ -│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ 3 │ -└───────┴─────┴─────┴─────────┴────────────┴─────┘ -``` - -## Détails De Mise En Œuvre {#implementation-details} - -L'ordre d'exécution de la requête est optimisé lors de l'exécution `ARRAY JOIN`. Bien `ARRAY JOIN` doit toujours être spécifié avant l' [WHERE](where.md)/[PREWHERE](prewhere.md) dans une requête, techniquement, ils peuvent être exécutés dans n'importe quel ordre, sauf résultat de `ARRAY JOIN` est utilisé pour le filtrage. L'ordre de traitement est contrôlée par l'optimiseur de requête. diff --git a/docs/fr/sql-reference/statements/select/distinct.md b/docs/fr/sql-reference/statements/select/distinct.md deleted file mode 100644 index 94552018c98..00000000000 --- a/docs/fr/sql-reference/statements/select/distinct.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# La Clause DISTINCT {#select-distinct} - -Si `SELECT DISTINCT` est spécifié, seules les lignes uniques restera un résultat de requête. Ainsi, une seule ligne restera hors de tous les ensembles de lignes entièrement correspondantes dans le résultat. - -## Le Traitement Null {#null-processing} - -`DISTINCT` fonctionne avec [NULL](../../syntax.md#null-literal) comme si `NULL` ont une valeur spécifique, et `NULL==NULL`. En d'autres termes, dans le `DISTINCT` résultats, différentes combinaisons avec `NULL` une fois seulement. Elle diffère de `NULL` traitement dans la plupart des autres contextes. - -## Alternative {#alternatives} - -Il est possible d'obtenir le même résultat en appliquant [GROUP BY](group-by.md) sur le même ensemble de valeurs, comme spécifié comme `SELECT` clause, sans utiliser de fonctions d'agrégation. Mais il y a peu de différences de `GROUP BY` approche: - -- `DISTINCT` peut être utilisé avec d' `GROUP BY`. -- Lorsque [ORDER BY](order-by.md) est omis et [LIMIT](limit.md) est définie, la requête s'arrête immédiatement après le nombre de lignes différentes, a été lu. -- Les blocs de données sont produits au fur et à mesure qu'ils sont traités, sans attendre que la requête entière se termine. - -## Limitation {#limitations} - -`DISTINCT` n'est pas pris en charge si `SELECT` a au moins une colonne de tableau. - -## Exemple {#examples} - -Clickhouse prend en charge l'utilisation du `DISTINCT` et `ORDER BY` clauses pour différentes colonnes dans une requête. Le `DISTINCT` la clause est exécutée avant la `ORDER BY` clause. - -Exemple de table: - -``` text -┌─a─┬─b─┐ -│ 2 │ 1 │ -│ 1 │ 2 │ -│ 3 │ 3 │ -│ 2 │ 4 │ -└───┴───┘ -``` - -Lors de la sélection de données avec le `SELECT DISTINCT a FROM t1 ORDER BY b ASC` requête, nous obtenons le résultat suivant: - -``` text -┌─a─┐ -│ 2 │ -│ 1 │ -│ 3 │ -└───┘ -``` - -Si nous changeons la direction de tri `SELECT DISTINCT a FROM t1 ORDER BY b DESC`, nous obtenons le résultat suivant: - -``` text -┌─a─┐ -│ 3 │ -│ 1 │ -│ 2 │ -└───┘ -``` - -Rangée `2, 4` a été coupé avant de les trier. - -Prenez en compte cette spécificité d'implémentation lors de la programmation des requêtes. diff --git a/docs/fr/sql-reference/statements/select/format.md b/docs/fr/sql-reference/statements/select/format.md deleted file mode 100644 index a88bb7831ba..00000000000 --- a/docs/fr/sql-reference/statements/select/format.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# FORMAT de la Clause {#format-clause} - -Clickhouse prend en charge une large gamme de [formats de sérialisation](../../../interfaces/formats.md) qui peut être utilisé sur les résultats de la requête entre autres choses. Il existe plusieurs façons de choisir un format pour `SELECT` de sortie, l'un d'eux est de spécifier `FORMAT format` à la fin de la requête pour obtenir les données résultantes dans tout format spécifique. - -Un format spécifique peut être utilisé pour des raisons de commodité, d'intégration avec d'autres systèmes ou d'amélioration des performances. - -## Format Par Défaut {#default-format} - -Si l' `FORMAT` la clause est omise, le format par défaut est utilisé, ce qui dépend à la fois des paramètres et de l'interface utilisée pour accéder au serveur ClickHouse. Pour l' [Interface HTTP](../../../interfaces/http.md) et la [client de ligne de commande](../../../interfaces/cli.md) en mode batch, le format par défaut est `TabSeparated`. Pour le client de ligne de commande en mode interactif, le format par défaut est `PrettyCompact` (il produit des tables compactes lisibles par l'homme). - -## Détails De Mise En Œuvre {#implementation-details} - -Lors de l'utilisation du client de ligne de commande, les données sont toujours transmises sur le réseau dans un format efficace interne (`Native`). Le client interprète indépendamment le `FORMAT` clause de la requête et formate les données elles-mêmes (soulageant ainsi le réseau et le serveur de la charge supplémentaire). diff --git a/docs/fr/sql-reference/statements/select/from.md b/docs/fr/sql-reference/statements/select/from.md deleted file mode 100644 index 964ffdd13fb..00000000000 --- a/docs/fr/sql-reference/statements/select/from.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# De la Clause {#select-from} - -Le `FROM` clause spécifie la source à partir de laquelle lire les données: - -- [Table](../../../engines/table-engines/index.md) -- [Sous-requête](index.md) {##TODO: meilleur lien ##} -- [Fonction de Table](../../table-functions/index.md#table-functions) - -[JOIN](join.md) et [ARRAY JOIN](array-join.md) les clauses peuvent également être utilisées pour étendre la fonctionnalité de la `FROM` clause. - -Subquery est un autre `SELECT` requête qui peut être spécifié entre parenthèses à l'intérieur `FROM` clause. - -`FROM` la clause peut contenir plusieurs sources de données, séparées par des virgules, ce qui équivaut à effectuer [CROSS JOIN](join.md) sur eux. - -## Modificateur FINAL {#select-from-final} - -Lorsque `FINAL` est spécifié, ClickHouse fusionne complètement les données avant de renvoyer le résultat et effectue ainsi toutes les transformations de données qui se produisent lors des fusions pour le moteur de table donné. - -Il est applicable lors de la sélection de données à partir de tables qui utilisent [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md)-la famille de moteurs (à l'exception de `GraphiteMergeTree`). Également pris en charge pour: - -- [Répliqué](../../../engines/table-engines/mergetree-family/replication.md) les versions de `MergeTree` moteur. -- [Vue](../../../engines/table-engines/special/view.md), [Tampon](../../../engines/table-engines/special/buffer.md), [Distribué](../../../engines/table-engines/special/distributed.md), et [MaterializedView](../../../engines/table-engines/special/materializedview.md) moteurs qui fonctionnent sur d'autres moteurs, à condition qu'ils aient été créés sur `MergeTree`-tables de moteur. - -### Inconvénient {#drawbacks} - -Requêtes qui utilisent `FINAL` sont exécutés pas aussi vite que les requêtes similaires qui ne le font pas, car: - -- La requête est exécutée dans un seul thread et les données sont fusionnées lors de l'exécution de la requête. -- Les requêtes avec `FINAL` lire les colonnes de clé primaire en plus des colonnes spécifiées dans la requête. - -**Dans la plupart des cas, évitez d'utiliser `FINAL`.** L'approche commune consiste à utiliser différentes requêtes qui supposent les processus d'arrière-plan du `MergeTree` le moteur n'est pas encore arrivé et y faire face en appliquant l'agrégation (par exemple, pour éliminer les doublons). {##TODO: exemples ##} - -## Détails De Mise En Œuvre {#implementation-details} - -Si l' `FROM` la clause est omise, les données seront lues à partir `system.one` table. -Le `system.one` table contient exactement une ligne (cette table remplit le même but que la table double trouvée dans d'autres SGBD). - -Pour exécuter une requête, toutes les colonnes mentionnées dans la requête sont extraites de la table appropriée. Toutes les colonnes non nécessaires pour la requête externe sont rejetées des sous-requêtes. -Si une requête ne répertorie aucune colonne (par exemple, `SELECT count() FROM t`), une colonne est extraite de la table de toute façon (la plus petite est préférée), afin de calculer le nombre de lignes. diff --git a/docs/fr/sql-reference/statements/select/group-by.md b/docs/fr/sql-reference/statements/select/group-by.md deleted file mode 100644 index 9d1b5c276d5..00000000000 --- a/docs/fr/sql-reference/statements/select/group-by.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause GROUP BY {#select-group-by-clause} - -`GROUP BY` la clause change le `SELECT` requête dans un mode d'agrégation, qui fonctionne comme suit: - -- `GROUP BY` clause contient une liste des expressions (ou une seule expression, qui est considéré comme la liste de longueur). Cette liste agit comme un “grouping key”, tandis que chaque expression individuelle sera appelée “key expressions”. -- Toutes les expressions dans le [SELECT](index.md), [HAVING](having.md), et [ORDER BY](order-by.md) clause **devoir** être calculé sur la base d'expressions clés **ou** sur [les fonctions d'agrégation](../../../sql-reference/aggregate-functions/index.md) sur les expressions non-clés (y compris les colonnes simples). En d'autres termes, chaque colonne sélectionnée dans la table doit être utilisée soit dans une expression de clé, soit dans une fonction d'agrégat, mais pas les deux. -- Résultat de l'agrégation de `SELECT` la requête contiendra autant de lignes qu'il y avait des valeurs uniques de “grouping key” dans la table source. Habituellement, cela réduit considérablement le nombre de lignes, souvent par ordre de grandeur, mais pas nécessairement: le nombre de lignes reste le même si tous “grouping key” les valeurs sont distinctes. - -!!! note "Note" - Il existe un moyen supplémentaire d'exécuter l'agrégation sur une table. Si une requête ne contient que des colonnes de table à l'intérieur des fonctions `GROUP BY clause` peut être omis, et l'agrégation par un ensemble vide de touches est supposé. Ces interrogations renvoient toujours exactement une ligne. - -## Le Traitement NULL {#null-processing} - -Pour le regroupement, ClickHouse interprète [NULL](../../syntax.md#null-literal) comme une valeur, et `NULL==NULL`. Elle diffère de `NULL` traitement dans la plupart des autres contextes. - -Voici un exemple pour montrer ce que cela signifie. - -Supposons que vous avez cette table: - -``` text -┌─x─┬────y─┐ -│ 1 │ 2 │ -│ 2 │ ᴺᵁᴸᴸ │ -│ 3 │ 2 │ -│ 3 │ 3 │ -│ 3 │ ᴺᵁᴸᴸ │ -└───┴──────┘ -``` - -Requête `SELECT sum(x), y FROM t_null_big GROUP BY y` résultats dans: - -``` text -┌─sum(x)─┬────y─┐ -│ 4 │ 2 │ -│ 3 │ 3 │ -│ 5 │ ᴺᵁᴸᴸ │ -└────────┴──────┘ -``` - -Vous pouvez voir que `GROUP BY` pour `y = NULL` résumer `x` comme si `NULL` a cette valeur. - -Si vous passez plusieurs clés `GROUP BY` le résultat vous donnera toutes les combinaisons de la sélection, comme si `NULL` ont une valeur spécifique. - -## Avec modificateur de totaux {#with-totals-modifier} - -Si l' `WITH TOTALS` modificateur est spécifié, une autre ligne sera calculée. Cette ligne aura des colonnes clés contenant des valeurs par défaut (zéros ou lignes vides), et des colonnes de fonctions d'agrégat avec les valeurs calculées sur toutes les lignes (le “total” valeur). - -Cette ligne supplémentaire est uniquement produite en `JSON*`, `TabSeparated*`, et `Pretty*` formats, séparément des autres lignes: - -- Dans `JSON*` formats, cette ligne est sortie en tant que distinct ‘totals’ champ. -- Dans `TabSeparated*` formats, la ligne vient après le résultat principal, précédé par une ligne vide (après les autres données). -- Dans `Pretty*` formats, la ligne est sortie comme une table séparée après le résultat principal. -- Dans les autres formats, il n'est pas disponible. - -`WITH TOTALS` peut être exécuté de différentes manières lorsqu'il est présent. Le comportement dépend de l' ‘totals_mode’ paramètre. - -### Configuration Du Traitement Des Totaux {#configuring-totals-processing} - -Par défaut, `totals_mode = 'before_having'`. Dans ce cas, ‘totals’ est calculé sur toutes les lignes, y compris celles qui ne passent pas par `max_rows_to_group_by`. - -Les autres alternatives incluent uniquement les lignes qui passent à travers avoir dans ‘totals’, et se comporter différemment avec le réglage `max_rows_to_group_by` et `group_by_overflow_mode = 'any'`. - -`after_having_exclusive` – Don't include rows that didn't pass through `max_rows_to_group_by`. En d'autres termes, ‘totals’ aura moins ou le même nombre de lignes que si `max_rows_to_group_by` ont été omis. - -`after_having_inclusive` – Include all the rows that didn't pass through ‘max_rows_to_group_by’ dans ‘totals’. En d'autres termes, ‘totals’ aura plus ou le même nombre de lignes que si `max_rows_to_group_by` ont été omis. - -`after_having_auto` – Count the number of rows that passed through HAVING. If it is more than a certain amount (by default, 50%), include all the rows that didn't pass through ‘max_rows_to_group_by’ dans ‘totals’. Sinon, ne pas les inclure. - -`totals_auto_threshold` – By default, 0.5. The coefficient for `after_having_auto`. - -Si `max_rows_to_group_by` et `group_by_overflow_mode = 'any'` ne sont pas utilisés, toutes les variations de `after_having` sont les mêmes, et vous pouvez utiliser l'un d'eux (par exemple, `after_having_auto`). - -Vous pouvez utiliser avec les totaux dans les sous-requêtes, y compris les sous-requêtes dans la clause JOIN (dans ce cas, les valeurs totales respectives sont combinées). - -## Exemple {#examples} - -Exemple: - -``` sql -SELECT - count(), - median(FetchTiming > 60 ? 60 : FetchTiming), - count() - sum(Refresh) -FROM hits -``` - -Cependant, contrairement au SQL standard, si la table n'a pas de lignes (soit il n'y en a pas du tout, soit il n'y en a pas après avoir utilisé WHERE to filter), un résultat vide est renvoyé, et non le résultat d'une des lignes contenant les valeurs initiales des fonctions d'agrégat. - -Contrairement à MySQL (et conforme à SQL standard), vous ne pouvez pas obtenir une valeur d'une colonne qui n'est pas dans une fonction clé ou agrégée (sauf les expressions constantes). Pour contourner ce problème, vous pouvez utiliser le ‘any’ fonction d'agrégation (récupère la première valeur rencontrée) ou ‘min/max’. - -Exemple: - -``` sql -SELECT - domainWithoutWWW(URL) AS domain, - count(), - any(Title) AS title -- getting the first occurred page header for each domain. -FROM hits -GROUP BY domain -``` - -Pour chaque valeur de clé différente rencontrée, GROUP BY calcule un ensemble de valeurs de fonction d'agrégation. - -GROUP BY n'est pas pris en charge pour les colonnes de tableau. - -Une constante ne peut pas être spécifiée comme arguments pour les fonctions d'agrégation. Exemple: somme(1). Au lieu de cela, vous pouvez vous débarrasser de la constante. Exemple: `count()`. - -## Détails De Mise En Œuvre {#implementation-details} - -L'agrégation est l'une des caractéristiques les plus importantes d'un SGBD orienté colonne, et donc son implémentation est l'une des parties les plus optimisées de ClickHouse. Par défaut, l'agrégation se fait en mémoire à l'aide d'une table de hachage. Il a plus de 40 spécialisations qui sont choisies automatiquement en fonction de “grouping key” types de données. - -### Groupe par dans la mémoire externe {#select-group-by-in-external-memory} - -Vous pouvez activer le dumping des données temporaires sur le disque pour limiter l'utilisation de la mémoire pendant `GROUP BY`. -Le [max_bytes_before_external_group_by](../../../operations/settings/settings.md#settings-max_bytes_before_external_group_by) réglage détermine le seuil de consommation de RAM pour le dumping `GROUP BY` données temporaires dans le système de fichiers. Si elle est définie sur 0 (valeur par défaut), elle est désactivée. - -Lors de l'utilisation de `max_bytes_before_external_group_by`, nous vous recommandons de définir `max_memory_usage` environ deux fois plus élevé. Ceci est nécessaire car il y a deux étapes à l'agrégation: la lecture des données et la formation des données intermédiaires (1) et la fusion des données intermédiaires (2). Le Dumping des données dans le système de fichiers ne peut se produire qu'au cours de l'étape 1. Si les données temporaires n'ont pas été vidées, l'étape 2 peut nécessiter jusqu'à la même quantité de mémoire qu'à l'étape 1. - -Par exemple, si [max_memory_usage](../../../operations/settings/settings.md#settings_max_memory_usage) a été défini sur 10000000000 et que vous souhaitez utiliser l'agrégation externe, il est logique de définir `max_bytes_before_external_group_by` à 10000000000, et `max_memory_usage` à 20000000000. Lorsque l'agrégation externe est déclenchée (s'il y a eu au moins un vidage de données temporaires), la consommation maximale de RAM n'est que légèrement supérieure à `max_bytes_before_external_group_by`. - -Avec le traitement des requêtes distribuées, l'agrégation externe est effectuée sur des serveurs distants. Pour que le serveur demandeur n'utilise qu'une petite quantité de RAM, définissez `distributed_aggregation_memory_efficient` 1. - -Lors de la fusion de données vidées sur le disque, ainsi que lors de la fusion des résultats de serveurs distants lorsque `distributed_aggregation_memory_efficient` paramètre est activé, consomme jusqu'à `1/256 * the_number_of_threads` à partir de la quantité totale de mémoire RAM. - -Lorsque l'agrégation externe est activée, s'il y a moins de `max_bytes_before_external_group_by` of data (i.e. data was not flushed), the query runs just as fast as without external aggregation. If any temporary data was flushed, the run time will be several times longer (approximately three times). - -Si vous avez un [ORDER BY](order-by.md) avec un [LIMIT](limit.md) après `GROUP BY` puis la quantité de RAM dépend de la quantité de données dans `LIMIT`, pas dans l'ensemble de la table. Mais si l' `ORDER BY` n'a pas `LIMIT`, n'oubliez pas d'activer externe de tri (`max_bytes_before_external_sort`). diff --git a/docs/fr/sql-reference/statements/select/having.md b/docs/fr/sql-reference/statements/select/having.md deleted file mode 100644 index 9425830c3d4..00000000000 --- a/docs/fr/sql-reference/statements/select/having.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause HAVING {#having-clause} - -Permet de filtrer les résultats d'agrégation produits par [GROUP BY](group-by.md). Il est similaire à la [WHERE](where.md) la clause, mais la différence est que `WHERE` est effectuée avant l'agrégation, tandis que `HAVING` est effectué d'après elle. - -Il est possible de référencer les résultats d'agrégation à partir de `SELECT` la clause dans `HAVING` clause par leur alias. Alternativement, `HAVING` clause peut filtrer sur les résultats d'agrégats supplémentaires qui ne sont pas retournés dans les résultats de la requête. - -## Limitation {#limitations} - -`HAVING` ne peut pas être utilisé si le regroupement n'est pas effectuée. Utiliser `WHERE` plutôt. diff --git a/docs/fr/sql-reference/statements/select/index.md b/docs/fr/sql-reference/statements/select/index.md deleted file mode 100644 index 1d53ae80eb4..00000000000 --- a/docs/fr/sql-reference/statements/select/index.md +++ /dev/null @@ -1,158 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 33 -toc_title: SELECT ---- - -# Sélectionnez la syntaxe des requêtes {#select-queries-syntax} - -`SELECT` effectue la récupération des données. - -``` sql -[WITH expr_list|(subquery)] -SELECT [DISTINCT] expr_list -[FROM [db.]table | (subquery) | table_function] [FINAL] -[SAMPLE sample_coeff] -[ARRAY JOIN ...] -[GLOBAL] [ANY|ALL|ASOF] [INNER|LEFT|RIGHT|FULL|CROSS] [OUTER|SEMI|ANTI] JOIN (subquery)|table (ON )|(USING ) -[PREWHERE expr] -[WHERE expr] -[GROUP BY expr_list] [WITH TOTALS] -[HAVING expr] -[ORDER BY expr_list] [WITH FILL] [FROM expr] [TO expr] [STEP expr] -[LIMIT [offset_value, ]n BY columns] -[LIMIT [n, ]m] [WITH TIES] -[UNION ALL ...] -[INTO OUTFILE filename] -[FORMAT format] -``` - -Toutes les clauses sont facultatives, à l'exception de la liste d'expressions requise immédiatement après `SELECT` qui est abordée plus en détail [dessous](#select-clause). - -Spécificités de chaque clause facultative, sont couverts dans des sections distinctes, qui sont énumérés dans le même ordre qu'elles sont exécutées: - -- [AVEC la clause](with.md) -- [La clause DISTINCT](distinct.md) -- [De la clause](from.md) -- [Exemple de clause](sample.md) -- [Clause de JOINTURE](join.md) -- [Clause PREWHERE](prewhere.md) -- [Clause where](where.md) -- [Groupe par clause](group-by.md) -- [Limite par clause](limit-by.md) -- [Clause HAVING](having.md) -- [Clause SELECT](#select-clause) -- [Clause LIMIT](limit.md) -- [Clause UNION ALL](union.md) - -## Clause SELECT {#select-clause} - -[Expression](../../syntax.md#syntax-expressions) spécifié dans le `SELECT` clause sont calculés après toutes les opérations dans les clauses décrites ci-dessus sont terminés. Ces expressions fonctionnent comme si elles s'appliquaient à des lignes séparées dans le résultat. Si les expressions dans le `SELECT` la clause contient des fonctions d'agrégation, puis clickhouse traite les fonctions d'agrégation et les expressions utilisées [GROUP BY](group-by.md) agrégation. - -Si vous souhaitez inclure toutes les colonnes dans le résultat, utilisez l'astérisque (`*`) symbole. Exemple, `SELECT * FROM ...`. - -Pour correspondre à certaines colonnes dans le résultat avec un [re2](https://en.wikipedia.org/wiki/RE2_(software)) expression régulière, vous pouvez utiliser le `COLUMNS` expression. - -``` sql -COLUMNS('regexp') -``` - -Par exemple, considérez le tableau: - -``` sql -CREATE TABLE default.col_names (aa Int8, ab Int8, bc Int8) ENGINE = TinyLog -``` - -La requête suivante sélectionne les données de toutes les colonnes contenant les `a` symbole dans leur nom. - -``` sql -SELECT COLUMNS('a') FROM col_names -``` - -``` text -┌─aa─┬─ab─┐ -│ 1 │ 1 │ -└────┴────┘ -``` - -Les colonnes sélectionnées sont retournés pas dans l'ordre alphabétique. - -Vous pouvez utiliser plusieurs `COLUMNS` expressions dans une requête et leur appliquer des fonctions. - -Exemple: - -``` sql -SELECT COLUMNS('a'), COLUMNS('c'), toTypeName(COLUMNS('c')) FROM col_names -``` - -``` text -┌─aa─┬─ab─┬─bc─┬─toTypeName(bc)─┐ -│ 1 │ 1 │ 1 │ Int8 │ -└────┴────┴────┴────────────────┘ -``` - -Chaque colonne renvoyée par le `COLUMNS` expression est passée à la fonction en tant qu'argument séparé. Vous pouvez également passer d'autres arguments à la fonction si elle les supporte. Soyez prudent lorsque vous utilisez des fonctions. Si une fonction ne prend pas en charge le nombre d'arguments que vous lui avez transmis, ClickHouse lève une exception. - -Exemple: - -``` sql -SELECT COLUMNS('a') + COLUMNS('c') FROM col_names -``` - -``` text -Received exception from server (version 19.14.1): -Code: 42. DB::Exception: Received from localhost:9000. DB::Exception: Number of arguments for function plus doesn't match: passed 3, should be 2. -``` - -Dans cet exemple, `COLUMNS('a')` retourne deux colonnes: `aa` et `ab`. `COLUMNS('c')` renvoie la `bc` colonne. Le `+` l'opérateur ne peut pas s'appliquer à 3 arguments, donc ClickHouse lève une exception avec le message pertinent. - -Colonnes qui correspondent à la `COLUMNS` l'expression peut avoir différents types de données. Si `COLUMNS` ne correspond à aucune colonne et est la seule expression dans `SELECT`, ClickHouse lance une exception. - -### Astérisque {#asterisk} - -Vous pouvez mettre un astérisque dans quelque partie de la requête au lieu d'une expression. Lorsque la requête est analysée, l'astérisque est étendu à une liste de toutes les colonnes `MATERIALIZED` et `ALIAS` colonne). Il n'y a que quelques cas où l'utilisation d'un astérisque est justifiée: - -- Lors de la création d'un vidage de table. -- Pour les tables contenant seulement quelques colonnes, comme les tables système. -- Pour obtenir des informations sur ce que sont les colonnes dans une table. Dans ce cas, la valeur `LIMIT 1`. Mais il est préférable d'utiliser la `DESC TABLE` requête. -- Quand il y a une forte filtration sur un petit nombre de colonnes en utilisant `PREWHERE`. -- Dans les sous-requêtes (puisque les colonnes qui ne sont pas nécessaires pour la requête externe sont exclues des sous-requêtes). - -Dans tous les autres cas, nous ne recommandons pas d'utiliser l'astérisque, car il ne vous donne que les inconvénients d'un SGBD colonnaire au lieu des avantages. En d'autres termes, l'utilisation de l'astérisque n'est pas recommandée. - -### Les Valeurs Extrêmes {#extreme-values} - -En plus des résultats, vous pouvez également obtenir des valeurs minimales et maximales pour les colonnes de résultats. Pour ce faire, définissez la **extrême** réglage sur 1. Les Minimums et les maximums sont calculés pour les types numériques, les dates et les dates avec des heures. Pour les autres colonnes, les valeurs par défaut sont sorties. - -An extra two rows are calculated – the minimums and maximums, respectively. These extra two rows are output in `JSON*`, `TabSeparated*`, et `Pretty*` [format](../../../interfaces/formats.md), séparés des autres lignes. Ils ne sont pas Produits pour d'autres formats. - -Dans `JSON*` formats, les valeurs extrêmes sont sorties dans un ‘extremes’ champ. Dans `TabSeparated*` formats, la ligne vient après le résultat principal, et après ‘totals’ si elle est présente. Elle est précédée par une ligne vide (après les autres données). Dans `Pretty*` formats, la ligne est sortie comme une table séparée après le résultat principal, et après `totals` si elle est présente. - -Les valeurs extrêmes sont calculées pour les lignes avant `LIMIT` mais après `LIMIT BY`. Cependant, lors de l'utilisation de `LIMIT offset, size`, les lignes avant de les `offset` sont inclus dans `extremes`. Dans les requêtes de flux, le résultat peut également inclure un petit nombre de lignes qui ont traversé `LIMIT`. - -### Note {#notes} - -Vous pouvez utiliser des synonymes (`AS` alias) dans n'importe quelle partie d'une requête. - -Le `GROUP BY` et `ORDER BY` les clauses ne supportent pas les arguments positionnels. Cela contredit MySQL, mais est conforme à SQL standard. Exemple, `GROUP BY 1, 2` will be interpreted as grouping by constants (i.e. aggregation of all rows into one). - -## Détails De Mise En Œuvre {#implementation-details} - -Si la requête omet le `DISTINCT`, `GROUP BY` et `ORDER BY` les clauses et les `IN` et `JOIN` sous-requêtes, la requête sera complètement traitée en flux, en utilisant O (1) quantité de RAM. Sinon, la requête peut consommer beaucoup de RAM si les restrictions appropriées ne sont pas spécifiées: - -- `max_memory_usage` -- `max_rows_to_group_by` -- `max_rows_to_sort` -- `max_rows_in_distinct` -- `max_bytes_in_distinct` -- `max_rows_in_set` -- `max_bytes_in_set` -- `max_rows_in_join` -- `max_bytes_in_join` -- `max_bytes_before_external_sort` -- `max_bytes_before_external_group_by` - -Pour plus d'informations, consultez la section “Settings”. Il est possible d'utiliser le tri externe (sauvegarde des tables temporaires sur un disque) et l'agrégation externe. - -{## [Article Original](https://clickhouse.tech/docs/en/sql-reference/statements/select/) ##} diff --git a/docs/fr/sql-reference/statements/select/into-outfile.md b/docs/fr/sql-reference/statements/select/into-outfile.md deleted file mode 100644 index 0150de7cb97..00000000000 --- a/docs/fr/sql-reference/statements/select/into-outfile.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Dans OUTFILE Clause {#into-outfile-clause} - -Ajouter l' `INTO OUTFILE filename` clause (où filename est un littéral de chaîne) pour `SELECT query` pour rediriger sa sortie vers le fichier spécifié côté client. - -## Détails De Mise En Œuvre {#implementation-details} - -- Cette fonctionnalité est disponible dans les [client de ligne de commande](../../../interfaces/cli.md) et [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Ainsi, une requête envoyée par [Interface HTTP](../../../interfaces/http.md) va échouer. -- La requête échouera si un fichier portant le même nom existe déjà. -- Défaut [le format de sortie](../../../interfaces/formats.md) être `TabSeparated` (comme dans le mode batch client en ligne de commande). diff --git a/docs/fr/sql-reference/statements/select/join.md b/docs/fr/sql-reference/statements/select/join.md deleted file mode 100644 index 4233a120674..00000000000 --- a/docs/fr/sql-reference/statements/select/join.md +++ /dev/null @@ -1,187 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause de JOINTURE {#select-join} - -Join produit une nouvelle table en combinant des colonnes d'une ou plusieurs tables en utilisant des valeurs communes à chacune. C'est une opération courante dans les bases de données avec support SQL, ce qui correspond à [l'algèbre relationnelle](https://en.wikipedia.org/wiki/Relational_algebra#Joins_and_join-like_operators) rejoindre. Le cas particulier d'une jointure de table est souvent appelé “self-join”. - -Syntaxe: - -``` sql -SELECT -FROM -[GLOBAL] [INNER|LEFT|RIGHT|FULL|CROSS] [OUTER|SEMI|ANTI|ANY|ASOF] JOIN -(ON )|(USING ) ... -``` - -Les Expressions de `ON` clause et colonnes de `USING` clause sont appelés “join keys”. Sauf indication contraire, joindre un produit [Produit cartésien](https://en.wikipedia.org/wiki/Cartesian_product) des lignes, avec correspondance “join keys”, ce qui pourrait produire des résultats avec beaucoup plus de lignes que les tables source. - -## Types de jointure pris en charge {#select-join-types} - -Tous les standard [SQL JOIN](https://en.wikipedia.org/wiki/Join_(SQL)) les types sont pris en charge: - -- `INNER JOIN`, seules les lignes correspondantes sont retournés. -- `LEFT OUTER JOIN`, les lignes non correspondantes de la table de gauche sont retournées en plus des lignes correspondantes. -- `RIGHT OUTER JOIN`, les lignes non correspondantes de la table de gauche sont retournées en plus des lignes correspondantes. -- `FULL OUTER JOIN`, les lignes non correspondantes des deux tables sont renvoyées en plus des lignes correspondantes. -- `CROSS JOIN`, produit le produit cartésien des tables entières, “join keys” être **pas** défini. - -`JOIN` sans type spécifié implique `INNER`. Mot `OUTER` peut les oublier. Syntaxe Alternative pour `CROSS JOIN` spécifie plusieurs tables dans [De la clause](from.md) séparés par des virgules. - -Autres types de jointure disponibles dans ClickHouse: - -- `LEFT SEMI JOIN` et `RIGHT SEMI JOIN` une liste blanche sur “join keys”, sans produire un produit cartésien. -- `LEFT ANTI JOIN` et `RIGHT ANTI JOIN` une liste noire sur “join keys”, sans produire un produit cartésien. -- `LEFT ANY JOIN`, `RIGHT ANY JOIN` et `INNER ANY JOIN`, partially (for opposite side of `LEFT` and `RIGHT`) or completely (for `INNER` and `FULL`) disables the cartesian product for standard `JOIN` types. -- `ASOF JOIN` et `LEFT ASOF JOIN`, joining sequences with a non-exact match. `ASOF JOIN` usage is described below. - -## Setting {#join-settings} - -!!! note "Note" - La valeur de rigueur par défaut peut être remplacée à l'aide [join_default_strictness](../../../operations/settings/settings.md#settings-join_default_strictness) paramètre. - -### ASOF joindre L'utilisation {#asof-join-usage} - -`ASOF JOIN` est utile lorsque vous devez joindre des enregistrements qui n'ont pas de correspondance exacte. - -Tables pour `ASOF JOIN` doit avoir une colonne de séquence ordonnée. Cette colonne ne peut pas être seule dans une table et doit être l'un des types de données: `UInt32`, `UInt64`, `Float32`, `Float64`, `Date`, et `DateTime`. - -Syntaxe `ASOF JOIN ... ON`: - -``` sql -SELECT expressions_list -FROM table_1 -ASOF LEFT JOIN table_2 -ON equi_cond AND closest_match_cond -``` - -Vous pouvez utiliser n'importe quel nombre de conditions d'égalité et exactement une condition de correspondance la plus proche. Exemple, `SELECT count() FROM table_1 ASOF LEFT JOIN table_2 ON table_1.a == table_2.b AND table_2.t <= table_1.t`. - -Conditions prises en charge pour la correspondance la plus proche: `>`, `>=`, `<`, `<=`. - -Syntaxe `ASOF JOIN ... USING`: - -``` sql -SELECT expressions_list -FROM table_1 -ASOF JOIN table_2 -USING (equi_column1, ... equi_columnN, asof_column) -``` - -`ASOF JOIN` utiliser `equi_columnX` pour rejoindre sur l'égalité et `asof_column` pour rejoindre le match le plus proche avec le `table_1.asof_column >= table_2.asof_column` condition. Le `asof_column` colonne toujours la dernière dans le `USING` clause. - -Par exemple, considérez les tableaux suivants: - - table_1 table_2 - event | ev_time | user_id event | ev_time | user_id - ----------|---------|---------- ----------|---------|---------- - ... ... - event_1_1 | 12:00 | 42 event_2_1 | 11:59 | 42 - ... event_2_2 | 12:30 | 42 - event_1_2 | 13:00 | 42 event_2_3 | 13:00 | 42 - ... ... - -`ASOF JOIN` peut prendre la date d'un événement utilisateur de `table_1` et trouver un événement dans `table_2` où le timestamp est plus proche de l'horodatage de l'événement à partir de `table_1` correspondant à la condition de correspondance la plus proche. Les valeurs d'horodatage égales sont les plus proches si elles sont disponibles. Ici, l' `user_id` la colonne peut être utilisée pour joindre sur l'égalité et le `ev_time` la colonne peut être utilisée pour se joindre à la correspondance la plus proche. Dans notre exemple, `event_1_1` peut être jointe à `event_2_1` et `event_1_2` peut être jointe à `event_2_3`, mais `event_2_2` ne peut pas être rejoint. - -!!! note "Note" - `ASOF` jointure est **pas** pris en charge dans le [Rejoindre](../../../engines/table-engines/special/join.md) tableau moteur. - -## Jointure Distribuée {#global-join} - -Il existe deux façons d'exécuter join impliquant des tables distribuées: - -- Lors de l'utilisation normale `JOIN` la requête est envoyée aux serveurs distants. Les sous-requêtes sont exécutées sur chacune d'elles afin de créer la bonne table, et la jointure est effectuée avec cette table. En d'autres termes, la table de droite est formée sur chaque serveur séparément. -- Lors de l'utilisation de `GLOBAL ... JOIN`, d'abord le serveur demandeur exécute une sous-requête pour calculer la bonne table. Cette table temporaire est transmise à chaque serveur distant, et les requêtes sont exécutées sur eux en utilisant les données temporaires qui ont été transmises. - -Soyez prudent lorsque vous utilisez `GLOBAL`. Pour plus d'informations, voir le [Sous-requêtes distribuées](../../operators/in.md#select-distributed-subqueries) section. - -## Recommandations D'Utilisation {#usage-recommendations} - -### Traitement des cellules vides ou nulles {#processing-of-empty-or-null-cells} - -Lors de la jonction de tables, les cellules vides peuvent apparaître. Paramètre [join_use_nulls](../../../operations/settings/settings.md#join_use_nulls) définir comment clickhouse remplit ces cellules. - -Si l' `JOIN` les touches sont [Nullable](../../data-types/nullable.md) champs, les lignes où au moins une des clés a la valeur [NULL](../../../sql-reference/syntax.md#null-literal) ne sont pas jointes. - -### Syntaxe {#syntax} - -Les colonnes spécifiées dans `USING` doit avoir les mêmes noms dans les deux sous-requêtes, et les autres colonnes doivent être nommées différemment. Vous pouvez utiliser des alias pour les noms des colonnes dans les sous-requêtes. - -Le `USING` clause spécifie une ou plusieurs colonnes de jointure, qui établit l'égalité de ces colonnes. La liste des colonnes est définie sans crochets. Les conditions de jointure plus complexes ne sont pas prises en charge. - -### Limitations De Syntaxe {#syntax-limitations} - -Pour plusieurs `JOIN` clauses dans un seul `SELECT` requête: - -- Prendre toutes les colonnes via `*` n'est disponible que si les tables sont jointes, pas les sous-requêtes. -- Le `PREWHERE` la clause n'est pas disponible. - -Pour `ON`, `WHERE`, et `GROUP BY` clause: - -- Les expressions arbitraires ne peuvent pas être utilisées dans `ON`, `WHERE`, et `GROUP BY` mais vous pouvez définir une expression dans un `SELECT` clause et ensuite l'utiliser dans ces clauses via un alias. - -### Performance {#performance} - -Lors de l'exécution d'un `JOIN`, il n'y a pas d'optimisation de la commande d'exécution par rapport aux autres stades de la requête. La jointure (une recherche dans la table de droite) est exécutée avant de filtrer `WHERE` et avant l'agrégation. - -Chaque fois qu'une requête est exécutée avec la même `JOIN`, la sous-requête est exécutée à nouveau car le résultat n'est pas mis en cache. Pour éviter cela, utilisez la spéciale [Rejoindre](../../../engines/table-engines/special/join.md) table engine, qui est un tableau préparé pour l'assemblage qui est toujours en RAM. - -Dans certains cas, il est plus efficace d'utiliser [IN](../../operators/in.md) plutôt `JOIN`. - -Si vous avez besoin d'un `JOIN` pour se joindre à des tables de dimension (ce sont des tables relativement petites qui contiennent des propriétés de dimension, telles que des noms pour des campagnes publicitaires), un `JOIN` peut-être pas très pratique en raison du fait que la bonne table est ré-accédée pour chaque requête. Pour de tels cas, il y a un “external dictionaries” la fonctionnalité que vous devez utiliser à la place de `JOIN`. Pour plus d'informations, voir le [Dictionnaires externes](../../dictionaries/external-dictionaries/external-dicts.md) section. - -### Limitations De Mémoire {#memory-limitations} - -Par défaut, ClickHouse utilise [jointure de hachage](https://en.wikipedia.org/wiki/Hash_join) algorithme. ClickHouse prend le `` et crée une table de hachage pour cela dans la RAM. Après un certain seuil de consommation de mémoire, ClickHouse revient à fusionner l'algorithme de jointure. - -Si vous devez restreindre la consommation de mémoire de l'opération join utilisez les paramètres suivants: - -- [max_rows_in_join](../../../operations/settings/query-complexity.md#settings-max_rows_in_join) — Limits number of rows in the hash table. -- [max_bytes_in_join](../../../operations/settings/query-complexity.md#settings-max_bytes_in_join) — Limits size of the hash table. - -Lorsque l'une de ces limites est atteinte, ClickHouse agit comme [join_overflow_mode](../../../operations/settings/query-complexity.md#settings-join_overflow_mode) réglage des instructions. - -## Exemple {#examples} - -Exemple: - -``` sql -SELECT - CounterID, - hits, - visits -FROM -( - SELECT - CounterID, - count() AS hits - FROM test.hits - GROUP BY CounterID -) ANY LEFT JOIN -( - SELECT - CounterID, - sum(Sign) AS visits - FROM test.visits - GROUP BY CounterID -) USING CounterID -ORDER BY hits DESC -LIMIT 10 -``` - -``` text -┌─CounterID─┬───hits─┬─visits─┐ -│ 1143050 │ 523264 │ 13665 │ -│ 731962 │ 475698 │ 102716 │ -│ 722545 │ 337212 │ 108187 │ -│ 722889 │ 252197 │ 10547 │ -│ 2237260 │ 196036 │ 9522 │ -│ 23057320 │ 147211 │ 7689 │ -│ 722818 │ 90109 │ 17847 │ -│ 48221 │ 85379 │ 4652 │ -│ 19762435 │ 77807 │ 7026 │ -│ 722884 │ 77492 │ 11056 │ -└───────────┴────────┴────────┘ -``` diff --git a/docs/fr/sql-reference/statements/select/limit-by.md b/docs/fr/sql-reference/statements/select/limit-by.md deleted file mode 100644 index 4d1bd766ef1..00000000000 --- a/docs/fr/sql-reference/statements/select/limit-by.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Limite par Clause {#limit-by-clause} - -Une requête avec l' `LIMIT n BY expressions` la clause sélectionne le premier `n` lignes pour chaque valeur distincte de `expressions`. La clé pour `LIMIT BY` peut contenir n'importe quel nombre de [expression](../../syntax.md#syntax-expressions). - -ClickHouse prend en charge les variantes de syntaxe suivantes: - -- `LIMIT [offset_value, ]n BY expressions` -- `LIMIT n OFFSET offset_value BY expressions` - -Pendant le traitement de la requête, ClickHouse sélectionne les données classées par clé de tri. La clé de tri est définie explicitement à l'aide [ORDER BY](order-by.md) clause ou implicitement en tant que propriété du moteur de table. Puis clickhouse s'applique `LIMIT n BY expressions` et renvoie le premier `n` lignes pour chaque combinaison distincte de `expressions`. Si `OFFSET` est spécifié, puis pour chaque bloc de données qui appartient à une combinaison particulière de `expressions`, Clickhouse saute `offset_value` nombre de lignes depuis le début du bloc et renvoie un maximum de `n` les lignes en conséquence. Si `offset_value` est plus grand que le nombre de lignes dans le bloc de données, ClickHouse renvoie zéro lignes du bloc. - -!!! note "Note" - `LIMIT BY` n'est pas liée à [LIMIT](limit.md). Ils peuvent tous deux être utilisés dans la même requête. - -## Exemple {#examples} - -Exemple de table: - -``` sql -CREATE TABLE limit_by(id Int, val Int) ENGINE = Memory; -INSERT INTO limit_by VALUES (1, 10), (1, 11), (1, 12), (2, 20), (2, 21); -``` - -Requête: - -``` sql -SELECT * FROM limit_by ORDER BY id, val LIMIT 2 BY id -``` - -``` text -┌─id─┬─val─┐ -│ 1 │ 10 │ -│ 1 │ 11 │ -│ 2 │ 20 │ -│ 2 │ 21 │ -└────┴─────┘ -``` - -``` sql -SELECT * FROM limit_by ORDER BY id, val LIMIT 1, 2 BY id -``` - -``` text -┌─id─┬─val─┐ -│ 1 │ 11 │ -│ 1 │ 12 │ -│ 2 │ 21 │ -└────┴─────┘ -``` - -Le `SELECT * FROM limit_by ORDER BY id, val LIMIT 2 OFFSET 1 BY id` requête renvoie le même résultat. - -La requête suivante renvoie les 5 principaux référents pour chaque `domain, device_type` paire avec un maximum de 100 lignes au total (`LIMIT n BY + LIMIT`). - -``` sql -SELECT - domainWithoutWWW(URL) AS domain, - domainWithoutWWW(REFERRER_URL) AS referrer, - device_type, - count() cnt -FROM hits -GROUP BY domain, referrer, device_type -ORDER BY cnt DESC -LIMIT 5 BY domain, device_type -LIMIT 100 -``` diff --git a/docs/fr/sql-reference/statements/select/limit.md b/docs/fr/sql-reference/statements/select/limit.md deleted file mode 100644 index 69334c32cc9..00000000000 --- a/docs/fr/sql-reference/statements/select/limit.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause LIMIT {#limit-clause} - -`LIMIT m` permet de sélectionner la première `m` lignes du résultat. - -`LIMIT n, m` permet de sélectionner le `m` lignes du résultat après avoir sauté le premier `n` rangée. Le `LIMIT m OFFSET n` la syntaxe est équivalente. - -`n` et `m` doivent être des entiers non négatifs. - -Si il n'y a pas de [ORDER BY](order-by.md) clause qui trie explicitement les résultats, le choix des lignes pour le résultat peut être arbitraire et non déterministe. diff --git a/docs/fr/sql-reference/statements/select/order-by.md b/docs/fr/sql-reference/statements/select/order-by.md deleted file mode 100644 index 2a4ef58d7ad..00000000000 --- a/docs/fr/sql-reference/statements/select/order-by.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause ORDER BY {#select-order-by} - -Le `ORDER BY` clause contient une liste des expressions, qui peuvent être attribuées avec `DESC` (décroissant) ou `ASC` modificateur (ascendant) qui détermine la direction de tri. Si la direction n'est pas spécifié, `ASC` est supposé, donc il est généralement omis. La direction de tri s'applique à une seule expression, pas à la liste entière. Exemple: `ORDER BY Visits DESC, SearchPhrase` - -Les lignes qui ont des valeurs identiques pour la liste des expressions de tri sont sorties dans un ordre arbitraire, qui peut également être non déterministe (différent à chaque fois). -Si la clause ORDER BY est omise, l'ordre des lignes est également indéfini et peut également être non déterministe. - -## Tri des valeurs spéciales {#sorting-of-special-values} - -Il existe deux approches pour `NaN` et `NULL` ordre de tri: - -- Par défaut ou avec le `NULLS LAST` modificateur: d'abord les valeurs, puis `NaN`, puis `NULL`. -- Avec l' `NULLS FIRST` modificateur: première `NULL`, puis `NaN` puis d'autres valeurs. - -### Exemple {#example} - -Pour la table - -``` text -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 2 │ -│ 1 │ nan │ -│ 2 │ 2 │ -│ 3 │ 4 │ -│ 5 │ 6 │ -│ 6 │ nan │ -│ 7 │ ᴺᵁᴸᴸ │ -│ 6 │ 7 │ -│ 8 │ 9 │ -└───┴──────┘ -``` - -Exécuter la requête `SELECT * FROM t_null_nan ORDER BY y NULLS FIRST` obtenir: - -``` text -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 7 │ ᴺᵁᴸᴸ │ -│ 1 │ nan │ -│ 6 │ nan │ -│ 2 │ 2 │ -│ 2 │ 2 │ -│ 3 │ 4 │ -│ 5 │ 6 │ -│ 6 │ 7 │ -│ 8 │ 9 │ -└───┴──────┘ -``` - -Lorsque les nombres à virgule flottante sont triés, les Nan sont séparés des autres valeurs. Quel que soit l'ordre de tri, NaNs viennent à la fin. En d'autres termes, pour le Tri ascendant, ils sont placés comme s'ils étaient plus grands que tous les autres nombres, tandis que pour le Tri descendant, ils sont placés comme s'ils étaient plus petits que les autres. - -## Classement De Soutien {#collation-support} - -Pour le tri par valeurs de chaîne, vous pouvez spécifier le classement (comparaison). Exemple: `ORDER BY SearchPhrase COLLATE 'tr'` - pour le tri par mot-clé dans l'ordre croissant, en utilisant l'alphabet turc, insensible à la casse, en supposant que les chaînes sont encodées en UTF-8. COLLATE peut être spécifié ou non pour chaque expression dans L'ordre par indépendamment. Si ASC ou DESC est spécifié, COLLATE est spécifié après. Lors de L'utilisation de COLLATE, le tri est toujours insensible à la casse. - -Nous recommandons uniquement D'utiliser COLLATE pour le tri final d'un petit nombre de lignes, car le tri avec COLLATE est moins efficace que le tri normal par octets. - -## Détails De Mise En Œuvre {#implementation-details} - -Moins de RAM est utilisé si un assez petit [LIMIT](limit.md) est précisée en plus `ORDER BY`. Sinon, la quantité de mémoire dépensée est proportionnelle au volume de données à trier. Pour le traitement des requêtes distribuées, si [GROUP BY](group-by.md) est omis, le tri est partiellement effectué sur les serveurs distants et les résultats sont fusionnés Sur le serveur demandeur. Cela signifie que pour le tri distribué, le volume de données à trier peut être supérieur à la quantité de mémoire sur un seul serveur. - -S'il N'y a pas assez de RAM, il est possible d'effectuer un tri dans la mémoire externe (création de fichiers temporaires sur un disque). Utilisez le paramètre `max_bytes_before_external_sort` pour ce but. S'il est défini sur 0 (par défaut), le tri externe est désactivé. Si elle est activée, lorsque le volume de données à trier atteint le nombre spécifié d'octets, les données collectées sont triés et déposés dans un fichier temporaire. Une fois toutes les données lues, tous les fichiers triés sont fusionnés et les résultats sont générés. Les fichiers sont écrits dans le `/var/lib/clickhouse/tmp/` dans la configuration (par défaut, mais vous pouvez `tmp_path` paramètre pour modifier ce paramètre). - -L'exécution d'une requête peut utiliser plus de mémoire que `max_bytes_before_external_sort`. Pour cette raison, ce paramètre doit avoir une valeur significativement inférieure à `max_memory_usage`. Par exemple, si votre serveur dispose de 128 Go de RAM et que vous devez exécuter une seule requête, définissez `max_memory_usage` à 100 Go, et `max_bytes_before_external_sort` à 80 Go. - -Le tri externe fonctionne beaucoup moins efficacement que le tri dans la RAM. diff --git a/docs/fr/sql-reference/statements/select/prewhere.md b/docs/fr/sql-reference/statements/select/prewhere.md deleted file mode 100644 index 2c825d050f4..00000000000 --- a/docs/fr/sql-reference/statements/select/prewhere.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause PREWHERE {#prewhere-clause} - -Prewhere est une optimisation pour appliquer le filtrage plus efficacement. Il est activé par défaut, même si `PREWHERE` la clause n'est pas explicitement spécifié. Il fonctionne en déplaçant automatiquement une partie de [WHERE](where.md) condition à prewhere étape. Le rôle de `PREWHERE` la clause est seulement pour contrôler cette optimisation si vous pensez que vous savez comment le faire mieux que par défaut. - -Avec l'optimisation prewhere, au début, seules les colonnes nécessaires à l'exécution de l'expression prewhere sont lues. Ensuite, les autres colonnes sont lues qui sont nécessaires pour exécuter le reste de la requête, mais seulement les blocs où l'expression prewhere est “true” au moins pour certaines lignes. S'il y a beaucoup de blocs où prewhere expression est “false” pour toutes les lignes et prewhere a besoin de moins de colonnes que les autres parties de la requête, cela permet souvent de lire beaucoup moins de données à partir du disque pour l'exécution de la requête. - -## Contrôle Manuel De Prewhere {#controlling-prewhere-manually} - -La clause a le même sens que la `WHERE` clause. La différence est dans laquelle les données sont lues à partir de la table. Quand à commander manuellement `PREWHERE` pour les conditions de filtration qui sont utilisées par une minorité des colonnes de la requête, mais qui fournissent une filtration de données forte. Cela réduit le volume de données à lire. - -Une requête peut spécifier simultanément `PREWHERE` et `WHERE`. Dans ce cas, `PREWHERE` précéder `WHERE`. - -Si l' `optimize_move_to_prewhere` le paramètre est défini sur 0, heuristiques pour déplacer automatiquement des parties d'expressions `WHERE` de `PREWHERE` sont désactivés. - -## Limitation {#limitations} - -`PREWHERE` est uniquement pris en charge par les tables `*MergeTree` famille. diff --git a/docs/fr/sql-reference/statements/select/sample.md b/docs/fr/sql-reference/statements/select/sample.md deleted file mode 100644 index b2ddc060a19..00000000000 --- a/docs/fr/sql-reference/statements/select/sample.md +++ /dev/null @@ -1,113 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Exemple de Clause {#select-sample-clause} - -Le `SAMPLE` clause permet approchée `SELECT` le traitement de la requête. - -Lorsque l'échantillonnage de données est activé, la requête n'est pas effectuée sur toutes les données, mais uniquement sur une certaine fraction de données (échantillon). Par exemple, si vous avez besoin de calculer des statistiques pour toutes les visites, il suffit d'exécuter la requête sur le 1/10 de la fraction de toutes les visites, puis multiplier le résultat par 10. - -Le traitement approximatif des requêtes peut être utile dans les cas suivants: - -- Lorsque vous avez des exigences de synchronisation strictes (comme \<100ms), mais que vous ne pouvez pas justifier le coût des ressources matérielles supplémentaires pour y répondre. -- Lorsque vos données brutes ne sont pas précises, l'approximation ne dégrade pas sensiblement la qualité. -- Les exigences commerciales ciblent des résultats approximatifs (pour la rentabilité, ou pour commercialiser des résultats exacts aux utilisateurs premium). - -!!! note "Note" - Vous ne pouvez utiliser l'échantillonnage qu'avec les tables [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) famille, et seulement si l'expression d'échantillonnage a été spécifiée lors de la création de la table (voir [Moteur MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table)). - -Les caractéristiques de l'échantillonnage des données sont énumérées ci-dessous: - -- L'échantillonnage de données est un mécanisme déterministe. Le résultat de la même `SELECT .. SAMPLE` la requête est toujours le même. -- L'échantillonnage fonctionne de manière cohérente pour différentes tables. Pour les tables avec une seule clé d'échantillonnage, un échantillon avec le même coefficient sélectionne toujours le même sous-ensemble de données possibles. Par exemple, un exemple d'ID utilisateur prend des lignes avec le même sous-ensemble de tous les ID utilisateur possibles de différentes tables. Cela signifie que vous pouvez utiliser l'exemple dans les sous-requêtes dans la [IN](../../operators/in.md) clause. En outre, vous pouvez joindre des échantillons en utilisant le [JOIN](join.md) clause. -- L'échantillonnage permet de lire moins de données à partir d'un disque. Notez que vous devez spécifier l'échantillonnage clé correctement. Pour plus d'informations, voir [Création d'une Table MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table). - -Pour l' `SAMPLE` clause la syntaxe suivante est prise en charge: - -| SAMPLE Clause Syntax | Description | -|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `SAMPLE k` | Ici `k` est le nombre de 0 à 1.
La requête est exécutée sur `k` fraction des données. Exemple, `SAMPLE 0.1` exécute la requête sur 10% des données. [Lire plus](#select-sample-k) | -| `SAMPLE n` | Ici `n` est un entier suffisamment grand.
La requête est exécutée sur un échantillon d'au moins `n` lignes (mais pas significativement plus que cela). Exemple, `SAMPLE 10000000` exécute la requête sur un minimum de 10 000 000 lignes. [Lire plus](#select-sample-n) | -| `SAMPLE k OFFSET m` | Ici `k` et `m` sont les nombres de 0 à 1.
La requête est exécutée sur un échantillon de `k` fraction des données. Les données utilisées pour l'échantillon est compensée par `m` fraction. [Lire plus](#select-sample-offset) | - -## SAMPLE K {#select-sample-k} - -Ici `k` est le nombre de 0 à 1 (les notations fractionnaires et décimales sont prises en charge). Exemple, `SAMPLE 1/2` ou `SAMPLE 0.5`. - -Dans un `SAMPLE k` clause, l'échantillon est prélevé à partir de la `k` fraction des données. L'exemple est illustré ci-dessous: - -``` sql -SELECT - Title, - count() * 10 AS PageViews -FROM hits_distributed -SAMPLE 0.1 -WHERE - CounterID = 34 -GROUP BY Title -ORDER BY PageViews DESC LIMIT 1000 -``` - -Dans cet exemple, la requête est exécutée sur un échantillon de 0,1 (10%) de données. Les valeurs des fonctions d'agrégat ne sont pas corrigées automatiquement, donc pour obtenir un résultat approximatif, la valeur `count()` est multiplié manuellement par 10. - -## SAMPLE N {#select-sample-n} - -Ici `n` est un entier suffisamment grand. Exemple, `SAMPLE 10000000`. - -Dans ce cas, la requête est exécutée sur un échantillon d'au moins `n` lignes (mais pas significativement plus que cela). Exemple, `SAMPLE 10000000` exécute la requête sur un minimum de 10 000 000 lignes. - -Puisque l'unité minimale pour la lecture des données est un granule (sa taille est définie par le `index_granularity` de réglage), il est logique de définir un échantillon beaucoup plus grand que la taille du granule. - -Lors de l'utilisation de la `SAMPLE n` clause, vous ne savez pas quel pourcentage relatif de données a été traité. Donc, vous ne connaissez pas le coefficient par lequel les fonctions agrégées doivent être multipliées. L'utilisation de la `_sample_factor` colonne virtuelle pour obtenir le résultat approximatif. - -Le `_sample_factor` colonne contient des coefficients relatifs qui sont calculés dynamiquement. Cette colonne est créée automatiquement lorsque vous [créer](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) une table avec la clé d'échantillonnage spécifiée. Les exemples d'utilisation de la `_sample_factor` colonne sont indiqués ci-dessous. - -Considérons la table `visits` qui contient des statistiques sur les visites de site. Le premier exemple montre comment calculer le nombre de pages vues: - -``` sql -SELECT sum(PageViews * _sample_factor) -FROM visits -SAMPLE 10000000 -``` - -L'exemple suivant montre comment calculer le nombre total de visites: - -``` sql -SELECT sum(_sample_factor) -FROM visits -SAMPLE 10000000 -``` - -L'exemple ci-dessous montre comment calculer la durée moyenne de la session. Notez que vous n'avez pas besoin d'utiliser le coefficient relatif pour calculer les valeurs moyennes. - -``` sql -SELECT avg(Duration) -FROM visits -SAMPLE 10000000 -``` - -## SAMPLE K OFFSET M {#select-sample-offset} - -Ici `k` et `m` sont des nombres de 0 à 1. Des exemples sont présentés ci-dessous. - -**Exemple 1** - -``` sql -SAMPLE 1/10 -``` - -Dans cet exemple, l'échantillon représente 1 / 10e de toutes les données: - -`[++------------]` - -**Exemple 2** - -``` sql -SAMPLE 1/10 OFFSET 1/2 -``` - -Ici, un échantillon de 10% est prélevé à partir de la seconde moitié des données. - -`[------++------]` diff --git a/docs/fr/sql-reference/statements/select/union.md b/docs/fr/sql-reference/statements/select/union.md deleted file mode 100644 index 9ae65ebcf72..00000000000 --- a/docs/fr/sql-reference/statements/select/union.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause UNION ALL {#union-clause} - -Vous pouvez utiliser `UNION ALL` à combiner `SELECT` requêtes en étendant leurs résultats. Exemple: - -``` sql -SELECT CounterID, 1 AS table, toInt64(count()) AS c - FROM test.hits - GROUP BY CounterID - -UNION ALL - -SELECT CounterID, 2 AS table, sum(Sign) AS c - FROM test.visits - GROUP BY CounterID - HAVING c > 0 -``` - -Les colonnes de résultat sont appariées par leur index (ordre intérieur `SELECT`). Si les noms de colonne ne correspondent pas, les noms du résultat final sont tirés de la première requête. - -La coulée de Type est effectuée pour les syndicats. Par exemple, si deux requêtes combinées ont le même champ avec non-`Nullable` et `Nullable` types d'un type compatible, la `UNION ALL` a un `Nullable` type de champ. - -Requêtes qui font partie de `UNION ALL` ne peut pas être placée entre parenthèses. [ORDER BY](order-by.md) et [LIMIT](limit.md) sont appliqués à des requêtes séparées, pas au résultat final. Si vous devez appliquer une conversion au résultat final, vous pouvez mettre toutes les requêtes avec `UNION ALL` dans une sous-requête dans la [FROM](from.md) clause. - -## Limitation {#limitations} - -Seulement `UNION ALL` est pris en charge. Régulier `UNION` (`UNION DISTINCT`) n'est pas pris en charge. Si vous avez besoin d' `UNION DISTINCT`, vous pouvez écrire `SELECT DISTINCT` à partir d'une sous-requête contenant `UNION ALL`. - -## Détails De Mise En Œuvre {#implementation-details} - -Requêtes qui font partie de `UNION ALL` peuvent être exécutées simultanément, et leurs résultats peuvent être mélangés ensemble. diff --git a/docs/fr/sql-reference/statements/select/where.md b/docs/fr/sql-reference/statements/select/where.md deleted file mode 100644 index a4d7bc5e87a..00000000000 --- a/docs/fr/sql-reference/statements/select/where.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# Clause where {#select-where} - -`WHERE` clause permet de filtrer les données en provenance de [FROM](from.md) la clause de `SELECT`. - -Si il y a un `WHERE` , il doit contenir une expression avec la `UInt8` type. C'est généralement une expression avec comparaison et opérateurs logiques. Les lignes où cette expression est évaluée à 0 sont exclues des transformations ou des résultats ultérieurs. - -`WHERE` expression est évaluée sur la possibilité d'utiliser des index et l'élagage de partition, si le moteur de table sous-jacent le prend en charge. - -!!! note "Note" - Il y a une optimisation de filtrage appelée [prewhere](prewhere.md). diff --git a/docs/fr/sql-reference/statements/select/with.md b/docs/fr/sql-reference/statements/select/with.md deleted file mode 100644 index a42aedf460b..00000000000 --- a/docs/fr/sql-reference/statements/select/with.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd ---- - -# AVEC la Clause {#with-clause} - -Cette section prend en charge les Expressions de Table courantes ([CTE](https://en.wikipedia.org/wiki/Hierarchical_and_recursive_queries_in_SQL)), de sorte que les résultats de `WITH` la clause peut être utilisé à l'intérieur `SELECT` clause. - -## Limitation {#limitations} - -1. Les requêtes récursives ne sont pas prises en charge. -2. Lorsque la sous-requête est utilisée à l'intérieur avec section, son résultat doit être scalaire avec exactement une ligne. -3. Les résultats d'Expression ne sont pas disponibles dans les sous-requêtes. - -## Exemple {#examples} - -**Exemple 1:** Utilisation d'une expression constante comme “variable” - -``` sql -WITH '2019-08-01 15:23:00' as ts_upper_bound -SELECT * -FROM hits -WHERE - EventDate = toDate(ts_upper_bound) AND - EventTime <= ts_upper_bound -``` - -**Exemple 2:** De les expulser, somme(octets) résultat de l'expression de clause SELECT de la liste de colonnes - -``` sql -WITH sum(bytes) as s -SELECT - formatReadableSize(s), - table -FROM system.parts -GROUP BY table -ORDER BY s -``` - -**Exemple 3:** Utilisation des résultats de la sous-requête scalaire - -``` sql -/* this example would return TOP 10 of most huge tables */ -WITH - ( - SELECT sum(bytes) - FROM system.parts - WHERE active - ) AS total_disk_usage -SELECT - (sum(bytes) / total_disk_usage) * 100 AS table_disk_usage, - table -FROM system.parts -GROUP BY table -ORDER BY table_disk_usage DESC -LIMIT 10 -``` - -**Exemple 4:** Réutilisation de l'expression dans la sous-requête - -Comme solution de contournement pour la limitation actuelle de l'utilisation de l'expression dans les sous-requêtes, Vous pouvez la dupliquer. - -``` sql -WITH ['hello'] AS hello -SELECT - hello, - * -FROM -( - WITH ['hello'] AS hello - SELECT hello -) -``` - -``` text -┌─hello─────┬─hello─────┐ -│ ['hello'] │ ['hello'] │ -└───────────┴───────────┘ -``` diff --git a/docs/fr/sql-reference/statements/show.md b/docs/fr/sql-reference/statements/show.md deleted file mode 100644 index 129c6e30d1c..00000000000 --- a/docs/fr/sql-reference/statements/show.md +++ /dev/null @@ -1,169 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 38 -toc_title: SHOW ---- - -# Afficher les requêtes {#show-queries} - -## SHOW CREATE TABLE {#show-create-table} - -``` sql -SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY] [db.]table [INTO OUTFILE filename] [FORMAT format] -``` - -Renvoie un seul `String`-type ‘statement’ column, which contains a single value – the `CREATE` requête utilisée pour créer l'objet spécifié. - -## SHOW DATABASES {#show-databases} - -``` sql -SHOW DATABASES [INTO OUTFILE filename] [FORMAT format] -``` - -Imprime une liste de toutes les bases de données. -Cette requête est identique à `SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format]`. - -## SHOW PROCESSLIST {#show-processlist} - -``` sql -SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] -``` - -Sorties le contenu de la [système.processus](../../operations/system-tables.md#system_tables-processes) table, qui contient une liste de requêtes en cours de traitement en ce moment, à l'exception `SHOW PROCESSLIST` requête. - -Le `SELECT * FROM system.processes` requête renvoie des données sur toutes les requêtes en cours. - -Astuce (exécuter dans la console): - -``` bash -$ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" -``` - -## SHOW TABLES {#show-tables} - -Affiche une liste de tableaux. - -``` sql -SHOW [TEMPORARY] TABLES [{FROM | IN} ] [LIKE '' | WHERE expr] [LIMIT ] [INTO OUTFILE ] [FORMAT ] -``` - -Si l' `FROM` la clause n'est pas spécifié, la requête renvoie la liste des tables de la base de données actuelle. - -Vous pouvez obtenir les mêmes résultats que l' `SHOW TABLES` requête de la façon suivante: - -``` sql -SELECT name FROM system.tables WHERE database = [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] -``` - -**Exemple** - -La requête suivante sélectionne les deux premières lignes de la liste des tables `system` base de données, dont les noms contiennent `co`. - -``` sql -SHOW TABLES FROM system LIKE '%co%' LIMIT 2 -``` - -``` text -┌─name───────────────────────────┐ -│ aggregate_function_combinators │ -│ collations │ -└────────────────────────────────┘ -``` - -## SHOW DICTIONARIES {#show-dictionaries} - -Affiche une liste de [dictionnaires externes](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). - -``` sql -SHOW DICTIONARIES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] -``` - -Si l' `FROM` la clause n'est pas spécifié, la requête retourne la liste des dictionnaires de la base de données actuelle. - -Vous pouvez obtenir les mêmes résultats que l' `SHOW DICTIONARIES` requête de la façon suivante: - -``` sql -SELECT name FROM system.dictionaries WHERE database = [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] -``` - -**Exemple** - -La requête suivante sélectionne les deux premières lignes de la liste des tables `system` base de données, dont les noms contiennent `reg`. - -``` sql -SHOW DICTIONARIES FROM db LIKE '%reg%' LIMIT 2 -``` - -``` text -┌─name─────────┐ -│ regions │ -│ region_names │ -└──────────────┘ -``` - -## SHOW GRANTS {#show-grants-statement} - -Montre les privilèges d'un utilisateur. - -### Syntaxe {#show-grants-syntax} - -``` sql -SHOW GRANTS [FOR user] -``` - -Si l'utilisateur n'est pas spécifié, la requête renvoie les privilèges de l'utilisateur actuel. - -## SHOW CREATE USER {#show-create-user-statement} - -Affiche les paramètres qui ont été utilisés [la création d'un utilisateur](create.md#create-user-statement). - -`SHOW CREATE USER` ne produit pas de mots de passe utilisateur. - -### Syntaxe {#show-create-user-syntax} - -``` sql -SHOW CREATE USER [name | CURRENT_USER] -``` - -## SHOW CREATE ROLE {#show-create-role-statement} - -Affiche les paramètres qui ont été utilisés [la création de rôle](create.md#create-role-statement) - -### Syntaxe {#show-create-role-syntax} - -``` sql -SHOW CREATE ROLE name -``` - -## SHOW CREATE ROW POLICY {#show-create-row-policy-statement} - -Affiche les paramètres qui ont été utilisés [création de stratégie de ligne](create.md#create-row-policy-statement) - -### Syntaxe {#show-create-row-policy-syntax} - -``` sql -SHOW CREATE [ROW] POLICY name ON [database.]table -``` - -## SHOW CREATE QUOTA {#show-create-quota-statement} - -Affiche les paramètres qui ont été utilisés [quota de création](create.md#create-quota-statement) - -### Syntaxe {#show-create-row-policy-syntax} - -``` sql -SHOW CREATE QUOTA [name | CURRENT] -``` - -## SHOW CREATE SETTINGS PROFILE {#show-create-settings-profile-statement} - -Affiche les paramètres qui ont été utilisés [configuration création de profil](create.md#create-settings-profile-statement) - -### Syntaxe {#show-create-row-policy-syntax} - -``` sql -SHOW CREATE [SETTINGS] PROFILE name -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/show/) diff --git a/docs/fr/sql-reference/statements/system.md b/docs/fr/sql-reference/statements/system.md deleted file mode 100644 index e8c9ed85cbc..00000000000 --- a/docs/fr/sql-reference/statements/system.md +++ /dev/null @@ -1,113 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 37 -toc_title: SYSTEM ---- - -# SYSTÈME de Requêtes {#query-language-system} - -- [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries) -- [RELOAD DICTIONARY](#query_language-system-reload-dictionary) -- [DROP DNS CACHE](#query_language-system-drop-dns-cache) -- [DROP MARK CACHE](#query_language-system-drop-mark-cache) -- [FLUSH LOGS](#query_language-system-flush_logs) -- [RELOAD CONFIG](#query_language-system-reload-config) -- [SHUTDOWN](#query_language-system-shutdown) -- [KILL](#query_language-system-kill) -- [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends) -- [FLUSH DISTRIBUTED](#query_language-system-flush-distributed) -- [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) -- [STOP MERGES](#query_language-system-stop-merges) -- [START MERGES](#query_language-system-start-merges) - -## RELOAD DICTIONARIES {#query_language-system-reload-dictionaries} - -Recharge tous les dictionnaires qui ont déjà été chargés avec succès. -Par défaut, les dictionnaires sont chargés paresseusement (voir [dictionaries_lazy_load](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_lazy_load)), donc au lieu d'être chargés automatiquement au démarrage, ils sont initialisés lors du premier accès via la fonction dictGet ou sélectionnez dans les tables avec ENGINE = Dictionary . Le `SYSTEM RELOAD DICTIONARIES` query recharge ces dictionnaires (chargés). -Retourne toujours `Ok.` quel que soit le résultat de la mise à jour du dictionnaire. - -## Recharger le dictionnaire Dictionary_name {#query_language-system-reload-dictionary} - -Recharge complètement un dictionnaire `dictionary_name`, quel que soit l'état du dictionnaire (LOADED / NOT_LOADED / FAILED). -Retourne toujours `Ok.` quel que soit le résultat de la mise à jour du dictionnaire. -L'état du dictionnaire peut être vérifié en interrogeant le `system.dictionaries` table. - -``` sql -SELECT name, status FROM system.dictionaries; -``` - -## DROP DNS CACHE {#query_language-system-drop-dns-cache} - -Réinitialise le cache DNS interne de ClickHouse. Parfois (pour les anciennes versions de ClickHouse), il est nécessaire d'utiliser cette commande lors de la modification de l'infrastructure (modification de l'adresse IP d'un autre serveur ClickHouse ou du serveur utilisé par les dictionnaires). - -Pour une gestion du cache plus pratique (automatique), voir paramètres disable_internal_dns_cache, dns_cache_update_period. - -## DROP MARK CACHE {#query_language-system-drop-mark-cache} - -Réinitialise le cache de marque. Utilisé dans le développement de ClickHouse et des tests de performance. - -## FLUSH LOGS {#query_language-system-flush_logs} - -Flushes buffers of log messages to system tables (e.g. system.query_log). Allows you to not wait 7.5 seconds when debugging. - -## RELOAD CONFIG {#query_language-system-reload-config} - -Recharge la configuration de ClickHouse. Utilisé lorsque la configuration est stockée dans ZooKeeeper. - -## SHUTDOWN {#query_language-system-shutdown} - -Normalement ferme ClickHouse (comme `service clickhouse-server stop` / `kill {$pid_clickhouse-server}`) - -## KILL {#query_language-system-kill} - -Annule le processus de ClickHouse (comme `kill -9 {$ pid_clickhouse-server}`) - -## Gestion Des Tables Distribuées {#query-language-system-distributed} - -ClickHouse peut gérer [distribué](../../engines/table-engines/special/distributed.md) table. Lorsqu'un utilisateur insère des données dans ces tables, ClickHouse crée d'abord une file d'attente des données qui doivent être envoyées aux nœuds de cluster, puis l'envoie de manière asynchrone. Vous pouvez gérer le traitement des files d'attente avec [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends), [FLUSH DISTRIBUTED](#query_language-system-flush-distributed), et [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) requête. Vous pouvez également insérer de manière synchrone des données distribuées avec `insert_distributed_sync` paramètre. - -### STOP DISTRIBUTED SENDS {#query_language-system-stop-distributed-sends} - -Désactive la distribution de données en arrière-plan lors de l'insertion de données dans des tables distribuées. - -``` sql -SYSTEM STOP DISTRIBUTED SENDS [db.] -``` - -### FLUSH DISTRIBUTED {#query_language-system-flush-distributed} - -Force ClickHouse à envoyer des données aux nœuds de cluster de manière synchrone. Si des nœuds ne sont pas disponibles, ClickHouse lève une exception et arrête l'exécution de la requête. Vous pouvez réessayer la requête jusqu'à ce qu'elle réussisse, ce qui se produira lorsque tous les nœuds seront de nouveau en ligne. - -``` sql -SYSTEM FLUSH DISTRIBUTED [db.] -``` - -### START DISTRIBUTED SENDS {#query_language-system-start-distributed-sends} - -Active la distribution de données en arrière-plan lors de l'insertion de données dans des tables distribuées. - -``` sql -SYSTEM START DISTRIBUTED SENDS [db.] -``` - -### STOP MERGES {#query_language-system-stop-merges} - -Offre la possibilité d'arrêter les fusions d'arrière-plan pour les tables de la famille MergeTree: - -``` sql -SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] -``` - -!!! note "Note" - `DETACH / ATTACH` table va commencer les fusions d'arrière-plan pour la table même dans le cas où les fusions ont été arrêtées pour toutes les tables MergeTree auparavant. - -### START MERGES {#query_language-system-start-merges} - -Offre la possibilité de démarrer des fusions en arrière-plan pour les tables de la famille MergeTree: - -``` sql -SYSTEM START MERGES [[db.]merge_tree_family_table_name] -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/system/) diff --git a/docs/fr/sql-reference/syntax.md b/docs/fr/sql-reference/syntax.md deleted file mode 100644 index b8b24c9bbb5..00000000000 --- a/docs/fr/sql-reference/syntax.md +++ /dev/null @@ -1,187 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 31 -toc_title: Syntaxe ---- - -# Syntaxe {#syntax} - -Il existe deux types d'analyseurs dans le système: L'analyseur SQL complet (un analyseur de descente récursif) et l'analyseur de format de données (un analyseur de flux rapide). -Dans tous les cas à l'exception de la `INSERT` requête, seul L'analyseur SQL complet est utilisé. -Le `INSERT` requête utilise les deux analyseurs: - -``` sql -INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def') -``` - -Le `INSERT INTO t VALUES` fragment est analysé par l'analyseur complet, et les données `(1, 'Hello, world'), (2, 'abc'), (3, 'def')` est analysé par l'analyseur de flux rapide. Vous pouvez également activer l'analyseur complet pour les données à l'aide de la [input_format_values_interpret_expressions](../operations/settings/settings.md#settings-input_format_values_interpret_expressions) paramètre. Lorsque `input_format_values_interpret_expressions = 1`, ClickHouse essaie d'abord d'analyser les valeurs avec l'analyseur de flux rapide. S'il échoue, ClickHouse essaie d'utiliser l'analyseur complet pour les données, en le traitant comme un SQL [expression](#syntax-expressions). - -Les données peuvent avoir n'importe quel format. Lorsqu'une requête est reçue, le serveur calcule pas plus que [max_query_size](../operations/settings/settings.md#settings-max_query_size) octets de la requête en RAM (par défaut, 1 Mo), et le reste est analysé en flux. -Il permet d'éviter les problèmes avec de grandes `INSERT` requête. - -Lors de l'utilisation de la `Values` format dans un `INSERT` de la requête, il peut sembler que les données sont analysées de même que les expressions dans un `SELECT` requête, mais ce n'est pas vrai. Le `Values` le format est beaucoup plus limitée. - -Le reste de cet article couvre l'analyseur complet. Pour plus d'informations sur les analyseurs de format, consultez [Format](../interfaces/formats.md) section. - -## Espace {#spaces} - -Il peut y avoir n'importe quel nombre de symboles d'espace entre les constructions syntaxiques (y compris le début et la fin d'une requête). Les symboles d'espace incluent l'espace, l'onglet, le saut de ligne, Le CR et le flux de formulaire. - -## Commentaire {#comments} - -ClickHouse prend en charge les commentaires de style SQL et de style C. -Les commentaires de style SQL commencent par `--` et continuer jusqu'à la fin de la ligne, un espace après `--` peut être omis. -C-style sont de `/*` de `*/`et peut être multiligne, les espaces ne sont pas requis non plus. - -## Mot {#syntax-keywords} - -Les mots clés sont insensibles à la casse lorsqu'ils correspondent à: - -- La norme SQL. Exemple, `SELECT`, `select` et `SeLeCt` sont toutes valides. -- Implémentation dans certains SGBD populaires (MySQL ou Postgres). Exemple, `DateTime` est le même que `datetime`. - -Si le nom du type de données est sensible à la casse peut être vérifié `system.data_type_families` table. - -Contrairement à SQL standard, tous les autres mots clés (y compris les noms de fonctions) sont **sensible à la casse**. - -Mots-clés ne sont pas réservés; ils sont traités comme tels que dans le contexte correspondant. Si vous utilisez [identificateur](#syntax-identifiers) avec le même nom que les mots-clés, placez-les entre guillemets doubles ou backticks. Par exemple, la requête `SELECT "FROM" FROM table_name` est valide si la table `table_name` a colonne avec le nom de `"FROM"`. - -## Identificateur {#syntax-identifiers} - -Les identificateurs sont: - -- Noms de Cluster, de base de données, de table, de partition et de colonne. -- Fonction. -- Types de données. -- [Expression des alias](#syntax-expression_aliases). - -Les identificateurs peuvent être cités ou non cités. Ce dernier est préféré. - -Non identificateurs doivent correspondre à l'expression régulière `^[a-zA-Z_][0-9a-zA-Z_]*$` et ne peut pas être égale à [mot](#syntax-keywords). Exemple: `x, _1, X_y__Z123_.` - -Si vous souhaitez utiliser les identifiants de la même manière que les mots-clés ou si vous souhaitez utiliser d'autres symboles dans les identifiants, citez-le en utilisant des guillemets doubles ou des backticks, par exemple, `"id"`, `` `id` ``. - -## Littéral {#literals} - -Il y a numérique, chaîne de caractères, composé, et `NULL` littéral. - -### Numérique {#numeric} - -Littéral numérique tente d'être analysé: - -- Tout d'abord, comme un nombre signé 64 bits, en utilisant le [strtoull](https://en.cppreference.com/w/cpp/string/byte/strtoul) fonction. -- En cas d'échec, en tant que nombre non signé 64 bits, [strtoll](https://en.cppreference.com/w/cpp/string/byte/strtol) fonction. -- En cas d'échec, en tant que nombre à virgule flottante [strtod](https://en.cppreference.com/w/cpp/string/byte/strtof) fonction. -- Sinon, elle renvoie une erreur. - -La valeur littérale a le plus petit type dans lequel la valeur correspond. -Par exemple, 1 est analysé comme `UInt8`, mais 256 est analysé comme `UInt16`. Pour plus d'informations, voir [Types de données](../sql-reference/data-types/index.md). - -Exemple: `1`, `18446744073709551615`, `0xDEADBEEF`, `01`, `0.1`, `1e100`, `-1e-100`, `inf`, `nan`. - -### Chaîne {#syntax-string-literal} - -Seuls les littéraux de chaîne entre guillemets simples sont pris en charge. Le clos de caractères barre oblique inverse échappé. Les séquences d'échappement suivantes ont une valeur spéciale correspondante: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\a`, `\v`, `\xHH`. Dans tous les autres cas, des séquences d'échappement au format `\c`, où `c` est un caractère, sont convertis à `c`. Cela signifie que vous pouvez utiliser les séquences `\'`et`\\`. La valeur aurez l' [Chaîne](../sql-reference/data-types/string.md) type. - -Dans les littéraux de chaîne, vous devez vous échapper d'au moins `'` et `\`. Les guillemets simples peuvent être échappés avec le guillemet simple, littéraux `'It\'s'` et `'It''s'` sont égaux. - -### Composé {#compound} - -Les tableaux sont construits avec des crochets `[1, 2, 3]`. Nuples sont construits avec des supports ronds `(1, 'Hello, world!', 2)`. -Techniquement, ce ne sont pas des littéraux, mais des expressions avec l'opérateur de création de tableau et l'opérateur de création de tuple, respectivement. -Un tableau doit être composé d'au moins un élément, et un tuple doit avoir au moins deux éléments. -Il y a un cas distinct lorsque les tuples apparaissent dans le `IN` clause de a `SELECT` requête. Les résultats de la requête peuvent inclure des tuples, mais les tuples ne peuvent pas être enregistrés dans une base de données (à l'exception des tables avec [Mémoire](../engines/table-engines/special/memory.md) moteur). - -### NULL {#null-literal} - -Indique que la valeur est manquante. - -Afin de stocker `NULL` dans un champ de table, il doit être de la [Nullable](../sql-reference/data-types/nullable.md) type. - -Selon le format de données (entrée ou sortie), `NULL` peut avoir une représentation différente. Pour plus d'informations, consultez la documentation de [formats de données](../interfaces/formats.md#formats). - -Il y a beaucoup de nuances au traitement `NULL`. Par exemple, si au moins l'un des arguments d'une opération de comparaison est `NULL` le résultat de cette opération est également `NULL`. Il en va de même pour la multiplication, l'addition et d'autres opérations. Pour plus d'informations, lisez la documentation pour chaque opération. - -Dans les requêtes, vous pouvez vérifier `NULL` à l'aide de la [IS NULL](operators/index.md#operator-is-null) et [IS NOT NULL](operators/index.md) opérateurs et les fonctions connexes `isNull` et `isNotNull`. - -## Fonction {#functions} - -Les appels de fonction sont écrits comme un identifiant avec une liste d'arguments (éventuellement vide) entre parenthèses. Contrairement à SQL standard, les crochets sont requis, même pour une liste d'arguments vide. Exemple: `now()`. -Il existe des fonctions régulières et agrégées (voir la section “Aggregate functions”). Certaines fonctions d'agrégat peut contenir deux listes d'arguments entre parenthèses. Exemple: `quantile (0.9) (x)`. Ces fonctions d'agrégation sont appelés “parametric” fonctions, et les arguments dans la première liste sont appelés “parameters”. La syntaxe des fonctions d'agrégation sans paramètres est la même que pour les fonctions régulières. - -## Opérateur {#operators} - -Les opérateurs sont convertis en leurs fonctions correspondantes lors de l'analyse des requêtes, en tenant compte de leur priorité et de leur associativité. -Par exemple, l'expression `1 + 2 * 3 + 4` est transformé à `plus(plus(1, multiply(2, 3)), 4)`. - -## Types de données et moteurs de Table de base de données {#data_types-and-database-table-engines} - -Types de données et moteurs de table dans `CREATE` les requêtes sont écrites de la même manière que les identifiants ou les fonctions. En d'autres termes, ils peuvent ou ne peuvent pas contenir une liste d'arguments entre parenthèses. Pour plus d'informations, voir les sections “Data types,” “Table engines,” et “CREATE”. - -## Expression Des Alias {#syntax-expression_aliases} - -Un alias est un nom défini par l'utilisateur pour l'expression dans une requête. - -``` sql -expr AS alias -``` - -- `AS` — The keyword for defining aliases. You can define the alias for a table name or a column name in a `SELECT` clause sans utiliser le `AS` mot. - - For example, `SELECT table_name_alias.column_name FROM table_name table_name_alias`. - - In the [CAST](sql_reference/functions/type_conversion_functions.md#type_conversion_function-cast) function, the `AS` keyword has another meaning. See the description of the function. - -- `expr` — Any expression supported by ClickHouse. - - For example, `SELECT column_name * 2 AS double FROM some_table`. - -- `alias` — Name for `expr`. Les alias doivent être conformes à la [identificateur](#syntax-identifiers) syntaxe. - - For example, `SELECT "table t".column_name FROM table_name AS "table t"`. - -### Notes sur l'Utilisation de la {#notes-on-usage} - -Les alias sont globaux pour une requête ou d'une sous-requête, vous pouvez définir un alias dans n'importe quelle partie d'une requête de toute expression. Exemple, `SELECT (1 AS n) + 2, n`. - -Les alias ne sont pas visibles dans les sous-requêtes et entre les sous-requêtes. Par exemple, lors de l'exécution de la requête `SELECT (SELECT sum(b.a) + num FROM b) - a.a AS num FROM a` Clickhouse génère l'exception `Unknown identifier: num`. - -Si un alias est défini pour les colonnes de `SELECT` la clause d'une sous-requête, ces colonnes sont visibles dans la requête externe. Exemple, `SELECT n + m FROM (SELECT 1 AS n, 2 AS m)`. - -Soyez prudent avec les Alias qui sont les mêmes que les noms de colonnes ou de tables. Considérons l'exemple suivant: - -``` sql -CREATE TABLE t -( - a Int, - b Int -) -ENGINE = TinyLog() -``` - -``` sql -SELECT - argMax(a, b), - sum(b) AS b -FROM t -``` - -``` text -Received exception from server (version 18.14.17): -Code: 184. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: Aggregate function sum(b) is found inside another aggregate function in query. -``` - -Dans cet exemple, nous avons déclaré table `t` avec la colonne `b`. Ensuite, lors de la sélection des données, nous avons défini le `sum(b) AS b` alias. Comme les alias sont globaux, ClickHouse a substitué le littéral `b` dans l'expression `argMax(a, b)` avec l'expression `sum(b)`. Cette substitution a provoqué l'exception. - -## Astérisque {#asterisk} - -Dans un `SELECT` requête, un astérisque peut remplacer l'expression. Pour plus d'informations, consultez la section “SELECT”. - -## Expression {#syntax-expressions} - -Une expression est une fonction, un identifiant, un littéral, une application d'un opérateur, une expression entre parenthèses, une sous-requête ou un astérisque. Il peut également contenir un alias. -Une liste des expressions est une ou plusieurs expressions séparées par des virgules. -Les fonctions et les opérateurs, à leur tour, peuvent avoir des expressions comme arguments. - -[Article Original](https://clickhouse.tech/docs/en/sql_reference/syntax/) diff --git a/docs/fr/sql-reference/table-functions/file.md b/docs/fr/sql-reference/table-functions/file.md deleted file mode 100644 index a58821d021d..00000000000 --- a/docs/fr/sql-reference/table-functions/file.md +++ /dev/null @@ -1,121 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 37 -toc_title: fichier ---- - -# fichier {#file} - -Crée un tableau à partir d'un fichier. Cette fonction de table est similaire à [URL](url.md) et [hdfs](hdfs.md) ceux. - -``` sql -file(path, format, structure) -``` - -**Les paramètres d'entrée** - -- `path` — The relative path to the file from [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Chemin d'accès à la prise en charge des fichiers suivant les globs en mode Lecture seule: `*`, `?`, `{abc,def}` et `{N..M}` où `N`, `M` — numbers, \``'abc', 'def'` — strings. -- `format` — The [format](../../interfaces/formats.md#formats) de le fichier. -- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. - -**Valeur renvoyée** - -Une table avec la structure spécifiée pour lire ou écrire des données dans le fichier spécifié. - -**Exemple** - -Paramètre `user_files_path` et le contenu du fichier `test.csv`: - -``` bash -$ grep user_files_path /etc/clickhouse-server/config.xml - /var/lib/clickhouse/user_files/ - -$ cat /var/lib/clickhouse/user_files/test.csv - 1,2,3 - 3,2,1 - 78,43,45 -``` - -Table de`test.csv` et la sélection des deux premières lignes de ce: - -``` sql -SELECT * -FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') -LIMIT 2 -``` - -``` text -┌─column1─┬─column2─┬─column3─┐ -│ 1 │ 2 │ 3 │ -│ 3 │ 2 │ 1 │ -└─────────┴─────────┴─────────┘ -``` - -``` sql --- getting the first 10 lines of a table that contains 3 columns of UInt32 type from a CSV file -SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10 -``` - -**Globs dans le chemin** - -Plusieurs composants de chemin peuvent avoir des globs. Pour être traité, le fichier doit exister et correspondre à l'ensemble du modèle de chemin (pas seulement le suffixe ou le préfixe). - -- `*` — Substitutes any number of any characters except `/` y compris la chaîne vide. -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. - -Les Constructions avec `{}` sont similaires à l' [fonction de table à distance](../../sql-reference/table-functions/remote.md)). - -**Exemple** - -1. Supposons que nous ayons plusieurs fichiers avec les chemins relatifs suivants: - -- ‘some_dir/some_file_1’ -- ‘some_dir/some_file_2’ -- ‘some_dir/some_file_3’ -- ‘another_dir/some_file_1’ -- ‘another_dir/some_file_2’ -- ‘another_dir/some_file_3’ - -1. Interroger la quantité de lignes dans ces fichiers: - - - -``` sql -SELECT count(*) -FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') -``` - -1. Requête de la quantité de lignes dans tous les fichiers de ces deux répertoires: - - - -``` sql -SELECT count(*) -FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') -``` - -!!! warning "Avertissement" - Si votre liste de fichiers contient des plages de nombres avec des zéros en tête, utilisez la construction avec des accolades pour chaque chiffre séparément ou utilisez `?`. - -**Exemple** - -Interroger les données des fichiers nommés `file000`, `file001`, … , `file999`: - -``` sql -SELECT count(*) -FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') -``` - -## Les Colonnes Virtuelles {#virtual-columns} - -- `_path` — Path to the file. -- `_file` — Name of the file. - -**Voir Aussi** - -- [Les colonnes virtuelles](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/file/) diff --git a/docs/fr/sql-reference/table-functions/generate.md b/docs/fr/sql-reference/table-functions/generate.md deleted file mode 100644 index 1f7eeddd0e1..00000000000 --- a/docs/fr/sql-reference/table-functions/generate.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 47 -toc_title: generateRandom ---- - -# generateRandom {#generaterandom} - -Génère des données aléatoires avec un schéma donné. -Permet de remplir des tables de test avec des données. -Prend en charge tous les types de données qui peuvent être stockés dans la table sauf `LowCardinality` et `AggregateFunction`. - -``` sql -generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]); -``` - -**Paramètre** - -- `name` — Name of corresponding column. -- `TypeName` — Type of corresponding column. -- `max_array_length` — Maximum array length for all generated arrays. Defaults to `10`. -- `max_string_length` — Maximum string length for all generated strings. Defaults to `10`. -- `random_seed` — Specify random seed manually to produce stable results. If NULL — seed is randomly generated. - -**Valeur Renvoyée** - -Un objet de table avec le schéma demandé. - -## Exemple D'Utilisation {#usage-example} - -``` sql -SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64(3), UUID)', 1, 10, 2) LIMIT 3; -``` - -``` text -┌─a────────┬────────────d─┬─c──────────────────────────────────────────────────────────────────┐ -│ [77] │ -124167.6723 │ ('2061-04-17 21:59:44.573','3f72f405-ec3e-13c8-44ca-66ef335f7835') │ -│ [32,110] │ -141397.7312 │ ('1979-02-09 03:43:48.526','982486d1-5a5d-a308-e525-7bd8b80ffa73') │ -│ [68] │ -67417.0770 │ ('2080-03-12 14:17:31.269','110425e5-413f-10a6-05ba-fa6b3e929f15') │ -└──────────┴──────────────┴────────────────────────────────────────────────────────────────────┘ -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/generate/) diff --git a/docs/fr/sql-reference/table-functions/hdfs.md b/docs/fr/sql-reference/table-functions/hdfs.md deleted file mode 100644 index 51b742d8018..00000000000 --- a/docs/fr/sql-reference/table-functions/hdfs.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 45 -toc_title: hdfs ---- - -# hdfs {#hdfs} - -Crée une table à partir de fichiers dans HDFS. Cette fonction de table est similaire à [URL](url.md) et [fichier](file.md) ceux. - -``` sql -hdfs(URI, format, structure) -``` - -**Les paramètres d'entrée** - -- `URI` — The relative URI to the file in HDFS. Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` et `{N..M}` où `N`, `M` — numbers, \``'abc', 'def'` — strings. -- `format` — The [format](../../interfaces/formats.md#formats) de le fichier. -- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. - -**Valeur renvoyée** - -Une table avec la structure spécifiée pour lire ou écrire des données dans le fichier spécifié. - -**Exemple** - -Table de `hdfs://hdfs1:9000/test` et la sélection des deux premières lignes de ce: - -``` sql -SELECT * -FROM hdfs('hdfs://hdfs1:9000/test', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') -LIMIT 2 -``` - -``` text -┌─column1─┬─column2─┬─column3─┐ -│ 1 │ 2 │ 3 │ -│ 3 │ 2 │ 1 │ -└─────────┴─────────┴─────────┘ -``` - -**Globs dans le chemin** - -Plusieurs composants de chemin peuvent avoir des globs. Pour être traité, le fichier doit exister et correspondre à l'ensemble du modèle de chemin (pas seulement le suffixe ou le préfixe). - -- `*` — Substitutes any number of any characters except `/` y compris la chaîne vide. -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. - -Les Constructions avec `{}` sont similaires à l' [fonction de table à distance](../../sql-reference/table-functions/remote.md)). - -**Exemple** - -1. Supposons que nous ayons plusieurs fichiers avec les URI suivants sur HDFS: - -- ‘hdfs://hdfs1:9000/some_dir/some_file_1’ -- ‘hdfs://hdfs1:9000/some_dir/some_file_2’ -- ‘hdfs://hdfs1:9000/some_dir/some_file_3’ -- ‘hdfs://hdfs1:9000/another_dir/some_file_1’ -- ‘hdfs://hdfs1:9000/another_dir/some_file_2’ -- ‘hdfs://hdfs1:9000/another_dir/some_file_3’ - -1. Interroger la quantité de lignes dans ces fichiers: - - - -``` sql -SELECT count(*) -FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') -``` - -1. Requête de la quantité de lignes dans tous les fichiers de ces deux répertoires: - - - -``` sql -SELECT count(*) -FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') -``` - -!!! warning "Avertissement" - Si votre liste de fichiers contient des plages de nombres avec des zéros en tête, utilisez la construction avec des accolades pour chaque chiffre séparément ou utilisez `?`. - -**Exemple** - -Interroger les données des fichiers nommés `file000`, `file001`, … , `file999`: - -``` sql -SELECT count(*) -FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') -``` - -## Les Colonnes Virtuelles {#virtual-columns} - -- `_path` — Path to the file. -- `_file` — Name of the file. - -**Voir Aussi** - -- [Les colonnes virtuelles](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/hdfs/) diff --git a/docs/fr/sql-reference/table-functions/index.md b/docs/fr/sql-reference/table-functions/index.md deleted file mode 100644 index 89a8200e385..00000000000 --- a/docs/fr/sql-reference/table-functions/index.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Les Fonctions De Table -toc_priority: 34 -toc_title: Introduction ---- - -# Les Fonctions De Table {#table-functions} - -Les fonctions de Table sont des méthodes pour construire des tables. - -Vous pouvez utiliser les fonctions de table dans: - -- [FROM](../statements/select/from.md) la clause de la `SELECT` requête. - - The method for creating a temporary table that is available only in the current query. The table is deleted when the query finishes. - -- [Créer une TABLE en tant que \< table_function ()\>](../statements/create.md#create-table-query) requête. - - It's one of the methods of creating a table. - -!!! warning "Avertissement" - Vous ne pouvez pas utiliser les fonctions de table si [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) paramètre est désactivé. - -| Fonction | Description | -|-----------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| [fichier](file.md) | Crée un [Fichier](../../engines/table-engines/special/file.md)-moteur de table. | -| [fusionner](merge.md) | Crée un [Fusionner](../../engines/table-engines/special/merge.md)-moteur de table. | -| [nombre](numbers.md) | Crée une table avec une seule colonne remplie de nombres entiers. | -| [distant](remote.md) | Vous permet d'accéder à des serveurs distants sans [Distribué](../../engines/table-engines/special/distributed.md)-moteur de table. | -| [URL](url.md) | Crée un [URL](../../engines/table-engines/special/url.md)-moteur de table. | -| [mysql](mysql.md) | Crée un [MySQL](../../engines/table-engines/integrations/mysql.md)-moteur de table. | -| [jdbc](jdbc.md) | Crée un [JDBC](../../engines/table-engines/integrations/jdbc.md)-moteur de table. | -| [ODBC](odbc.md) | Crée un [ODBC](../../engines/table-engines/integrations/odbc.md)-moteur de table. | -| [hdfs](hdfs.md) | Crée un [HDFS](../../engines/table-engines/integrations/hdfs.md)-moteur de table. | - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/) diff --git a/docs/fr/sql-reference/table-functions/input.md b/docs/fr/sql-reference/table-functions/input.md deleted file mode 100644 index 21e0eacb5c1..00000000000 --- a/docs/fr/sql-reference/table-functions/input.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 46 -toc_title: "entr\xE9e" ---- - -# entrée {#input} - -`input(structure)` - fonction de table qui permet effectivement convertir et insérer des données envoyées à la -serveur avec une structure donnée à la table avec une autre structure. - -`structure` - structure de données envoyées au serveur dans le format suivant `'column1_name column1_type, column2_name column2_type, ...'`. -Exemple, `'id UInt32, name String'`. - -Cette fonction peut être utilisée uniquement dans `INSERT SELECT` requête et une seule fois mais se comporte autrement comme une fonction de table ordinaire -(par exemple, il peut être utilisé dans la sous-requête, etc.). - -Les données peuvent être envoyées de quelque manière que ce soit comme pour ordinaire `INSERT` requête et passé dans tout disponible [format](../../interfaces/formats.md#formats) -qui doit être spécifié à la fin de la requête (contrairement à l'ordinaire `INSERT SELECT`). - -La caractéristique principale de cette fonction est que lorsque le serveur reçoit des données du client il les convertit simultanément -selon la liste des expressions dans le `SELECT` clause et insère dans la table cible. Table temporaire -avec toutes les données transférées n'est pas créé. - -**Exemple** - -- Laissez le `test` le tableau a la structure suivante `(a String, b String)` - et les données `data.csv` a une structure différente `(col1 String, col2 Date, col3 Int32)`. Requête pour insérer - les données de l' `data.csv` dans le `test` table avec conversion simultanée ressemble à ceci: - - - -``` bash -$ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; -``` - -- Si `data.csv` contient les données de la même structure `test_structure` comme la table `test` puis ces deux requêtes sont égales: - - - -``` bash -$ cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" -$ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/input/) diff --git a/docs/fr/sql-reference/table-functions/jdbc.md b/docs/fr/sql-reference/table-functions/jdbc.md deleted file mode 100644 index 76dea0e0930..00000000000 --- a/docs/fr/sql-reference/table-functions/jdbc.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 43 -toc_title: jdbc ---- - -# jdbc {#table-function-jdbc} - -`jdbc(jdbc_connection_uri, schema, table)` - retourne la table qui est connectée via le pilote JDBC. - -Ce tableau fonction nécessite séparé `clickhouse-jdbc-bridge` programme en cours d'exécution. -Il prend en charge les types Nullable (basé sur DDL de la table distante qui est interrogée). - -**Exemple** - -``` sql -SELECT * FROM jdbc('jdbc:mysql://localhost:3306/?user=root&password=root', 'schema', 'table') -``` - -``` sql -SELECT * FROM jdbc('mysql://localhost:3306/?user=root&password=root', 'schema', 'table') -``` - -``` sql -SELECT * FROM jdbc('datasource://mysql-local', 'schema', 'table') -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/jdbc/) diff --git a/docs/fr/sql-reference/table-functions/merge.md b/docs/fr/sql-reference/table-functions/merge.md deleted file mode 100644 index 1ec264b06bd..00000000000 --- a/docs/fr/sql-reference/table-functions/merge.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 38 -toc_title: fusionner ---- - -# fusionner {#merge} - -`merge(db_name, 'tables_regexp')` – Creates a temporary Merge table. For more information, see the section “Table engines, Merge”. - -La structure de la table est tirée de la première table rencontrée qui correspond à l'expression régulière. - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/merge/) diff --git a/docs/fr/sql-reference/table-functions/mysql.md b/docs/fr/sql-reference/table-functions/mysql.md deleted file mode 100644 index 295456914f0..00000000000 --- a/docs/fr/sql-reference/table-functions/mysql.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 42 -toc_title: mysql ---- - -# mysql {#mysql} - -Permettre `SELECT` requêtes à effectuer sur des données stockées sur un serveur MySQL distant. - -``` sql -mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); -``` - -**Paramètre** - -- `host:port` — MySQL server address. - -- `database` — Remote database name. - -- `table` — Remote table name. - -- `user` — MySQL user. - -- `password` — User password. - -- `replace_query` — Flag that converts `INSERT INTO` les requêtes de `REPLACE INTO`. Si `replace_query=1` la requête est remplacé. - -- `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expression qui est ajoutée à la `INSERT` requête. - - Example: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, where `on_duplicate_clause` is `UPDATE c2 = c2 + 1`. See the MySQL documentation to find which `on_duplicate_clause` you can use with the `ON DUPLICATE KEY` clause. - - To specify `on_duplicate_clause` you need to pass `0` to the `replace_query` parameter. If you simultaneously pass `replace_query = 1` and `on_duplicate_clause`, ClickHouse generates an exception. - -Simple `WHERE` des clauses telles que `=, !=, >, >=, <, <=` sont actuellement exécutés sur le serveur MySQL. - -Le reste des conditions et le `LIMIT` les contraintes d'échantillonnage sont exécutées dans ClickHouse uniquement après la fin de la requête à MySQL. - -**Valeur Renvoyée** - -Un objet table avec les mêmes colonnes que la table MySQL d'origine. - -## Exemple D'Utilisation {#usage-example} - -Table dans MySQL: - -``` text -mysql> CREATE TABLE `test`.`test` ( - -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `int_nullable` INT NULL DEFAULT NULL, - -> `float` FLOAT NOT NULL, - -> `float_nullable` FLOAT NULL DEFAULT NULL, - -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) - -mysql> insert into test (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) - -mysql> select * from test; -+------+----------+-----+----------+ -| int_id | int_nullable | float | float_nullable | -+------+----------+-----+----------+ -| 1 | NULL | 2 | NULL | -+------+----------+-----+----------+ -1 row in set (0,00 sec) -``` - -Sélection des données de ClickHouse: - -``` sql -SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123') -``` - -``` text -┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ -│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ -└────────┴──────────────┴───────┴────────────────┘ -``` - -## Voir Aussi {#see-also} - -- [Le ‘MySQL’ tableau moteur](../../engines/table-engines/integrations/mysql.md) -- [Utilisation de MySQL comme source de dictionnaire externe](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/mysql/) diff --git a/docs/fr/sql-reference/table-functions/numbers.md b/docs/fr/sql-reference/table-functions/numbers.md deleted file mode 100644 index 50a5ad61002..00000000000 --- a/docs/fr/sql-reference/table-functions/numbers.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 39 -toc_title: nombre ---- - -# nombre {#numbers} - -`numbers(N)` – Returns a table with the single ‘number’ colonne (UInt64) qui contient des entiers de 0 à n-1. -`numbers(N, M)` - Retourne un tableau avec le seul ‘number’ colonne (UInt64) qui contient des entiers de N À (N + M-1). - -Similaire à la `system.numbers` table, il peut être utilisé pour tester et générer des valeurs successives, `numbers(N, M)` plus efficace que `system.numbers`. - -Les requêtes suivantes sont équivalentes: - -``` sql -SELECT * FROM numbers(10); -SELECT * FROM numbers(0, 10); -SELECT * FROM system.numbers LIMIT 10; -``` - -Exemple: - -``` sql --- Generate a sequence of dates from 2010-01-01 to 2010-12-31 -select toDate('2010-01-01') + number as d FROM numbers(365); -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/numbers/) diff --git a/docs/fr/sql-reference/table-functions/odbc.md b/docs/fr/sql-reference/table-functions/odbc.md deleted file mode 100644 index aae636a5eb2..00000000000 --- a/docs/fr/sql-reference/table-functions/odbc.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 44 -toc_title: ODBC ---- - -# ODBC {#table-functions-odbc} - -Renvoie la table connectée via [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). - -``` sql -odbc(connection_settings, external_database, external_table) -``` - -Paramètre: - -- `connection_settings` — Name of the section with connection settings in the `odbc.ini` fichier. -- `external_database` — Name of a database in an external DBMS. -- `external_table` — Name of a table in the `external_database`. - -Pour implémenter en toute sécurité les connexions ODBC, ClickHouse utilise un programme distinct `clickhouse-odbc-bridge`. Si le pilote ODBC est chargé directement depuis `clickhouse-server`, les problèmes de pilote peuvent planter le serveur ClickHouse. Clickhouse démarre automatiquement `clickhouse-odbc-bridge` lorsque cela est nécessaire. Le programme ODBC bridge est installé à partir du même package que `clickhouse-server`. - -Les champs avec l' `NULL` les valeurs de la table externe sont converties en valeurs par défaut pour le type de données de base. Par exemple, si un champ de table MySQL distant a `INT NULL` type il est converti en 0 (la valeur par défaut pour ClickHouse `Int32` type de données). - -## Exemple D'Utilisation {#usage-example} - -**Obtenir des données de L'installation MySQL locale via ODBC** - -Cet exemple est vérifié pour Ubuntu Linux 18.04 et MySQL server 5.7. - -Assurez-vous que unixODBC et MySQL Connector sont installés. - -Par défaut (si installé à partir de paquets), ClickHouse démarre en tant qu'utilisateur `clickhouse`. Ainsi, vous devez créer et configurer cet utilisateur dans le serveur MySQL. - -``` bash -$ sudo mysql -``` - -``` sql -mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; -mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; -``` - -Puis configurez la connexion dans `/etc/odbc.ini`. - -``` bash -$ cat /etc/odbc.ini -[mysqlconn] -DRIVER = /usr/local/lib/libmyodbc5w.so -SERVER = 127.0.0.1 -PORT = 3306 -DATABASE = test -USERNAME = clickhouse -PASSWORD = clickhouse -``` - -Vous pouvez vérifier la connexion en utilisant le `isql` utilitaire de l'installation unixODBC. - -``` bash -$ isql -v mysqlconn -+-------------------------+ -| Connected! | -| | -... -``` - -Table dans MySQL: - -``` text -mysql> CREATE TABLE `test`.`test` ( - -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `int_nullable` INT NULL DEFAULT NULL, - -> `float` FLOAT NOT NULL, - -> `float_nullable` FLOAT NULL DEFAULT NULL, - -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) - -mysql> insert into test (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) - -mysql> select * from test; -+------+----------+-----+----------+ -| int_id | int_nullable | float | float_nullable | -+------+----------+-----+----------+ -| 1 | NULL | 2 | NULL | -+------+----------+-----+----------+ -1 row in set (0,00 sec) -``` - -Récupération des données de la table MySQL dans ClickHouse: - -``` sql -SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') -``` - -``` text -┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ -│ 1 │ 0 │ 2 │ 0 │ -└────────┴──────────────┴───────┴────────────────┘ -``` - -## Voir Aussi {#see-also} - -- [Dictionnaires externes ODBC](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) -- [Moteur de table ODBC](../../engines/table-engines/integrations/odbc.md). - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/jdbc/) diff --git a/docs/fr/sql-reference/table-functions/remote.md b/docs/fr/sql-reference/table-functions/remote.md deleted file mode 100644 index 380a9986116..00000000000 --- a/docs/fr/sql-reference/table-functions/remote.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 40 -toc_title: distant ---- - -# à distance, remoteSecure {#remote-remotesecure} - -Vous permet d'accéder à des serveurs distants sans `Distributed` table. - -Signature: - -``` sql -remote('addresses_expr', db, table[, 'user'[, 'password']]) -remote('addresses_expr', db.table[, 'user'[, 'password']]) -remoteSecure('addresses_expr', db, table[, 'user'[, 'password']]) -remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) -``` - -`addresses_expr` – An expression that generates addresses of remote servers. This may be just one server address. The server address is `host:port` ou juste `host`. L'hôte peut être spécifié comme nom de serveur ou l'adresse IPv4 ou IPv6. Une adresse IPv6 est indiquée entre crochets. Le port est le port TCP sur le serveur distant. Si le port est omis, il utilise `tcp_port` à partir du fichier de configuration du serveur (par défaut, 9000). - -!!! important "Important" - Le port est requis pour une adresse IPv6. - -Exemple: - -``` text -example01-01-1 -example01-01-1:9000 -localhost -127.0.0.1 -[::]:9000 -[2a02:6b8:0:1111::11]:9000 -``` - -Plusieurs adresses séparées par des virgules. Dans ce cas, ClickHouse utilisera le traitement distribué, donc il enverra la requête à toutes les adresses spécifiées (comme les fragments avec des données différentes). - -Exemple: - -``` text -example01-01-1,example01-02-1 -``` - -Une partie de l'expression peut être spécifiée entre crochets. L'exemple précédent peut être écrite comme suit: - -``` text -example01-0{1,2}-1 -``` - -Les accolades peuvent contenir une plage de Nombres séparés par deux points (entiers non négatifs). Dans ce cas, la gamme est étendue à un ensemble de valeurs qui génèrent fragment d'adresses. Si le premier nombre commence par zéro, les valeurs sont formées avec le même alignement zéro. L'exemple précédent peut être écrite comme suit: - -``` text -example01-{01..02}-1 -``` - -Si vous avez plusieurs paires d'accolades, il génère le produit direct des ensembles correspondants. - -Les adresses et les parties d'adresses entre crochets peuvent être séparées par le symbole de tuyau (\|). Dans ce cas, les ensembles correspondants de adresses sont interprétés comme des répliques, et la requête sera envoyée à la première sain réplique. Cependant, les répliques sont itérées dans l'ordre actuellement défini dans [équilibrage](../../operations/settings/settings.md) paramètre. - -Exemple: - -``` text -example01-{01..02}-{1|2} -``` - -Cet exemple spécifie deux fragments qui ont chacun deux répliques. - -Le nombre d'adresses générées est limitée par une constante. En ce moment, c'est 1000 adresses. - -À l'aide de la `remote` la fonction de table est moins optimale que la création d'un `Distributed` table, car dans ce cas, la connexion au serveur est rétablie pour chaque requête. En outre, si des noms d'hôte, les noms sont résolus, et les erreurs ne sont pas comptés lors de travail avec diverses répliques. Lors du traitement d'un grand nombre de requêtes, créez toujours `Distributed` table à l'avance, et ne pas utiliser la `remote` table de fonction. - -Le `remote` table de fonction peut être utile dans les cas suivants: - -- Accès à un serveur spécifique pour la comparaison de données, le débogage et les tests. -- Requêtes entre différents clusters ClickHouse à des fins de recherche. -- Demandes distribuées peu fréquentes qui sont faites manuellement. -- Distribué demandes où l'ensemble des serveurs est redéfinie à chaque fois. - -Si l'utilisateur n'est pas spécifié, `default` est utilisée. -Si le mot de passe n'est spécifié, un mot de passe vide est utilisé. - -`remoteSecure` - la même chose que `remote` but with secured connection. Default port — [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure) de config ou 9440. - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/remote/) diff --git a/docs/fr/sql-reference/table-functions/url.md b/docs/fr/sql-reference/table-functions/url.md deleted file mode 100644 index 1df5cf55526..00000000000 --- a/docs/fr/sql-reference/table-functions/url.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 41 -toc_title: URL ---- - -# URL {#url} - -`url(URL, format, structure)` - retourne une table créée à partir du `URL` avec le -`format` et `structure`. - -URL-adresse du serveur HTTP ou HTTPS, qui peut accepter `GET` et/ou `POST` demande. - -format - [format](../../interfaces/formats.md#formats) des données. - -structure - structure de table dans `'UserID UInt64, Name String'` format. Détermine les noms et les types de colonnes. - -**Exemple** - -``` sql --- getting the first 3 lines of a table that contains columns of String and UInt32 type from HTTP-server which answers in CSV format. -SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 -``` - -[Article Original](https://clickhouse.tech/docs/en/query_language/table_functions/url/) diff --git a/docs/fr/whats-new/changelog/2017.md b/docs/fr/whats-new/changelog/2017.md deleted file mode 120000 index d581cbbb422..00000000000 --- a/docs/fr/whats-new/changelog/2017.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/whats-new/changelog/2017.md \ No newline at end of file diff --git a/docs/fr/whats-new/changelog/2018.md b/docs/fr/whats-new/changelog/2018.md deleted file mode 120000 index 22874fcae85..00000000000 --- a/docs/fr/whats-new/changelog/2018.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/whats-new/changelog/2018.md \ No newline at end of file diff --git a/docs/fr/whats-new/changelog/2019.md b/docs/fr/whats-new/changelog/2019.md deleted file mode 120000 index 0f3f095f8a1..00000000000 --- a/docs/fr/whats-new/changelog/2019.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/whats-new/changelog/2019.md \ No newline at end of file diff --git a/docs/fr/whats-new/changelog/index.md b/docs/fr/whats-new/changelog/index.md deleted file mode 120000 index 5461b93ec8c..00000000000 --- a/docs/fr/whats-new/changelog/index.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/whats-new/changelog/index.md \ No newline at end of file diff --git a/docs/fr/whats-new/index.md b/docs/fr/whats-new/index.md deleted file mode 100644 index 51a77da8ef4..00000000000 --- a/docs/fr/whats-new/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: Ce qui est Nouveau -toc_priority: 72 ---- - - diff --git a/docs/fr/whats-new/roadmap.md b/docs/fr/whats-new/roadmap.md deleted file mode 100644 index 87d64208f67..00000000000 --- a/docs/fr/whats-new/roadmap.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 74 -toc_title: Feuille de route ---- - -# Feuille de route {#roadmap} - -## Q1 2020 {#q1-2020} - -- Contrôle d'accès par rôle - -## Q2 2020 {#q2-2020} - -- Intégration avec les services d'authentification externes -- Pools de ressources pour une répartition plus précise de la capacité du cluster entre les utilisateurs - -{## [Article Original](https://clickhouse.tech/docs/en/roadmap/) ##} diff --git a/docs/fr/whats-new/security-changelog.md b/docs/fr/whats-new/security-changelog.md deleted file mode 100644 index 6046ef96bb2..00000000000 --- a/docs/fr/whats-new/security-changelog.md +++ /dev/null @@ -1,76 +0,0 @@ ---- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_priority: 76 -toc_title: "S\xE9curit\xE9 Changelog" ---- - -## Correction dans la version 19.14.3.3 de ClickHouse, 2019-09-10 {#fixed-in-clickhouse-release-19-14-3-3-2019-09-10} - -### CVE-2019-15024 {#cve-2019-15024} - -Аn attacker that has write access to ZooKeeper and who ican run a custom server available from the network where ClickHouse runs, can create a custom-built malicious server that will act as a ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from the malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. - -Crédits: Eldar Zaitov de L'équipe de sécurité de L'Information Yandex - -### CVE-2019-16535 {#cve-2019-16535} - -Аn OOB read, OOB write and integer underflow in decompression algorithms can be used to achieve RCE or DoS via native protocol. - -Crédits: Eldar Zaitov de L'équipe de sécurité de L'Information Yandex - -### CVE-2019-16536 {#cve-2019-16536} - -Le débordement de pile menant à DoS peut être déclenché par un client authentifié malveillant. - -Crédits: Eldar Zaitov de L'équipe de sécurité de L'Information Yandex - -## Correction de la version 19.13.6.1 de ClickHouse, 2019-09-20 {#fixed-in-clickhouse-release-19-13-6-1-2019-09-20} - -### CVE-2019-18657 {#cve-2019-18657} - -Fonction de Table `url` la vulnérabilité avait-elle permis à l'attaquant d'injecter des en-têtes HTTP arbitraires dans la requête. - -Crédit: [Nikita Tikhomirov](https://github.com/NSTikhomirov) - -## Correction dans la version ClickHouse 18.12.13, 2018-09-10 {#fixed-in-clickhouse-release-18-12-13-2018-09-10} - -### CVE-2018-14672 {#cve-2018-14672} - -Les fonctions de chargement des modèles CatBoost permettaient de parcourir les chemins et de lire des fichiers arbitraires via des messages d'erreur. - -Crédits: Andrey Krasichkov de L'équipe de sécurité de L'Information Yandex - -## Correction dans la version 18.10.3 de ClickHouse, 2018-08-13 {#fixed-in-clickhouse-release-18-10-3-2018-08-13} - -### CVE-2018-14671 {#cve-2018-14671} - -unixODBC a permis de charger des objets partagés arbitraires à partir du système de fichiers, ce qui a conduit à une vulnérabilité D'exécution de Code À Distance. - -Crédits: Andrey Krasichkov et Evgeny Sidorov de Yandex Information Security Team - -## Correction dans la version 1.1.54388 de ClickHouse, 2018-06-28 {#fixed-in-clickhouse-release-1-1-54388-2018-06-28} - -### CVE-2018-14668 {#cve-2018-14668} - -“remote” la fonction de table a permis des symboles arbitraires dans “user”, “password” et “default_database” champs qui ont conduit à des attaques de falsification de requêtes inter-protocoles. - -Crédits: Andrey Krasichkov de L'équipe de sécurité de L'Information Yandex - -## Correction dans la version 1.1.54390 de ClickHouse, 2018-07-06 {#fixed-in-clickhouse-release-1-1-54390-2018-07-06} - -### CVE-2018-14669 {#cve-2018-14669} - -Clickhouse client MySQL avait “LOAD DATA LOCAL INFILE” fonctionnalité activée permettant à une base de données MySQL malveillante de lire des fichiers arbitraires à partir du serveur clickhouse connecté. - -Crédits: Andrey Krasichkov et Evgeny Sidorov de Yandex Information Security Team - -## Correction dans la version 1.1.54131 de ClickHouse, 2017-01-10 {#fixed-in-clickhouse-release-1-1-54131-2017-01-10} - -### CVE-2018-14670 {#cve-2018-14670} - -Configuration incorrecte dans le paquet deb pourrait conduire à l'utilisation non autorisée de la base de données. - -Crédits: National Cyber Security Centre (NCSC) - -{## [Article Original](https://clickhouse.tech/docs/en/security_changelog/) ##} diff --git a/docs/ja/commercial/cloud.md b/docs/ja/commercial/cloud.md index 403b34d198c..84f58e46cdb 100644 --- a/docs/ja/commercial/cloud.md +++ b/docs/ja/commercial/cloud.md @@ -20,4 +20,16 @@ toc_title: "\u30AF\u30E9\u30A6\u30C9" - 暗号化と分離 - 自動メンテナンス +## Alibaba Cloud {#alibaba-cloud} + +ClickHouseのためのAlibaba Cloudの管理サービス [中国サイト](https://www.aliyun.com/product/clickhouse) (2021年5月に国際サイトで利用可能になります) 次の主な機能を提供します: + +- Alibaba Cloud Apsara分散システムをベースにした信頼性の高いクラウドディスクストレージエンジン +- 手動でのデータ移行を必要とせずに、オン・デマンドで容量を拡張 +- シングル・ノード、シングル・レプリカ、マルチ・ノード、マルチ・レプリカ・アーキテクチャをサポートし、ホット・データとコールド・データの階層化をサポート +- アクセスホワイトリスト、OneKey Recovery、マルチレイヤーネットワークセキュリティ保護、クラウドディスク暗号化をサポート +- クラウドログシステム、データベース、およびデータアプリケーションツールとのシームレスな統合 +- 組み込み型の監視およびデータベース管理プラットフォーム +- プロフェッショナルデータベースエキスパートによるテクニカル・サポートとサービス + {## [元の記事](https://clickhouse.tech/docs/en/commercial/cloud/) ##} diff --git a/docs/ja/sql-reference/aggregate-functions/reference.md b/docs/ja/sql-reference/aggregate-functions/reference.md index 465f36179da..c66e9b54746 100644 --- a/docs/ja/sql-reference/aggregate-functions/reference.md +++ b/docs/ja/sql-reference/aggregate-functions/reference.md @@ -624,7 +624,7 @@ uniqHLL12(x[, ...]) - HyperLogLogアルゴリズムを使用して、異なる引数値の数を近似します。 - 212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). + 2^12 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). - 決定的な結果を提供します(クエリ処理順序に依存しません)。 diff --git a/docs/ja/sql-reference/functions/bitmap-functions.md b/docs/ja/sql-reference/functions/bitmap-functions.md index cc57e762610..de3ce938444 100644 --- a/docs/ja/sql-reference/functions/bitmap-functions.md +++ b/docs/ja/sql-reference/functions/bitmap-functions.md @@ -35,7 +35,7 @@ SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) ``` text ┌─res─┬─toTypeName(bitmapBuild([1, 2, 3, 4, 5]))─────┐ -│  │ AggregateFunction(groupBitmap, UInt8) │ +│ │ AggregateFunction(groupBitmap, UInt8) │ └─────┴──────────────────────────────────────────────┘ ``` diff --git a/docs/ja/sql-reference/functions/hash-functions.md b/docs/ja/sql-reference/functions/hash-functions.md index d48e6846bb4..a98ae60690d 100644 --- a/docs/ja/sql-reference/functions/hash-functions.md +++ b/docs/ja/sql-reference/functions/hash-functions.md @@ -434,13 +434,13 @@ A [FixedString(16)](../../sql-reference/data-types/fixedstring.md) データ型 **例** ``` sql -SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) AS type +SELECT hex(murmurHash3_128('example_string')) AS MurmurHash3, toTypeName(MurmurHash3) AS type; ``` ``` text -┌─MurmurHash3──────┬─type────────────┐ -│ 6�1�4"S5KT�~~q │ FixedString(16) │ -└──────────────────┴─────────────────┘ +┌─MurmurHash3──────────────────────┬─type───┐ +│ 368A1A311CB7342253354B548E7E7E71 │ String │ +└──────────────────────────────────┴────────┘ ``` ## xxHash32,xxHash64 {#hash-functions-xxhash32} diff --git a/docs/ru/commercial/cloud.md b/docs/ru/commercial/cloud.md index 4f57592b4c7..e00fc3be673 100644 --- a/docs/ru/commercial/cloud.md +++ b/docs/ru/commercial/cloud.md @@ -1,6 +1,6 @@ --- toc_priority: 1 -toc_title: "\u041f\u043e\u0441\u0442\u0430\u0432\u0449\u0438\u043a\u0438\u0020\u043e\u0431\u043b\u0430\u0447\u043d\u044b\u0445\u0020\u0443\u0441\u043b\u0443\u0433\u0020\u0043\u006c\u0069\u0063\u006b\u0048\u006f\u0075\u0073\u0065" +toc_title: "Поставщики облачных услуг ClickHouse" --- # Поставщики облачных услуг ClickHouse {#clickhouse-cloud-service-providers} @@ -29,4 +29,30 @@ toc_title: "\u041f\u043e\u0441\u0442\u0430\u0432\u0449\u0438\u043a\u0438\u0020\u - cross-az масштабирование для повышения производительности и обеспечения высокой доступности - встроенный мониторинг и редактор SQL-запросов -{## [Оригинальная статья](https://clickhouse.tech/docs/ru/commercial/cloud/) ##} +## Alibaba Cloud {#alibaba-cloud} + +Управляемый облачный сервис Alibaba для ClickHouse: [китайская площадка](https://www.aliyun.com/product/clickhouse), будет доступен на международной площадке в мае 2021 года. Сервис предоставляет следующие возможности: + +- надежный сервер для облачного хранилища на основе распределенной системы [Alibaba Cloud Apsara](https://www.alibabacloud.com/product/apsara-stack); +- расширяемая по запросу емкость, без переноса данных вручную; +- поддержка одноузловой и многоузловой архитектуры, архитектуры с одной или несколькими репликами, а также многоуровневого хранения cold и hot data; +- поддержка прав доступа, one-key восстановления, многоуровневая защита сети, шифрование облачного диска; +- полная интеграция с облачными системами логирования, базами данных и инструментами обработки данных; +- встроенная платформа для мониторинга и управления базами данных; +- техническая поддержка от экспертов по работе с базами данных. + +## SberCloud {#sbercloud} + +[Облачная платформа SberCloud.Advanced](https://sbercloud.ru/ru/advanced): + +- предоставляет более 50 высокотехнологичных сервисов; +- позволяет быстро создавать и эффективно управлять ИТ-инфраструктурой, приложениями и интернет-сервисами; +- радикально минимизирует ресурсы, требуемые для работы корпоративных ИТ-систем; +- в разы сокращает время вывода новых продуктов на рынок. + +SberCloud.Advanced предоставляет [MapReduce Service (MRS)](https://docs.sbercloud.ru/mrs/ug/topics/ug__clickhouse.html) — надежную, безопасную и простую в использовании платформу корпоративного уровня для хранения, обработки и анализа больших данных. MRS позволяет быстро создавать и управлять кластерами ClickHouse. + +- Инстанс ClickHouse состоит из трех узлов ZooKeeper и нескольких узлов ClickHouse. Выделенный режим реплики используется для обеспечения высокой надежности двойных копий данных. +- MRS предлагает возможности гибкого масштабирования при быстром росте сервисов в сценариях, когда емкости кластерного хранилища или вычислительных ресурсов процессора недостаточно. MRS в один клик предоставляет инструмент для балансировки данных при расширении узлов ClickHouse в кластере. Вы можете определить режим и время балансировки данных на основе характеристик сервиса, чтобы обеспечить доступность сервиса. +- MRS использует архитектуру развертывания высокой доступности на основе Elastic Load Balance (ELB) — сервиса для автоматического распределения трафика на несколько внутренних узлов. Благодаря ELB, данные записываются в локальные таблицы и считываются из распределенных таблиц на разных узлах. Такая архитектура повышает отказоустойчивость кластера и гарантирует высокую доступность приложений. + diff --git a/docs/ru/commercial/index.md b/docs/ru/commercial/index.md index c6c440c17e8..66b1b125823 100644 --- a/docs/ru/commercial/index.md +++ b/docs/ru/commercial/index.md @@ -1,9 +1,7 @@ --- -toc_folder_title: "\u041A\u043E\u043C\u043C\u0435\u0440\u0447\u0435\u0441\u043A\u0438\ - \u0435 \u0443\u0441\u043B\u0443\u0433\u0438" +toc_folder_title: "Коммерческие услуги" toc_priority: 70 -toc_title: "\u041A\u043E\u043C\u043C\u0435\u0440\u0447\u0435\u0441\u043A\u0438\u0435\ - \ \u0443\u0441\u043B\u0443\u0433\u0438" +toc_title: "Коммерческие услуги" --- # Коммерческие услуги {#clickhouse-commercial-services} diff --git a/docs/ru/development/architecture.md b/docs/ru/development/architecture.md index de8fba1bc4b..d2cfc44b711 100644 --- a/docs/ru/development/architecture.md +++ b/docs/ru/development/architecture.md @@ -1,6 +1,6 @@ --- toc_priority: 62 -toc_title: "\u041e\u0431\u0437\u043e\u0440\u0020\u0430\u0440\u0445\u0438\u0442\u0435\u043a\u0442\u0443\u0440\u044b\u0020\u0043\u006c\u0069\u0063\u006b\u0048\u006f\u0075\u0073\u0065" +toc_title: "Обзор архитектуры ClickHouse" --- # Обзор архитектуры ClickHouse {#overview-of-clickhouse-architecture} @@ -27,7 +27,7 @@ ClickHouse - полноценная колоночная СУБД. Данные `IColumn` предоставляет методы для общих реляционных преобразований данных, но они не отвечают всем потребностям. Например, `ColumnUInt64` не имеет метода для вычисления суммы двух столбцов, а `ColumnString` не имеет метода для запуска поиска по подстроке. Эти бесчисленные процедуры реализованы вне `IColumn`. -Различные функции на колонках могут быть реализованы обобщенным, неэффективным путем, используя `IColumn` методы для извлечения значений `Field`, или специальным путем, используя знания о внутреннем распределение данных в памяти в конкретной реализации `IColumn`. Для этого функции приводятся к конкретному типу `IColumn` и работают напрямую с его внутренним представлением. Например, в `ColumnUInt64` есть метод getData, который возвращает ссылку на внутренний массив, чтение и заполнение которого, выполняется отдельной процедурой напрямую. Фактически, мы имеем "дырявую абстракции", обеспечивающие эффективные специализации различных процедур. +Различные функции на колонках могут быть реализованы обобщенным, неэффективным путем, используя `IColumn` методы для извлечения значений `Field`, или специальным путем, используя знания о внутреннем распределение данных в памяти в конкретной реализации `IColumn`. Для этого функции приводятся к конкретному типу `IColumn` и работают напрямую с его внутренним представлением. Например, в `ColumnUInt64` есть метод `getData`, который возвращает ссылку на внутренний массив, чтение и заполнение которого, выполняется отдельной процедурой напрямую. Фактически, мы имеем "дырявые абстракции", обеспечивающие эффективные специализации различных процедур. ## Типы данных (Data Types) {#data_types} @@ -42,7 +42,7 @@ ClickHouse - полноценная колоночная СУБД. Данные ## Блоки (Block) {#block} -`Block` это контейнер, который представляет фрагмент (chunk) таблицы в памяти. Это набор троек - `(IColumn, IDataType, имя колонки)`. В процессе выполнения запроса, данные обрабатываются `Block`ами. Если у нас есть `Block`, значит у нас есть данные (в объекте `IColumn`), информация о типе (в `IDataType`), которая говорит нам, как работать с колонкой, и имя колонки (оригинальное имя колонки таблицы или служебное имя, присвоенное для получения промежуточных результатов вычислений). +`Block` это контейнер, который представляет фрагмент (chunk) таблицы в памяти. Это набор троек - `(IColumn, IDataType, имя колонки)`. В процессе выполнения запроса, данные обрабатываются `Block`-ами. Если у нас есть `Block`, значит у нас есть данные (в объекте `IColumn`), информация о типе (в `IDataType`), которая говорит нам, как работать с колонкой, и имя колонки (оригинальное имя колонки таблицы или служебное имя, присвоенное для получения промежуточных результатов вычислений). При вычислении некоторой функции на колонках в блоке мы добавляем еще одну колонку с результатами в блок, не трогая колонки аргументов функции, потому что операции иммутабельные. Позже ненужные колонки могут быть удалены из блока, но не модифицированы. Это удобно для устранения общих подвыражений. @@ -58,7 +58,7 @@ ClickHouse - полноценная колоночная СУБД. Данные 2. Реализацию форматов данных. Например, при выводе данных в терминал в формате `Pretty`, вы создаете выходной поток блоков, который форматирует поступающие в него блоки. 3. Трансформацию данных. Допустим, у вас есть `IBlockInputStream` и вы хотите создать отфильтрованный поток. Вы создаете `FilterBlockInputStream` и инициализируете его вашим потоком. Затем вы тянете (pull) блоки из `FilterBlockInputStream`, а он тянет блоки исходного потока, фильтрует их и возвращает отфильтрованные блоки вам. Таким образом построены конвейеры выполнения запросов. -Имеются и более сложные трансформации. Например, когда вы тянете блоки из `AggregatingBlockInputStream`, он считывает все данные из своего источника, агрегирует их, и возвращает поток агрегированных данных вам. Другой пример: конструктор `UnionBlockInputStream` принимает множество источников входных данных и число потоков. Такой `Stream` работает в несколько потоков и читает данные источников параллельно. +Имеются и более сложные трансформации. Например, когда вы тянете блоки из `AggregatingBlockInputStream`, он считывает все данные из своего источника, агрегирует их, и возвращает поток агрегированных данных вам. Другой пример: конструктор `UnionBlockInputStream` принимает множество источников входных данных и число потоков. Такой `Stream` работает в несколько потоков и читает данные источников параллельно. > Потоки блоков используют «втягивающий» (pull) подход к управлению потоком выполнения: когда вы вытягиваете блок из первого потока, он, следовательно, вытягивает необходимые блоки из вложенных потоков, так и работает весь конвейер выполнения. Ни «pull» ни «push» не имеют явного преимущества, потому что поток управления неявный, и это ограничивает в реализации различных функций, таких как одновременное выполнение нескольких запросов (слияние нескольких конвейеров вместе). Это ограничение можно преодолеть с помощью сопрограмм (coroutines) или просто запуском дополнительных потоков, которые ждут друг друга. У нас может быть больше возможностей, если мы сделаем поток управления явным: если мы локализуем логику для передачи данных из одной расчетной единицы в другую вне этих расчетных единиц. Читайте эту [статью](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/) для углубленного изучения. @@ -110,9 +110,9 @@ ClickHouse - полноценная колоночная СУБД. Данные > Генераторы парсеров не используются по историческим причинам. ## Интерпретаторы {#interpreters} - + Интерпретаторы отвечают за создание конвейера выполнения запроса из `AST`. Есть простые интерпретаторы, такие как `InterpreterExistsQuery` и `InterpreterDropQuery` или более сложный `InterpreterSelectQuery`. Конвейер выполнения запроса представляет собой комбинацию входных и выходных потоков блоков. Например, результатом интерпретации `SELECT` запроса является `IBlockInputStream` для чтения результирующего набора данных; результат интерпретации `INSERT` запроса - это `IBlockOutputStream`, для записи данных, предназначенных для вставки; результат интерпретации `INSERT SELECT` запроса - это `IBlockInputStream`, который возвращает пустой результирующий набор при первом чтении, но копирует данные из `SELECT` к `INSERT`. - + `InterpreterSelectQuery` использует `ExpressionAnalyzer` и `ExpressionActions` механизмы для анализа запросов и преобразований. Именно здесь выполняется большинство оптимизаций запросов на основе правил. `ExpressionAnalyzer` написан довольно грязно и должен быть переписан: различные преобразования запросов и оптимизации должны быть извлечены в отдельные классы, чтобы позволить модульные преобразования или запросы. ## Функции {#functions} @@ -162,9 +162,9 @@ ClickHouse имеет сильную типизацию, поэтому нет Сервера в кластере в основном независимы. Вы можете создать `Распределенную` (`Distributed`) таблицу на одном или всех серверах в кластере. Такая таблица сама по себе не хранит данные - она только предоставляет возможность "просмотра" всех локальных таблиц на нескольких узлах кластера. При выполнении `SELECT` распределенная таблица переписывает запрос, выбирает удаленные узлы в соответствии с настройками балансировки нагрузки и отправляет им запрос. Распределенная таблица просит удаленные сервера обработать запрос до той стадии, когда промежуточные результаты с разных серверов могут быть объединены. Затем он получает промежуточные результаты и объединяет их. Распределенная таблица пытается возложить как можно больше работы на удаленные серверы и сократить объем промежуточных данных, передаваемых по сети. -Ситуация усложняется, при использовании подзапросы в случае IN или JOIN, когда каждый из них использует таблицу `Distributed`. Есть разные стратегии для выполнения таких запросов. +Ситуация усложняется, при использовании подзапросов в случае `IN` или `JOIN`, когда каждый из них использует таблицу `Distributed`. Есть разные стратегии для выполнения таких запросов. -Глобального плана выполнения распределенных запросов не существует. Каждый узел имеет собственный локальный план для своей части работы. У нас есть простое однонаправленное выполнение распределенных запросов: мы отправляем запросы на удаленные узлы и затем объединяем результаты. Но это невозможно для сложных запросов GROUP BY высокой кардинальности или запросов с большим числом временных данных в JOIN: в таких случаях нам необходимо перераспределить («reshuffle») данные между серверами, что требует дополнительной координации. ClickHouse не поддерживает выполнение запросов такого рода, и нам нужно работать над этим. +Глобального плана выполнения распределенных запросов не существует. Каждый узел имеет собственный локальный план для своей части работы. У нас есть простое однонаправленное выполнение распределенных запросов: мы отправляем запросы на удаленные узлы и затем объединяем результаты. Но это невозможно для сложных запросов `GROUP BY` высокой кардинальности или запросов с большим числом временных данных в `JOIN`: в таких случаях нам необходимо перераспределить («reshuffle») данные между серверами, что требует дополнительной координации. ClickHouse не поддерживает выполнение запросов такого рода, и нам нужно работать над этим. ## Merge Tree {#merge-tree} @@ -190,7 +190,7 @@ ClickHouse имеет сильную типизацию, поэтому нет Репликация использует асинхронную multi-master схему. Вы можете вставить данные в любую реплику, которая имеет открытую сессию в `ZooKeeper`, и данные реплицируются на все другие реплики асинхронно. Поскольку ClickHouse не поддерживает UPDATE, репликация исключает конфликты (conflict-free replication). Поскольку подтверждение вставок кворумом не реализовано, только что вставленные данные могут быть потеряны в случае сбоя одного узла. -Метаданные для репликации хранятся в `ZooKeeper`. Существует журнал репликации, в котором перечислены действия, которые необходимо выполнить. Среди этих действий: получить часть (get the part); объединить части (merge parts); удалить партицию (drop a partition) и так далее. Каждая реплика копирует журнал репликации в свою очередь, а затем выполняет действия из очереди. Например, при вставке в журнале создается действие «получить часть» (get the part), и каждая реплика загружает эту часть. Слияния координируются между репликами, чтобы получить идентичные до байта результаты. Все части объединяются одинаково на всех репликах. Одна из реплик-лидеров инициирует новое слияние кусков первой и записывает действия «слияния частей» в журнал. Несколько реплик (или все) могут быть лидерами одновременно. Реплике можно запретить быть лидером с помощью `merge_tree` настройки `replicated_can_become_leader`. +Метаданные для репликации хранятся в `ZooKeeper`. Существует журнал репликации, в котором перечислены действия, которые необходимо выполнить. Среди этих действий: получить часть (get the part); объединить части (merge parts); удалить партицию (drop a partition) и так далее. Каждая реплика копирует журнал репликации в свою очередь, а затем выполняет действия из очереди. Например, при вставке в журнале создается действие «получить часть» (get the part), и каждая реплика загружает эту часть. Слияния координируются между репликами, чтобы получить идентичные до байта результаты. Все части объединяются одинаково на всех репликах. Одна из реплик-лидеров инициирует новое слияние кусков первой и записывает действия «слияния частей» в журнал. Несколько реплик (или все) могут быть лидерами одновременно. Реплике можно запретить быть лидером с помощью `merge_tree` настройки `replicated_can_become_leader`. Репликация является физической: между узлами передаются только сжатые части, а не запросы. Слияния обрабатываются на каждой реплике независимо, в большинстве случаев, чтобы снизить затраты на сеть, во избежание усиления роли сети. Крупные объединенные части отправляются по сети только в случае значительной задержки репликации. diff --git a/docs/ru/development/browse-code.md b/docs/ru/development/browse-code.md index ac17cf0e6f5..3f6de574abe 100644 --- a/docs/ru/development/browse-code.md +++ b/docs/ru/development/browse-code.md @@ -1,6 +1,6 @@ --- toc_priority: 71 -toc_title: "\u041d\u0430\u0432\u0438\u0433\u0430\u0446\u0438\u044f\u0020\u043f\u043e\u0020\u043a\u043e\u0434\u0443\u0020\u0043\u006c\u0069\u0063\u006b\u0048\u006f\u0075\u0073\u0065" +toc_title: "Навигация по коду ClickHouse" --- diff --git a/docs/ru/development/contrib.md b/docs/ru/development/contrib.md index 05367267e41..f3310836ba9 100644 --- a/docs/ru/development/contrib.md +++ b/docs/ru/development/contrib.md @@ -1,6 +1,6 @@ --- toc_priority: 70 -toc_title: "\u0418\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c\u044b\u0435\u0020\u0441\u0442\u043e\u0440\u043e\u043d\u043d\u0438\u0435\u0020\u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438" +toc_title: "Используемые сторонние библиотеки" --- diff --git a/docs/ru/development/developer-instruction.md b/docs/ru/development/developer-instruction.md index 7d88c97fe46..66bbb9030cf 100644 --- a/docs/ru/development/developer-instruction.md +++ b/docs/ru/development/developer-instruction.md @@ -1,21 +1,21 @@ --- toc_priority: 61 -toc_title: "\u0418\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438\u044f\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u0447\u0438\u043a\u043e\u0432" +toc_title: "Инструкция для разработчиков" --- # Инструкция для разработчиков Сборка ClickHouse поддерживается на Linux, FreeBSD, Mac OS X. -# Если вы используете Windows {#esli-vy-ispolzuete-windows} +## Если вы используете Windows {#esli-vy-ispolzuete-windows} Если вы используете Windows, вам потребуется создать виртуальную машину с Ubuntu. Для работы с виртуальной машиной, установите VirtualBox. Скачать Ubuntu можно на сайте: https://www.ubuntu.com/#download Создайте виртуальную машину из полученного образа. Выделите для неё не менее 4 GB оперативной памяти. Для запуска терминала в Ubuntu, найдите в меню программу со словом terminal (gnome-terminal, konsole или что-то в этом роде) или нажмите Ctrl+Alt+T. -# Если вы используете 32-битную систему {#esli-vy-ispolzuete-32-bitnuiu-sistemu} +## Если вы используете 32-битную систему {#esli-vy-ispolzuete-32-bitnuiu-sistemu} ClickHouse не работает и не собирается на 32-битных системах. Получите доступ к 64-битной системе и продолжайте. -# Создание репозитория на GitHub {#sozdanie-repozitoriia-na-github} +## Создание репозитория на GitHub {#sozdanie-repozitoriia-na-github} Для работы с репозиторием ClickHouse, вам потребуется аккаунт на GitHub. Наверное, он у вас уже есть. @@ -34,7 +34,7 @@ ClickHouse не работает и не собирается на 32-битны Подробное руководство по использованию Git: https://git-scm.com/book/ru/v2 -# Клонирование репозитория на рабочую машину {#klonirovanie-repozitoriia-na-rabochuiu-mashinu} +## Клонирование репозитория на рабочую машину {#klonirovanie-repozitoriia-na-rabochuiu-mashinu} Затем вам потребуется загрузить исходники для работы на свой компьютер. Это называется «клонирование репозитория», потому что создаёт на вашем компьютере локальную копию репозитория, с которой вы будете работать. @@ -78,7 +78,7 @@ ClickHouse не работает и не собирается на 32-битны После этого, вы сможете добавлять в свой репозиторий обновления из репозитория Яндекса с помощью команды `git pull upstream master`. -## Работа с сабмодулями Git {#rabota-s-sabmoduliami-git} +### Работа с сабмодулями Git {#rabota-s-sabmoduliami-git} Работа с сабмодулями git может быть достаточно болезненной. Следующие команды позволят содержать их в порядке: @@ -110,7 +110,7 @@ The next commands would help you to reset all submodules to the initial state (! git submodule foreach git submodule foreach git reset --hard git submodule foreach git submodule foreach git clean -xfd -# Система сборки {#sistema-sborki} +## Система сборки {#sistema-sborki} ClickHouse использует систему сборки CMake и Ninja. @@ -130,11 +130,11 @@ Ninja - система запуска сборочных задач. Проверьте версию CMake: `cmake --version`. Если версия меньше 3.3, то установите новую версию с сайта https://cmake.org/download/ -# Необязательные внешние библиотеки {#neobiazatelnye-vneshnie-biblioteki} +## Необязательные внешние библиотеки {#neobiazatelnye-vneshnie-biblioteki} ClickHouse использует для сборки некоторое количество внешних библиотек. Но ни одну из них не требуется отдельно устанавливать, так как они собираются вместе с ClickHouse, из исходников, которые расположены в submodules. Посмотреть набор этих библиотек можно в директории contrib. -# Компилятор C++ {#kompiliator-c} +## Компилятор C++ {#kompiliator-c} В качестве компилятора C++ поддерживается GCC начиная с версии 9 или Clang начиная с версии 8. @@ -148,7 +148,7 @@ ClickHouse использует для сборки некоторое коли Если вы решили использовать Clang, вы также можете установить `libc++` и `lld`, если вы знаете, что это такое. При желании, установите `ccache`. -# Процесс сборки {#protsess-sborki} +## Процесс сборки {#protsess-sborki} Теперь вы готовы к сборке ClickHouse. Для размещения собранных файлов, рекомендуется создать отдельную директорию build внутри директории ClickHouse: @@ -206,7 +206,7 @@ Mac OS X: ls -l programs/clickhouse -# Запуск собранной версии ClickHouse {#zapusk-sobrannoi-versii-clickhouse} +## Запуск собранной версии ClickHouse {#zapusk-sobrannoi-versii-clickhouse} Для запуска сервера из под текущего пользователя, с выводом логов в терминал и с использованием примеров конфигурационных файлов, расположенных в исходниках, перейдите в директорию `ClickHouse/programs/server/` (эта директория находится не в директории build) и выполните: @@ -233,7 +233,7 @@ Mac OS X: sudo service clickhouse-server stop sudo -u clickhouse ClickHouse/build/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml -# Среда разработки {#sreda-razrabotki} +## Среда разработки {#sreda-razrabotki} Если вы не знаете, какую среду разработки использовать, то рекомендуется использовать CLion. CLion является платным ПО, но его можно использовать бесплатно в течение пробного периода. Также он бесплатен для учащихся. CLion можно использовать как под Linux, так и под Mac OS X. @@ -243,7 +243,7 @@ Mac OS X: На всякий случай заметим, что CLion самостоятельно создаёт свою build директорию, самостоятельно выбирает тип сборки debug по-умолчанию, для конфигурации использует встроенную в CLion версию CMake вместо установленного вами, а для запуска задач использует make вместо ninja. Это нормально, просто имейте это ввиду, чтобы не возникало путаницы. -# Написание кода {#napisanie-koda} +## Написание кода {#napisanie-koda} Описание архитектуры ClickHouse: https://clickhouse.tech/docs/ru/development/architecture/ @@ -253,7 +253,7 @@ Mac OS X: Список задач: https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aopen+is%3Aissue+label%3A%22easy+task%22 -# Тестовые данные {#testovye-dannye} +## Тестовые данные {#testovye-dannye} Разработка ClickHouse часто требует загрузки реалистичных наборов данных. Особенно это важно для тестирования производительности. Специально для вас мы подготовили набор данных, представляющий собой анонимизированные данные Яндекс.Метрики. Загрузка этих данных потребует ещё 3 GB места на диске. Для выполнения большинства задач разработки, загружать эти данные не обязательно. @@ -274,7 +274,7 @@ Mac OS X: clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv -# Создание Pull Request {#sozdanie-pull-request} +## Создание Pull Request {#sozdanie-pull-request} Откройте свой форк репозитория в интерфейсе GitHub. Если вы вели разработку в бранче, выберите этот бранч. На странице будет доступна кнопка «Pull request». По сути, это означает «создать заявку на принятие моих изменений в основной репозиторий». diff --git a/docs/ru/development/style.md b/docs/ru/development/style.md index 1b211259bbb..f08ecc3c4c7 100644 --- a/docs/ru/development/style.md +++ b/docs/ru/development/style.md @@ -1,6 +1,6 @@ --- toc_priority: 68 -toc_title: "\u041a\u0430\u043a\u0020\u043f\u0438\u0441\u0430\u0442\u044c\u0020\u043a\u043e\u0434\u0020\u043d\u0430\u0020\u0043\u002b\u002b" +toc_title: "Как писать код на C++" --- @@ -911,4 +911,3 @@ function( size_t limit) ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/development/style/) diff --git a/docs/ru/engines/database-engines/atomic.md b/docs/ru/engines/database-engines/atomic.md new file mode 100644 index 00000000000..8c75be3d93b --- /dev/null +++ b/docs/ru/engines/database-engines/atomic.md @@ -0,0 +1,54 @@ +--- +toc_priority: 32 +toc_title: Atomic +--- + +# Atomic {#atomic} + +Поддерживает неблокирующие запросы [DROP TABLE](#drop-detach-table) и [RENAME TABLE](#rename-table) и атомарные запросы [EXCHANGE TABLES t1 AND t](#exchange-tables). Движок `Atomic` используется по умолчанию. + +## Создание БД {#creating-a-database} + +``` sql + CREATE DATABASE test[ ENGINE = Atomic]; +``` + +## Особенности и рекомендации {#specifics-and-recommendations} + +### UUID {#table-uuid} + +Каждая таблица в базе данных `Atomic` имеет уникальный [UUID](../../sql-reference/data-types/uuid.md) и хранит данные в папке `/clickhouse_path/store/xxx/xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy/`, где `xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy` - это UUID таблицы. +Обычно UUID генерируется автоматически, но пользователь также может явно указать UUID в момент создания таблицы (однако это не рекомендуется). Для отображения UUID в запросе `SHOW CREATE` вы можете использовать настройку [show_table_uuid_in_table_create_query_if_not_nil](../../operations/settings/settings.md#show_table_uuid_in_table_create_query_if_not_nil). Результат выполнения в таком случае будет иметь вид: + +```sql +CREATE TABLE name UUID '28f1c61c-2970-457a-bffe-454156ddcfef' (n UInt64) ENGINE = ...; +``` +### RENAME TABLE {#rename-table} + +Запросы `RENAME` выполняются без изменения UUID и перемещения табличных данных. Эти запросы не ожидают завершения использующих таблицу запросов и будут выполнены мгновенно. + +### DROP/DETACH TABLE {#drop-detach-table} + +При выполнении запроса `DROP TABLE` никакие данные не удаляются. Таблица помечается как удаленная, метаданные перемещаются в папку `/clickhouse_path/metadata_dropped/` и база данных уведомляет фоновый поток. Задержка перед окончательным удалением данных задается настройкой [database_atomic_delay_before_drop_table_sec](../../operations/server-configuration-parameters/settings.md#database_atomic_delay_before_drop_table_sec). +Вы можете задать синхронный режим, определяя модификатор `SYNC`. Используйте для этого настройку [database_atomic_wait_for_drop_and_detach_synchronously](../../operations/settings/settings.md#database_atomic_wait_for_drop_and_detach_synchronously). В этом случае запрос `DROP` ждет завершения `SELECT`, `INSERT` и других запросов, которые используют таблицу. Таблица будет фактически удалена, когда она не будет использоваться. + +### EXCHANGE TABLES {#exchange-tables} + +Запрос `EXCHANGE` меняет местами две таблицы атомарно. Вместо неатомарной операции: + +```sql +RENAME TABLE new_table TO tmp, old_table TO new_table, tmp TO old_table; +``` +вы можете использовать один атомарный запрос: + +``` sql +EXCHANGE TABLES new_table AND old_table; +``` + +### ReplicatedMergeTree in Atomic Database {#replicatedmergetree-in-atomic-database} + +Для таблиц [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) рекомендуется не указывать параметры движка - путь в ZooKeeper и имя реплики. В этом случае будут использоваться параметры конфигурации: [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) и [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name). Если вы хотите определить параметры движка явно, рекомендуется использовать макрос {uuid}. Это удобно, так как автоматически генерируются уникальные пути для каждой таблицы в ZooKeeper. + +## Смотрите также + +- Системная таблица [system.databases](../../operations/system-tables/databases.md). diff --git a/docs/ru/engines/database-engines/index.md b/docs/ru/engines/database-engines/index.md index 4dfe766f066..d4fad8f43a9 100644 --- a/docs/ru/engines/database-engines/index.md +++ b/docs/ru/engines/database-engines/index.md @@ -1,14 +1,14 @@ --- -toc_folder_title: "\u0414\u0432\u0438\u0436\u043a\u0438\u0020\u0431\u0430\u0437\u0020\u0434\u0430\u043d\u043d\u044b\u0445" +toc_folder_title: "Движки баз данных" toc_priority: 27 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +toc_title: "Введение" --- -# Движки баз данных {#dvizhki-baz-dannykh} +# Движки баз данных {#database-engines} Движки баз данных обеспечивают работу с таблицами. -По умолчанию ClickHouse использует собственный движок баз данных, который поддерживает конфигурируемые [движки таблиц](../../engines/database-engines/index.md) и [диалект SQL](../../engines/database-engines/index.md). +По умолчанию ClickHouse использует движок [Atomic](../../engines/database-engines/atomic.md). Он поддерживает конфигурируемые [движки таблиц](../../engines/table-engines/index.md) и [диалект SQL](../../sql-reference/syntax.md). Также можно использовать следующие движки баз данных: @@ -18,4 +18,5 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" - [Lazy](../../engines/database-engines/lazy.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/database_engines/) +- [PostgreSQL](../../engines/database-engines/postgresql.md) + diff --git a/docs/ru/engines/database-engines/lazy.md b/docs/ru/engines/database-engines/lazy.md index c01aae0284e..140a67be761 100644 --- a/docs/ru/engines/database-engines/lazy.md +++ b/docs/ru/engines/database-engines/lazy.md @@ -15,4 +15,3 @@ toc_title: Lazy CREATE DATABASE testlazy ENGINE = Lazy(expiration_time_in_seconds); ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/database_engines/lazy/) diff --git a/docs/ru/engines/database-engines/materialize-mysql.md b/docs/ru/engines/database-engines/materialize-mysql.md index 3022542e294..2067dfecca0 100644 --- a/docs/ru/engines/database-engines/materialize-mysql.md +++ b/docs/ru/engines/database-engines/materialize-mysql.md @@ -157,4 +157,3 @@ SELECT * FROM mysql.test; └───┴─────┴──────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/database-engines/materialize-mysql/) diff --git a/docs/ru/engines/database-engines/postgresql.md b/docs/ru/engines/database-engines/postgresql.md new file mode 100644 index 00000000000..c11dab6f1aa --- /dev/null +++ b/docs/ru/engines/database-engines/postgresql.md @@ -0,0 +1,138 @@ +--- +toc_priority: 35 +toc_title: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +Позволяет подключаться к БД на удаленном сервере [PostgreSQL](https://www.postgresql.org). Поддерживает операции чтения и записи (запросы `SELECT` и `INSERT`) для обмена данными между ClickHouse и PostgreSQL. + +Позволяет в реальном времени получать от удаленного сервера PostgreSQL информацию о таблицах БД и их структуре с помощью запросов `SHOW TABLES` и `DESCRIBE TABLE`. + +Поддерживает операции изменения структуры таблиц (`ALTER TABLE ... ADD|DROP COLUMN`). Если параметр `use_table_cache` (см. ниже раздел Параметры движка) установлен в значение `1`, структура таблицы кешируется, и изменения в структуре не отслеживаются, но будут обновлены, если выполнить команды `DETACH` и `ATTACH`. + +## Создание БД {#creating-a-database} + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `use_table_cache`]); +``` + +**Параметры движка** + +- `host:port` — адрес сервера PostgreSQL. +- `database` — имя удаленной БД. +- `user` — пользователь PostgreSQL. +- `password` — пароль пользователя. +- `use_table_cache` — определяет кеширование структуры таблиц БД. Необязательный параметр. Значение по умолчанию: `0`. + +## Поддерживаемые типы данных {#data_types-support} + +| PostgerSQL | ClickHouse | +|------------------|--------------------------------------------------------------| +| DATE | [Date](../../sql-reference/data-types/date.md) | +| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| REAL | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) | +| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | +| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) | +| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | +| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) | +| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) | +| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) | +| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))| +| ARRAY | [Array](../../sql-reference/data-types/array.md) | + + +## Примеры использования {#examples-of-use} + +Обмен данными между БД ClickHouse и сервером PostgreSQL: + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1); +``` + +``` sql +SHOW DATABASES; +``` + +``` text +┌─name──────────┐ +│ default │ +│ test_database │ +│ system │ +└───────────────┘ +``` + +``` sql +SHOW TABLES FROM test_database; +``` + +``` text +┌─name───────┐ +│ test_table │ +└────────────┘ +``` + +Чтение данных из таблицы PostgreSQL: + +``` sql +SELECT * FROM test_database.test_table; +``` + +``` text +┌─id─┬─value─┐ +│ 1 │ 2 │ +└────┴───────┘ +``` + +Запись данных в таблицу PostgreSQL: + +``` sql +INSERT INTO test_database.test_table VALUES (3,4); +SELECT * FROM test_database.test_table; +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +Пусть структура таблицы была изменена в PostgreSQL: + +``` sql +postgre> ALTER TABLE test_table ADD COLUMN data Text +``` + +Поскольку при создании БД параметр `use_table_cache` был установлен в значение `1`, структура таблицы в ClickHouse была кеширована и поэтому не изменилась: + +``` sql +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +└────────┴───────────────────┘ +``` + +После того как таблицу «отцепили» и затем снова «прицепили», структура обновилась: + +``` sql +DETACH TABLE test_database.test_table; +ATTACH TABLE test_database.test_table; +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +│ data │ Nullable(String) │ +└────────┴───────────────────┘ +``` + +[Оригинальная статья](https://clickhouse.tech/docs/ru/database-engines/postgresql/) diff --git a/docs/ru/engines/index.md b/docs/ru/engines/index.md index 28ccc8bcfe6..fe41ada8fb6 100644 --- a/docs/ru/engines/index.md +++ b/docs/ru/engines/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0045\u006e\u0067\u0069\u006e\u0065\u0073" +toc_folder_title: "Engines" toc_hidden: true toc_priority: 25 toc_title: hidden diff --git a/docs/ru/engines/table-engines/index.md b/docs/ru/engines/table-engines/index.md index 740588c50a4..b17b2124250 100644 --- a/docs/ru/engines/table-engines/index.md +++ b/docs/ru/engines/table-engines/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u0414\u0432\u0438\u0436\u043a\u0438\u0020\u0442\u0430\u0431\u043b\u0438\u0446" +toc_folder_title: "Движки таблиц" toc_priority: 26 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +toc_title: "Введение" --- @@ -16,7 +16,7 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" - Возможно ли многопоточное выполнение запроса. - Параметры репликации данных. -## Семейства движков {#semeistva-dvizhkov} +## Семейства движков {#engine-families} ### MergeTree {#mergetree} @@ -42,7 +42,7 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" - [StripeLog](log-family/stripelog.md#stripelog) - [Log](log-family/log.md#log) -### Движки для интеграции {#dvizhki-dlia-integratsii} +### Движки для интеграции {#integration-engines} Движки для связи с другими системами хранения и обработки данных. @@ -52,9 +52,22 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" - [MySQL](integrations/mysql.md#mysql) - [ODBC](integrations/odbc.md#table-engine-odbc) - [JDBC](integrations/jdbc.md#table-engine-jdbc) +- [S3](integrations/s3.md#table-engine-s3) ### Специальные движки {#spetsialnye-dvizhki} +- [ODBC](../../engines/table-engines/integrations/odbc.md) +- [JDBC](../../engines/table-engines/integrations/jdbc.md) +- [MySQL](../../engines/table-engines/integrations/mysql.md) +- [MongoDB](../../engines/table-engines/integrations/mongodb.md) +- [HDFS](../../engines/table-engines/integrations/hdfs.md) +- [Kafka](../../engines/table-engines/integrations/kafka.md) +- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md) + +### Специальные движки {#special-engines} + Движки семейства: - [Distributed](special/distributed.md#distributed) @@ -79,5 +92,3 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" Чтобы получить данные из виртуального столбца, необходимо указать его название в запросе `SELECT`. `SELECT *` не отображает данные из виртуальных столбцов. При создании таблицы со столбцом, имя которого совпадает с именем одного из виртуальных столбцов таблицы, виртуальный столбец становится недоступным. Не делайте так. Чтобы помочь избежать конфликтов, имена виртуальных столбцов обычно предваряются подчеркиванием. - -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/) diff --git a/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md b/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md index 9b68bcfc770..5a7909f63b2 100644 --- a/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +toc_priority: 9 toc_title: EmbeddedRocksDB --- @@ -41,4 +41,3 @@ ENGINE = EmbeddedRocksDB PRIMARY KEY key; ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/embedded-rocksdb/) \ No newline at end of file diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index bd8e760fce4..b56bbfc0788 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -1,5 +1,5 @@ --- -toc_priority: 4 +toc_priority: 6 toc_title: HDFS --- @@ -102,16 +102,103 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs Создадим таблицу с именами `file000`, `file001`, … , `file999`: ``` sql -CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') +CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') ``` +## Конфигурация {#configuration} + +Похоже на GraphiteMergeTree, движок HDFS поддерживает расширенную конфигурацию с использованием файла конфигурации ClickHouse. Есть два раздела конфигурации которые вы можете использовать: глобальный (`hdfs`) и на уровне пользователя (`hdfs_*`). Глобальные настройки применяются первыми, и затем применяется конфигурация уровня пользователя (если она указана). + +``` xml + + + /tmp/keytab/clickhouse.keytab + clickuser@TEST.CLICKHOUSE.TECH + kerberos + + + + + root@TEST.CLICKHOUSE.TECH + +``` + +### Список возможных опций конфигурации со значениями по умолчанию +#### Поддерживаемые из libhdfs3 + + +| **параметр** | **по умолчанию** | +| rpc\_client\_connect\_tcpnodelay | true | +| dfs\_client\_read\_shortcircuit | true | +| output\_replace-datanode-on-failure | true | +| input\_notretry-another-node | false | +| input\_localread\_mappedfile | true | +| dfs\_client\_use\_legacy\_blockreader\_local | false | +| rpc\_client\_ping\_interval | 10 * 1000 | +| rpc\_client\_connect\_timeout | 600 * 1000 | +| rpc\_client\_read\_timeout | 3600 * 1000 | +| rpc\_client\_write\_timeout | 3600 * 1000 | +| rpc\_client\_socekt\_linger\_timeout | -1 | +| rpc\_client\_connect\_retry | 10 | +| rpc\_client\_timeout | 3600 * 1000 | +| dfs\_default\_replica | 3 | +| input\_connect\_timeout | 600 * 1000 | +| input\_read\_timeout | 3600 * 1000 | +| input\_write\_timeout | 3600 * 1000 | +| input\_localread\_default\_buffersize | 1 * 1024 * 1024 | +| dfs\_prefetchsize | 10 | +| input\_read\_getblockinfo\_retry | 3 | +| input\_localread\_blockinfo\_cachesize | 1000 | +| input\_read\_max\_retry | 60 | +| output\_default\_chunksize | 512 | +| output\_default\_packetsize | 64 * 1024 | +| output\_default\_write\_retry | 10 | +| output\_connect\_timeout | 600 * 1000 | +| output\_read\_timeout | 3600 * 1000 | +| output\_write\_timeout | 3600 * 1000 | +| output\_close\_timeout | 3600 * 1000 | +| output\_packetpool\_size | 1024 | +| output\_heeartbeat\_interval | 10 * 1000 | +| dfs\_client\_failover\_max\_attempts | 15 | +| dfs\_client\_read\_shortcircuit\_streams\_cache\_size | 256 | +| dfs\_client\_socketcache\_expiryMsec | 3000 | +| dfs\_client\_socketcache\_capacity | 16 | +| dfs\_default\_blocksize | 64 * 1024 * 1024 | +| dfs\_default\_uri | "hdfs://localhost:9000" | +| hadoop\_security\_authentication | "simple" | +| hadoop\_security\_kerberos\_ticket\_cache\_path | "" | +| dfs\_client\_log\_severity | "INFO" | +| dfs\_domain\_socket\_path | "" | + + +[Руководство по конфигурации HDFS](https://hawq.apache.org/docs/userguide/2.3.0.0-incubating/reference/HDFSConfigurationParameterReference.html) поможет обьяснить назначения некоторых параметров. + + +#### Расширенные параметры для ClickHouse {#clickhouse-extras} + +| **параметр** | **по умолчанию** | +|hadoop\_kerberos\_keytab | "" | +|hadoop\_kerberos\_principal | "" | +|hadoop\_kerberos\_kinit\_command | kinit | + +#### Ограничения {#limitations} + * hadoop\_security\_kerberos\_ticket\_cache\_path могут быть определены только на глобальном уровне + +## Поддержика Kerberos {#kerberos-support} + +Если hadoop\_security\_authentication параметр имеет значение 'kerberos', ClickHouse аутентифицируется с помощью Kerberos. +[Расширенные параметры](#clickhouse-extras) и hadoop\_security\_kerberos\_ticket\_cache\_path помогают сделать это. +Обратите внимание что из-за ограничений libhdfs3 поддерживается только устаревший метод аутентификации, +коммуникация с узлами данных не защищена SASL (HADOOP\_SECURE\_DN\_USER надежный показатель такого +подхода к безопасности). Используйте tests/integration/test\_storage\_kerberized\_hdfs/hdfs_configs/bootstrap.sh для примера настроек. + +Если hadoop\_kerberos\_keytab, hadoop\_kerberos\_principal или hadoop\_kerberos\_kinit\_command указаны в настройках, kinit будет вызван. hadoop\_kerberos\_keytab и hadoop\_kerberos\_principal обязательны в этом случае. Необходимо также будет установить kinit и файлы конфигурации krb5. ## Виртуальные столбцы {#virtualnye-stolbtsy} - `_path` — Путь к файлу. - `_file` — Имя файла. -**Смотрите также** +**См. также** -- [Виртуальные столбцы](index.md#table_engines-virtual_columns) +- [Виртуальные колонки](../../../engines/table-engines/index.md#table_engines-virtual_columns) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/hdfs/) diff --git a/docs/ru/engines/table-engines/integrations/index.md b/docs/ru/engines/table-engines/integrations/index.md index db7e527442e..cb217270129 100644 --- a/docs/ru/engines/table-engines/integrations/index.md +++ b/docs/ru/engines/table-engines/integrations/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0414\u0432\u0438\u0436\u043a\u0438\u0020\u0442\u0430\u0431\u043b\u0438\u0446\u0020\u0434\u043b\u044f\u0020\u0438\u043d\u0442\u0435\u0433\u0440\u0430\u0446\u0438\u0438" +toc_folder_title: "Движки таблиц для интеграции" toc_priority: 30 --- @@ -14,8 +14,9 @@ toc_priority: 30 - [MySQL](../../../engines/table-engines/integrations/mysql.md) - [MongoDB](../../../engines/table-engines/integrations/mongodb.md) - [HDFS](../../../engines/table-engines/integrations/hdfs.md) +- [S3](../../../engines/table-engines/integrations/s3.md) - [Kafka](../../../engines/table-engines/integrations/kafka.md) - [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md) - [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/) diff --git a/docs/ru/engines/table-engines/integrations/jdbc.md b/docs/ru/engines/table-engines/integrations/jdbc.md index d7d438e0633..fd7411a258e 100644 --- a/docs/ru/engines/table-engines/integrations/jdbc.md +++ b/docs/ru/engines/table-engines/integrations/jdbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 2 +toc_priority: 3 toc_title: JDBC --- @@ -89,4 +89,3 @@ FROM jdbc_table - [Табличная функция JDBC](../../../engines/table-engines/integrations/jdbc.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/jdbc/) diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index 940fee2452b..19e2850dd51 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -1,5 +1,5 @@ --- -toc_priority: 5 +toc_priority: 8 toc_title: Kafka --- @@ -31,21 +31,26 @@ SETTINGS [kafka_schema = '',] [kafka_num_consumers = N,] [kafka_skip_broken_messages = N] + [kafka_commit_every_batch = 0,] + [kafka_thread_per_consumer = 0] ``` Обязательные параметры: -- `kafka_broker_list` – перечень брокеров, разделенный запятыми (`localhost:9092`). -- `kafka_topic_list` – перечень необходимых топиков Kafka. -- `kafka_group_name` – группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы. -- `kafka_format` – формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../../interfaces/formats.md). +- `kafka_broker_list` — перечень брокеров, разделенный запятыми (`localhost:9092`). +- `kafka_topic_list` — перечень необходимых топиков Kafka. +- `kafka_group_name` — группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы. +- `kafka_format` — формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../../interfaces/formats.md). Опциональные параметры: -- `kafka_row_delimiter` – символ-разделитель записей (строк), которым завершается сообщение. -- `kafka_schema` – опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`. -- `kafka_num_consumers` – количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя. -- `kafka_skip_broken_messages` – максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0. +- `kafka_row_delimiter` — символ-разделитель записей (строк), которым завершается сообщение. +- `kafka_schema` — опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`. +- `kafka_num_consumers` — количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя. +- `kafka_max_block_size` — максимальный размер пачек (в сообщениях) для poll (по умолчанию `max_block_size`). +- `kafka_skip_broken_messages` — максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0. +- `kafka_commit_every_batch` — включает или отключает режим записи каждой принятой и обработанной пачки по отдельности вместо единой записи целого блока (по умолчанию `0`). +- `kafka_thread_per_consumer` — включает или отключает предоставление отдельного потока каждому потребителю (по умолчанию `0`). При включенном режиме каждый потребитель сбрасывает данные независимо и параллельно, при отключённом — строки с данными от нескольких потребителей собираются в один блок. Примеры @@ -188,4 +193,3 @@ ClickHouse может поддерживать учетные данные Kerbe - [Виртуальные столбцы](index.md#table_engines-virtual_columns) - [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/kafka/) diff --git a/docs/ru/engines/table-engines/integrations/mongodb.md b/docs/ru/engines/table-engines/integrations/mongodb.md index 0765b3909de..97f903bdf89 100644 --- a/docs/ru/engines/table-engines/integrations/mongodb.md +++ b/docs/ru/engines/table-engines/integrations/mongodb.md @@ -1,5 +1,5 @@ --- -toc_priority: 7 +toc_priority: 5 toc_title: MongoDB --- @@ -54,4 +54,4 @@ SELECT COUNT() FROM mongo_table; └─────────┘ ``` -[Original article](https://clickhouse.tech/docs/ru/operations/table_engines/integrations/mongodb/) +[Original article](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/mongodb/) diff --git a/docs/ru/engines/table-engines/integrations/mysql.md b/docs/ru/engines/table-engines/integrations/mysql.md index 3370e9b06d0..5011c8a93c6 100644 --- a/docs/ru/engines/table-engines/integrations/mysql.md +++ b/docs/ru/engines/table-engines/integrations/mysql.md @@ -1,5 +1,5 @@ --- -toc_priority: 3 +toc_priority: 4 toc_title: MySQL --- @@ -18,12 +18,13 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ) ENGINE = MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); ``` -Смотрите подробное описание запроса [CREATE TABLE](../../../engines/table-engines/integrations/mysql.md#create-table-query). +Смотрите подробное описание запроса [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query). Структура таблицы может отличаться от исходной структуры таблицы MySQL: - Имена столбцов должны быть такими же, как в исходной таблице MySQL, но вы можете использовать только некоторые из этих столбцов и в любом порядке. -- Типы столбцов могут отличаться от типов в исходной таблице MySQL. ClickHouse пытается [приводить](../../../engines/table-engines/integrations/mysql.md#type_conversion_function-cast) значения к типам данных ClickHouse. +- Типы столбцов могут отличаться от типов в исходной таблице MySQL. ClickHouse пытается [приводить](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) значения к типам данных ClickHouse. +- Настройка `external_table_functions_use_nulls` определяет как обрабатывать Nullable столбцы. По умолчанию 1, если 0 - табличная функция не будет делать nullable столбцы и будет вместо null выставлять значения по умолчанию для скалярного типа. Это также применимо для null значений внутри массивов. **Параметры движка** @@ -100,4 +101,3 @@ SELECT * FROM mysql_table - [Табличная функция ‘mysql’](../../../engines/table-engines/integrations/mysql.md) - [Использование MySQL в качестве источника для внешнего словаря](../../../engines/table-engines/integrations/mysql.md#dicts-external_dicts_dict_sources-mysql) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/mysql/) diff --git a/docs/ru/engines/table-engines/integrations/odbc.md b/docs/ru/engines/table-engines/integrations/odbc.md index 97317d647c8..669977ff531 100644 --- a/docs/ru/engines/table-engines/integrations/odbc.md +++ b/docs/ru/engines/table-engines/integrations/odbc.md @@ -1,5 +1,5 @@ --- -toc_priority: 1 +toc_priority: 2 toc_title: ODBC --- @@ -29,6 +29,7 @@ ENGINE = ODBC(connection_settings, external_database, external_table) - Имена столбцов должны быть такими же, как в исходной таблице, но вы можете использовать только некоторые из этих столбцов и в любом порядке. - Типы столбцов могут отличаться от типов аналогичных столбцов в исходной таблице. ClickHouse пытается [приводить](../../../engines/table-engines/integrations/odbc.md#type_conversion_function-cast) значения к типам данных ClickHouse. +- Настройка `external_table_functions_use_nulls` определяет как обрабатывать Nullable столбцы. По умолчанию 1, если 0 - табличная функция не будет делать nullable столбцы и будет вместо null выставлять значения по умолчанию для скалярного типа. Это также применимо для null значений внутри массивов. **Параметры движка** @@ -127,4 +128,3 @@ SELECT * FROM odbc_t - [Внешние словари ODBC](../../../engines/table-engines/integrations/odbc.md#dicts-external_dicts_dict_sources-odbc) - [Табличная функция odbc](../../../engines/table-engines/integrations/odbc.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/odbc/) diff --git a/docs/ru/engines/table-engines/integrations/postgresql.md b/docs/ru/engines/table-engines/integrations/postgresql.md new file mode 100644 index 00000000000..cb8e38ae5c9 --- /dev/null +++ b/docs/ru/engines/table-engines/integrations/postgresql.md @@ -0,0 +1,145 @@ +--- +toc_priority: 11 +toc_title: PostgreSQL +--- + +#PostgreSQL {#postgresql} + +Движок PostgreSQL позволяет выполнять запросы `SELECT` и `INSERT` для таблиц на удаленном сервере PostgreSQL. + +## Создание таблицы {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + ... +) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]); +``` + +Смотрите подробное описание запроса [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query). + +Структура таблицы может отличаться от исходной структуры таблицы PostgreSQL: + +- Имена столбцов должны быть такими же, как в исходной таблице PostgreSQL, но вы можете использовать только некоторые из этих столбцов и в любом порядке. +- Типы столбцов могут отличаться от типов в исходной таблице PostgreSQL. ClickHouse пытается [приводить](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) values to the ClickHouse data types. +- Настройка `external_table_functions_use_nulls` определяет как обрабатывать Nullable столбцы. По умолчанию 1, если 0 - табличная функция не будет делать nullable столбцы и будет вместо null выставлять значения по умолчанию для скалярного типа. Это также применимо для null значений внутри массивов. + +**Параметры движка** + +- `host:port` — адрес сервера PostgreSQL. +- `database` — Имя базы данных на сервере PostgreSQL. +- `table` — Имя таблицы. +- `user` — Имя пользователя PostgreSQL. +- `password` — Пароль пользователя PostgreSQL. +- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент. + +## Особенности реализации {#implementation-details} + +Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`. + +Простые условия для `WHERE`, такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN`, исполняются на стороне PostgreSQL сервера. + +Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того, как запрос к PostgreSQL закончился. + +Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`. + +PostgreSQL массивы конвертируются в массивы ClickHouse. + +!!! info "Внимание" + Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустимы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. + +При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`. + +В примере ниже реплика `example01-1` имеет более высокий приоритет: + +```xml + + 5432 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 2 + + db_name +
table_name
+ id=10 + SQL_QUERY + + +``` + +## Пример использования {#usage-example} + +Таблица в PostgreSQL: + +``` text +postgres=# CREATE TABLE "public"."test" ( +"int_id" SERIAL, +"int_nullable" INT NULL DEFAULT NULL, +"float" FLOAT NOT NULL, +"str" VARCHAR(100) NOT NULL DEFAULT '', +"float_nullable" FLOAT NULL DEFAULT NULL, +PRIMARY KEY (int_id)); + +CREATE TABLE + +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); +INSERT 0 1 + +postgresql> SELECT * FROM test; + int_id | int_nullable | float | str | float_nullable + --------+--------------+-------+------+---------------- + 1 | | 2 | test | + (1 row) +``` + +Таблица в ClickHouse, получение данных из PostgreSQL таблицы, созданной выше: + +``` sql +CREATE TABLE default.postgresql_table +( + `float_nullable` Nullable(Float32), + `str` String, + `int_id` Int32 +) +ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgres_password'); +``` + +``` sql +SELECT * FROM postgresql_table WHERE str IN ('test'); +``` + +``` text +┌─float_nullable─┬─str──┬─int_id─┐ +│ ᴺᵁᴸᴸ │ test │ 1 │ +└────────────────┴──────┴────────┘ +``` + +Using Non-default Schema: + +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**См. также** + +- [Табличная функция `postgresql`](../../../sql-reference/table-functions/postgresql.md) +- [Использование PostgreSQL в качестве источника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/postgresql/) diff --git a/docs/ru/engines/table-engines/integrations/rabbitmq.md b/docs/ru/engines/table-engines/integrations/rabbitmq.md index f55163c1988..ef8a58c4c82 100644 --- a/docs/ru/engines/table-engines/integrations/rabbitmq.md +++ b/docs/ru/engines/table-engines/integrations/rabbitmq.md @@ -155,3 +155,4 @@ Example: - `_redelivered` - флаг `redelivered`. (Не равно нулю, если есть возможность, что сообщение было получено более, чем одним каналом.) - `_message_id` - значение поля `messageID` полученного сообщения. Данное поле непусто, если указано в параметрах при отправке сообщения. - `_timestamp` - значение поля `timestamp` полученного сообщения. Данное поле непусто, если указано в параметрах при отправке сообщения. + diff --git a/docs/ru/engines/table-engines/integrations/s3.md b/docs/ru/engines/table-engines/integrations/s3.md new file mode 100644 index 00000000000..216db98077c --- /dev/null +++ b/docs/ru/engines/table-engines/integrations/s3.md @@ -0,0 +1,146 @@ +--- +toc_priority: 4 +toc_title: S3 +--- + +# Движок таблиц S3 {#table-engine-s3} + +Этот движок обеспечивает интеграцию с экосистемой [Amazon S3](https://aws.amazon.com/s3/). Он похож на движок [HDFS](../../../engines/table-engines/special/file.md#table_engines-hdfs), но обеспечивает специфические для S3 возможности. + +## Создание таблицы {#creating-a-table} + +``` sql +CREATE TABLE s3_engine_table (name String, value UInt32) +ENGINE = S3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compression]) +``` + +**Параметры движка** + +- `path` — URL-адрес бакета с указанием пути к файлу. Поддерживает следующие подстановочные знаки в режиме "только чтение": `*`, `?`, `{abc,def}` и `{N..M}` где `N`, `M` — числа, `'abc'`, `'def'` — строки. Подробнее смотри [ниже](#wildcards-in-path). +- `format` — [формат](../../../interfaces/formats.md#formats) файла. +- `structure` — структура таблицы в формате `'column1_name column1_type, column2_name column2_type, ...'`. +- `compression` — тип сжатия. Возможные значения: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Необязательный параметр. Если не указано, то тип сжатия определяется автоматически по расширению файла. + +**Пример** + +``` sql +CREATE TABLE s3_engine_table (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip'); +INSERT INTO s3_engine_table VALUES ('one', 1), ('two', 2), ('three', 3); +SELECT * FROM s3_engine_table LIMIT 2; +``` + +``` text +┌─name─┬─value─┐ +│ one │ 1 │ +│ two │ 2 │ +└──────┴───────┘ +``` + +## Виртуальные столбцы {#virtual-columns} + +- `_path` — путь к файлу. +- `_file` — имя файла. + +Подробнее про виртуальные столбцы можно прочитать [здесь](../../../engines/table-engines/index.md#table_engines-virtual_columns). + +## Детали реализации {#implementation-details} + +- Чтение и запись могут быть параллельными. +- Не поддерживаются: + - запросы `ALTER` и `SELECT...SAMPLE`, + - индексы, + - репликация. + +## Символы подстановки {#wildcards-in-path} + +Аргумент `path` может указывать на несколько файлов, используя подстановочные знаки. Для обработки файл должен существовать и соответствовать всему шаблону пути. Список файлов определяется во время выполнения запроса `SELECT` (не в момент выполнения запроса `CREATE`). + +- `*` — заменяет любое количество любых символов, кроме `/`, включая пустую строку. +- `?` — заменяет любые одиночные символы. +- `{some_string, another_string, yet_another_one}` — заменяет любые строки `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — заменяет любое число от N до M, включая обе границы. N и M могут иметь ведущие нули, например `000..078`. + +Конструкции с `{}` аналогичны функции [remote](../../../sql-reference/table-functions/remote.md). + +## Настройки движка S3 {#s3-settings} + +Перед выполнением запроса или в конфигурационном файле могут быть установлены следующие настройки: + +- `s3_max_single_part_upload_size` — максимальный размер объекта для загрузки с использованием однокомпонентной загрузки в S3. Значение по умолчанию — `64 Mб`. +- `s3_min_upload_part_size` — минимальный размер объекта для загрузки при многокомпонентной загрузке в [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Значение по умолчанию — `512 Mб`. +- `s3_max_redirects` — максимальное количество разрешенных переадресаций S3. Значение по умолчанию — `10`. + +Соображение безопасности: если злонамеренный пользователь попробует указать произвольные URL-адреса S3, параметр `s3_max_redirects` должен быть установлен в ноль, чтобы избежать атак [SSRF] (https://en.wikipedia.org/wiki/Server-side_request_forgery). Как альтернатива, в конфигурации сервера должен быть указан `remote_host_filter`. + +## Настройки точки приема запроса {#endpoint-settings} + +Для точки приема запроса (которая соответствует точному префиксу URL-адреса) в конфигурационном файле могут быть заданы следующие настройки: + +Обязательная настройка: +- `endpoint` — указывает префикс точки приема запроса. + +Необязательные настройки: +- `access_key_id` и `secret_access_key` — указывают учетные данные для использования с данной точкой приема запроса. +- `use_environment_credentials` — если `true`, S3-клиент будет пытаться получить учетные данные из переменных среды и метаданных Amazon EC2 для данной точки приема запроса. Значение по умолчанию - `false`. +- `header` — добавляет указанный HTTP-заголовок к запросу на заданную точку приема запроса. Может быть определен несколько раз. +- `server_side_encryption_customer_key_base64` — устанавливает необходимые заголовки для доступа к объектам S3 с шифрованием SSE-C. + +**Пример** + +``` xml + + + https://storage.yandexcloud.net/my-test-bucket-768/ + + + + + + + +``` + +## Примеры использования {#usage-examples} + +Предположим, у нас есть несколько файлов в формате TSV со следующими URL-адресами в HDFS: + +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_3.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_1.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv' + +1. Существует несколько способов создать таблицу, включающую в себя все шесть файлов: + +``` sql +CREATE TABLE table_with_range (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}', 'CSV'); +``` + +2. Другой способ: + +``` sql +CREATE TABLE table_with_question_mark (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_?', 'CSV'); +``` + +3. Таблица содержит все файлы в обоих каталогах (все файлы должны соответствовать формату и схеме, описанным в запросе): + +``` sql +CREATE TABLE table_with_asterisk (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV'); +``` + +Если список файлов содержит диапазоны чисел с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры отдельно или используйте `?`. + +4. Создание таблицы из файлов с именами `file-000.csv`, `file-001.csv`, … , `file-999.csv`: + +``` sql +CREATE TABLE big_table (name String, value UInt32) +ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV'); +``` +**Смотрите также** + +- [Табличная функция S3](../../../sql-reference/table-functions/s3.md) diff --git a/docs/ru/engines/table-engines/log-family/index.md b/docs/ru/engines/table-engines/log-family/index.md index 7c6d2f81d7c..7737eac2f43 100644 --- a/docs/ru/engines/table-engines/log-family/index.md +++ b/docs/ru/engines/table-engines/log-family/index.md @@ -1,6 +1,6 @@ --- -toc_folder_title: "\u0421\u0435\u043c\u0435\u0439\u0441\u0442\u0432\u043e\u0020\u004c\u006f\u0067" -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +toc_folder_title: "Семейство Log" +toc_title: "Введение" toc_priority: 29 --- @@ -42,4 +42,3 @@ toc_priority: 29 Движки `Log` и `StripeLog` поддерживают параллельное чтение. При чтении данных, ClickHouse использует множество потоков. Каждый поток обрабатывает отдельный блок данных. Движок `Log` сохраняет каждый столбец таблицы в отдельном файле. Движок `StripeLog` хранит все данные в одном файле. Таким образом, движок `StripeLog` использует меньше дескрипторов в операционной системе, а движок `Log` обеспечивает более эффективное считывание данных. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/log_family/) diff --git a/docs/ru/engines/table-engines/log-family/log.md b/docs/ru/engines/table-engines/log-family/log.md index fad331454c7..6c5bf2221f8 100644 --- a/docs/ru/engines/table-engines/log-family/log.md +++ b/docs/ru/engines/table-engines/log-family/log.md @@ -11,4 +11,3 @@ toc_title: Log При конкурентном доступе к данным, чтения могут выполняться одновременно, а записи блокируют чтения и друг друга. Движок Log не поддерживает индексы. Также, если при записи в таблицу произошёл сбой, то таблица станет битой, и чтения из неё будут возвращать ошибку. Движок Log подходит для временных данных, write-once таблиц, а также для тестовых и демонстрационных целей. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/log/) diff --git a/docs/ru/engines/table-engines/log-family/stripelog.md b/docs/ru/engines/table-engines/log-family/stripelog.md index e505aae4c52..2f4b228f894 100644 --- a/docs/ru/engines/table-engines/log-family/stripelog.md +++ b/docs/ru/engines/table-engines/log-family/stripelog.md @@ -90,4 +90,3 @@ SELECT * FROM stripe_log_table ORDER BY timestamp └─────────────────────┴──────────────┴────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/stripelog/) diff --git a/docs/ru/engines/table-engines/log-family/tinylog.md b/docs/ru/engines/table-engines/log-family/tinylog.md index d5c24d41ca4..721355d8702 100644 --- a/docs/ru/engines/table-engines/log-family/tinylog.md +++ b/docs/ru/engines/table-engines/log-family/tinylog.md @@ -11,4 +11,3 @@ toc_title: TinyLog Запросы выполняются в один поток. То есть, этот движок предназначен для сравнительно маленьких таблиц (до 1 000 000 строк). Этот движок таблиц имеет смысл использовать в том случае, когда у вас есть много маленьких таблиц, так как он проще, чем движок [Log](log.md) (требуется открывать меньше файлов). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/tinylog/) diff --git a/docs/ru/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/ru/engines/table-engines/mergetree-family/aggregatingmergetree.md index 99b4ec06765..6e01cc2bcac 100644 --- a/docs/ru/engines/table-engines/mergetree-family/aggregatingmergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/aggregatingmergetree.md @@ -97,4 +97,3 @@ GROUP BY StartDate ORDER BY StartDate; ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/aggregatingmergetree/) diff --git a/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md index 8ea3a5a7c92..424fcbb5873 100644 --- a/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -304,4 +304,3 @@ select * FROM UAct └─────────────────────┴───────────┴──────────┴──────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/collapsingmergetree/) diff --git a/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md index 2d26528d964..9a09618e508 100644 --- a/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -1,6 +1,6 @@ --- toc_priority: 32 -toc_title: "\u041f\u0440\u043e\u0438\u0437\u0432\u043e\u043b\u044c\u043d\u044b\u0439\u0020\u043a\u043b\u044e\u0447\u0020\u043f\u0430\u0440\u0442\u0438\u0446\u0438\u043e\u043d\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u044f" +toc_title: "Произвольный ключ партиционирования" --- @@ -129,4 +129,3 @@ drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached ClickHouse позволяет производить различные манипуляции с кусками: удалять, копировать из одной таблицы в другую или создавать их резервные копии. Подробнее см. в разделе [Манипуляции с партициями и кусками](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md#alter_manipulations-with-partitions). -[Оригинальная статья:](https://clickhouse.tech/docs/ru/operations/table_engines/custom_partitioning_key/) diff --git a/docs/ru/engines/table-engines/mergetree-family/graphitemergetree.md b/docs/ru/engines/table-engines/mergetree-family/graphitemergetree.md index e47c9127711..f3e915a413b 100644 --- a/docs/ru/engines/table-engines/mergetree-family/graphitemergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/graphitemergetree.md @@ -171,4 +171,3 @@ default ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/graphitemergetree/) diff --git a/docs/ru/engines/table-engines/mergetree-family/index.md b/docs/ru/engines/table-engines/mergetree-family/index.md index abdfdd77d7f..e184e51c406 100644 --- a/docs/ru/engines/table-engines/mergetree-family/index.md +++ b/docs/ru/engines/table-engines/mergetree-family/index.md @@ -1,5 +1,5 @@ --- toc_folder_title: MergeTree Family toc_priority: 28 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +toc_title: "Введение" --- diff --git a/docs/ru/engines/table-engines/mergetree-family/mergetree.md b/docs/ru/engines/table-engines/mergetree-family/mergetree.md index 6fc566b7c31..b8bd259167a 100644 --- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md @@ -56,13 +56,13 @@ ORDER BY expr ClickHouse использует ключ сортировки в качестве первичного ключа, если первичный ключ не задан в секции `PRIMARY KEY`. - Чтобы отключить сортировку, используйте синтаксис `ORDER BY tuple()`. Смотрите [выбор первичного ключа](#vybor-pervichnogo-kliucha). + Чтобы отключить сортировку, используйте синтаксис `ORDER BY tuple()`. Смотрите [выбор первичного ключа](#primary-keys-and-indexes-in-queries). - `PARTITION BY` — [ключ партиционирования](custom-partitioning-key.md). Необязательный параметр. Для партиционирования по месяцам используйте выражение `toYYYYMM(date_column)`, где `date_column` — столбец с датой типа [Date](../../../engines/table-engines/mergetree-family/mergetree.md). В этом случае имена партиций имеют формат `"YYYYMM"`. -- `PRIMARY KEY` — первичный ключ, если он [отличается от ключа сортировки](#pervichnyi-kliuch-otlichnyi-ot-kliucha-sortirovki). Необязательный параметр. +- `PRIMARY KEY` — первичный ключ, если он [отличается от ключа сортировки](#choosing-a-primary-key-that-differs-from-the-sorting-key). Необязательный параметр. По умолчанию первичный ключ совпадает с ключом сортировки (который задаётся секцией `ORDER BY`.) Поэтому в большинстве случаев секцию `PRIMARY KEY` отдельно указывать не нужно. @@ -188,7 +188,7 @@ ClickHouse не требует уникального первичного кл При сортировке с использованием выражения `ORDER BY` для значений `NULL` всегда работает принцип [NULLS_LAST](../../../sql-reference/statements/select/order-by.md#sorting-of-special-values). -### Выбор первичного ключа {#vybor-pervichnogo-kliucha} +### Выбор первичного ключа {#selecting-the-primary-key} Количество столбцов в первичном ключе не ограничено явным образом. В зависимости от структуры данных в первичный ключ можно включать больше или меньше столбцов. Это может: @@ -217,7 +217,7 @@ ClickHouse не требует уникального первичного кл -### Первичный ключ, отличный от ключа сортировки {#pervichnyi-kliuch-otlichnyi-ot-kliucha-sortirovki} +### Первичный ключ, отличный от ключа сортировки {#choosing-a-primary-key-that-differs-from-the-sorting-key} Существует возможность задать первичный ключ (выражение, значения которого будут записаны в индексный файл для каждой засечки), отличный от ключа сортировки (выражение, по которому будут упорядочены строки в кусках @@ -236,7 +236,7 @@ ClickHouse не требует уникального первичного кл [ALTER ключа сортировки](../../../engines/table-engines/mergetree-family/mergetree.md) — лёгкая операция, так как при одновременном добавлении нового столбца в таблицу и ключ сортировки не нужно изменять данные кусков (они остаются упорядоченными и по новому выражению ключа). -### Использование индексов и партиций в запросах {#ispolzovanie-indeksov-i-partitsii-v-zaprosakh} +### Использование индексов и партиций в запросах {#use-of-indexes-and-partitions-in-queries} Для запросов `SELECT` ClickHouse анализирует возможность использования индекса. Индекс может использоваться, если в секции `WHERE/PREWHERE`, в качестве одного из элементов конъюнкции, или целиком, есть выражение, представляющее операции сравнения на равенства, неравенства, а также `IN` или `LIKE` с фиксированным префиксом, над столбцами или выражениями, входящими в первичный ключ или ключ партиционирования, либо над некоторыми частично монотонными функциями от этих столбцов, а также логические связки над такими выражениями. @@ -270,7 +270,7 @@ SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' Ключ партиционирования по месяцам обеспечивает чтение только тех блоков данных, которые содержат даты из нужного диапазона. При этом блок данных может содержать данные за многие даты (до целого месяца). В пределах одного блока данные упорядочены по первичному ключу, который может не содержать дату в качестве первого столбца. В связи с этим, при использовании запроса с указанием условия только на дату, но не на префикс первичного ключа, будет читаться данных больше, чем за одну дату. -### Использование индекса для частично-монотонных первичных ключей {#ispolzovanie-indeksa-dlia-chastichno-monotonnykh-pervichnykh-kliuchei} +### Использование индекса для частично-монотонных первичных ключей {#use-of-index-for-partially-monotonic-primary-keys} Рассмотрим, например, дни месяца. Они образуют последовательность [монотонную](https://ru.wikipedia.org/wiki/Монотонная_последовательность) в течение одного месяца, но не монотонную на более длительных периодах. Это частично-монотонная последовательность. Если пользователь создаёт таблицу с частично-монотонным первичным ключом, ClickHouse как обычно создаёт разреженный индекс. Когда пользователь выбирает данные из такого рода таблиц, ClickHouse анализирует условия запроса. Если пользователь хочет получить данные между двумя метками индекса, и обе эти метки находятся внутри одного месяца, ClickHouse может использовать индекс в данном конкретном случае, поскольку он может рассчитать расстояние между параметрами запроса и индексными метками. @@ -312,7 +312,7 @@ SELECT count() FROM table WHERE s < 'z' SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 ``` -#### Доступные индексы {#dostupnye-indeksy} +#### Доступные индексы {#available-types-of-indices} - `minmax` — Хранит минимум и максимум выражения (если выражение - `tuple`, то для каждого элемента `tuple`), используя их для пропуска блоков аналогично первичному ключу. @@ -375,7 +375,7 @@ INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARIT - `s != 1` - `NOT startsWith(s, 'test')` -## Конкурентный доступ к данным {#konkurentnyi-dostup-k-dannym} +## Конкурентный доступ к данным {#concurrent-data-access} Для конкурентного доступа к таблице используется мультиверсионность. То есть, при одновременном чтении и обновлении таблицы, данные будут читаться из набора кусочков, актуального на момент запроса. Длинных блокировок нет. Вставки никак не мешают чтениям. @@ -517,7 +517,7 @@ CREATE TABLE table_for_aggregation y Int ) ENGINE = MergeTree -ORDER BY k1, k2 +ORDER BY (k1, k2) TTL d + INTERVAL 1 MONTH GROUP BY k1, k2 SET x = max(x), y = min(y); ``` @@ -531,13 +531,13 @@ TTL d + INTERVAL 1 MONTH GROUP BY k1, k2 SET x = max(x), y = min(y); ## Хранение данных таблицы на нескольких блочных устройствах {#table_engine-mergetree-multiple-volumes} -### Введение {#vvedenie} +### Введение {#introduction} Движки таблиц семейства `MergeTree` могут хранить данные на нескольких блочных устройствах. Это может оказаться полезным, например, при неявном разделении данных одной таблицы на «горячие» и «холодные». Наиболее свежая часть занимает малый объём и запрашивается регулярно, а большой хвост исторических данных запрашивается редко. При наличии в системе нескольких дисков, «горячая» часть данных может быть размещена на быстрых дисках (например, на NVMe SSD или в памяти), а холодная на более медленных (например, HDD). Минимальной перемещаемой единицей для `MergeTree` является кусок данных (data part). Данные одного куска могут находится только на одном диске. Куски могут перемещаться между дисками в фоне, согласно пользовательским настройкам, а также с помощью запросов [ALTER](../../../engines/table-engines/mergetree-family/mergetree.md#alter_move-partition). -### Термины {#terminy} +### Термины {#terms} - Диск — примонтированное в файловой системе блочное устройство. - Диск по умолчанию — диск, на котором находится путь, указанный в конфигурационной настройке сервера [path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-path). @@ -689,7 +689,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' Количество потоков для фоновых перемещений кусков между дисками можно изменить с помощью настройки [background_move_pool_size](../../../operations/settings/settings.md#background_move_pool_size) -### Особенности работы {#osobennosti-raboty} +### Особенности работы {#details} В таблицах `MergeTree` данные попадают на диск несколькими способами: @@ -712,4 +712,97 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' После выполнения фоновых слияний или мутаций старые куски не удаляются сразу, а через некоторое время (табличная настройка `old_parts_lifetime`). Также они не перемещаются на другие тома или диски, поэтому до момента удаления они продолжают учитываться при подсчёте занятого дискового пространства. -[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/mergetree-family/mergetree/) +## Использование сервиса S3 для хранения данных {#table_engine-mergetree-s3} + +Таблицы семейства `MergeTree` могут хранить данные в сервисе [S3](https://aws.amazon.com/s3/) при использовании диска типа `s3`. + +Конфигурация: + +``` xml + + ... + + + s3 + https://storage.yandexcloud.net/my-bucket/root-path/ + your_access_key_id + your_secret_access_key + + http://proxy1 + http://proxy2 + + 10000 + 5000 + 10 + 1000 + /var/lib/clickhouse/disks/s3/ + true + /var/lib/clickhouse/disks/s3/cache/ + false + + + ... + +``` + +Обязательные параметры: + +- `endpoint` — URL точки приема запроса на стороне S3 в [форматах](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html) `path` или `virtual hosted`. URL точки должен содержать бакет и путь к корневой директории на сервере, где хранятся данные. +- `access_key_id` — id ключа доступа к S3. +- `secret_access_key` — секретный ключ доступа к S3. + +Необязательные параметры: + +- `use_environment_credentials` — признак, нужно ли считывать учетные данные AWS из сетевого окружения, а также из переменных окружения `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` и `AWS_SESSION_TOKEN`, если они есть. Значение по умолчанию: `false`. +- `use_insecure_imds_request` — признак, нужно ли использовать менее безопасное соединение при выполнении запроса к IMDS при получении учётных данных из метаданных Amazon EC2. Значение по умолчанию: `false`. +- `proxy` — конфигурация прокси-сервера для конечной точки S3. Каждый элемент `uri` внутри блока `proxy` должен содержать URL прокси-сервера. +- `connect_timeout_ms` — таймаут подключения к сокету в миллисекундах. Значение по умолчанию: 10 секунд. +- `request_timeout_ms` — таймаут выполнения запроса в миллисекундах. Значение по умолчанию: 5 секунд. +- `retry_attempts` — число попыток выполнения запроса в случае возникновения ошибки. Значение по умолчанию: `10`. +- `min_bytes_for_seek` — минимальное количество байтов, которые используются для операций поиска вместо последовательного чтения. Значение по умолчанию: 1 МБайт. +- `metadata_path` — путь к локальному файловому хранилищу для хранения файлов с метаданными для S3. Значение по умолчанию: `/var/lib/clickhouse/disks//`. +- `cache_enabled` — признак, разрешено ли хранение кэша засечек и файлов индекса в локальной файловой системе. Значение по умолчанию: `true`. +- `cache_path` — путь в локальной файловой системе, где будут храниться кэш засечек и файлы индекса. Значение по умолчанию: `/var/lib/clickhouse/disks//cache/`. +- `skip_access_check` — признак, выполнять ли проверку доступов при запуске диска. Если установлено значение `true`, то проверка не выполняется. Значение по умолчанию: `false`. + + +Диск S3 может быть сконфигурирован как `main` или `cold`: + +``` xml + + ... + + + s3 + https://storage.yandexcloud.net/my-bucket/root-path/ + your_access_key_id + your_secret_access_key + + + + + +
+ s3 +
+
+
+ + +
+ default +
+ + s3 + +
+ 0.2 +
+
+ ... +
+``` + +Если диск сконфигурирован как `cold`, данные будут переноситься в S3 при срабатывании правил TTL или когда свободное место на локальном диске станет меньше порогового значения, которое определяется как `move_factor * disk_size`. + + diff --git a/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md b/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md index a4e47b161ad..ec0b339e8c9 100644 --- a/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md @@ -66,4 +66,3 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/replacingmergetree/) diff --git a/docs/ru/engines/table-engines/mergetree-family/replication.md b/docs/ru/engines/table-engines/mergetree-family/replication.md index a8a308b104f..848adbee4da 100644 --- a/docs/ru/engines/table-engines/mergetree-family/replication.md +++ b/docs/ru/engines/table-engines/mergetree-family/replication.md @@ -1,6 +1,6 @@ --- toc_priority: 31 -toc_title: "\u0420\u0435\u043f\u043b\u0438\u043a\u0430\u0446\u0438\u044f\u0020\u0434\u0430\u043d\u043d\u044b\u0445" +toc_title: "Репликация данных" --- # Репликация данных {#table_engines-replication} @@ -251,4 +251,3 @@ $ sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data - [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) - [execute_merges_on_single_replica_time_threshold](../../../operations/settings/settings.md#execute-merges-on-single-replica-time-threshold) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/replication/) diff --git a/docs/ru/engines/table-engines/mergetree-family/summingmergetree.md b/docs/ru/engines/table-engines/mergetree-family/summingmergetree.md index 7b9c11adc2e..adb40037319 100644 --- a/docs/ru/engines/table-engines/mergetree-family/summingmergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/summingmergetree.md @@ -136,4 +136,3 @@ ClickHouse может слить куски данных таким образо Для вложенной структуры данных не нужно указывать её столбцы в кортеже столбцов для суммирования. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/summingmergetree/) diff --git a/docs/ru/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md b/docs/ru/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md index 2adb8cc0d77..61688b1f00f 100644 --- a/docs/ru/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md @@ -233,4 +233,3 @@ SELECT * FROM UAct FINAL Это очень неэффективный способ выбора данных. Не используйте его для больших таблиц. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/versionedcollapsingmergetree/) diff --git a/docs/ru/engines/table-engines/special/buffer.md b/docs/ru/engines/table-engines/special/buffer.md index 75ce12f50fa..ba865b72b78 100644 --- a/docs/ru/engines/table-engines/special/buffer.md +++ b/docs/ru/engines/table-engines/special/buffer.md @@ -66,4 +66,3 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10 Заметим, что даже для таблиц типа Buffer не имеет смысла вставлять данные по одной строке, так как таким образом будет достигнута скорость всего лишь в несколько тысяч строк в секунду, тогда как при вставке более крупными блоками, достижимо более миллиона строк в секунду (смотрите раздел [«Производительность»](../../../introduction/performance/). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/buffer/) diff --git a/docs/ru/engines/table-engines/special/dictionary.md b/docs/ru/engines/table-engines/special/dictionary.md index 048da157b2d..243fd5395c0 100644 --- a/docs/ru/engines/table-engines/special/dictionary.md +++ b/docs/ru/engines/table-engines/special/dictionary.md @@ -90,4 +90,3 @@ select * from products limit 1; └───────────────┴─────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/dictionary/) diff --git a/docs/ru/engines/table-engines/special/distributed.md b/docs/ru/engines/table-engines/special/distributed.md index 7ab0b916337..86eef35ebbc 100644 --- a/docs/ru/engines/table-engines/special/distributed.md +++ b/docs/ru/engines/table-engines/special/distributed.md @@ -136,4 +136,3 @@ logs - имя кластера в конфигурационном файле с При выставлении опции max_parallel_replicas выполнение запроса распараллеливается по всем репликам внутри одного шарда. Подробнее смотрите раздел [max_parallel_replicas](../../../operations/settings/settings.md#settings-max_parallel_replicas). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/distributed/) diff --git a/docs/ru/engines/table-engines/special/external-data.md b/docs/ru/engines/table-engines/special/external-data.md index 7e383c0c12d..29075837aba 100644 --- a/docs/ru/engines/table-engines/special/external-data.md +++ b/docs/ru/engines/table-engines/special/external-data.md @@ -1,6 +1,6 @@ --- toc_priority: 45 -toc_title: "\u0412\u043d\u0435\u0448\u043d\u0438\u0435\u0020\u0434\u0430\u043d\u043d\u044b\u0435\u0020\u0434\u043b\u044f\u0020\u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438\u0020\u0437\u0430\u043f\u0440\u043e\u0441\u0430" +toc_title: "Внешние данные для обработки запроса" --- # Внешние данные для обработки запроса {#vneshnie-dannye-dlia-obrabotki-zaprosa} @@ -65,4 +65,3 @@ $ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+coun При распределённой обработке запроса, временные таблицы передаются на все удалённые серверы. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/external_data/) diff --git a/docs/ru/engines/table-engines/special/file.md b/docs/ru/engines/table-engines/special/file.md index 6a55ef31732..6f1c723d2a7 100644 --- a/docs/ru/engines/table-engines/special/file.md +++ b/docs/ru/engines/table-engines/special/file.md @@ -63,7 +63,7 @@ SELECT * FROM file_engine_table ## Использование движка в Clickhouse-local {#ispolzovanie-dvizhka-v-clickhouse-local} -В [clickhouse-local](../../../engines/table-engines/special/file.md) движок в качестве параметра принимает не только формат, но и путь к файлу. В том числе можно указать стандартные потоки ввода/вывода цифровым или буквенным обозначением `0` или `stdin`, `1` или `stdout`. +В [clickhouse-local](../../../engines/table-engines/special/file.md) движок в качестве параметра принимает не только формат, но и путь к файлу. В том числе можно указать стандартные потоки ввода/вывода цифровым или буквенным обозначением `0` или `stdin`, `1` или `stdout`. Можно записывать и читать сжатые файлы. Для этого нужно задать дополнительный параметр движка или расширение файла (`gz`, `br` или `xz`). **Пример:** @@ -81,4 +81,3 @@ $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64 - индексы; - репликация. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/file/) diff --git a/docs/ru/engines/table-engines/special/index.md b/docs/ru/engines/table-engines/special/index.md index 0d86461dd2d..231bf2979ed 100644 --- a/docs/ru/engines/table-engines/special/index.md +++ b/docs/ru/engines/table-engines/special/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0421\u043f\u0435\u0446\u0438\u0430\u043b\u044c\u043d\u044b\u0435\u0020\u0434\u0432\u0438\u0436\u043a\u0438\u0020\u0442\u0430\u0431\u043b\u0438\u0446" +toc_folder_title: "Специальные движки таблиц" toc_priority: 31 --- @@ -13,4 +13,3 @@ toc_priority: 31 Остальные движки таблиц уникальны по своему назначению и еще не сгруппированы в семейства, поэтому они помещены в эту специальную категорию. -[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/special/) diff --git a/docs/ru/engines/table-engines/special/join.md b/docs/ru/engines/table-engines/special/join.md index 8cb7acd91e1..ef27ac3f10f 100644 --- a/docs/ru/engines/table-engines/special/join.md +++ b/docs/ru/engines/table-engines/special/join.md @@ -107,4 +107,3 @@ SELECT joinGet('id_val_join', 'val', toUInt32(1)) При аварийном перезапуске сервера блок данных на диске может быть потерян или повреждён. В последнем случае, может потребоваться вручную удалить файл с повреждёнными данными. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/join/) diff --git a/docs/ru/engines/table-engines/special/materializedview.md b/docs/ru/engines/table-engines/special/materializedview.md index 1281d1db9ab..6b82f95df92 100644 --- a/docs/ru/engines/table-engines/special/materializedview.md +++ b/docs/ru/engines/table-engines/special/materializedview.md @@ -7,4 +7,3 @@ toc_title: MaterializedView Используется для реализации материализованных представлений (подробнее см. запрос [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query)). Для хранения данных, использует другой движок, который был указан при создании представления. При чтении из таблицы, просто использует этот движок. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/materializedview/) diff --git a/docs/ru/engines/table-engines/special/memory.md b/docs/ru/engines/table-engines/special/memory.md index 9ca189ef3b2..5a242238a02 100644 --- a/docs/ru/engines/table-engines/special/memory.md +++ b/docs/ru/engines/table-engines/special/memory.md @@ -14,4 +14,3 @@ toc_title: Memory Движок Memory используется системой для временных таблиц - внешних данных запроса (смотрите раздел «Внешние данные для обработки запроса»), для реализации `GLOBAL IN` (смотрите раздел «Операторы IN»). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/memory/) diff --git a/docs/ru/engines/table-engines/special/merge.md b/docs/ru/engines/table-engines/special/merge.md index 656aa7cfd6b..714b087c201 100644 --- a/docs/ru/engines/table-engines/special/merge.md +++ b/docs/ru/engines/table-engines/special/merge.md @@ -65,4 +65,3 @@ FROM WatchLog - [Виртуальные столбцы](index.md#table_engines-virtual_columns) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/merge/) diff --git a/docs/ru/engines/table-engines/special/null.md b/docs/ru/engines/table-engines/special/null.md index 2c3af1ce11e..05f5c88bacb 100644 --- a/docs/ru/engines/table-engines/special/null.md +++ b/docs/ru/engines/table-engines/special/null.md @@ -7,4 +7,3 @@ toc_title: 'Null' Тем не менее, есть возможность создать материализованное представление над таблицей типа Null. Тогда данные, записываемые в таблицу, будут попадать в представление. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/null/) diff --git a/docs/ru/engines/table-engines/special/set.md b/docs/ru/engines/table-engines/special/set.md index 14b7f123a34..ced9abf55dc 100644 --- a/docs/ru/engines/table-engines/special/set.md +++ b/docs/ru/engines/table-engines/special/set.md @@ -20,4 +20,3 @@ toc_title: Set - [persistent](../../../operations/settings/settings.md#persistent) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/set/) diff --git a/docs/ru/engines/table-engines/special/url.md b/docs/ru/engines/table-engines/special/url.md index cdb5afddf75..b8fcd27204f 100644 --- a/docs/ru/engines/table-engines/special/url.md +++ b/docs/ru/engines/table-engines/special/url.md @@ -77,4 +77,3 @@ SELECT * FROM url_engine_table - индексы; - репликация. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/url/) diff --git a/docs/ru/engines/table-engines/special/view.md b/docs/ru/engines/table-engines/special/view.md index 18813a55da2..45aeb55cd85 100644 --- a/docs/ru/engines/table-engines/special/view.md +++ b/docs/ru/engines/table-engines/special/view.md @@ -7,4 +7,3 @@ toc_title: View Используется для реализации представлений (подробнее см. запрос `CREATE VIEW`). Не хранит данные, а хранит только указанный запрос `SELECT`. При чтении из таблицы, выполняет его (с удалением из запроса всех ненужных столбцов). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/view/) diff --git a/docs/ru/faq/general/columnar-database.md b/docs/ru/faq/general/columnar-database.md new file mode 100644 index 00000000000..f38e46cfe93 --- /dev/null +++ b/docs/ru/faq/general/columnar-database.md @@ -0,0 +1,25 @@ +--- +title: Что такое столбцовая база данных? +toc_hidden: true +toc_priority: 101 +--- + +# Что такое столбцовая (колоночная) база данных? {#what-is-a-columnar-database} + +В столбцовой БД данные каждого столбца хранятся отдельно (независимо) от других столбцов. Такой принцип хранения позволяет при выполнении запроса считывать с диска данные только тех столбцов, которые непосредственно участвуют в этом запросе. Обратная сторона такого принципа хранения заключается в том, что выполнение операций над строками становится более затратным. ClickHouse — типичный пример столбцовой СУБД. + +Ключевые преимущества столбцовой СУБД: + +- выполнение запросов над отдельными столбцами таблицы, а не над всей таблицей сразу; +- агрегация запросов на больших объемах данных; +- сжатие данных в столбцах. + +Ниже — иллюстрация того, как извлекаются данные для отчетов при использовании обычной строковой СУБД и столбцовой СУБД: + +**Стандартная строковая СУБД** +![Стандартная строковая СУБД](https://clickhouse.tech/docs/en/images/row-oriented.gif#) + +**Столбцовая СУБД** +![Столбцовая СУБД](https://clickhouse.tech/docs/en/images/column-oriented.gif#) + +Для аналитических приложений столбцовые СУБД предпочтительнее, так как в них можно хранить много столбцов в таблице просто на всякий случай, и это не будет сказываться на скорости чтения данных. Столбцовые СУБД предназначены для обработки и хранения больших данных. Они прекрасно масштабируются при помощи распределенных кластеров на относительно недорогих серверах — для увеличения производительности. В ClickHouse для этого используются [распределенные](../../engines/table-engines/special/distributed.md) и [реплицированные](../../engines/table-engines/mergetree-family/replication.md) таблицы. diff --git a/docs/ru/faq/general/dbms-naming.md b/docs/ru/faq/general/dbms-naming.md new file mode 100644 index 00000000000..2611daeffcc --- /dev/null +++ b/docs/ru/faq/general/dbms-naming.md @@ -0,0 +1,17 @@ +--- +title: "Что означает название ClickHouse?" +toc_hidden: true +toc_priority: 10 +--- + +# Что означает название ClickHouse? {#what-does-clickhouse-mean} + +Это комбинация терминов **Click**stream и Data ware**House**. Название пришло из Яндекс.Метрики, для которой первоначально был разработан ClickHouse — там он использовался для хранения истории визитов пользователей на сайты и всех пользовательских действий — "кликов". Кстати, ClickHouse по-прежнему выполняет эту функцию. Узнать об этом больше можно на странице [истории ClickHouse](../../introduction/history.md). + +Поскольку название составное, использовать его нужно следующим образом: + +- единственно правильный способ написания — Click**H**ouse — с заглавной буквой H; +- если нужно сокращеннное название, используйте **CH**. Исторически сложилось, что в Китае также популярно сокращение CK — в основном, из-за того, что это название использовалось в одном из первых обсуждений ClickHouse на китайском языке. + +!!! info "Забавный факт" + Спустя годы после того, как ClickHouse получил свое название, принцип комбинирования двух слов, каждое из которых имеет подходящий смысл, был признан лучшим способом назвать базу данных в [исследовании Andy Pavlo](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html), Associate Professor of Databases в Carnegie Mellon University. ClickHouse разделил награду "за лучшее название СУБД" с Postgres. diff --git a/docs/ru/faq/general/index.md b/docs/ru/faq/general/index.md index cf0f8451450..cf105eaf8bb 100644 --- a/docs/ru/faq/general/index.md +++ b/docs/ru/faq/general/index.md @@ -1,25 +1,24 @@ --- -title: General questions about ClickHouse +title: Общие вопросы о ClickHouse toc_hidden_folder: true toc_priority: 1 toc_title: Общие вопросы --- -# Общие вопросы о ClickHouse {#obshchie-voprosy} +# Общие вопросы о ClickHouse {#general-questions} Вопросы: -- Что такое ClickHouse? -- Почему ClickHouse такой быстрый? -- Кто пользуется ClickHouse? -- Что обозначает название “ClickHouse”? -- Что значит “Не тормозит”? -- Что такое OLAP? -- Что такое колоночная база данных? +- [Что такое ClickHouse?](../../index.md#what-is-clickhouse) +- [Почему ClickHouse такой быстрый?](why-clickhouse-is-so-fast.md) +- [Кто пользуется ClickHouse?](who-is-using-clickhouse.md) +- [Что обозначает название ClickHouse?](dbms-naming.md) +- [Как фраза “Не тормозит” осталась на всех футболках?](ne-tormozit.md) +- [Что такое OLAP?](olap.md) +- [Что такое столбцовая база данных?](columnar-database.md) - [Почему бы не использовать системы типа MapReduce?](mapreduce.md) !!! info "Если вы не нашли то, что искали:" - Загляните в другие категории F.A.Q. или поищите в других разделах документации, ориентируйтесь по оглавлению слева. - -{## [Original article](https://clickhouse.tech/docs/ru/faq/general/) ##} + Загляните в другие категории F.A.Q. или поищите в остальных разделах документации, ориентируясь по оглавлению слева. +[Original article](https://clickhouse.tech/docs/ru/faq/general/) diff --git a/docs/ru/faq/general/mapreduce.md b/docs/ru/faq/general/mapreduce.md index 645391493c3..8a524c9f680 100644 --- a/docs/ru/faq/general/mapreduce.md +++ b/docs/ru/faq/general/mapreduce.md @@ -4,13 +4,10 @@ toc_hidden: true toc_priority: 110 --- -## Почему бы не использовать системы типа MapReduce? {#pochemu-by-ne-ispolzovat-sistemy-tipa-mapreduce} +# Почему бы не использовать системы типа MapReduce? {#why-not-use-something-like-mapreduce} -Системами типа MapReduce будем называть системы распределённых вычислений, в которых операция reduce сделана на основе распределённой сортировки. Наиболее распространённым opensource решением данного класса является [Apache Hadoop](http://hadoop.apache.org). Яндекс использует собственное решение — YT. +Системами типа MapReduce будем называть системы распределённых вычислений, в которых операция свёртки реализована на основе распределённой сортировки. Наиболее распространённое решение с открытым кодом в данном классе — [Apache Hadoop](http://hadoop.apache.org). Яндекс пользуется собственным решением — YT. -Такие системы не подходят для онлайн запросов в силу слишком большой latency. То есть, не могут быть использованы в качестве бэкенда для веб-интерфейса. -Такие системы не подходят для обновления данных в реальном времени. -Распределённая сортировка не является оптимальным способом выполнения операции reduce, если результат выполнения операции и все промежуточные результаты, при их наличии, помещаются в оперативку на одном сервере, как обычно бывает в запросах, выполняющихся в режиме онлайн. В таком случае, оптимальным способом выполнения операции reduce является хэш-таблица. Частым способом оптимизации map-reduce задач является предагрегация (частичный reduce) с использованием хэш-таблицы в оперативной памяти. Эта оптимизация делается пользователем в ручном режиме. -Распределённая сортировка является основной причиной тормозов при выполнении несложных map-reduce задач. +Такие системы не подходят для онлайн запросов в силу слишком большой задержки. То есть не могут быть использованы в качестве бэкенда для веб-интерфейса. Также эти системы не подходят для обновления данных в реальном времени. Распределённая сортировка является не оптимальным способом для выполнения операции свёртки в случае запросов, выполняющихся в режиме онлайн, потому что результат выполнения операции и все промежуточные результаты (если такие есть) помещаются в оперативную память на одном сервере. В таком случае оптимальным способом выполнения операции свёртки является хеш-таблица. Частым способом оптимизации "map-reduce" задач является предагрегация (частичная свёртка) с использованием хеш-таблицы в оперативной памяти. Пользователь делает эту оптимизацию в ручном режиме. Распределённая сортировка — основная причина тормозов при выполнении несложных задач типа "map-reduce". -Большинство реализаций MapReduce позволяют выполнять произвольный код на кластере. Но для OLAP задач лучше подходит декларативный язык запросов, который позволяет быстро проводить исследования. Для примера, для Hadoop существует Hive и Pig. Также смотрите Cloudera Impala, Shark (устаревший) для Spark, а также Spark SQL, Presto, Apache Drill. Впрочем, производительность при выполнении таких задач является сильно неоптимальной по сравнению со специализированными системами, а сравнительно высокая latency не позволяет использовать эти системы в качестве бэкенда для веб-интерфейса. +Большинство реализаций MapReduce позволяют выполнять произвольный код на кластере. Но для OLAP-задач лучше подходит декларативный язык запросов, который позволяет быстро проводить исследования. Например, для Hadoop существуют Hive и Pig. Также посмотрите на Cloudera Impala, Shark (устаревший) для Spark, а также Spark SQL, Presto, Apache Drill. Впрочем, производительность при выполнении таких задач очень неоптимальная, если сравнивать со специализированными системами, а относительно высокая задержка не позволяет использовать эти системы в качестве бэкенда для веб-интерфейса. diff --git a/docs/ru/faq/general/ne-tormozit.md b/docs/ru/faq/general/ne-tormozit.md new file mode 100644 index 00000000000..1230e34c475 --- /dev/null +++ b/docs/ru/faq/general/ne-tormozit.md @@ -0,0 +1,25 @@ +--- +title: "What does “не тормозит” mean?" +toc_hidden: true +toc_priority: 11 +--- + +# Что значит “Не тормозит”? {#what-does-ne-tormozit-mean} + +Обычно этот вопрос возникает, когда люди видят официальные футболки ClickHouse. На них большими буквами написано **“ClickHouse не тормозит”**. + +До того, как код ClickHouse стал открытым, его разрабатывали как собственную систему хранения данных в крупнейшей российской ИТ-компании [Яндекс](https://yandex.com/company/). Поэтому оригинальный слоган написан по-русски. После выхода версии с открытым исходным кодом мы впервые выпустили некоторое количество таких футболок для мероприятий в России, и просто оставили прежний слоган. + +Когда мы решили отправить партию этих футболок на мероприятия вне России, мы пробовали подобрать подходящий английский слоган. К сожалению, мы так и не смогли придумать достаточно точный и выразительный перевод, ведь на русском этот слоган звучит очень ёмко и при этом довольно элегантно. К тому же, существовало ограничение по количеству символов на футболках. В итоге мы решили оставить русский вариант даже для международных событий. И это стало прекрасным решением, потому что люди по всему миру приятно удивлялись, когда видели фразу и интересовались, что же там написано. + +Итак, как же объяснить эту фразу на английском? Вот несколько вариантов: + +- Если переводить буквально, то получится что-то подобное: *“ClickHouse doesn’t press the brake pedal”*. +- Если же вы хотите максимально сохранить том смысл, который вкладывает в эту фразу человек из ИТ-сферы, то будет примерно следующее: *“If your larger system lags, it’s not because it uses ClickHouse”*. +- Более короткие, но не такие точные версии: *“ClickHouse is not slow”*, *“ClickHouse doesn’t lag”* или просто *“ClickHouse is fast”*. + +Если вы не видели наших футболок, посмотрите видео о ClickHouse. Например, вот это: + +![iframe](https://www.youtube.com/embed/bSyQahMVZ7w) + +P.S. Эти футболки не продаются, а распространяются бесплатно на большинстве митапов [ClickHouse](https://clickhouse.tech/#meet), обычно в награду за самые интересные вопросы или другие виды активного участия. diff --git a/docs/ru/faq/general/olap.md b/docs/ru/faq/general/olap.md new file mode 100644 index 00000000000..9dce0ffbdf7 --- /dev/null +++ b/docs/ru/faq/general/olap.md @@ -0,0 +1,39 @@ +--- +title: Что такое OLAP? +toc_hidden: true +toc_priority: 100 +--- + +# Что такое OLAP? {#what-is-olap} + +[OLAP](https://ru.wikipedia.org/wiki/OLAP) (OnLine Analytical Processing) переводится как обработка данных в реальном времени. Это широкий термин, который можно рассмотреть с двух сторон: с технической и с точки зрения бизнеса. Для самого общего понимания можно просто прочитать его с конца: + +**Processing** + Обрабатываются некие исходные данные… + +**Analytical** +: … чтобы получить какие-то аналитические отчеты или новые знания… + +**OnLine** +: … в реальном времени, практически без задержек на обработку. + +## OLAP с точки зрения бизнеса {#olap-from-the-business-perspective} + +В последние годы бизнес-сообщество стало осознавать ценность данных. Компании, которые принимают решения вслепую, чаще всего отстают от конкурентов. Управление бизнесом на основе данных, которое применяется успешными компаниями, побуждает собирать все данные, которые могут быть полезны в будущем для принятия бизнес-решений, а также подбирать механизмы, чтобы своевременно эти данные анализировать. Именно для этого и нужны СУБД с OLAP. + +С точки зрения бизнеса, OLAP позволяет компаниям постоянно планировать, анализировать и оценивать операционную деятельность, чтобы повышать её эффективность, уменьшать затраты и как следствие — увеличивать долю рынка. Это можно делать как в собственной системе, так и в облачной (SaaS), в веб или мобильных аналитических приложениях, CRM-системах и т.д. Технология OLAP используется во многих приложениях BI (Business Intelligence — бизнес-аналитика). + +ClickHouse — это СУБД с OLAP, которая часто используется для поддержки SaaS-решений для анализа данных в различных предметных областях. Но поскольку некоторые компании все еще не слишком охотно размещают свои данные в облаке (у сторонних провайдеров), ClickHouse может быть развернут и на собственных серверах заказчика. + +## OLAP с технической точки зрения {#olap-from-the-technical-perspective} + +Все СУБД можно разделить на две группы: OLAP (**аналитическая** обработка в реальном времени) и OLTP (обработка **транзакций** в реальном времени). OLAP используются для построения отчетов на основе больших объемов накопленных исторических данных, но эти отчеты обновляются не слишком часто. OLTP обычно применяются для обработки непрерывных потоков операций (транзакций), каждая из которых изменяет состояние данных. + +На практике OLAP и OLTP — это не строго разделённые категории, а скорее спектр возможностей. Большинство СУБД специализируются на каком-то одном виде обработки данных, но имеют инструменты и для выполнения других операций, когда это необходимо. Из-за такой специализации часто приходится использовать несколько СУБД и интегрировать их между собой. Это вполне реальная и решаемая задача, но, как известно, чем больше систем, тем выше расходы на их содержание. Поэтому в последние годы становятся популярны гибридные СУБД — HTAP (**Hybrid Transactional/Analytical Processing**), которые одинаково эффективно выполняют оба вида операций по обработке данных. + +Даже если СУБД сначала развивались исключительно как OLAP или как OLTP, разработчики постепенно двигаются в сторону HTAP, чтобы сохранять конкурентоспособность. И ClickHouse не исключение. Изначально он создавался как [OLAP СУБД с максимальной производительностью](../../faq/general/why-clickhouse-is-so-fast.md), и на сегодняшний день в нем нет полноценной поддержки обработки тразакций, но уже реализованы некоторые возможности, такие как постоянная скорость чтения/записи данных и мутации при изменении/удалении данных. + +Принципиальное "разделение труда" между OLAP и OLTP СУБД сохраняется: + +- Чтобы эффективно строить аналитические отчеты, нужно уметь обрабатывать колонки по отдельности, поэтому большинство OLAP СУБД — [столбцовые](../../faq/general/columnar-database.md). +- Хранение данных по столбцам снижает скорость выполнения операций над строками (таких как добавление или изменение данных) пропорционально числу столбцов, а это число может быть огромным для систем, ориентированных на сбор разнообразных детальных данных о событиях. Поэтому большинство OLTP систем используют строковые СУБД. diff --git a/docs/ru/faq/general/who-is-using-clickhouse.md b/docs/ru/faq/general/who-is-using-clickhouse.md new file mode 100644 index 00000000000..02a4d198deb --- /dev/null +++ b/docs/ru/faq/general/who-is-using-clickhouse.md @@ -0,0 +1,19 @@ +--- +title: Кто пользуется ClickHouse? +toc_hidden: true +toc_priority: 9 +--- + +# Кто пользуется ClickHouse? {#who-is-using-clickhouse} + +Так как CH является продуктом с открытым исходным кодом, на этот вопрос не так просто ответить. Вы не должны сообщать кому-либо о том, что вы начали пользоваться ClickHouse, достаточно взять исходный код или предкомпилированный установочный пакет. Не нужно подписывать контракт, а [лицензия Apache 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/LICENSE) позволяет распространять ПО без ограничений. + +Кроме того, стек используемых технологий часто не раскрывается из-за NDA. Некоторые компании рассматривают технологии, которыми пользуются, как своё конкурентное преимущество, даже если это продукты с открытым исходным кодом. Такие компании не позволяют сотрудникам рассказывать о том, с каким ПО они работают, или требуют согласовывать это с PR-отделом. + +Итак, как же узнать, кто пользуется ClickHouse? + +Один из способов — **поспрашивать в своем окружении**. В разговорах люди более охотно делятся тем, какие технологии внедрены в их компаниях, какие задачи решаются с их помощью, могут назвать характеристики аппаратного обеспечения, объемы данных и т.д. Мы регулярно разговариваем с пользователями во время [митапов ClickHouse](https://www.youtube.com/channel/UChtmrD-dsdpspr42P_PyRAw/playlists) по всему миру и слышали о более чем 1000 компаний, которые пользуются ClickHouse. К сожалению, мы не можем раскрывать подробности, потому что по умолчанию считаем такие истории защищенными NDA, чтобы избежать любых возможных проблем. Вы можете прийти на любой из наших будущих митапов и самостоятельно поговорить с другими пользователями. Мы анонсируем события по разным каналам, например, вы можете подписаться на [наш Twitter](http://twitter.com/ClickHouseDB/). + +Второй способ узнать — посмотреть, что компании **говорят публично** о том, как именно они пользуются ClickHouse. Это более существенная информация, потому что ее можно найти в публикациях в блогах, видеозаписях разговоров, презентациях и т.д. Мы собираем ссылки на такие материалы на своей странице **[Пользователи ClickHouse](../../introduction/adopters.md)**. Будем рады, если вы поделитесь историей вашей компании или ссылками по теме (но всегда помните о том, что не стоит нарушать NDA). + +В числе пользователей есть множество очень крупных компаний, знакомых вам, таких как Bloomberg, Cisco, China Telecom, Tencent или Uber, но на самом деле это далеко не полный перечень. К примеру, если вы возьмете [список Forbes крупнейших ИТ-компаний в 2020](https://www.forbes.com/sites/hanktucker/2020/05/13/worlds-largest-technology-companies-2020-apple-stays-on-top-zoom-and-uber-debut/), то увидите, что более половины из этих компаний так или иначе пользуются ClickHouse. Также необходимо упомянуть Яндекс — компанию, которая открыла исходный код ClickHouse в 2016 году и является одной из самых крупных ИТ-компаний в Европе. diff --git a/docs/ru/faq/general/why-clickhouse-is-so-fast.md b/docs/ru/faq/general/why-clickhouse-is-so-fast.md new file mode 100644 index 00000000000..694488b40a9 --- /dev/null +++ b/docs/ru/faq/general/why-clickhouse-is-so-fast.md @@ -0,0 +1,63 @@ +--- +title: Почему ClickHouse так быстро работает? +toc_hidden: true +toc_priority: 8 +--- + +# Почему ClickHouse так быстро работает? {#why-clickhouse-is-so-fast} + +Производительность изначально заложена в архитектуре ClickHouse. Высокая скорость выполнения запросов была и остается самым важным критерием, который учитывается при разработке. Но мы обращаем внимание и на другие характеристики, такие как удобство использования, масштабируемость, безопасность. Всё это делает ClickHouse настоящей промышленной разработкой. + +Сначала ClickHouse создавался как прототип, который должен был отлично справляться с одной единственной задачей — отбирать и агрегировать данные с максимальной скоростью. Это необходимо, чтобы создать обычный аналитический отчет, и именно это делает стандартный запрос [GROUP BY](../../sql-reference/statements/select/group-by.md). Для решения такой задачи команда разработки ClickHouse приняла несколько архитектурных решений: + +Столбцовое хранение данных +: Исходные данные часто содержат сотни или даже тысячи столбцов, в то время как для конкретного отчета нужны только несколько из них. Система не должна читать ненужные столбцы, поскольку операции чтения данных с диска — самые дорогостоящие. + +Индексы +: ClickHouse хранит структуры данных в оперативной памяти, что позволяет считывать не только нужные столбцы, но и нужные диапазоны строк для этих столбцов. + +Сжатие данных +: Различные способы хранения смежных значений в столбце позволяют достигать более высокой степени сжатия данных (по сравнению с обычными строковыми СУБД), т.к. в смежных строках значения часто бывают одинаковыми или близкими. В дополнение к универсальному сжатию ClickHouse поддерживает [специализированные кодеки](../../sql-reference/statements/create/table.md#create-query-specialized-codecs), которые позволяют еще больше уменьшить объемы хранимых данных. + +Векторные запросы +: ClickHouse не только хранит, но и обрабатывает данные в столбцах. Это приводит к лучшей утилизации кеша процессора и позволяет использовать инструкции [SIMD](https://en.wikipedia.org/wiki/SIMD). + +Масштабируемость +: ClickHouse может задействовать все доступные мощности процессоров и объемы дисков, чтобы выполнить даже одиночный запрос. Не только на отдельном сервере, но и в целом кластере. + +Похожие техники используют и многие другие СУБД. **Внимание к мельчайшим деталям** — вот что на самом деле выделяет ClickHouse. Большинство языков программирования поддерживают большинство распространенных алгоритмов и структур данных, но как правило, они бывают слишком универсальными, чтобы быть по-настоящему эффективными. Мы рассматриваем каждую задачу как тонкий инструмент со множеством настроек, вместо того чтобы просто взять какую-то случайную реализацию. Например, если вам нужна хеш-таблица, вот несколько ключевых вопросов, которые нужно продумать: + +- Какую хеш-функцию выбрать? +- Каким способом разрешать коллизии: [открытая адресация](https://en.wikipedia.org/wiki/Open_addressing) или [метод цепочек](https://en.wikipedia.org/wiki/Hash_table#Separate_chaining)? +- Как хранить данные в памяти: ключи и значения в одном массиве или в отдельных? Будут ли там храниться маленькие или большие значения? +- Фактор заполнения: когда и как менять размер таблицы? Как перемещать значения при изменении размера? +- Будут ли значения удаляться и если да, то какой алгоритм сработает лучше? +- Понадобится ли быстрое зондирование с использованием битовых масок, встроенное хранение строковых ключей, поддержка неперемещаемых значений, предварительная выборка и пакетная обработка? + +Хеш-таблица — ключевая структура данных для реализации `GROUP BY`, и ClickHouse автоматически выбирает одну из [более 30 вариаций](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Aggregator.h) для каждого специфического запроса. + +Для алгоритмов сортировки, например, следует продумать следующие вопросы: + +- Что будет сортироваться: массив чисел, кортежей, строк или структур? +- Доступны ли все данные в оперативной памяти? +- Нужна ли стабильная сортировка? +- Нужна ли полная сортировка? Может быть, будет достаточно частичной или выборочной сортировки? +- Как сравнивать данные? +- Не являются ли данные частично отсортированными? + +Алгоритмы, основанные на характеристиках рабочих данных, обычно дают лучшие результаты, чем их более универсальные аналоги. Если заранее неизвестно, с какими данными придется работать, ClickHouse будет в процессе выполнения пробовать различные реализации и в итоге выберет оптимальный вариант. Например, рекомендуем прочитать [статью о том, как в ClickHouse реализуется распаковка LZ4](https://habr.com/en/company/yandex/blog/457612/). + +Ну и последнее, но тем не менее важное условие: команда ClickHouse постоянно отслеживает в интернете сообщения пользователей о найденных ими удачных реализациях, алгоритмах или структурах данных, анализирует и пробует новые идеи. Иногда в этом потоке сообщений попадаются действительно ценные предложения. + +!!! info "Советы о том, как создать собственную высокопроизводительную систему" + + + - При проектировании системы обращайте внимание на мельчайшие детали реализации. + - Учитывайте возможности аппаратного обеспечения. + - Выбирайте структуры и представления данных исходя из требований конкретной задачи. + - Для особых случаев разрабатывайте специализированные решения. + - Пробуйте новые алгоритмы, о которых вы вчера прочитали в интернете. Ищите возможности для совершенствования. + - Выбирайте алгоритмы динамически, в процессе выполнения, на основе статистики. + - Ориентируйтесь на показатели, собранные при работе с реальными данными. + - Проверяйте производительность в процессе CI. + - Измеряйте и анализируйте всё, что только возможно. diff --git a/docs/ru/faq/index.md b/docs/ru/faq/index.md index 611e604bc8e..08deec5f7ce 100644 --- a/docs/ru/faq/index.md +++ b/docs/ru/faq/index.md @@ -4,14 +4,42 @@ toc_hidden: true toc_priority: 76 --- -# Содержание F.A.Q. {#soderzhanie} +# ClickHouse F.A.Q. {#clickhouse-f-a-q} -В этом разделе документации собрали вопросы о ClickHouse, которые задают чаще всего. +В этом разделе документации собраны ответы на вопросы о ClickHouse, которые задают чаще всего. Категории: -- **[Общие вопросы](../faq/general/index.md)** -- **[Применение](../faq/use-cases/index.md)** -- **[Операции](../faq/operations/index.md)** -- **[Интеграция](../faq/integration/index.md)** +- **[Общие вопросы](general/index.md)** + - [Что такое ClickHouse?](../index.md#what-is-clickhouse) + - [Почему ClickHouse такой быстрый?](general/why-clickhouse-is-so-fast.md) + - [Кто пользуется ClickHouse?](general/who-is-using-clickhouse.md) + - [Что обозначает название ClickHouse?](general/dbms-naming.md) + - [Как фраза “Не тормозит” осталась на всех футболках?](general/ne-tormozit.md) + - [Что такое OLAP?](general/olap.md) + - [Что такое столбцовая база данных?](general/columnar-database.md) + - [Почему бы не использовать системы типа MapReduce?](general/mapreduce.md) +- **[Применение](use-cases/index.md)** + - [Можно ли использовать ClickHouse как БД временных рядов?](use-cases/time-series.md) + - [Можно ли использовать ClickHouse для хранения данных вида "ключ-значение"?](use-cases/key-value.md) +- **[Операции](operations/index.md)** + - [Какую версию ClickHouse использовать?](operations/production.md) + - [Возможно ли удалить старые записи из таблицы ClickHouse?](operations/delete-old-data.md) +- **[Интеграция](integration/index.md)** + - [Как экспортировать данные из ClickHouse в файл?](integration/file-export.md) + - [Как импортировать JSON в ClickHouse?](integration/json-import.md) + - [Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC?](integration/oracle-odbc.md) +{## TODO +Question candidates: +- How to choose a primary key? +- How to add a column in ClickHouse? +- Too many parts +- How to filter ClickHouse table by an array column contents? +- How to insert all rows from one table to another of identical structure? +- How to kill a process (query) in ClickHouse? +- How to implement pivot (like in pandas)? +- How to remove the default ClickHouse user through users.d? +- Importing MySQL dump to Clickhouse +- Window function workarounds (row\_number, lag/lead, running diff/sum/average) +##} diff --git a/docs/ru/faq/integration/file-export.md b/docs/ru/faq/integration/file-export.md index d21ffbd3349..b9a54dc0766 100644 --- a/docs/ru/faq/integration/file-export.md +++ b/docs/ru/faq/integration/file-export.md @@ -1,27 +1,27 @@ --- -title: How do I export data from ClickHouse to a file? +title: Как экспортировать данные из ClickHouse в файл? toc_hidden: true toc_priority: 10 --- -## Как экспортировать данные из ClickHouse в файл? {#how-to-export-to-file-rus} +# Как экспортировать данные из ClickHouse в файл? {#how-to-export-to-file-rus} -### Секция INTO OUTFILE {#sektsiia-into-outfile-rus} +## Секция INTO OUTFILE {#using-into-outfile-clause} -Добавьте секцию [INTO OUTFILE](../../sql-reference/statements/select/into-outfile.md#into-outfile-clause) к своему запросу. +Добавьте к своему запросу секцию [INTO OUTFILE](../../sql-reference/statements/select/into-outfile.md#into-outfile-clause). Например: ``` sql -SELECT * FROM table INTO OUTFILE 'file' +SELECT * FROM table INTO OUTFILE 'file'; ``` -По умолчанию, для выдачи данных ClickHouse использует формат [TabSeparated](../../interfaces/formats.md#tabseparated). Чтобы выбрать [формат данных](../../interfaces/formats.md), используйте секцию [FORMAT](../../sql-reference/statements/select/format.md#format-clause). +По умолчанию при выдаче данных ClickHouse использует формат [TabSeparated](../../interfaces/formats.md#tabseparated). Чтобы выбрать другой [формат данных](../../interfaces/formats.md), используйте секцию [FORMAT](../../sql-reference/statements/select/format.md#format-clause). Например: ``` sql -SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV +SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV; ``` ## Таблица с движком File {#using-a-file-engine-table} diff --git a/docs/ru/faq/integration/index.md b/docs/ru/faq/integration/index.md index 59fd84a8139..fdb9039d066 100644 --- a/docs/ru/faq/integration/index.md +++ b/docs/ru/faq/integration/index.md @@ -1,19 +1,19 @@ --- -title: Questions about integrating ClickHouse and other systems +title: Интеграция ClickHouse с другими системами toc_hidden_folder: true toc_priority: 4 toc_title: Интеграция --- -# Вопросы об интеграции ClickHouse с другими системами {#question-about-integrating-clickhouse-and-other-systems-rus} +# Интеграция ClickHouse с другими системами {#question-about-integrating-clickhouse-and-other-systems-rus} Вопросы: - [Как экспортировать данные из ClickHouse в файл?](file-export.md) -- Как импортировать JSON в ClickHouse? +- [Как импортировать JSON в ClickHouse?](json-import.md) - [Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC?](oracle-odbc.md) !!! info "Если вы не нашли то, что искали" Загляните в другие подразделы F.A.Q. или поищите в остальных разделах документации, ориентируйтесь по оглавлению слева. -{## [Original article](https://clickhouse.tech/docs/ru/faq/integration/) ##} +[Original article](https://clickhouse.tech/docs/ru/faq/integration/) diff --git a/docs/ru/faq/integration/json-import.md b/docs/ru/faq/integration/json-import.md new file mode 100644 index 00000000000..17911dd1ef8 --- /dev/null +++ b/docs/ru/faq/integration/json-import.md @@ -0,0 +1,33 @@ +--- +title: Как импортировать JSON в ClickHouse? +toc_hidden: true +toc_priority: 11 +--- + +# Как импортировать JSON в ClickHouse? {#how-to-import-json-into-clickhouse} + +ClickHouse поддерживает широкий спектр [входных и выходных форматов данных](../../interfaces/formats.md). Среди них есть множество вариаций JSON, но чаще всего для импорта данных используют [JSONEachRow](../../interfaces/formats.md#jsoneachrow): один JSON-объект в строке, каждый объект с новой строки. + +## Примеры {#examples} + +С помощью [HTTP-интерфейса](../../interfaces/http.md): + +``` bash +$ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test%20FORMAT%20JSONEachRow' --data-binary @- +``` + +При помощи [интефейса CLI](../../interfaces/cli.md): + +``` bash +$ echo '{"foo":"bar"}' | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow" +``` + +Чтобы не вставлять данные вручную, используйте одну из [готовых библиотек](../../interfaces/index.md). + +## Полезные настройки {#useful-settings} + +- `input_format_skip_unknown_fields` позволяет импортировать JSON, даже если он содержит дополнительные поля, которых нет в таблице (отбрасывая лишние поля). +- `input_format_import_nested_json` позволяет импортировать вложенные JSON-объекты в столбцы типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md). + +!!! note "Примечание" + В HTTP-интерфейсе настройки передаются через параметры `GET` запроса, в `CLI` interface — как дополнительные аргументы командной строки, начинающиеся с `--`. diff --git a/docs/ru/faq/integration/oracle-odbc.md b/docs/ru/faq/integration/oracle-odbc.md index f41f1ae7fc1..1997cdcc1d2 100644 --- a/docs/ru/faq/integration/oracle-odbc.md +++ b/docs/ru/faq/integration/oracle-odbc.md @@ -1,10 +1,10 @@ --- -title: What if I have a problem with encodings when using Oracle via ODBC? +title: Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC? toc_hidden: true toc_priority: 20 --- -## Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC? {#oracle-odbc-encodings-rus} +# Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC? {#oracle-odbc-encodings} Если вы используете Oracle через драйвер ODBC в качестве источника внешних словарей, необходимо задать правильное значение для переменной окружения `NLS_LANG` в `/etc/default/clickhouse`. Подробнее читайте в [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). diff --git a/docs/ru/faq/operations/delete-old-data.md b/docs/ru/faq/operations/delete-old-data.md new file mode 100644 index 00000000000..7870f9b3bfe --- /dev/null +++ b/docs/ru/faq/operations/delete-old-data.md @@ -0,0 +1,42 @@ +--- +title: Возможно ли удалить старые записи из таблицы ClickHouse? +toc_hidden: true +toc_priority: 20 +--- + +# Возможно ли удалить старые записи из таблицы ClickHouse? {#is-it-possible-to-delete-old-records-from-a-clickhouse-table} + +Если отвечать коротко, то да. В ClickHouse есть множество механизмов, которые позволяют освобождать место на диске, удаляя старые данные. Каждый механизм подходит для разных сценариев. + +## TTL {#ttl} + +ClickHouse позволяет автоматически удалять данные при выполнении некоторых условий. Эти условия задаются как выражение, вычисляемое на основе значений любых столбцов, обычно это просто разница между текущим моментом времени и значением какого-то столбца, содержащего дату и время. + +Ключевое преимущество такого подхода в том, что не нужно использовать внешнюю систему, чтобы запустить процесс — когда заданы условия TTL, удаление данных выполняется автоматически в фоновом режиме. + +!!! note "Note" + TTL можно использовать не только для перемещения в [/dev/null](https://en.wikipedia.org/wiki/Null_device), но еще и между дисками, например, с SSD на HDD. + +[Подробнее о конфигурировании TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + +## ALTER DELETE {#alter-delete} + +ClickHouse не удаляет данные в реальном времени, как СУБД [OLTP](https://en.wikipedia.org/wiki/Online_transaction_processing). Больше всего на такое удаление похожи мутации. Они выполняются с помощью запросов `ALTER ... DELETE` или `ALTER ... UPDATE`. В отличие от обычных запросов `DELETE` и `UPDATE`, мутации выполняются асинхронно, в пакетном режиме, не в реальном времени. В остальном после слов `ALTER TABLE` синтаксис обычных запросов и мутаций одинаковый. + +`ALTER DELETE` можно использовать для гибкого удаления устаревших данных. Если вам нужно делать это регулярно, единственный недостаток такого подхода будет заключаться в том, что потребуется внешняя система для запуска запроса. Кроме того, могут возникнуть некоторые проблемы с производительностью, поскольку мутации перезаписывают целые куски данных если в них содержится хотя бы одна строка, которую нужно удалить. + +Это самый распространенный подход к тому, чтобы обеспечить соблюдение принципов [GDPR](https://gdpr-info.eu) в вашей системе на ClickHouse. + +Подробнее смотрите в разделе [Мутации](../../sql-reference/statements/alter/index.md#alter-mutations). + +## DROP PARTITION {#drop-partition} + +Запрос `ALTER TABLE ... DROP PARTITION` позволяет эффективно удалять целые партиции. Этот способ не такой гибкий, важно правильно сконфигурировать партиции при создании таблицы, но он подходит для достаточно широкого спектра типовых задач. Как и для мутаций, для регулярного запуска таких запросов нужна внешняя система. + +Подробнее смотрите в разделе [Манипулирование с партициями и кусками](../../sql-reference/statements/alter/partition.md#alter_drop-partition). + +## TRUNCATE {#truncate} + +Это достаточно радикальный способ, он удаляет все данные в таблице, но хорошо подходит для отдельных случаевх. + +Подробнее смотрите в разделе об [удалении партиций](../../sql-reference/statements/alter/partition.md#alter_drop-partition). diff --git a/docs/ru/faq/operations/index.md b/docs/ru/faq/operations/index.md index 98a04213fb7..723ae2d10d3 100644 --- a/docs/ru/faq/operations/index.md +++ b/docs/ru/faq/operations/index.md @@ -1,18 +1,18 @@ --- -title: Question about operating ClickHouse servers and clusters +title: Вопросы о производительности серверов и кластеров ClickHouse toc_hidden_folder: true toc_priority: 3 toc_title: Операции --- -# Вопросы о производительности серверов и кластеров ClickHouse {#voprosy-ob-operating-clickhouse-servers-and-clusters} +# Вопросы о производительности серверов и кластеров ClickHouse {#question-about-operating-clickhouse-servers-and-clusters} Вопросы: -- Which ClickHouse version to use in production? -- Is it possible to delete old records from a ClickHouse table? +- [Какую версию ClickHouse использовать?](production.md) +- [Возможно ли удалить старые записи из таблицы ClickHouse?](delete-old-data.md) - !!! info "Don’t see what you were looking for?" - Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. +!!! info "Если вы не нашли то, что искали" + Загляните в другие подразделы F.A.Q. или поищите в остальных разделах документации, ориентируйтесь по оглавлению слева. - {## [Original article](https://clickhouse.tech/docs/en/faq/production/) ##} +[Original article](https://clickhouse.tech/docs/en/faq/operations/) diff --git a/docs/ru/faq/operations/production.md b/docs/ru/faq/operations/production.md new file mode 100644 index 00000000000..a82a7f5e888 --- /dev/null +++ b/docs/ru/faq/operations/production.md @@ -0,0 +1,70 @@ +--- +title: Какую версию ClickHouse использовать? +toc_hidden: true +toc_priority: 10 +--- + +# Какую версию ClickHouse использовать? {#which-clickhouse-version-to-use-in-production} + +Во-первых, давайте обсудим, почему возникает этот вопрос. Есть две основные причины: + +1. ClickHouse развивается достаточно быстро, и обычно мы выпускаем более 10 стабильных релизов в год. Так что есть из чего выбрать, а это не всегда просто. +2. Некоторые пользователи не хотят тратить время на анализ того, какая версия лучше подходит для их задач, и просто хотят получить совет от эксперта. + +Вторая причина более весомая, так что начнем с нее, а затем рассмотрим, какие бывают релизы ClickHouse. + +## Какую версию ClickHouse вы посоветуете? {#which-clickhouse-version-do-you-recommend} + +Казалось бы, самый удобный вариант — нанять консультанта или довериться эксперту, и делегировать ему ответственность за вашу систему. Вы устанавливаете ту версию ClickHouse, которую вам рекомендовали, и теперь если что-то пойдет не так — это уже не ваша вина. На самом деле это не так. Никто не может знать лучше вас, что происходит в вашей системе. + +Как же правильно выбрать версию ClickHouse, на которую стоит обновиться? Или как выбрать версию, с которой следует начать, если вы только внедряете ClickHouse? Во-первых, мы рекомендуем позаботиться о создании **реалистичной тестовой среды** (pre-production). В идеальном мире это была бы полная копия рабочей среды, но чаще всего такое решение оказывается слишком дорогостоящим. + +Чтобы тестовая среда была достаточно надежной, но не слишком дорогостоящей, учитывайте следующие моменты: + +- В тестовой среде нужно выполнять набор запросов, максимально близкий к тому, который будет выполняться в реальной среде: + - Не используйте тестовую среду в режиме "только для чтения", работая с каким-то статичным набором данных. + - Не используйте её в режиме "только для записи", проверяя лишь копирование данных, без построения типовых отчетов. + - Не очищайте её, удаляя все данные подчистую вместо тестирования рабочих схем миграции. +- Выполняйте реальные запросы на выборке из реальных рабочих данных. Постарайтесь подготовить репрезентативную выборку, на которой запрос `SELECT` будет возвращать адекватные результаты. Если регламенты безопасности не позволяют использовать реальные данные за пределами защищенной рабочей среды, используйте обфускацию. +- Убедитесь, что тестовая среда находится под контролем тех же систем мониторинга и оповещения, что и рабочая. +- Если ваша рабочая среда распределена между разными дата-центрами и регионами, тестовая среда должна быть такой же. +- Если в рабочей среде используются сложные инструменты типа репликации, распределённых таблиц или каскадных материализованных представлений, тестовая среда должна быть сконфигурирована так же. +- Обычно в тестовой среде стараются использовать то же количество серверов и виртуальных машин, что и в рабочей, но делают их меньшего объема. Либо наоборот, используют существенно меньшее число серверов и ВМ, но тех же объемов. Первый вариант скорее позволит обнаружить проблемы, связанные с работой сети, а второй вариант более прост в управлении. + +Второе направление — **автоматизированное тестирование**. Не думайте, что если какой-то запрос отработал успешно один раз, так будет всегда. Считается приемлемым выполнять некоторые юнит-тесты, используя "заглушки" вместо запросов к СУБД. Но вы должны проводить достаточное количество автотестов, где запросы выполняются в реальном ClickHouse, чтобы убедиться, что все важные задачи отрабатывают должным образом. + +В продолжение этой темы, вы можете поделиться вашими автотестами и передать их [в открытую тестовую среду ClickHouse](https://github.com/ClickHouse/ClickHouse/tree/master/tests), которая используется для постоянного развития нашей СУБД. Вам придётся потратить немного времени и сил, чтобы научиться [составлять и выполнять тесты](../../development/tests.md), а также чтобы перенести ваши тесты на эту платформу. Наградой за это станет уверенность в том, что новые стабильные релизы ClickHouse будут корректно работать на ваших задачах. Это гораздо лучше, чем тратить время на то, чтобы вновь отлавливать прежние ошибки в новых версиях, а затем ждать, пока их исправят и включат эти исправления в очередной релиз. Некоторые компании уже включили в корпоративные регламенты необходимость передачи своих тестов в ClickHouse, прежде всего стоит упомянуть [правило Beyonce](https://www.oreilly.com/library/view/software-engineering-at/9781492082781/ch01.html#policies_that_scale_well), действующее в Google. + +После того, как вы подготовили тестовую среду и инфраструктуру, выбор версии ClickHouse упрощается: + +1. Проверяйте новые релизы ClickHouse с помощью подготовленных автотестов. Вы можете проверять не только стабильные релизы, но и тестовые, хотя работать с такими релизами не рекомендуется. +2. Если новый релиз ClickHouse успешно прошел ваши автотесты, внедряйте его в тестовой среде и проверяйте работоспособность всех ваших задач. +3. Сообщайте обо всех обнаруженных проблемах в [ClickHouse GitHub Issues](https://github.com/ClickHouse/ClickHouse/issues). +4. Если никаких серьезных проблем не было выявлено, можно установить новый релиз ClickHouse в рабочую среду. Чтобы еще больше снизить риски, вы можете внедрить специальные техники поэтапного перехода на новые релизы, такие как [canary releases](https://martinfowler.com/bliki/CanaryRelease.html) или [green-blue deployments](https://martinfowler.com/bliki/BlueGreenDeployment.html). + +Как вы уже поняли, ClickHouse не требует какого-то особенного подхода — описанные выше правила широко используются для любых элементов инфраструктуры, если нужно обеспечить ее надежность и если компании серьезно подходят к вопросам стабильности своих систем. + +## Какой вид релиза ClickHouse выбрать? {#how-to-choose-between-clickhouse-releases} + +Если вы заглянете в раздел, где публикуются установочные пакеты ClickHouse, вы увидите там следующие виды пакетов: + +1. `testing` +2. `prestable` +3. `stable` +4. `lts` (long-term support) + +Как уже упоминалось выше, тестовые релизы (`testing`) стоит использовать для раннего обнаружения ошибок, в рабочей среде мы не рекомендуем использовать такие релизы, поскольку они еще не протестированы так же тщательно, как остальные. + +Подготовительные (`prestable`) — это релизы-кандидаты, которые с большой вероятностью скоро будут доведены до стабильного состояния. Вы можете использовать их в тестовой среде и сообщать нам об обнаруженных ошибках. + +В рабочей среде мы рекомендуем использвать либо стабильный релиз (`stable`), либо релиз с долговременной поддержкой (`lts`). Если вы выбираете между этими двуми видами релизов, примите во внимание следующее: + +- По умолчанию мы рекомендуем релизы `stable`. Новый стабильный релиз выпускается примерно раз в месяц, что открывает доступ к новым функциям. Три последних стабильных релиза находятся на поддержке — это означает, что в них интегрируются исправленные ошибки и доработки. +- Релизы `lts` выпускаются дважды в год и находятся на поддержке в течение года с момента выхода. Они более предочтительны в следующих случаях: + - ваши корпоративные регламенты запрещают частые обновления или использование любых релизов, кроме LTS; + - вы используете ClickHouse в продуктах, которые не задействуют сложные инструменты ClickHouse, или у вас не хватает ресурсов для частого их обновления. + +Часто компании, которые изначально ориентировались на релизы `lts`, позднее переходят на `stable`, поскольку хотят быстрее получать доступ к новым возможностям. + +!!! warning "Важно" + Мы всегда стремимся поддерживать совместимость релизов, но иногда это правило нарушается, и какие-то отдельные возможности в новых релизах становятся недоступны. Перед обновлением ClickHouse обязательно изучите [журнал изменений](../../whats-new/changelog/index.md), чтобы убедиться, что в нем нет объявлений о нарушении обратной совместимости. diff --git a/docs/ru/faq/use-cases/index.md b/docs/ru/faq/use-cases/index.md index 6e7f2b36fe0..3507d69ad7c 100644 --- a/docs/ru/faq/use-cases/index.md +++ b/docs/ru/faq/use-cases/index.md @@ -1,14 +1,13 @@ --- -title: Questions about ClickHouse use cases +title: Вопросы о применении ClickHouse toc_hidden_folder: true toc_priority: 2 toc_title: Применение --- -# Вопросы о применении ClickHouse {#voprosy-o-primenenii} +# Вопросы о применении ClickHouse {#questions-about-clickhouse-use-cases} Вопросы: -- Can I use ClickHouse as a time-series database? -- Can I use ClickHouse as a key-value storage? - +- [Можно ли использовать ClickHouse как БД временных рядов?](time-series.md) +- [Можно ли использовать ClickHouse для хранения данных вида "ключ-значение"?](key-value.md) \ No newline at end of file diff --git a/docs/ru/faq/use-cases/key-value.md b/docs/ru/faq/use-cases/key-value.md new file mode 100644 index 00000000000..4daa9773f84 --- /dev/null +++ b/docs/ru/faq/use-cases/key-value.md @@ -0,0 +1,19 @@ +--- +title: Можно ли использовать ClickHouse для хранения данных вида "ключ-значение"? +toc_hidden: true +toc_priority: 101 +--- + +# Можно ли использовать ClickHouse для хранения данных вида "ключ-значение"? {#can-i-use-clickhouse-as-a-key-value-storage} + +Если отвечать коротко, то **"нет"**. Операции над данными вида "ключ-значение" занимают одну из верхних позиций в списке ситуаций, когда категорически **не стоит**{.text-danger} использовать ClickHouse. Это [OLAP](../../faq/general/olap.md) СУБД, в то время как есть много специализированных СУБД для данных вида "ключ-значение". + +Тем не менее, в некоторых ситуациях имеет смысл использовать ClickHouse для запросов над данными вида "ключ-значение". Чаще всего это относится к системам с относительно невысокой нагрузкой, в которых основной объем операций относится к аналитической обработке данных и отлично подходит для ClickHouse. Однако в них есть некий второстепенный процесс, в котором нужно обрабатывать данные вида "ключ-значение", при этом процесс не требует слишком высокой производительности и не имеет строгих ограничений по задержкам выполнения запросов. Если у вас нет ограничений по бюджету, вы можете использовать для таких операций вспомогательную базу данных "ключ-значение", но это увеличит расходы на обслуживание еще одной СУБД (мониторинг, бэкапы и т.д.). + +Если вы все же решите не следовать рекомендациям и использовать ClickHouse для работы с данными вида "ключ-значение", вот несколько советов: + +- Главная причина, по которой точечный запрос в ClickHouse становится ресурсозатратным — это разреженный индекс для первичного ключа в [таблице семейства MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). Этот индекс не может обращаться напрямую к каждой строке данных, вместо этого он обращается к каждой N-ой строке, а затем сканирует соседние строки вплоть до указанной, обрабатывая по пути лишние данные. При обработке данных вида "ключ-значение" может быть полезно уменьшить значение N при помощи настройки `index_granularity`. + +- ClickHouse хранит столбцы в отдельных файлах, поэтому чтобы собрать одну полную строку, ему приходится обрабатывать все эти файлы. Их количество растет линейно в зависимости от количества столбцов, поэтому при обработке данных вида "ключ-значение" стоит избегать использования множества столбцов и поместить все нужные данные в один столбец с типом `String` в формате JSON, Protobuf или другом подходящем формате. + +- Подумайте об использовании табличного движка [Join](../../engines/table-engines/special/join.md) вместо обычных таблиц `MergeTree` и функции [joinGet](../../sql-reference/functions/other-functions.md#joinget) для получения данных. В этом случае производительность выполнения запросов может быть выше, но могут появиться проблемы с надежностью и удобством. Пример такого использования описан [здесь](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00800_versatile_storage_join.sql#L49-L51). diff --git a/docs/ru/faq/use-cases/time-series.md b/docs/ru/faq/use-cases/time-series.md new file mode 100644 index 00000000000..ea56660cc10 --- /dev/null +++ b/docs/ru/faq/use-cases/time-series.md @@ -0,0 +1,15 @@ +--- +title: Можно ли использовать ClickHouse как базу данных временных рядов? +toc_hidden: true +toc_priority: 101 +--- + +# Можно ли использовать ClickHouse как базу данных временных рядов? {#can-i-use-clickhouse-as-a-time-series-database} + +ClickHouse — это универсальное решение для [OLAP](../../faq/general/olap.md) операций, в то время как существует много специализированных СУБД временных рядов. Однако [высокая скорость выполнения запросов](../../faq/general/why-clickhouse-is-so-fast.md) позволяет CLickHouse во многих случаях "побеждать" специализированные аналоги. В подтверждение этому есть много [примеров](https://medium.com/@AltinityDB/clickhouse-for-time-series-scalability-benchmarks-e181132a895b) с конкретными показателями производительности, так что мы не будем останавливаться на этом подробно. Лучше рассмотрим те возможности ClickHouse, которые стоит использовать. + +Во-первых, есть **[специальные кодеки](../../sql-reference/statements/create/table.md#create-query-specialized-codecs)**, которые составляют типичные временные ряды. Это могут быть либо стандартные алгоритмы, такие как `DoubleDelta` или `Gorilla`, либо специфические для ClickHouse, например `T64`. + +Во-вторых, запросы по временным рядам часто затрагивают только недавние данные, не старше одного дня или недели. Имеет смысл использовать серверы, где есть как быстрые диски nVME/SSD, так и более медленные, но ёмкие HDD диски. С помощью [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) можно сконфигурировать таблицы так, чтобы свежие данные хранились на быстрых дисках, а по мере устаревания перемещались на медленные диски. Для архивных данных можно также настроить сворачивание или даже удаление, если это необходимо. + +Несмотря на то, что работа с "сырыми" данными противоречит философии ClickHouse, если нужно соответствовать очень жестким требованиям по скорости обработки данных, вы можете использовать [материализованные представления](../../sql-reference/statements/create/view.md). diff --git a/docs/ru/getting-started/example-datasets/amplab-benchmark.md b/docs/ru/getting-started/example-datasets/amplab-benchmark.md index bc59672ab26..8a75852aad9 100644 --- a/docs/ru/getting-started/example-datasets/amplab-benchmark.md +++ b/docs/ru/getting-started/example-datasets/amplab-benchmark.md @@ -125,4 +125,3 @@ ORDER BY totalRevenue DESC LIMIT 1 ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/amplab_benchmark/) diff --git a/docs/ru/getting-started/example-datasets/brown-benchmark.md b/docs/ru/getting-started/example-datasets/brown-benchmark.md new file mode 100644 index 00000000000..f1aad06b743 --- /dev/null +++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md @@ -0,0 +1,415 @@ +--- +toc_priority: 20 +toc_title: Brown University Benchmark +--- + +# Brown University Benchmark + +`MgBench` — это аналитический тест производительности для данных журнала событий, сгенерированных машиной. Бенчмарк разработан [Andrew Crotty](http://cs.brown.edu/people/acrotty/). + +Скачать данные: +``` +wget https://datasets.clickhouse.tech/mgbench{1..3}.csv.xz +``` + +Распаковать данные: +``` +xz -v -d mgbench{1..3}.csv.xz +``` + +Создание таблиц: +``` +CREATE DATABASE mgbench; + + +CREATE TABLE mgbench.logs1 ( + log_time DateTime, + machine_name LowCardinality(String), + machine_group LowCardinality(String), + cpu_idle Nullable(Float32), + cpu_nice Nullable(Float32), + cpu_system Nullable(Float32), + cpu_user Nullable(Float32), + cpu_wio Nullable(Float32), + disk_free Nullable(Float32), + disk_total Nullable(Float32), + part_max_used Nullable(Float32), + load_fifteen Nullable(Float32), + load_five Nullable(Float32), + load_one Nullable(Float32), + mem_buffers Nullable(Float32), + mem_cached Nullable(Float32), + mem_free Nullable(Float32), + mem_shared Nullable(Float32), + swap_free Nullable(Float32), + bytes_in Nullable(Float32), + bytes_out Nullable(Float32) +) +ENGINE = MergeTree() +ORDER BY (machine_group, machine_name, log_time); + + +CREATE TABLE mgbench.logs2 ( + log_time DateTime, + client_ip IPv4, + request String, + status_code UInt16, + object_size UInt64 +) +ENGINE = MergeTree() +ORDER BY log_time; + + +CREATE TABLE mgbench.logs3 ( + log_time DateTime64, + device_id FixedString(15), + device_name LowCardinality(String), + device_type LowCardinality(String), + device_floor UInt8, + event_type LowCardinality(String), + event_unit FixedString(1), + event_value Nullable(Float32) +) +ENGINE = MergeTree() +ORDER BY (event_type, log_time); +``` + +Вставка данных: + +``` +clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv +clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbench2.csv +clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv +``` + +Запуск тестов производительности: +``` +-- Q1.1: What is the CPU/network utilization for each web server since midnight? + +SELECT machine_name, + MIN(cpu) AS cpu_min, + MAX(cpu) AS cpu_max, + AVG(cpu) AS cpu_avg, + MIN(net_in) AS net_in_min, + MAX(net_in) AS net_in_max, + AVG(net_in) AS net_in_avg, + MIN(net_out) AS net_out_min, + MAX(net_out) AS net_out_max, + AVG(net_out) AS net_out_avg +FROM ( + SELECT machine_name, + COALESCE(cpu_user, 0.0) AS cpu, + COALESCE(bytes_in, 0.0) AS net_in, + COALESCE(bytes_out, 0.0) AS net_out + FROM logs1 + WHERE machine_name IN ('anansi','aragog','urd') + AND log_time >= TIMESTAMP '2017-01-11 00:00:00' +) AS r +GROUP BY machine_name; + + +-- Q1.2: Which computer lab machines have been offline in the past day? + +SELECT machine_name, + log_time +FROM logs1 +WHERE (machine_name LIKE 'cslab%' OR + machine_name LIKE 'mslab%') + AND load_one IS NULL + AND log_time >= TIMESTAMP '2017-01-10 00:00:00' +ORDER BY machine_name, + log_time; + + +-- Q1.3: What are the hourly average metrics during the past 10 days for a specific workstation? + +SELECT dt, + hr, + AVG(load_fifteen) AS load_fifteen_avg, + AVG(load_five) AS load_five_avg, + AVG(load_one) AS load_one_avg, + AVG(mem_free) AS mem_free_avg, + AVG(swap_free) AS swap_free_avg +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + load_fifteen, + load_five, + load_one, + mem_free, + swap_free + FROM logs1 + WHERE machine_name = 'babbage' + AND load_fifteen IS NOT NULL + AND load_five IS NOT NULL + AND load_one IS NOT NULL + AND mem_free IS NOT NULL + AND swap_free IS NOT NULL + AND log_time >= TIMESTAMP '2017-01-01 00:00:00' +) AS r +GROUP BY dt, + hr +ORDER BY dt, + hr; + + +-- Q1.4: Over 1 month, how often was each server blocked on disk I/O? + +SELECT machine_name, + COUNT(*) AS spikes +FROM logs1 +WHERE machine_group = 'Servers' + AND cpu_wio > 0.99 + AND log_time >= TIMESTAMP '2016-12-01 00:00:00' + AND log_time < TIMESTAMP '2017-01-01 00:00:00' +GROUP BY machine_name +ORDER BY spikes DESC +LIMIT 10; + + +-- Q1.5: Which externally reachable VMs have run low on memory? + +SELECT machine_name, + dt, + MIN(mem_free) AS mem_free_min +FROM ( + SELECT machine_name, + CAST(log_time AS DATE) AS dt, + mem_free + FROM logs1 + WHERE machine_group = 'DMZ' + AND mem_free IS NOT NULL +) AS r +GROUP BY machine_name, + dt +HAVING MIN(mem_free) < 10000 +ORDER BY machine_name, + dt; + + +-- Q1.6: What is the total hourly network traffic across all file servers? + +SELECT dt, + hr, + SUM(net_in) AS net_in_sum, + SUM(net_out) AS net_out_sum, + SUM(net_in) + SUM(net_out) AS both_sum +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + COALESCE(bytes_in, 0.0) / 1000000000.0 AS net_in, + COALESCE(bytes_out, 0.0) / 1000000000.0 AS net_out + FROM logs1 + WHERE machine_name IN ('allsorts','andes','bigred','blackjack','bonbon', + 'cadbury','chiclets','cotton','crows','dove','fireball','hearts','huey', + 'lindt','milkduds','milkyway','mnm','necco','nerds','orbit','peeps', + 'poprocks','razzles','runts','smarties','smuggler','spree','stride', + 'tootsie','trident','wrigley','york') +) AS r +GROUP BY dt, + hr +ORDER BY both_sum DESC +LIMIT 10; + + +-- Q2.1: Which requests have caused server errors within the past 2 weeks? + +SELECT * +FROM logs2 +WHERE status_code >= 500 + AND log_time >= TIMESTAMP '2012-12-18 00:00:00' +ORDER BY log_time; + + +-- Q2.2: During a specific 2-week period, was the user password file leaked? + +SELECT * +FROM logs2 +WHERE status_code >= 200 + AND status_code < 300 + AND request LIKE '%/etc/passwd%' + AND log_time >= TIMESTAMP '2012-05-06 00:00:00' + AND log_time < TIMESTAMP '2012-05-20 00:00:00'; + + +-- Q2.3: What was the average path depth for top-level requests in the past month? + +SELECT top_level, + AVG(LENGTH(request) - LENGTH(REPLACE(request, '/', ''))) AS depth_avg +FROM ( + SELECT SUBSTRING(request FROM 1 FOR len) AS top_level, + request + FROM ( + SELECT POSITION(SUBSTRING(request FROM 2), '/') AS len, + request + FROM logs2 + WHERE status_code >= 200 + AND status_code < 300 + AND log_time >= TIMESTAMP '2012-12-01 00:00:00' + ) AS r + WHERE len > 0 +) AS s +WHERE top_level IN ('/about','/courses','/degrees','/events', + '/grad','/industry','/news','/people', + '/publications','/research','/teaching','/ugrad') +GROUP BY top_level +ORDER BY top_level; + + +-- Q2.4: During the last 3 months, which clients have made an excessive number of requests? + +SELECT client_ip, + COUNT(*) AS num_requests +FROM logs2 +WHERE log_time >= TIMESTAMP '2012-10-01 00:00:00' +GROUP BY client_ip +HAVING COUNT(*) >= 100000 +ORDER BY num_requests DESC; + + +-- Q2.5: What are the daily unique visitors? + +SELECT dt, + COUNT(DISTINCT client_ip) +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + client_ip + FROM logs2 +) AS r +GROUP BY dt +ORDER BY dt; + + +-- Q2.6: What are the average and maximum data transfer rates (Gbps)? + +SELECT AVG(transfer) / 125000000.0 AS transfer_avg, + MAX(transfer) / 125000000.0 AS transfer_max +FROM ( + SELECT log_time, + SUM(object_size) AS transfer + FROM logs2 + GROUP BY log_time +) AS r; + + +-- Q3.1: Did the indoor temperature reach freezing over the weekend? + +SELECT * +FROM logs3 +WHERE event_type = 'temperature' + AND event_value <= 32.0 + AND log_time >= '2019-11-29 17:00:00.000'; + + +-- Q3.4: Over the past 6 months, how frequently were each door opened? + +SELECT device_name, + device_floor, + COUNT(*) AS ct +FROM logs3 +WHERE event_type = 'door_open' + AND log_time >= '2019-06-01 00:00:00.000' +GROUP BY device_name, + device_floor +ORDER BY ct DESC; + + +-- Q3.5: Where in the building do large temperature variations occur in winter and summer? + +WITH temperature AS ( + SELECT dt, + device_name, + device_type, + device_floor + FROM ( + SELECT dt, + hr, + device_name, + device_type, + device_floor, + AVG(event_value) AS temperature_hourly_avg + FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + device_name, + device_type, + device_floor, + event_value + FROM logs3 + WHERE event_type = 'temperature' + ) AS r + GROUP BY dt, + hr, + device_name, + device_type, + device_floor + ) AS s + GROUP BY dt, + device_name, + device_type, + device_floor + HAVING MAX(temperature_hourly_avg) - MIN(temperature_hourly_avg) >= 25.0 +) +SELECT DISTINCT device_name, + device_type, + device_floor, + 'WINTER' +FROM temperature +WHERE dt >= DATE '2018-12-01' + AND dt < DATE '2019-03-01' +UNION +SELECT DISTINCT device_name, + device_type, + device_floor, + 'SUMMER' +FROM temperature +WHERE dt >= DATE '2019-06-01' + AND dt < DATE '2019-09-01'; + + +-- Q3.6: For each device category, what are the monthly power consumption metrics? + +SELECT yr, + mo, + SUM(coffee_hourly_avg) AS coffee_monthly_sum, + AVG(coffee_hourly_avg) AS coffee_monthly_avg, + SUM(printer_hourly_avg) AS printer_monthly_sum, + AVG(printer_hourly_avg) AS printer_monthly_avg, + SUM(projector_hourly_avg) AS projector_monthly_sum, + AVG(projector_hourly_avg) AS projector_monthly_avg, + SUM(vending_hourly_avg) AS vending_monthly_sum, + AVG(vending_hourly_avg) AS vending_monthly_avg +FROM ( + SELECT dt, + yr, + mo, + hr, + AVG(coffee) AS coffee_hourly_avg, + AVG(printer) AS printer_hourly_avg, + AVG(projector) AS projector_hourly_avg, + AVG(vending) AS vending_hourly_avg + FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(YEAR FROM log_time) AS yr, + EXTRACT(MONTH FROM log_time) AS mo, + EXTRACT(HOUR FROM log_time) AS hr, + CASE WHEN device_name LIKE 'coffee%' THEN event_value END AS coffee, + CASE WHEN device_name LIKE 'printer%' THEN event_value END AS printer, + CASE WHEN device_name LIKE 'projector%' THEN event_value END AS projector, + CASE WHEN device_name LIKE 'vending%' THEN event_value END AS vending + FROM logs3 + WHERE device_type = 'meter' + ) AS r + GROUP BY dt, + yr, + mo, + hr +) AS s +GROUP BY yr, + mo +ORDER BY yr, + mo; +``` + +Данные также доступны для работы с интерактивными запросами через [Playground](https://gh-api.clickhouse.tech/play?user=play), [пример](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). + diff --git a/docs/ru/getting-started/example-datasets/cell-towers.md b/docs/ru/getting-started/example-datasets/cell-towers.md new file mode 100644 index 00000000000..a5524248019 --- /dev/null +++ b/docs/ru/getting-started/example-datasets/cell-towers.md @@ -0,0 +1,128 @@ +--- +toc_priority: 21 +toc_title: Вышки сотовой связи +--- + +# Вышки сотовой связи {#cell-towers} + +Источник этого набора данных (dataset) - самая большая в мире открытая база данных о сотовых вышках - [OpenCellid](https://www.opencellid.org/). К 2021-му году здесь накопилось более, чем 40 миллионов записей о сотовых вышках (GSM, LTE, UMTS, и т.д.) по всему миру с их географическими координатами и метаданными (код страны, сети, и т.д.). + +OpenCelliD Project имеет лицензию Creative Commons Attribution-ShareAlike 4.0 International License, и мы распространяем снэпшот набора данных по условиям этой же лицензии. После авторизации можно загрузить последнюю версию набора данных. + +## Как получить набор данных {#get-the-dataset} + +1. Загрузите снэпшот набора данных за февраль 2021 [отсюда](https://datasets.clickhouse.tech/cell_towers.csv.xz) (729 MB). + +2. Если нужно, проверьте полноту и целостность при помощи команды: + +``` +md5sum cell_towers.csv.xz +8cf986f4a0d9f12c6f384a0e9192c908 cell_towers.csv.xz +``` + +3. Распакуйте набор данных при помощи команды: + +``` +xz -d cell_towers.csv.xz +``` + +4. Создайте таблицу: + +``` +CREATE TABLE cell_towers +( + radio Enum8('' = 0, 'CDMA' = 1, 'GSM' = 2, 'LTE' = 3, 'NR' = 4, 'UMTS' = 5), + mcc UInt16, + net UInt16, + area UInt16, + cell UInt64, + unit Int16, + lon Float64, + lat Float64, + range UInt32, + samples UInt32, + changeable UInt8, + created DateTime, + updated DateTime, + averageSignal UInt8 +) +ENGINE = MergeTree ORDER BY (radio, mcc, net, created); +``` + +5. Вставьте данные: +``` +clickhouse-client --query "INSERT INTO cell_towers FORMAT CSVWithNames" < cell_towers.csv +``` + +## Примеры {#examples} + +1. Количество вышек по типам: + +``` +SELECT radio, count() AS c FROM cell_towers GROUP BY radio ORDER BY c DESC + +┌─radio─┬────────c─┐ +│ UMTS │ 20686487 │ +│ LTE │ 12101148 │ +│ GSM │ 9931312 │ +│ CDMA │ 556344 │ +│ NR │ 867 │ +└───────┴──────────┘ + +5 rows in set. Elapsed: 0.011 sec. Processed 43.28 million rows, 43.28 MB (3.83 billion rows/s., 3.83 GB/s.) +``` + +2. Количество вышек по [мобильному коду страны (MCC)](https://ru.wikipedia.org/wiki/Mobile_Country_Code): + +``` +SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 + +┌─mcc─┬─count()─┐ +│ 310 │ 5024650 │ +│ 262 │ 2622423 │ +│ 250 │ 1953176 │ +│ 208 │ 1891187 │ +│ 724 │ 1836150 │ +│ 404 │ 1729151 │ +│ 234 │ 1618924 │ +│ 510 │ 1353998 │ +│ 440 │ 1343355 │ +│ 311 │ 1332798 │ +└─────┴─────────┘ + +10 rows in set. Elapsed: 0.019 sec. Processed 43.28 million rows, 86.55 MB (2.33 billion rows/s., 4.65 GB/s.) +``` + +Можно увидеть, что по количеству вышек лидируют следующие страны: США, Германия, Россия. + +Вы также можете создать [внешний словарь](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) в ClickHouse для того, чтобы расшифровать эти значения. + +## Пример использования {#use-case} + +Рассмотрим применение функции `pointInPolygon`. + +1. Создаем таблицу, в которой будем хранить многоугольники: + +``` +CREATE TEMPORARY TABLE moscow (polygon Array(Tuple(Float64, Float64))); +``` + +2. Очертания Москвы выглядят приблизительно так ("Новая Москва" в них не включена): + +``` +INSERT INTO moscow VALUES ([(37.84172564285271, 55.78000432402266), (37.8381207618713, 55.775874525970494), (37.83979446823122, 55.775626746008065), (37.84243326983639, 55.77446586811748), (37.84262672750849, 55.771974101091104), (37.84153238623039, 55.77114545193181), (37.841124690460184, 55.76722010265554), (37.84239076983644, 55.76654891107098), (37.842283558197025, 55.76258709833121), (37.8421759312134, 55.758073999993734), (37.84198330422974, 55.75381499999371), (37.8416827275085, 55.749277102484484), (37.84157576190186, 55.74794544108413), (37.83897929098507, 55.74525257875241), (37.83739676451868, 55.74404373042019), (37.838732481460525, 55.74298009816793), (37.841183997352545, 55.743060321833575), (37.84097476190185, 55.73938799999373), (37.84048155819702, 55.73570799999372), (37.840095812164286, 55.73228210777237), (37.83983814285274, 55.73080491981639), (37.83846476321406, 55.729799917464675), (37.83835745269769, 55.72919751082619), (37.838636380279524, 55.72859509486539), (37.8395161005249, 55.727705075632784), (37.83897964285276, 55.722727886185154), (37.83862557539366, 55.72034817326636), (37.83559735744853, 55.71944437307499), (37.835370708803126, 55.71831419154461), (37.83738169402022, 55.71765218986692), (37.83823396494291, 55.71691750159089), (37.838056931213345, 55.71547311301385), (37.836812846557606, 55.71221445615604), (37.83522525396725, 55.709331054395555), (37.83269301586908, 55.70953687463627), (37.829667367706236, 55.70903403789297), (37.83311126588435, 55.70552351822608), (37.83058993121339, 55.70041317726053), (37.82983872750851, 55.69883771404813), (37.82934501586913, 55.69718947487017), (37.828926414016685, 55.69504441658371), (37.82876530422971, 55.69287499999378), (37.82894754100031, 55.690759754047335), (37.827697554878185, 55.68951421135665), (37.82447346292115, 55.68965045405069), (37.83136543914793, 55.68322046195302), (37.833554015869154, 55.67814012759211), (37.83544184655761, 55.67295011628339), (37.837480388885474, 55.6672498719639), (37.838960677246064, 55.66316274139358), (37.83926093121332, 55.66046999999383), (37.839025050262435, 55.65869897264431), (37.83670784390257, 55.65794084879904), (37.835656529083245, 55.65694309303843), (37.83704060449217, 55.65689306460552), (37.83696819873806, 55.65550363526252), (37.83760389616388, 55.65487847246661), (37.83687972750851, 55.65356745541324), (37.83515216004943, 55.65155951234079), (37.83312418518067, 55.64979413590619), (37.82801726983639, 55.64640836412121), (37.820614174591, 55.64164525405531), (37.818908190475426, 55.6421883258084), (37.81717543386075, 55.64112490388471), (37.81690987037274, 55.63916106913107), (37.815099354492155, 55.637925371757085), (37.808769150787356, 55.633798276884455), (37.80100123544311, 55.62873670012244), (37.79598013491824, 55.62554336109055), (37.78634567724606, 55.62033499605651), (37.78334147619623, 55.618768681480326), (37.77746201055901, 55.619855533402706), (37.77527329626457, 55.61909966711279), (37.77801986242668, 55.618770300976294), (37.778212973541216, 55.617257701952106), (37.77784818518065, 55.61574504433011), (37.77016867724609, 55.61148576294007), (37.760191219573976, 55.60599579539028), (37.75338926983641, 55.60227892751446), (37.746329965606634, 55.59920577639331), (37.73939925396728, 55.59631430313617), (37.73273665739439, 55.5935318803559), (37.7299954450912, 55.59350760316188), (37.7268679946899, 55.59469840523759), (37.72626726983634, 55.59229549697373), (37.7262673598022, 55.59081598950582), (37.71897193121335, 55.5877595845419), (37.70871550793456, 55.58393177431724), (37.700497489410374, 55.580917323756644), (37.69204305026244, 55.57778089778455), (37.68544477378839, 55.57815154690915), (37.68391050793454, 55.57472945079756), (37.678803592590306, 55.57328235936491), (37.6743402539673, 55.57255251445782), (37.66813862698363, 55.57216388774464), (37.617927457672096, 55.57505691895805), (37.60443099999999, 55.5757737568051), (37.599683515869145, 55.57749105910326), (37.59754177842709, 55.57796291823627), (37.59625834786988, 55.57906686095235), (37.59501783265684, 55.57746616444403), (37.593090671936025, 55.57671634534502), (37.587018007904, 55.577944600233785), (37.578692203704804, 55.57982895000019), (37.57327546607398, 55.58116294118248), (37.57385012109279, 55.581550362779), (37.57399562266922, 55.5820107079112), (37.5735356072979, 55.58226289171689), (37.57290393054962, 55.582393529795155), (37.57037722355653, 55.581919415056234), (37.5592298306885, 55.584471614867844), (37.54189249206543, 55.58867650795186), (37.5297256269836, 55.59158133551745), (37.517837865081766, 55.59443656218868), (37.51200186508174, 55.59635625174229), (37.506808949737554, 55.59907823904434), (37.49820432275389, 55.6062944994944), (37.494406071441674, 55.60967103463367), (37.494760001358024, 55.61066689753365), (37.49397137107085, 55.61220931698269), (37.49016528606031, 55.613417718449064), (37.48773249206542, 55.61530616333343), (37.47921386508177, 55.622640129112334), (37.470652153442394, 55.62993723476164), (37.46273446298218, 55.6368075123157), (37.46350692265317, 55.64068225239439), (37.46050283203121, 55.640794546982576), (37.457627470916734, 55.64118904154646), (37.450718034393326, 55.64690488145138), (37.44239252645875, 55.65397824729769), (37.434587576721185, 55.66053543155961), (37.43582144975277, 55.661693766520735), (37.43576786245721, 55.662755031737014), (37.430982915344174, 55.664610641628116), (37.428547447097685, 55.66778515273695), (37.42945134592044, 55.668633314343566), (37.42859571562949, 55.66948145750025), (37.4262836402282, 55.670813882451405), (37.418709037048295, 55.6811141674414), (37.41922139651101, 55.68235377885389), (37.419218771842885, 55.68359335082235), (37.417196501327446, 55.684375235224735), (37.41607020370478, 55.68540557585352), (37.415640857147146, 55.68686637150793), (37.414632153442334, 55.68903015131686), (37.413344899475064, 55.690896881757396), (37.41171432275391, 55.69264232162232), (37.40948282275393, 55.69455101638112), (37.40703674603271, 55.69638690385348), (37.39607169577025, 55.70451821283731), (37.38952706878662, 55.70942491932811), (37.387778313491815, 55.71149057784176), (37.39049275399779, 55.71419814298992), (37.385557272491454, 55.7155489617061), (37.38388335714726, 55.71849856042102), (37.378368238098155, 55.7292763261685), (37.37763597123337, 55.730845879211614), (37.37890062088197, 55.73167906388319), (37.37750451918789, 55.734703664681774), (37.375610832015965, 55.734851959522246), (37.3723813571472, 55.74105626086403), (37.37014935714723, 55.746115620904355), (37.36944173016362, 55.750883999993725), (37.36975304365541, 55.76335905525834), (37.37244070571134, 55.76432079697595), (37.3724259757175, 55.76636979670426), (37.369922155757884, 55.76735417953104), (37.369892695770275, 55.76823419316575), (37.370214730163575, 55.782312184391266), (37.370493611114505, 55.78436801120489), (37.37120164550783, 55.78596427165359), (37.37284851456452, 55.7874378183096), (37.37608325135799, 55.7886695054807), (37.3764587460632, 55.78947647305964), (37.37530000265506, 55.79146512926804), (37.38235915344241, 55.79899647809345), (37.384344043655396, 55.80113596939471), (37.38594269577028, 55.80322699999366), (37.38711208598329, 55.804919036911976), (37.3880239841309, 55.806610999993666), (37.38928977249147, 55.81001864976979), (37.39038389947512, 55.81348641242801), (37.39235781481933, 55.81983538336746), (37.393709457672124, 55.82417822811877), (37.394685720901464, 55.82792275755836), (37.39557615344238, 55.830447148154136), (37.39844478226658, 55.83167107969975), (37.40019761214057, 55.83151823557964), (37.400398790382326, 55.83264967594742), (37.39659544313046, 55.83322180909622), (37.39667059524539, 55.83402792148566), (37.39682089947515, 55.83638877400216), (37.39643489154053, 55.83861656112751), (37.3955338994751, 55.84072348043264), (37.392680272491454, 55.84502158126453), (37.39241188227847, 55.84659117913199), (37.392529730163616, 55.84816071336481), (37.39486835714723, 55.85288092980303), (37.39873052645878, 55.859893456073635), (37.40272161111449, 55.86441833633205), (37.40697072750854, 55.867579567544375), (37.410007082016016, 55.868369880337), (37.4120992989502, 55.86920843741314), (37.412668021163924, 55.87055369615854), (37.41482461111453, 55.87170587948249), (37.41862266137694, 55.873183961039565), (37.42413732540892, 55.874879126654704), (37.4312182698669, 55.875614937236705), (37.43111093783558, 55.8762723478417), (37.43332105622856, 55.87706546369396), (37.43385747619623, 55.87790681284802), (37.441303050262405, 55.88027084462084), (37.44747234260555, 55.87942070143253), (37.44716141796871, 55.88072960917233), (37.44769797085568, 55.88121221323979), (37.45204320500181, 55.882080694420715), (37.45673176190186, 55.882346110794586), (37.463383999999984, 55.88252729504517), (37.46682797486874, 55.88294937719063), (37.470014457672086, 55.88361266759345), (37.47751410450743, 55.88546991372396), (37.47860317658232, 55.88534929207307), (37.48165826025772, 55.882563306475106), (37.48316434442331, 55.8815803226785), (37.483831555817645, 55.882427612793315), (37.483182967125686, 55.88372791409729), (37.483092277908824, 55.88495581062434), (37.4855716508179, 55.8875561994203), (37.486440636245746, 55.887827444039566), (37.49014203439328, 55.88897899871799), (37.493210285705544, 55.890208937135604), (37.497512451065035, 55.891342397444696), (37.49780744510645, 55.89174030252967), (37.49940333499519, 55.89239745507079), (37.50018383334346, 55.89339220941865), (37.52421672750851, 55.903869074155224), (37.52977457672118, 55.90564076517974), (37.53503220370484, 55.90661661218259), (37.54042858064267, 55.90714113744566), (37.54320461007303, 55.905645048442985), (37.545686966066306, 55.906608607018505), (37.54743976120755, 55.90788552162358), (37.55796999999999, 55.90901557907218), (37.572711542327866, 55.91059395704873), (37.57942799999998, 55.91073854155573), (37.58502865872187, 55.91009969268444), (37.58739968913264, 55.90794809960554), (37.59131567193598, 55.908713267595054), (37.612687423278814, 55.902866854295375), (37.62348079629517, 55.90041967242986), (37.635797880950896, 55.898141151686396), (37.649487626983664, 55.89639275532968), (37.65619302513125, 55.89572360207488), (37.66294133862307, 55.895295577183965), (37.66874564418033, 55.89505457604897), (37.67375601586915, 55.89254677027454), (37.67744661901856, 55.8947775867987), (37.688347, 55.89450045676125), (37.69480554232789, 55.89422926332761), (37.70107096560668, 55.89322256101114), (37.705962965606716, 55.891763491662616), (37.711885134918205, 55.889110234998974), (37.71682005026245, 55.886577568759876), (37.7199315476074, 55.88458159806678), (37.72234560316464, 55.882281005794134), (37.72364385977171, 55.8809452036196), (37.725371142837474, 55.8809722706006), (37.727870902099546, 55.88037213862385), (37.73394330422971, 55.877941504088696), (37.745339592590376, 55.87208120378722), (37.75525267724611, 55.86703807949492), (37.76919976190188, 55.859821640197474), (37.827835219574, 55.82962968399116), (37.83341438888553, 55.82575289922351), (37.83652584655761, 55.82188784027888), (37.83809213491821, 55.81612575504693), (37.83605359521481, 55.81460347077685), (37.83632178569025, 55.81276696067908), (37.838623105812026, 55.811486181656385), (37.83912198147584, 55.807329380532785), (37.839079078033414, 55.80510270463816), (37.83965844708251, 55.79940712529036), (37.840581150787344, 55.79131399999368), (37.84172564285271, 55.78000432402266)]); +``` + +3. Проверяем, сколько сотовых вышек находится в Москве: + +``` +SELECT count() FROM cell_towers WHERE pointInPolygon((lon, lat), (SELECT * FROM moscow)) + +┌─count()─┐ +│ 310463 │ +└─────────┘ + +1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.) +``` + +Вы можете протестировать другие запросы с помощью интерактивного ресурса [Playground](https://gh-api.clickhouse.tech/play?user=play). Например, [вот так](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). Однако, обратите внимание, что здесь нельзя создавать временные таблицы. diff --git a/docs/ru/getting-started/example-datasets/criteo.md b/docs/ru/getting-started/example-datasets/criteo.md index 7a58da5b695..bfa428a0e1c 100644 --- a/docs/ru/getting-started/example-datasets/criteo.md +++ b/docs/ru/getting-started/example-datasets/criteo.md @@ -1,6 +1,6 @@ --- toc_priority: 18 -toc_title: "\u0422\u0435\u0440\u0430\u0431\u0430\u0439\u0442\u0020\u043b\u043e\u0433\u043e\u0432\u0020\u043a\u043b\u0438\u043a\u043e\u0432\u0020\u043e\u0442\u0020\u0043\u0072\u0069\u0074\u0065\u006f" +toc_title: "Терабайт логов кликов от Criteo" --- # Терабайт логов кликов от Criteo {#terabait-logov-klikov-ot-criteo} @@ -76,4 +76,3 @@ INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int DROP TABLE criteo_log; ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/criteo/) diff --git a/docs/ru/getting-started/example-datasets/index.md b/docs/ru/getting-started/example-datasets/index.md index eff944a7980..756b3a75dee 100644 --- a/docs/ru/getting-started/example-datasets/index.md +++ b/docs/ru/getting-started/example-datasets/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u0422\u0435\u0441\u0442\u043e\u0432\u044b\u0435\u0020\u043c\u0430\u0441\u0441\u0438\u0432\u044b\u0020\u0434\u0430\u043d\u043d\u044b\u0445" +toc_folder_title: "Тестовые массивы данных" toc_priority: 14 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +toc_title: "Введение" --- # Тестовые массивы данных {#testovye-massivy-dannykh} @@ -16,5 +16,5 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" - [AMPLab Big Data Benchmark](amplab-benchmark.md) - [Данные о такси в Нью-Йорке](nyc-taxi.md) - [OnTime](ontime.md) +- [Вышки сотовой связи](../../getting-started/example-datasets/cell-towers.md) -[Оригинальная статья](https://clickhouse.tech/docs/en/getting_started/example_datasets) diff --git a/docs/ru/getting-started/example-datasets/metrica.md b/docs/ru/getting-started/example-datasets/metrica.md index 3246eb5178c..7deacdb836c 100644 --- a/docs/ru/getting-started/example-datasets/metrica.md +++ b/docs/ru/getting-started/example-datasets/metrica.md @@ -1,6 +1,6 @@ --- toc_priority: 15 -toc_title: "\u0410\u043d\u043e\u043d\u0438\u043c\u0438\u0437\u0438\u0440\u043e\u0432\u0430\u043d\u043d\u044b\u0435\u0020\u0434\u0430\u043d\u043d\u044b\u0435\u0020\u042f\u043d\u0434\u0435\u043a\u0441\u002e\u041c\u0435\u0442\u0440\u0438\u043a\u0438" +toc_title: "Анонимизированные данные Яндекс.Метрики" --- # Анонимизированные данные Яндекс.Метрики {#anonimizirovannye-dannye-iandeks-metriki} diff --git a/docs/ru/getting-started/example-datasets/nyc-taxi.md b/docs/ru/getting-started/example-datasets/nyc-taxi.md index a4472751a99..38a60ed1b2d 100644 --- a/docs/ru/getting-started/example-datasets/nyc-taxi.md +++ b/docs/ru/getting-started/example-datasets/nyc-taxi.md @@ -1,6 +1,6 @@ --- toc_priority: 20 -toc_title: "\u0414\u0430\u043d\u043d\u044b\u0435\u0020\u043e\u0020\u0442\u0430\u043a\u0441\u0438\u0020\u0432\u0020\u041d\u044c\u044e\u002d\u0419\u043e\u0440\u043a\u0435" +toc_title: "Данные о такси в Нью-Йорке" --- # Данные о такси в Нью-Йорке {#dannye-o-taksi-v-niu-iorke} @@ -390,4 +390,3 @@ Q4: 0.072 sec. | 3 | 0.212 | 0.438 | 0.733 | 1.241 | | 140 | 0.028 | 0.043 | 0.051 | 0.072 | -[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/nyc_taxi/) diff --git a/docs/ru/getting-started/example-datasets/ontime.md b/docs/ru/getting-started/example-datasets/ontime.md index 41a1c0d3142..be5b1cd1b70 100644 --- a/docs/ru/getting-started/example-datasets/ontime.md +++ b/docs/ru/getting-started/example-datasets/ontime.md @@ -407,4 +407,3 @@ LIMIT 10; - https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ - http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html -[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/ontime/) diff --git a/docs/ru/getting-started/example-datasets/recipes.md b/docs/ru/getting-started/example-datasets/recipes.md new file mode 100644 index 00000000000..75e385150e8 --- /dev/null +++ b/docs/ru/getting-started/example-datasets/recipes.md @@ -0,0 +1,342 @@ +--- +toc_priority: 16 +toc_title: Набор данных кулинарных рецептов +--- + +# Набор данных кулинарных рецептов + +Набор данных кулинарных рецептов от RecipeNLG доступен для загрузки [здесь](https://recipenlg.cs.put.poznan.pl/dataset). Он содержит 2.2 миллиона рецептов, а его размер чуть меньше 1 ГБ. + +## Загрузите и распакуйте набор данных + +1. Перейдите на страницу загрузки [https://recipenlg.cs.put.poznan.pl/dataset](https://recipenlg.cs.put.poznan.pl/dataset). +1. Примите Правила и условия и скачайте zip-архив с набором данных. +1. Распакуйте zip-архив и вы получите файл `full_dataset.csv`. + +## Создайте таблицу + +Запустите клиент ClickHouse и выполните следующий запрос для создания таблицы `recipes`: + +``` sql +CREATE TABLE recipes +( + title String, + ingredients Array(String), + directions Array(String), + link String, + source LowCardinality(String), + NER Array(String) +) ENGINE = MergeTree ORDER BY title; +``` + +## Добавьте данные в таблицу + +Чтобы добавить данные из файла `full_dataset.csv` в таблицу `recipes`, выполните команду: + +``` bash +clickhouse-client --query " + INSERT INTO recipes + SELECT + title, + JSONExtract(ingredients, 'Array(String)'), + JSONExtract(directions, 'Array(String)'), + link, + source, + JSONExtract(NER, 'Array(String)') + FROM input('num UInt32, title String, ingredients String, directions String, link String, source LowCardinality(String), NER String') + FORMAT CSVWithNames +" --input_format_with_names_use_header 0 --format_csv_allow_single_quote 0 --input_format_allow_errors_num 10 < full_dataset.csv +``` + +Это один из примеров анализа пользовательских CSV-файлов с применением специальных настроек. + +Пояснение: +- набор данных представлен в формате CSV и требует некоторой предварительной обработки при вставке. Для предварительной обработки используется табличная функция [input](../../sql-reference/table-functions/input.md); +- структура CSV-файла задается в аргументе табличной функции `input`; +- поле `num` (номер строки) не нужно — оно считывается из файла, но игнорируется; +- при загрузке используется `FORMAT CSVWithNames`, но заголовок в CSV будет проигнорирован (параметром командной строки `--input_format_with_names_use_header 0`), поскольку заголовок не содержит имени первого поля; +- в файле CSV для разделения строк используются только двойные кавычки. Но некоторые строки не заключены в двойные кавычки, и чтобы одинарная кавычка не рассматривалась как заключающая, используется параметр `--format_csv_allow_single_quote 0`; +- некоторые строки из CSV не могут быть считаны корректно, поскольку они начинаются с символов`\M/`, тогда как в CSV начинаться с обратной косой черты могут только символы `\N`, которые распознаются как `NULL` в SQL. Поэтому используется параметр `--input_format_allow_errors_num 10`, разрешающий пропустить до десяти некорректных записей; +- массивы `ingredients`, `directions` и `NER` представлены в необычном виде: они сериализуются в строку формата JSON, а затем помещаются в CSV — тогда они могут считываться и обрабатываться как обычные строки (`String`). Чтобы преобразовать строку в массив, используется функция [JSONExtract](../../sql-reference/functions/json-functions.md). + +## Проверьте добавленные данные + +Чтобы проверить добавленные данные, подсчитайте количество строк в таблице: + +Запрос: + +``` sql +SELECT count() FROM recipes; +``` + +Результат: + +``` text +┌─count()─┐ +│ 2231141 │ +└─────────┘ +``` + +## Примеры запросов + +### Самые упоминаемые ингридиенты в рецептах: + +В этом примере вы узнаете, как развернуть массив в набор строк с помощью функции [arrayJoin](../../sql-reference/functions/array-join.md). + +Запрос: + +``` sql +SELECT + arrayJoin(NER) AS k, + count() AS c +FROM recipes +GROUP BY k +ORDER BY c DESC +LIMIT 50 +``` + +Результат: + +``` text +┌─k────────────────────┬──────c─┐ +│ salt │ 890741 │ +│ sugar │ 620027 │ +│ butter │ 493823 │ +│ flour │ 466110 │ +│ eggs │ 401276 │ +│ onion │ 372469 │ +│ garlic │ 358364 │ +│ milk │ 346769 │ +│ water │ 326092 │ +│ vanilla │ 270381 │ +│ olive oil │ 197877 │ +│ pepper │ 179305 │ +│ brown sugar │ 174447 │ +│ tomatoes │ 163933 │ +│ egg │ 160507 │ +│ baking powder │ 148277 │ +│ lemon juice │ 146414 │ +│ Salt │ 122557 │ +│ cinnamon │ 117927 │ +│ sour cream │ 116682 │ +│ cream cheese │ 114423 │ +│ margarine │ 112742 │ +│ celery │ 112676 │ +│ baking soda │ 110690 │ +│ parsley │ 102151 │ +│ chicken │ 101505 │ +│ onions │ 98903 │ +│ vegetable oil │ 91395 │ +│ oil │ 85600 │ +│ mayonnaise │ 84822 │ +│ pecans │ 79741 │ +│ nuts │ 78471 │ +│ potatoes │ 75820 │ +│ carrots │ 75458 │ +│ pineapple │ 74345 │ +│ soy sauce │ 70355 │ +│ black pepper │ 69064 │ +│ thyme │ 68429 │ +│ mustard │ 65948 │ +│ chicken broth │ 65112 │ +│ bacon │ 64956 │ +│ honey │ 64626 │ +│ oregano │ 64077 │ +│ ground beef │ 64068 │ +│ unsalted butter │ 63848 │ +│ mushrooms │ 61465 │ +│ Worcestershire sauce │ 59328 │ +│ cornstarch │ 58476 │ +│ green pepper │ 58388 │ +│ Cheddar cheese │ 58354 │ +└──────────────────────┴────────┘ + +50 rows in set. Elapsed: 0.112 sec. Processed 2.23 million rows, 361.57 MB (19.99 million rows/s., 3.24 GB/s.) +``` + +### Самые сложные рецепты с клубникой + +Запрос: + +``` sql +SELECT + title, + length(NER), + length(directions) +FROM recipes +WHERE has(NER, 'strawberry') +ORDER BY length(directions) DESC +LIMIT 10; +``` + +Результат: + +``` text +┌─title────────────────────────────────────────────────────────────┬─length(NER)─┬─length(directions)─┐ +│ Chocolate-Strawberry-Orange Wedding Cake │ 24 │ 126 │ +│ Strawberry Cream Cheese Crumble Tart │ 19 │ 47 │ +│ Charlotte-Style Ice Cream │ 11 │ 45 │ +│ Sinfully Good a Million Layers Chocolate Layer Cake, With Strawb │ 31 │ 45 │ +│ Sweetened Berries With Elderflower Sherbet │ 24 │ 44 │ +│ Chocolate-Strawberry Mousse Cake │ 15 │ 42 │ +│ Rhubarb Charlotte with Strawberries and Rum │ 20 │ 42 │ +│ Chef Joey's Strawberry Vanilla Tart │ 7 │ 37 │ +│ Old-Fashioned Ice Cream Sundae Cake │ 17 │ 37 │ +│ Watermelon Cake │ 16 │ 36 │ +└──────────────────────────────────────────────────────────────────┴─────────────┴────────────────────┘ + +10 rows in set. Elapsed: 0.215 sec. Processed 2.23 million rows, 1.48 GB (10.35 million rows/s., 6.86 GB/s.) +``` + +В этом примере используется функция [has](../../sql-reference/functions/array-functions.md#hasarr-elem) для проверки вхождения элемента в массив, а также сортировка по количеству шагов (`length(directions)`). + +Существует свадебный торт, который требует целых 126 шагов для производства! Рассмотрим эти шаги: + +Запрос: + +``` sql +SELECT arrayJoin(directions) +FROM recipes +WHERE title = 'Chocolate-Strawberry-Orange Wedding Cake'; +``` + +Результат: + +``` text +┌─arrayJoin(directions)───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Position 1 rack in center and 1 rack in bottom third of oven and preheat to 350F. │ +│ Butter one 5-inch-diameter cake pan with 2-inch-high sides, one 8-inch-diameter cake pan with 2-inch-high sides and one 12-inch-diameter cake pan with 2-inch-high sides. │ +│ Dust pans with flour; line bottoms with parchment. │ +│ Combine 1/3 cup orange juice and 2 ounces unsweetened chocolate in heavy small saucepan. │ +│ Stir mixture over medium-low heat until chocolate melts. │ +│ Remove from heat. │ +│ Gradually mix in 1 2/3 cups orange juice. │ +│ Sift 3 cups flour, 2/3 cup cocoa, 2 teaspoons baking soda, 1 teaspoon salt and 1/2 teaspoon baking powder into medium bowl. │ +│ using electric mixer, beat 1 cup (2 sticks) butter and 3 cups sugar in large bowl until blended (mixture will look grainy). │ +│ Add 4 eggs, 1 at a time, beating to blend after each. │ +│ Beat in 1 tablespoon orange peel and 1 tablespoon vanilla extract. │ +│ Add dry ingredients alternately with orange juice mixture in 3 additions each, beating well after each addition. │ +│ Mix in 1 cup chocolate chips. │ +│ Transfer 1 cup plus 2 tablespoons batter to prepared 5-inch pan, 3 cups batter to prepared 8-inch pan and remaining batter (about 6 cups) to 12-inch pan. │ +│ Place 5-inch and 8-inch pans on center rack of oven. │ +│ Place 12-inch pan on lower rack of oven. │ +│ Bake cakes until tester inserted into center comes out clean, about 35 minutes. │ +│ Transfer cakes in pans to racks and cool completely. │ +│ Mark 4-inch diameter circle on one 6-inch-diameter cardboard cake round. │ +│ Cut out marked circle. │ +│ Mark 7-inch-diameter circle on one 8-inch-diameter cardboard cake round. │ +│ Cut out marked circle. │ +│ Mark 11-inch-diameter circle on one 12-inch-diameter cardboard cake round. │ +│ Cut out marked circle. │ +│ Cut around sides of 5-inch-cake to loosen. │ +│ Place 4-inch cardboard over pan. │ +│ Hold cardboard and pan together; turn cake out onto cardboard. │ +│ Peel off parchment.Wrap cakes on its cardboard in foil. │ +│ Repeat turning out, peeling off parchment and wrapping cakes in foil, using 7-inch cardboard for 8-inch cake and 11-inch cardboard for 12-inch cake. │ +│ Using remaining ingredients, make 1 more batch of cake batter and bake 3 more cake layers as described above. │ +│ Cool cakes in pans. │ +│ Cover cakes in pans tightly with foil. │ +│ (Can be prepared ahead. │ +│ Let stand at room temperature up to 1 day or double-wrap all cake layers and freeze up to 1 week. │ +│ Bring cake layers to room temperature before using.) │ +│ Place first 12-inch cake on its cardboard on work surface. │ +│ Spread 2 3/4 cups ganache over top of cake and all the way to edge. │ +│ Spread 2/3 cup jam over ganache, leaving 1/2-inch chocolate border at edge. │ +│ Drop 1 3/4 cups white chocolate frosting by spoonfuls over jam. │ +│ Gently spread frosting over jam, leaving 1/2-inch chocolate border at edge. │ +│ Rub some cocoa powder over second 12-inch cardboard. │ +│ Cut around sides of second 12-inch cake to loosen. │ +│ Place cardboard, cocoa side down, over pan. │ +│ Turn cake out onto cardboard. │ +│ Peel off parchment. │ +│ Carefully slide cake off cardboard and onto filling on first 12-inch cake. │ +│ Refrigerate. │ +│ Place first 8-inch cake on its cardboard on work surface. │ +│ Spread 1 cup ganache over top all the way to edge. │ +│ Spread 1/4 cup jam over, leaving 1/2-inch chocolate border at edge. │ +│ Drop 1 cup white chocolate frosting by spoonfuls over jam. │ +│ Gently spread frosting over jam, leaving 1/2-inch chocolate border at edge. │ +│ Rub some cocoa over second 8-inch cardboard. │ +│ Cut around sides of second 8-inch cake to loosen. │ +│ Place cardboard, cocoa side down, over pan. │ +│ Turn cake out onto cardboard. │ +│ Peel off parchment. │ +│ Slide cake off cardboard and onto filling on first 8-inch cake. │ +│ Refrigerate. │ +│ Place first 5-inch cake on its cardboard on work surface. │ +│ Spread 1/2 cup ganache over top of cake and all the way to edge. │ +│ Spread 2 tablespoons jam over, leaving 1/2-inch chocolate border at edge. │ +│ Drop 1/3 cup white chocolate frosting by spoonfuls over jam. │ +│ Gently spread frosting over jam, leaving 1/2-inch chocolate border at edge. │ +│ Rub cocoa over second 6-inch cardboard. │ +│ Cut around sides of second 5-inch cake to loosen. │ +│ Place cardboard, cocoa side down, over pan. │ +│ Turn cake out onto cardboard. │ +│ Peel off parchment. │ +│ Slide cake off cardboard and onto filling on first 5-inch cake. │ +│ Chill all cakes 1 hour to set filling. │ +│ Place 12-inch tiered cake on its cardboard on revolving cake stand. │ +│ Spread 2 2/3 cups frosting over top and sides of cake as a first coat. │ +│ Refrigerate cake. │ +│ Place 8-inch tiered cake on its cardboard on cake stand. │ +│ Spread 1 1/4 cups frosting over top and sides of cake as a first coat. │ +│ Refrigerate cake. │ +│ Place 5-inch tiered cake on its cardboard on cake stand. │ +│ Spread 3/4 cup frosting over top and sides of cake as a first coat. │ +│ Refrigerate all cakes until first coats of frosting set, about 1 hour. │ +│ (Cakes can be made to this point up to 1 day ahead; cover and keep refrigerate.) │ +│ Prepare second batch of frosting, using remaining frosting ingredients and following directions for first batch. │ +│ Spoon 2 cups frosting into pastry bag fitted with small star tip. │ +│ Place 12-inch cake on its cardboard on large flat platter. │ +│ Place platter on cake stand. │ +│ Using icing spatula, spread 2 1/2 cups frosting over top and sides of cake; smooth top. │ +│ Using filled pastry bag, pipe decorative border around top edge of cake. │ +│ Refrigerate cake on platter. │ +│ Place 8-inch cake on its cardboard on cake stand. │ +│ Using icing spatula, spread 1 1/2 cups frosting over top and sides of cake; smooth top. │ +│ Using pastry bag, pipe decorative border around top edge of cake. │ +│ Refrigerate cake on its cardboard. │ +│ Place 5-inch cake on its cardboard on cake stand. │ +│ Using icing spatula, spread 3/4 cup frosting over top and sides of cake; smooth top. │ +│ Using pastry bag, pipe decorative border around top edge of cake, spooning more frosting into bag if necessary. │ +│ Refrigerate cake on its cardboard. │ +│ Keep all cakes refrigerated until frosting sets, about 2 hours. │ +│ (Can be prepared 2 days ahead. │ +│ Cover loosely; keep refrigerated.) │ +│ Place 12-inch cake on platter on work surface. │ +│ Press 1 wooden dowel straight down into and completely through center of cake. │ +│ Mark dowel 1/4 inch above top of frosting. │ +│ Remove dowel and cut with serrated knife at marked point. │ +│ Cut 4 more dowels to same length. │ +│ Press 1 cut dowel back into center of cake. │ +│ Press remaining 4 cut dowels into cake, positioning 3 1/2 inches inward from cake edges and spacing evenly. │ +│ Place 8-inch cake on its cardboard on work surface. │ +│ Press 1 dowel straight down into and completely through center of cake. │ +│ Mark dowel 1/4 inch above top of frosting. │ +│ Remove dowel and cut with serrated knife at marked point. │ +│ Cut 3 more dowels to same length. │ +│ Press 1 cut dowel back into center of cake. │ +│ Press remaining 3 cut dowels into cake, positioning 2 1/2 inches inward from edges and spacing evenly. │ +│ Using large metal spatula as aid, place 8-inch cake on its cardboard atop dowels in 12-inch cake, centering carefully. │ +│ Gently place 5-inch cake on its cardboard atop dowels in 8-inch cake, centering carefully. │ +│ Using citrus stripper, cut long strips of orange peel from oranges. │ +│ Cut strips into long segments. │ +│ To make orange peel coils, wrap peel segment around handle of wooden spoon; gently slide peel off handle so that peel keeps coiled shape. │ +│ Garnish cake with orange peel coils, ivy or mint sprigs, and some berries. │ +│ (Assembled cake can be made up to 8 hours ahead. │ +│ Let stand at cool room temperature.) │ +│ Remove top and middle cake tiers. │ +│ Remove dowels from cakes. │ +│ Cut top and middle cakes into slices. │ +│ To cut 12-inch cake: Starting 3 inches inward from edge and inserting knife straight down, cut through from top to bottom to make 6-inch-diameter circle in center of cake. │ +│ Cut outer portion of cake into slices; cut inner portion into slices and serve with strawberries. │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ + +126 rows in set. Elapsed: 0.011 sec. Processed 8.19 thousand rows, 5.34 MB (737.75 thousand rows/s., 480.59 MB/s.) +``` + +### Online Playground + +Этот набор данных доступен в [Online Playground](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==). + +[Оригинальная статья](https://clickhouse.tech/docs/ru/getting-started/example-datasets/recipes/) diff --git a/docs/ru/getting-started/example-datasets/wikistat.md b/docs/ru/getting-started/example-datasets/wikistat.md index c5a877ff8fd..f224c24e6ac 100644 --- a/docs/ru/getting-started/example-datasets/wikistat.md +++ b/docs/ru/getting-started/example-datasets/wikistat.md @@ -30,4 +30,3 @@ $ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/page $ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/wikistat/) diff --git a/docs/ru/getting-started/index.md b/docs/ru/getting-started/index.md index ab72ce4a1d2..599cb8b9434 100644 --- a/docs/ru/getting-started/index.md +++ b/docs/ru/getting-started/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u041d\u0430\u0447\u0430\u043b\u043e\u0020\u0440\u0430\u0431\u043e\u0442\u044b" +toc_folder_title: "Начало работы" toc_hidden: true toc_priority: 8 toc_title: hidden @@ -14,4 +14,3 @@ toc_title: hidden - [Пройти подробное руководство для начинающих](tutorial.md) - [Поэкспериментировать с тестовыми наборами данных](example-datasets/ontime.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/) diff --git a/docs/ru/getting-started/install.md b/docs/ru/getting-started/install.md index 04efe77712b..4ae27a910ea 100644 --- a/docs/ru/getting-started/install.md +++ b/docs/ru/getting-started/install.md @@ -1,6 +1,6 @@ --- toc_priority: 11 -toc_title: "\u0423\u0441\u0442\u0430\u043d\u043e\u0432\u043a\u0430" +toc_title: "Установка" --- # Установка {#ustanovka} @@ -173,4 +173,3 @@ SELECT 1 Для дальнейших экспериментов можно попробовать загрузить один из тестовых наборов данных или пройти [пошаговое руководство для начинающих](https://clickhouse.tech/tutorial.html). -[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/install/) diff --git a/docs/ru/getting-started/playground.md b/docs/ru/getting-started/playground.md index 86a5cd5272c..b51a9b2b436 100644 --- a/docs/ru/getting-started/playground.md +++ b/docs/ru/getting-started/playground.md @@ -36,10 +36,10 @@ ClickHouse Playground дает возможность поработать с [ - запрещены INSERT запросы Также установлены следующие опции: -- [max_result_bytes=10485760](../operations/settings/query_complexity/#max-result-bytes) -- [max_result_rows=2000](../operations/settings/query_complexity/#setting-max_result_rows) -- [result_overflow_mode=break](../operations/settings/query_complexity/#result-overflow-mode) -- [max_execution_time=60000](../operations/settings/query_complexity/#max-execution-time) +- [max_result_bytes=10485760](../operations/settings/query-complexity.md#max-result-bytes) +- [max_result_rows=2000](../operations/settings/query-complexity.md#setting-max_result_rows) +- [result_overflow_mode=break](../operations/settings/query-complexity.md#result-overflow-mode) +- [max_execution_time=60000](../operations/settings/query-complexity.md#max-execution-time) ## Примеры {#examples} diff --git a/docs/ru/getting-started/tutorial.md b/docs/ru/getting-started/tutorial.md index f5455ba2b9a..68b3e4dbae7 100644 --- a/docs/ru/getting-started/tutorial.md +++ b/docs/ru/getting-started/tutorial.md @@ -644,7 +644,7 @@ If there are no replicas at the moment on replicated table creation, a new first ``` sql CREATE TABLE tutorial.hits_replica (...) -ENGINE = ReplcatedMergeTree( +ENGINE = ReplicatedMergeTree( '/clickhouse_perftest/tables/{shard}/hits', '{replica}' ) diff --git a/docs/ru/guides/apply-catboost-model.md b/docs/ru/guides/apply-catboost-model.md index 026b4d9d75e..db2be63692f 100644 --- a/docs/ru/guides/apply-catboost-model.md +++ b/docs/ru/guides/apply-catboost-model.md @@ -1,6 +1,6 @@ --- toc_priority: 41 -toc_title: "\u041f\u0440\u0438\u043c\u0435\u043d\u0435\u043d\u0438\u0435\u0020\u043c\u043e\u0434\u0435\u043b\u0438\u0020\u0043\u0061\u0074\u0042\u006f\u006f\u0073\u0074\u0020\u0432\u0020\u0043\u006c\u0069\u0063\u006b\u0048\u006f\u0075\u0073\u0065" +toc_title: "Применение модели CatBoost в ClickHouse" --- # Применение модели CatBoost в ClickHouse {#applying-catboost-model-in-clickhouse} @@ -158,7 +158,9 @@ FROM amazon_train /home/catboost/data/libcatboostmodel.so /home/catboost/models/*_model.xml ``` - +!!! note "Примечание" + Вы можете позднее изменить путь к конфигурации модели CatBoost без перезагрузки сервера. + ## 4. Запустите вывод модели из SQL {#run-model-inference} Для тестирования модели запустите клиент ClickHouse `$ clickhouse client`. diff --git a/docs/ru/guides/index.md b/docs/ru/guides/index.md index 2c38de275a7..5b305a6a135 100644 --- a/docs/ru/guides/index.md +++ b/docs/ru/guides/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u0420\u0443\u043A\u043E\u0432\u043E\u0434\u0441\u0442\u0432\u0430" +toc_folder_title: "Руководства" toc_priority: 38 -toc_title: "\u041E\u0431\u0437\u043E\u0440" +toc_title: "Обзор" --- # Руководства {#rukovodstva} diff --git a/docs/ru/index.md b/docs/ru/index.md index 7c129f9d113..e16f2afed82 100644 --- a/docs/ru/index.md +++ b/docs/ru/index.md @@ -1,9 +1,9 @@ --- toc_priority: 0 -toc_title: "\u041E\u0431\u0437\u043E\u0440" +toc_title: "Обзор" --- -# Что такое ClickHouse {#chto-takoe-clickhouse} +# Что такое ClickHouse {#what-is-clickhouse} ClickHouse - столбцовая система управления базами данных (СУБД) для онлайн обработки аналитических запросов (OLAP). @@ -97,4 +97,3 @@ ClickHouse - столбцовая система управления базам Стоит заметить, что для эффективности по CPU требуется, чтобы язык запросов был декларативным (SQL, MDX) или хотя бы векторным (J, K). То есть, чтобы запрос содержал циклы только в неявном виде, открывая возможности для оптимизации. -[Оригинальная статья](https://clickhouse.tech/docs/ru/) diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index b1d8c4f0732..277b73a6d36 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -1,6 +1,6 @@ --- toc_priority: 17 -toc_title: "\u041a\u043b\u0438\u0435\u043d\u0442\u0020\u043a\u043e\u043c\u0430\u043d\u0434\u043d\u043e\u0439\u0020\u0441\u0442\u0440\u043e\u043a\u0438" +toc_title: "Клиент командной строки" --- # Клиент командной строки {#klient-komandnoi-stroki} @@ -121,6 +121,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe - `--user, -u` — имя пользователя, по умолчанию — ‘default’. - `--password` — пароль, по умолчанию — пустая строка. - `--query, -q` — запрос для выполнения, при использовании в неинтерактивном режиме. +- `--queries-file, -qf` - путь к файлу с запросами для выполнения. Необходимо указать только одну из опций: `query` или `queries-file`. - `--database, -d` — выбрать текущую БД. Без указания значение берется из настроек сервера (по умолчанию — БД ‘default’). - `--multiline, -m` — если указано — разрешить многострочные запросы, не отправлять запрос по нажатию Enter. - `--multiquery, -n` — если указано — разрешить выполнять несколько запросов, разделённых точкой с запятой. @@ -130,6 +131,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe - `--stacktrace` — если указано, в случае исключения, выводить также его стек-трейс. - `--config-file` — имя конфигурационного файла. - `--secure` — если указано, будет использован безопасный канал. +- `--history_file` - путь к файлу с историей команд. - `--param_` — значение параметра для [запроса с параметрами](#cli-queries-with-parameters). Начиная с версии 20.5, в `clickhouse-client` есть автоматическая подсветка синтаксиса (включена всегда). @@ -153,4 +155,3 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/cli/) diff --git a/docs/ru/interfaces/cpp.md b/docs/ru/interfaces/cpp.md index 264b4f82500..f0691453fe6 100644 --- a/docs/ru/interfaces/cpp.md +++ b/docs/ru/interfaces/cpp.md @@ -1,10 +1,9 @@ --- toc_priority: 24 -toc_title: "\u0043\u002b\u002b\u0020\u043a\u043b\u0438\u0435\u043d\u0442\u0441\u043a\u0430\u044f\u0020\u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0430" +toc_title: "C++ клиентская библиотека" --- # C++ клиентская библиотека {#c-klientskaia-biblioteka} См. README в репозитории [clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp). -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/cpp/) diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 98426b489e8..f67997b58d6 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -1,6 +1,6 @@ --- toc_priority: 21 -toc_title: "\u0424\u043e\u0440\u043c\u0430\u0442\u044b\u0020\u0432\u0445\u043e\u0434\u043d\u044b\u0445\u0020\u0438\u0020\u0432\u044b\u0445\u043e\u0434\u043d\u044b\u0445\u0020\u0434\u0430\u043d\u043d\u044b\u0445" +toc_title: "Форматы входных и выходных данных" --- # Форматы входных и выходных данных {#formats} @@ -49,7 +49,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT | [Parquet](#data-format-parquet) | ✔ | ✔ | | [Arrow](#data-format-arrow) | ✔ | ✔ | | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | +| [ORC](#data-format-orc) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ | @@ -1173,7 +1173,7 @@ ClickHouse поддерживает настраиваемую точность Неподдержанные типы данных Parquet: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Типы данных столбцов в ClickHouse могут отличаться от типов данных соответствующих полей файла в формате Parquet. При вставке данных, ClickHouse интерпретирует типы данных в соответствии с таблицей выше, а затем [приводит](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) данные к тому типу, который установлен для столбца таблицы. +Типы данных столбцов в ClickHouse могут отличаться от типов данных соответствующих полей файла в формате Parquet. При вставке данных, ClickHouse интерпретирует типы данных в соответствии с таблицей выше, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к тому типу, который установлен для столбца таблицы. ### Вставка и выборка данных {#vstavka-i-vyborka-dannykh} @@ -1203,45 +1203,53 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_ ## ORC {#data-format-orc} -[Apache ORC](https://orc.apache.org/) - это column-oriented формат данных, распространённый в экосистеме Hadoop. Вы можете только вставлять данные этого формата в ClickHouse. +[Apache ORC](https://orc.apache.org/) — это столбцовый формат данных, распространенный в экосистеме [Hadoop](https://hadoop.apache.org/). ### Соответствие типов данных {#sootvetstvie-tipov-dannykh-1} -Таблица показывает поддержанные типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT`. +Таблица ниже содержит поддерживаемые типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT` и `SELECT`. -| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | -|---------------------------|-----------------------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | -| `DATE32` | [Date](../sql-reference/data-types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | -| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | -| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | +| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | Тип данных ORC (`SELECT`) | +|---------------------------|-----------------------------------------------------|---------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` | -ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных Parquet `DECIMAL` как `Decimal128`. +ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных ORC `DECIMAL` как `Decimal128`. -Неподдержанные типы данных ORC: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. +Неподдерживаемые типы данных ORC: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных, ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse. +Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse. ### Вставка данных {#vstavka-dannykh-1} -Данные ORC можно вставить в таблицу ClickHouse командой: +Чтобы вставить в ClickHouse данные из файла в формате ORC, используйте команду следующего вида: ``` bash $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` -Для обмена данных с Hadoop можно использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md). +### Вывод данных {#vyvod-dannykh-1} +Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} +``` + +Для обмена данных с экосистемой Hadoop вы можете использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md). ## LineAsString {#lineasstring} @@ -1268,7 +1276,7 @@ SELECT * FROM line_as_string; ## Regexp {#data-format-regexp} -Каждая строка импортируемых данных разбирается в соответствии с регулярным выражением. +Каждая строка импортируемых данных разбирается в соответствии с регулярным выражением. При работе с форматом `Regexp` можно использовать следующие параметры: @@ -1279,15 +1287,15 @@ SELECT * FROM line_as_string; - Escaped (как в [TSV](#tabseparated)) - Quoted (как в [Values](#data-format-values)) - Raw (данные импортируются как есть, без сериализации) -- `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Признак, будет ли генерироваться исключение в случае, если импортируемые данные не соответствуют регулярному выражению `format_regexp`. Может принимать значение `0` или `1`. +- `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Признак, будет ли генерироваться исключение в случае, если импортируемые данные не соответствуют регулярному выражению `format_regexp`. Может принимать значение `0` или `1`. -**Использование** +**Использование** -Регулярное выражение (шаблон) из параметра `format_regexp` применяется к каждой строке импортируемых данных. Количество частей в шаблоне (подшаблонов) должно соответствовать количеству колонок в импортируемых данных. +Регулярное выражение (шаблон) из параметра `format_regexp` применяется к каждой строке импортируемых данных. Количество частей в шаблоне (подшаблонов) должно соответствовать количеству колонок в импортируемых данных. -Строки импортируемых данных должны разделяться символом новой строки `'\n'` или символами `"\r\n"` (перенос строки в формате DOS). +Строки импортируемых данных должны разделяться символом новой строки `'\n'` или символами `"\r\n"` (перенос строки в формате DOS). -Данные, выделенные по подшаблонам, интерпретируются в соответствии с типом, указанным в параметре `format_regexp_escaping_rule`. +Данные, выделенные по подшаблонам, интерпретируются в соответствии с типом, указанным в параметре `format_regexp_escaping_rule`. Если строка импортируемых данных не соответствует регулярному выражению и параметр `format_regexp_skip_unmatched` равен 1, строка просто игнорируется. Если же параметр `format_regexp_skip_unmatched` равен 0, генерируется исключение. @@ -1390,4 +1398,3 @@ $ clickhouse-client --query "SELECT * FROM {some_table} FORMAT RawBLOB" | md5sum f9725a22f9191e064120d718e26862a9 - ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/formats/) diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md index 096fb6c05bc..9e553c12dc0 100644 --- a/docs/ru/interfaces/http.md +++ b/docs/ru/interfaces/http.md @@ -1,6 +1,6 @@ --- toc_priority: 19 -toc_title: "\u0048\u0054\u0054\u0050\u002d\u0438\u043d\u0442\u0435\u0440\u0444\u0435\u0439\u0441" +toc_title: "HTTP-интерфейс" --- # HTTP-интерфейс {#http-interface} @@ -149,28 +149,48 @@ $ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- Для запросов, которые не возвращают таблицу с данными, в случае успеха, выдаётся пустое тело ответа. -Вы можете использовать внутренний формат сжатия Clickhouse при передаче данных. Формат сжатых данных нестандартный, и вам придётся использовать для работы с ним специальную программу `clickhouse-compressor` (устанавливается вместе с пакетом `clickhouse-client`). Для повышения эффективности вставки данных можно отключить проверку контрольной суммы на стороне сервера с помощью настройки[http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress). -Если вы указали `compress = 1` в URL, то сервер сжимает данные, которые он отправляет. -Если вы указали `decompress = 1` в URL, сервер распаковывает те данные, которые вы передаёте методом `POST`. +## Сжатие {#compression} -Также, можно использовать [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). Для отправки сжатого запроса `POST`, добавьте заголовок `Content-Encoding: compression_method`. Чтобы ClickHouse сжимал ответ, добавьте заголовок `Accept-Encoding: compression_method`. ClickHouse поддерживает следующие [методы сжатия](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens): `gzip`, `br`, and `deflate`. Чтобы включить HTTP compression, используйте настройку ClickHouse [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression). Уровень сжатия данных для всех методов сжатия можно настроить с помощью настройки [http_zlib_compression_level](#settings-http_zlib_compression_level). +Сжатие можно использовать для уменьшения трафика по сети при передаче большого количества данных, а также для создания сразу сжатых дампов. -Это может быть использовано для уменьшения трафика по сети при передаче большого количества данных, а также для создания сразу сжатых дампов. +Вы можете использовать внутренний формат сжатия Clickhouse при передаче данных. Формат сжатых данных нестандартный, и вам придётся использовать для работы с ним специальную программу `clickhouse-compressor`. Она устанавливается вместе с пакетом `clickhouse-client`. Для повышения эффективности вставки данных можно отключить проверку контрольной суммы на стороне сервера с помощью настройки [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress). -Примеры отправки данных со сжатием: +Если вы указали `compress=1` в URL, то сервер сжимает данные, которые он отправляет. Если вы указали `decompress=1` в URL, сервер распаковывает те данные, которые вы передаёте методом `POST`. -``` bash -$ #Отправка данных на сервер: -$ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' +Также можно использовать [сжатие HTTP](https://en.wikipedia.org/wiki/HTTP_compression). ClickHouse поддерживает следующие [методы сжатия](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens): -$ #Отправка данных клиенту: -$ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' -``` +- `gzip` +- `br` +- `deflate` +- `xz` + +Для отправки сжатого запроса `POST`, добавьте заголовок `Content-Encoding: compression_method`. +Чтобы ClickHouse сжимал ответ, разрешите сжатие настройкой [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) и добавьте заголовок `Accept-Encoding: compression_method`. Уровень сжатия данных для всех методов сжатия можно задать с помощью настройки [http_zlib_compression_level](../operations/settings/settings.md#settings-http_zlib_compression_level). !!! note "Примечание" Некоторые HTTP-клиенты могут по умолчанию распаковывать данные (`gzip` и `deflate`) с сервера в фоновом режиме и вы можете получить распакованные данные, даже если правильно используете настройки сжатия. +**Примеры** + +``` bash +# Отправка сжатых данных на сервер +$ echo "SELECT 1" | gzip -c | \ + curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +``` + +``` bash +# Получение сжатых данных с сервера +$ curl -vsS "http://localhost:8123/?enable_http_compression=1" \ + -H 'Accept-Encoding: gzip' --output result.gz -d 'SELECT number FROM system.numbers LIMIT 3' +$ zcat result.gz +0 +1 +2 +``` + +## База данных по умолчанию {#default-database} + Вы можете использовать параметр URL `database` или заголовок `X-ClickHouse-Database`, чтобы указать БД по умолчанию. ``` bash @@ -615,4 +635,3 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler' * Connection #0 to host localhost left intact ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/http_interface/) diff --git a/docs/ru/interfaces/index.md b/docs/ru/interfaces/index.md index ea381c46206..12e8853823e 100644 --- a/docs/ru/interfaces/index.md +++ b/docs/ru/interfaces/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u0418\u043D\u0442\u0435\u0440\u0444\u0435\u0439\u0441\u044B" +toc_folder_title: "Интерфейсы" toc_priority: 14 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043D\u0438\u0435" +toc_title: "Введение" --- # Интерфейсы {#interfaces} @@ -24,4 +24,3 @@ ClickHouse предоставляет два сетевых интерфейса - [Библиотеки для интеграции](third-party/integrations.md); - [Визуальные интерфейсы](third-party/gui.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/) diff --git a/docs/ru/interfaces/jdbc.md b/docs/ru/interfaces/jdbc.md index 196dba64933..30270322f7a 100644 --- a/docs/ru/interfaces/jdbc.md +++ b/docs/ru/interfaces/jdbc.md @@ -1,6 +1,6 @@ --- toc_priority: 22 -toc_title: "\u004a\u0044\u0042\u0043\u002d\u0434\u0440\u0430\u0439\u0432\u0435\u0440" +toc_title: "JDBC-драйвер" --- # JDBC-драйвер {#jdbc-draiver} @@ -10,4 +10,3 @@ toc_title: "\u004a\u0044\u0042\u0043\u002d\u0434\u0440\u0430\u0439\u0432\u0435\u - [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) - [clickhouse4j](https://github.com/blynkkk/clickhouse4j) -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/jdbc/) diff --git a/docs/ru/interfaces/mysql.md b/docs/ru/interfaces/mysql.md index fa0003e0bea..925b1113109 100644 --- a/docs/ru/interfaces/mysql.md +++ b/docs/ru/interfaces/mysql.md @@ -1,6 +1,6 @@ --- toc_priority: 20 -toc_title: "\u004d\u0079\u0053\u0051\u004c\u002d\u0438\u043d\u0442\u0435\u0440\u0444\u0435\u0439\u0441" +toc_title: "MySQL-интерфейс" --- # MySQL-интерфейс {#mysql-interface} diff --git a/docs/ru/interfaces/odbc.md b/docs/ru/interfaces/odbc.md index 728c4bd6979..22153865298 100644 --- a/docs/ru/interfaces/odbc.md +++ b/docs/ru/interfaces/odbc.md @@ -1,6 +1,6 @@ --- toc_priority: 23 -toc_title: "\u004f\u0044\u0042\u0043\u002d\u0434\u0440\u0430\u0439\u0432\u0435\u0440" +toc_title: "ODBC-драйвер" --- @@ -8,4 +8,3 @@ toc_title: "\u004f\u0044\u0042\u0043\u002d\u0434\u0440\u0430\u0439\u0432\u0435\u - [Официальный драйвер](https://github.com/ClickHouse/clickhouse-odbc). -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/odbc/) diff --git a/docs/ru/interfaces/tcp.md b/docs/ru/interfaces/tcp.md index d89646f15b7..5261e1eafef 100644 --- a/docs/ru/interfaces/tcp.md +++ b/docs/ru/interfaces/tcp.md @@ -1,10 +1,9 @@ --- toc_priority: 18 -toc_title: "\u0420\u043e\u0434\u043d\u043e\u0439\u0020\u0438\u043d\u0442\u0435\u0440\u0444\u0435\u0439\u0441\u0020\u0028\u0054\u0043\u0050\u0029" +toc_title: "Родной интерфейс (TCP)" --- # Родной интерфейс (TCP) {#rodnoi-interfeis-tcp} Нативный протокол используется в [клиенте командной строки](cli.md), для взаимодействия между серверами во время обработки распределенных запросов, а также в других программах на C++. К сожалению, у родного протокола ClickHouse пока нет формальной спецификации, но в нем можно разобраться с использованием исходного кода ClickHouse (начиная с [примерно этого места](https://github.com/ClickHouse/ClickHouse/tree/master/src/Client)) и/или путем перехвата и анализа TCP трафика. -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/tcp/) diff --git a/docs/ru/interfaces/third-party/client-libraries.md b/docs/ru/interfaces/third-party/client-libraries.md index 26e05b02509..411475f0aaa 100644 --- a/docs/ru/interfaces/third-party/client-libraries.md +++ b/docs/ru/interfaces/third-party/client-libraries.md @@ -1,6 +1,6 @@ --- toc_priority: 26 -toc_title: "\u041a\u043b\u0438\u0435\u043d\u0442\u0441\u043a\u0438\u0435\u0020\u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438\u0020\u043e\u0442\u0020\u0441\u0442\u043e\u0440\u043e\u043d\u043d\u0438\u0445\u0020\u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u0447\u0438\u043a\u043e\u0432" +toc_title: "Клиентские библиотеки от сторонних разработчиков" --- # Клиентские библиотеки от сторонних разработчиков {#klientskie-biblioteki-ot-storonnikh-razrabotchikov} @@ -22,6 +22,7 @@ toc_title: "\u041a\u043b\u0438\u0435\u043d\u0442\u0441\u043a\u0438\u0435\u0020\u - [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client) - [SeasClick C++ client](https://github.com/SeasX/SeasClick) - [glushkovds/phpclickhouse-laravel](https://packagist.org/packages/glushkovds/phpclickhouse-laravel) + - [kolya7k ClickHouse PHP extension](https://github.com//kolya7k/clickhouse-php) - Go - [clickhouse](https://github.com/kshvakov/clickhouse/) - [go-clickhouse](https://github.com/roistat/go-clickhouse) @@ -57,4 +58,3 @@ toc_title: "\u041a\u043b\u0438\u0435\u043d\u0442\u0441\u043a\u0438\u0435\u0020\u - Nim - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/third-party/client_libraries/) diff --git a/docs/ru/interfaces/third-party/gui.md b/docs/ru/interfaces/third-party/gui.md index 1fabdb8a31c..156f7130bc5 100644 --- a/docs/ru/interfaces/third-party/gui.md +++ b/docs/ru/interfaces/third-party/gui.md @@ -1,6 +1,6 @@ --- toc_priority: 28 -toc_title: "\u0412\u0438\u0437\u0443\u0430\u043b\u044c\u043d\u044b\u0435\u0020\u0438\u043d\u0442\u0435\u0440\u0444\u0435\u0439\u0441\u044b\u0020\u043e\u0442\u0020\u0441\u0442\u043e\u0440\u043e\u043d\u043d\u0438\u0445\u0020\u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u0447\u0438\u043a\u043e\u0432" +toc_title: "Визуальные интерфейсы от сторонних разработчиков" --- @@ -103,7 +103,11 @@ toc_title: "\u0412\u0438\u0437\u0443\u0430\u043b\u044c\u043d\u044b\u0435\u0020\u [xeus-clickhouse](https://github.com/wangfenjin/xeus-clickhouse) — это ядро Jupyter для ClickHouse, которое поддерживает запрос ClickHouse-данных с использованием SQL в Jupyter. -## Коммерческие {#kommercheskie} +### MindsDB Studio {#mindsdb} + +[MindsDB](https://mindsdb.com/) — это продукт с открытым исходным кодом, реализующий слой искусственного интеллекта (Artificial Intelligence, AI) для различных СУБД, в том числе для ClickHouse. MindsDB облегчает процессы создания, обучения и развертывания современных моделей машинного обучения. Графический пользовательский интерфейс MindsDB Studio позволяет обучать новые модели на основе данных в БД, интерпретировать сделанные моделями прогнозы, выявлять потенциальные ошибки в данных, визуализировать и оценивать достоверность моделей с помощью функции Explainable AI, так чтобы вы могли быстрее адаптировать и настраивать ваши модели машинного обучения. + +## Коммерческие {#commercial} ### DataGrip {#datagrip} @@ -146,7 +150,6 @@ toc_title: "\u0412\u0438\u0437\u0443\u0430\u043b\u044c\u043d\u044b\u0435\u0020\u - Подготовка данных и возможности ETL. - Моделирование данных с помощью SQL для их реляционного отображения. -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/third-party/gui/) ### Looker {#looker} @@ -163,4 +166,19 @@ toc_title: "\u0412\u0438\u0437\u0443\u0430\u043b\u044c\u043d\u044b\u0435\u0020\u [Как сконфигурировать ClickHouse в Looker.](https://docs.looker.com/setup-and-management/database-config/clickhouse) -[Original article](https://clickhouse.tech/docs/ru/interfaces/third-party/gui/) +### SeekTable {#seektable} + +[SeekTable](https://www.seektable.com) — это аналитический инструмент для самостоятельного анализа и обработки данных бизнес-аналитики. Он доступен как в виде облачного сервиса, так и в виде локальной версии. Отчеты из SeekTable могут быть встроены в любое веб-приложение. + +Основные возможности: + +- Удобный конструктор отчетов. +- Гибкая настройка отчетов SQL и создание запросов для специфичных отчетов. +- Интегрируется с ClickHouse, используя собственную точку приема запроса TCP/IP или интерфейс HTTP(S) (два разных драйвера). +- Поддерживает всю мощь диалекта ClickHouse SQL для построения запросов по различным измерениям и показателям. +- [WEB-API](https://www.seektable.com/help/web-api-integration) для автоматизированной генерации отчетов. +- Процесс разработки отчетов поддерживает [резервное копирование/восстановление данных](https://www.seektable.com/help/self-hosted-backup-restore); конфигурация моделей данных (кубов) / отчетов представляет собой удобочитаемый XML-файл, который может храниться в системе контроля версий. + +SeekTable [бесплатен](https://www.seektable.com/help/cloud-pricing) для личного/индивидуального использования. + +[Как сконфигурировать подключение ClickHouse в SeekTable.](https://www.seektable.com/help/clickhouse-pivot-table) diff --git a/docs/ru/interfaces/third-party/index.md b/docs/ru/interfaces/third-party/index.md index a57169df73b..bbf5a237000 100644 --- a/docs/ru/interfaces/third-party/index.md +++ b/docs/ru/interfaces/third-party/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0421\u0442\u043e\u0440\u043e\u043d\u043d\u0438\u0435\u0020\u0438\u043d\u0442\u0435\u0440\u0444\u0435\u0439\u0441\u044b" +toc_folder_title: "Сторонние интерфейсы" toc_priority: 24 --- @@ -15,4 +15,3 @@ toc_priority: 24 !!! note "Примечание" С ClickHouse работают также универсальные инструменты, поддерживающие общий API, такие как [ODBC](../../interfaces/odbc.md) или [JDBC](../../interfaces/jdbc.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/third-party/) diff --git a/docs/ru/interfaces/third-party/integrations.md b/docs/ru/interfaces/third-party/integrations.md index 60d6181ab3f..198e9d6be76 100644 --- a/docs/ru/interfaces/third-party/integrations.md +++ b/docs/ru/interfaces/third-party/integrations.md @@ -1,6 +1,6 @@ --- toc_priority: 27 -toc_title: "\u0411\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438\u0020\u0434\u043b\u044f\u0020\u0438\u043d\u0442\u0435\u0433\u0440\u0430\u0446\u0438\u0438\u0020\u043e\u0442\u0020\u0441\u0442\u043e\u0440\u043e\u043d\u043d\u0438\u0445\u0020\u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u0447\u0438\u043a\u043e\u0432" +toc_title: "Библиотеки для интеграции от сторонних разработчиков" --- # Библиотеки для интеграции от сторонних разработчиков {#biblioteki-dlia-integratsii-ot-storonnikh-razrabotchikov} @@ -69,6 +69,9 @@ toc_title: "\u0411\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438\u0020\u - Гео - [MaxMind](https://dev.maxmind.com/geoip/) - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) +- AutoML + - [MindsDB](https://mindsdb.com/) + - [MindsDB](https://github.com/mindsdb/mindsdb) - Слой предиктивной аналитики и искусственного интеллекта для СУБД ClickHouse. ## Экосистемы вокруг языков программирования {#ekosistemy-vokrug-iazykov-programmirovaniia} @@ -105,4 +108,3 @@ toc_title: "\u0411\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438\u0020\u - [GraphQL](https://github.com/graphql) - [activecube-graphql](https://github.com/bitquery/activecube-graphql) -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/third-party/integrations/) diff --git a/docs/ru/interfaces/third-party/proxy.md b/docs/ru/interfaces/third-party/proxy.md index fc66ecde293..6d85c960c0e 100644 --- a/docs/ru/interfaces/third-party/proxy.md +++ b/docs/ru/interfaces/third-party/proxy.md @@ -1,6 +1,6 @@ --- toc_priority: 29 -toc_title: "\u041f\u0440\u043e\u043a\u0441\u0438\u002d\u0441\u0435\u0440\u0432\u0435\u0440\u044b\u0020\u043e\u0442\u0020\u0441\u0442\u043e\u0440\u043e\u043d\u043d\u0438\u0445\u0020\u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u0447\u0438\u043a\u043e\u0432" +toc_title: "Прокси-серверы от сторонних разработчиков" --- # Прокси-серверы от сторонних разработчиков {#proksi-servery-ot-storonnikh-razrabotchikov} @@ -41,4 +41,3 @@ toc_title: "\u041f\u0440\u043e\u043a\u0441\u0438\u002d\u0441\u0435\u0440\u0432\u Реализован на Go. -[Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/third-party/proxy/) diff --git a/docs/ru/introduction/distinctive-features.md b/docs/ru/introduction/distinctive-features.md index 4eeeef4a443..dedb1412dbf 100644 --- a/docs/ru/introduction/distinctive-features.md +++ b/docs/ru/introduction/distinctive-features.md @@ -1,6 +1,6 @@ --- toc_priority: 4 -toc_title: "\u041e\u0442\u043b\u0438\u0447\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0435\u0020\u0432\u043e\u0437\u043c\u043e\u0436\u043d\u043e\u0441\u0442\u0438\u0020\u0043\u006c\u0069\u0063\u006b\u0048\u006f\u0075\u0073\u0065" +toc_title: "Отличительные возможности ClickHouse" --- # Отличительные возможности ClickHouse {#otlichitelnye-vozmozhnosti-clickhouse} @@ -73,4 +73,3 @@ ClickHouse предоставляет различные способы разм 3. Разреженный индекс делает ClickHouse плохо пригодным для точечных чтений одиночных строк по своим ключам. -[Оригинальная статья](https://clickhouse.tech/docs/ru/introduction/distinctive_features/) diff --git a/docs/ru/introduction/history.md b/docs/ru/introduction/history.md index ab740954bbe..dc4aa935c27 100644 --- a/docs/ru/introduction/history.md +++ b/docs/ru/introduction/history.md @@ -1,6 +1,6 @@ --- toc_priority: 7 -toc_title: "\u0418\u0441\u0442\u043e\u0440\u0438\u044f\u0020\u0043\u006c\u0069\u0063\u006b\u0048\u006f\u0075\u0073\u0065" +toc_title: "История ClickHouse" --- @@ -52,4 +52,3 @@ OLAPServer хорошо подходил для неагрегированных Чтобы снять ограничения OLAPServer-а и решить задачу работы с неагрегированными данными для всех отчётов, разработана СУБД ClickHouse. -[Оригинальная статья](https://clickhouse.tech/docs/ru/introduction/ya_metrika_task/) diff --git a/docs/ru/introduction/index.md b/docs/ru/introduction/index.md index 28a8e10e15b..99f8aad0531 100644 --- a/docs/ru/introduction/index.md +++ b/docs/ru/introduction/index.md @@ -1,6 +1,4 @@ --- -toc_folder_title: "\u0412\u0432\u0435\u0434\u0435\u043D\u0438\u0435" +toc_folder_title: "Введение" toc_priority: 1 --- - - diff --git a/docs/ru/introduction/info.md b/docs/ru/introduction/info.md index a9398b8c9cd..a5e7efffc7e 100644 --- a/docs/ru/introduction/info.md +++ b/docs/ru/introduction/info.md @@ -9,4 +9,3 @@ toc_priority: 100 - Адрес электронной почты: - Телефон: +7-495-780-6510 -[Оригинальная статья](https://clickhouse.tech/docs/ru/introduction/info/) diff --git a/docs/ru/introduction/performance.md b/docs/ru/introduction/performance.md index c449e76a6ea..eec1dcf4d0a 100644 --- a/docs/ru/introduction/performance.md +++ b/docs/ru/introduction/performance.md @@ -1,6 +1,6 @@ --- toc_priority: 6 -toc_title: "\u041f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c" +toc_title: "Производительность" --- # Производительность {#proizvoditelnost} @@ -27,4 +27,3 @@ toc_title: "\u041f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0438\u0442\u0435\u Данные рекомендуется вставлять пачками не менее 1000 строк или не более одного запроса в секунду. При вставке в таблицу типа MergeTree из tab-separated дампа, скорость вставки будет в районе 50-200 МБ/сек. Если вставляются строчки размером около 1 КБ, то скорость будет в районе 50 000 - 200 000 строчек в секунду. Если строчки маленькие - производительность в строчках в секунду будет выше (на данных БК - `>` 500 000 строк в секунду, на данных Graphite - `>` 1 000 000 строк в секунду). Для увеличения производительности, можно производить несколько запросов INSERT параллельно - при этом производительность растёт линейно. -[Оригинальная статья](https://clickhouse.tech/docs/ru/introduction/performance/) diff --git a/docs/ru/operations/access-rights.md b/docs/ru/operations/access-rights.md index 00e55da7a82..a0ad7664131 100644 --- a/docs/ru/operations/access-rights.md +++ b/docs/ru/operations/access-rights.md @@ -1,6 +1,6 @@ --- toc_priority: 48 -toc_title: "\u0423\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u0438\u0435\u0020\u0434\u043e\u0441\u0442\u0443\u043f\u043e\u043c" +toc_title: "Управление доступом" --- # Управление доступом {#access-control} @@ -146,4 +146,3 @@ ClickHouse поддерживает управление доступом на По умолчанию управление доступом на основе SQL выключено для всех пользователей. Вам необходимо настроить хотя бы одного пользователя в файле конфигурации `users.xml` и присвоить значение 1 параметру [access_management](settings/settings-users.md#access_management-user-setting). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/access_rights/) diff --git a/docs/ru/operations/backup.md b/docs/ru/operations/backup.md index 165b54d9b62..ed0adeb5e6f 100644 --- a/docs/ru/operations/backup.md +++ b/docs/ru/operations/backup.md @@ -1,6 +1,6 @@ --- toc_priority: 49 -toc_title: "\u0420\u0435\u0437\u0435\u0440\u0432\u043d\u043e\u0435\u0020\u043a\u043e\u043f\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435\u0020\u0434\u0430\u043d\u043d\u044b\u0445" +toc_title: "Резервное копирование данных" --- # Резервное копирование данных {#rezervnoe-kopirovanie-dannykh} @@ -36,4 +36,3 @@ ClickHouse позволяет использовать запрос `ALTER TABLE Для автоматизации этого подхода доступен инструмент от сторонних разработчиков: [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/backup/) diff --git a/docs/ru/operations/caches.md b/docs/ru/operations/caches.md new file mode 100644 index 00000000000..a0b71d1782a --- /dev/null +++ b/docs/ru/operations/caches.md @@ -0,0 +1,28 @@ +--- +toc_priority: 65 +toc_title: Кеши +--- + +# Типы кеша {#cache-types} + +При выполнении запросов ClickHouse использует различные типы кеша. + +Основные типы кеша: + +- `mark_cache` — кеш засечек, используемых движками таблиц семейства [MergeTree](../engines/table-engines/mergetree-family/mergetree.md). +- `uncompressed_cache` — кеш несжатых данных, используемых движками таблиц семейства [MergeTree](../engines/table-engines/mergetree-family/mergetree.md). + +Дополнительные типы кеша: + +- DNS-кеш. +- Кеш данных формата [regexp](../interfaces/formats.md#data-format-regexp). +- Кеш скомпилированных выражений. +- Кеш схем формата [Avro](../interfaces/formats.md#data-format-avro). +- Кеш данных в [словарях](../sql-reference/dictionaries/index.md). + +Непрямое использование: + +- Кеш страницы ОС. + +Чтобы очистить кеш, используйте выражение [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md). + diff --git a/docs/ru/operations/configuration-files.md b/docs/ru/operations/configuration-files.md index a4cc9182427..11a01d1e6d2 100644 --- a/docs/ru/operations/configuration-files.md +++ b/docs/ru/operations/configuration-files.md @@ -1,6 +1,6 @@ --- toc_priority: 50 -toc_title: "\u041a\u043e\u043d\u0444\u0438\u0433\u0443\u0440\u0430\u0446\u0438\u043e\u043d\u043d\u044b\u0435\u0020\u0444\u0430\u0439\u043b\u044b" +toc_title: "Конфигурационные файлы" --- @@ -52,4 +52,3 @@ $ cat /etc/clickhouse-server/users.d/alice.xml Сервер следит за изменениями конфигурационных файлов, а также файлов и ZooKeeper-узлов, которые были использованы при выполнении подстановок и переопределений, и перезагружает настройки пользователей и кластеров на лету. То есть, можно изменять кластера, пользователей и их настройки без перезапуска сервера. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/configuration_files/) diff --git a/docs/ru/operations/external-authenticators/index.md b/docs/ru/operations/external-authenticators/index.md new file mode 100644 index 00000000000..c2ed9750562 --- /dev/null +++ b/docs/ru/operations/external-authenticators/index.md @@ -0,0 +1,16 @@ +--- +toc_folder_title: "\u0412\u043d\u0435\u0448\u043d\u0438\u0435\u0020\u0430\u0443\u0442\u0435\u043d\u0442\u0438\u0444\u0438\u043a\u0430\u0442\u043e\u0440\u044b\u0020\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u0435\u0439\u0020\u0438\u0020\u043a\u0430\u0442\u0430\u043b\u043e\u0433\u0438" +toc_priority: 48 +toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +--- + +# Внешние аутентификаторы пользователей и каталоги {#external-authenticators} + +ClickHouse поддерживает аутентификацию и управление пользователями при помощи внешних сервисов. + +Поддерживаются следующие внешние аутентификаторы и каталоги: + +- [LDAP](./ldap.md#external-authenticators-ldap) [аутентификатор](./ldap.md#ldap-external-authenticator) и [каталог](./ldap.md#ldap-external-user-directory) +- Kerberos [аутентификатор](./kerberos.md#external-authenticators-kerberos) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/external-authenticators/index/) diff --git a/docs/ru/operations/external-authenticators/kerberos.md b/docs/ru/operations/external-authenticators/kerberos.md new file mode 100644 index 00000000000..b90714d14fd --- /dev/null +++ b/docs/ru/operations/external-authenticators/kerberos.md @@ -0,0 +1,118 @@ +# Kerberos {#external-authenticators-kerberos} + +ClickHouse предоставляет возможность аутентификации существующих (и правильно сконфигурированных) пользователей с использованием Kerberos. + +В настоящее время возможно использование Kerberos только как внешнего аутентификатора, то есть для аутентификации уже существующих пользователей с помощью Kerberos. Пользователи, настроенные для Kerberos-аутентификации, могут работать с ClickHouse только через HTTP-интерфейс, причём сами клиенты должны иметь возможность аутентификации с использованием механизма GSS-SPNEGO. + + +!!! info "!!!" + Для Kerberos-аутентификации необходимо предварительно корректно настроить Kerberos на стороне клиента, на сервере и в конфигурационных файлах самого ClickHouse. Ниже описана лишь конфигурация ClickHouse. + + +## Настройка Kerberos в ClickHouse {#enabling-kerberos-in-clickhouse} + +Для того, чтобы задействовать Kerberos-аутентификацию в ClickHouse, в первую очередь необходимо добавить одну-единственную секцию `kerberos` в `config.xml`. + +В секции могут быть указаны дополнительные параметры: + +- `principal` — задаёт имя принципала (canonical service principal name, SPN), используемое при авторизации ClickHouse на Kerberos-сервере. + - Это опциональный параметр, при его отсутствии будет использовано стандартное имя. + +- `realm` — обеспечивает фильтрацию по реалм (realm). Пользователям, чей реалм не совпадает с указанным, будет отказано в аутентификации. + - Это опциональный параметр, при его отсутствии фильтр по реалм применяться не будет. + +Примеры, как должен выглядеть файл `config.xml`: + +```xml + + + + +``` + +Или, с указанием принципала: + +```xml + + + + HTTP/clickhouse.example.com@EXAMPLE.COM + + +``` + +Или, с фильтрацией по реалм: + +```xml + + + + EXAMPLE.COM + + +``` + +!!! Warning "Важно" + В конфигурационном файле не могут быть указаны одновременно оба параметра. В противном случае, аутентификация с помощью Kerberos будет недоступна для всех пользователей. + +!!! Warning "Важно" + В конфигурационном файле может быть не более одной секции `kerberos`. В противном случае, аутентификация с помощью Kerberos будет отключена для всех пользователей. + + +## Аутентификация пользователей с помощью Kerberos {#kerberos-as-an-external-authenticator-for-existing-users} + +Уже существующие пользователи могут воспользоваться аутентификацией с помощью Kerberos. Однако, Kerberos-аутентификация возможна только при использовании HTTP-интерфейса. + +Имя принципала (principal name) обычно имеет вид: + +- *primary/instance@REALM* + +Для успешной аутентификации необходимо, чтобы *primary* совпало с именем пользователя ClickHouse, настроенного для использования Kerberos. + +### Настройка Kerberos в `users.xml` {#enabling-kerberos-in-users-xml} + +Для того, чтобы пользователь имел возможность производить аутентификацию с помощью Kerberos, достаточно включить секцию `kerberos` в описание пользователя в `users.xml` (например, вместо секции `password` или аналогичной ей). + +В секции могут быть указаны дополнительные параметры: + +- `realm` — обеспечивает фильтрацию по реалм (realm): аутентификация будет возможна только при совпадении реалм клиента с указанным. + - Этот параметр является опциональным, при его отсутствии фильтрация применяться не будет. + +Пример, как выглядит конфигурация Kerberos в `users.xml`: + +```xml + + + + + + + + EXAMPLE.COM + + + + +``` + + +!!! Warning "Важно" + Если пользователь настроен для Kerberos-аутентификации, другие виды уатентификации будут для него недоступны. Если наряду с `kerberos` в определении пользователя будет указан какой-либо другой способ аутентификации, ClickHouse завершит работу. + +!!! info "" + Ещё раз отметим, что кроме `users.xml`, необходимо также включить Kerberos в `config.xml`. + +### Настройка Kerberos через SQL {#enabling-kerberos-using-sql} + +Пользователей, использующих Kerberos-аутентификацию, можно создать не только с помощью изменения конфигурационных файлов. +Если SQL-ориентированное управление доступом включено в ClickHouse, можно также создать пользователя, работающего через Kerberos, с помощью SQL. + +```sql +CREATE USER my_user IDENTIFIED WITH kerberos REALM 'EXAMPLE.COM' +``` + +Или, без фильтрации по реалм: + +```sql +CREATE USER my_user IDENTIFIED WITH kerberos +``` diff --git a/docs/ru/operations/external-authenticators/ldap.md b/docs/ru/operations/external-authenticators/ldap.md new file mode 100644 index 00000000000..312020000ea --- /dev/null +++ b/docs/ru/operations/external-authenticators/ldap.md @@ -0,0 +1,148 @@ +# LDAP {#external-authenticators-ldap} + +Для аутентификации пользователей ClickHouse можно использовать сервер LDAP. Существуют два подхода: + +- Использовать LDAP как внешний аутентификатор для существующих пользователей, которые определены в `users.xml`, или в локальных параметрах управления доступом. +- Использовать LDAP как внешний пользовательский каталог и разрешить аутентификацию локально неопределенных пользователей, если они есть на LDAP сервере. + +Для обоих подходов необходимо определить внутреннее имя LDAP сервера в конфигурации ClickHouse, чтобы другие параметры конфигурации могли ссылаться на это имя. + +## Определение LDAP сервера {#ldap-server-definition} + +Чтобы определить LDAP сервер, необходимо добавить секцию `ldap_servers` в `config.xml`. + +**Пример** + +```xml + + + + + localhost + 636 + uid={user_name},ou=users,dc=example,dc=com + 300 + yes + tls1.2 + demand + /path/to/tls_cert_file + /path/to/tls_key_file + /path/to/tls_ca_cert_file + /path/to/tls_ca_cert_dir + ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:AES256-GCM-SHA384 + + + +``` + +Обратите внимание, что можно определить несколько LDAP серверов внутри секции `ldap_servers`, используя различные имена. + +**Параметры** + +- `host` — имя хоста сервера LDAP или его IP. Этот параметр обязательный и не может быть пустым. +- `port` — порт сервера LDAP. Если настройка `enable_tls` равна `true`, то по умолчанию используется порт `636`, иначе — порт `389`. +- `bind_dn` — шаблон для создания DN для привязки. + - При формировании DN все подстроки `{user_name}` в шаблоне будут заменяться на фактическое имя пользователя при каждой попытке аутентификации. +- `verification_cooldown` — промежуток времени (в секундах) после успешной попытки привязки, в течение которого пользователь будет считаться аутентифицированным и сможет выполнять запросы без повторного обращения к серверам LDAP. + - Чтобы отключить кеширование и заставить обращаться к серверу LDAP для каждого запроса аутентификации, укажите `0` (значение по умолчанию). +- `enable_tls` — флаг, включающий использование защищенного соединения с сервером LDAP. + - Укажите `no` для использования текстового протокола `ldap://` (не рекомендовано). + - Укажите `yes` для обращения к LDAP по протоколу SSL/TLS `ldaps://` (рекомендовано, используется по умолчанию). + - Укажите `starttls` для использования устаревшего протокола StartTLS (текстовый `ldap://` протокол, модернизированный до TLS). +- `tls_minimum_protocol_version` — минимальная версия протокола SSL/TLS. + - Возможные значения: `ssl2`, `ssl3`, `tls1.0`, `tls1.1`, `tls1.2` (по-умолчанию). +- `tls_require_cert` — поведение при проверке сертификата SSL/TLS. + - Возможные значения: `never`, `allow`, `try`, `demand` (по-умолчанию). +- `tls_cert_file` — путь к файлу сертификата. +- `tls_key_file` — путь к файлу ключа сертификата. +- `tls_ca_cert_file` — путь к файлу ЦС (certification authority) сертификата. +- `tls_ca_cert_dir` — путь к каталогу, содержащему сертификаты ЦС. +- `tls_cipher_suite` — разрешенный набор шифров (в нотации OpenSSL). + +## Внешний аутентификатор LDAP {#ldap-external-authenticator} + +Удаленный сервер LDAP можно использовать для верификации паролей локально определенных пользователей (пользователей, которые определены в `users.xml` или в локальных параметрах управления доступом). Для этого укажите имя определенного ранее сервера LDAP вместо `password` или другой аналогичной секции в настройках пользователя. + +При каждой попытке авторизации ClickHouse пытается "привязаться" к DN, указанному в [определении LDAP сервера](#ldap-server-definition), используя параметр `bind_dn` и предоставленные реквизиты для входа. Если попытка оказалась успешной, пользователь считается аутентифицированным. Обычно это называют методом "простой привязки". + +**Пример** + +```xml + + + + + + + + my_ldap_server + + + + +``` + +Обратите внимание, что пользователь `my_user` ссылается на `my_ldap_server`. Этот LDAP сервер должен быть настроен в основном файле `config.xml`, как это было описано ранее. + +При включенном SQL-ориентированном [управлении доступом](../access-rights.md#access-control) пользователи, аутентифицированные LDAP серверами, могут также быть созданы запросом [CREATE USER](../../sql-reference/statements/create/user.md#create-user-statement). + +Запрос: + +```sql +CREATE USER my_user IDENTIFIED WITH ldap SERVER 'my_ldap_server'; +``` + +## Внешний пользовательский каталог LDAP {#ldap-external-user-directory} + +В дополнение к локально определенным пользователям, удаленный LDAP сервер может служить источником определения пользователей. Для этого укажите имя определенного ранее сервера LDAP (см. [Определение LDAP сервера](#ldap-server-definition)) в секции `ldap` внутри секции `users_directories` файла `config.xml`. + +При каждой попытке аутентификации ClickHouse пытается локально найти определение пользователя и аутентифицировать его как обычно. Если пользователь не находится локально, ClickHouse предполагает, что он определяется во внешнем LDAP каталоге и пытается "привязаться" к DN, указанному на LDAP сервере, используя предоставленные реквизиты для входа. Если попытка оказалась успешной, пользователь считается существующим и аутентифицированным. Пользователю присваиваются роли из списка, указанного в секции `roles`. Кроме того, если настроена секция `role_mapping`, то выполняется LDAP поиск, а его результаты преобразуются в имена ролей и присваиваются пользователям. Все это работает при условии, что SQL-ориентированное [управлением доступом](../access-rights.md#access-control) включено, а роли созданы запросом [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement). + +**Пример** + +В `config.xml`. + +```xml + + + + + + my_ldap_server + + + + + + ou=groups,dc=example,dc=com + subtree + (&(objectClass=groupOfNames)(member={bind_dn})) + cn + clickhouse_ + + + + +``` + +Обратите внимание, что `my_ldap_server`, указанный в секции `ldap` внутри секции `user_directories`, должен быть настроен в файле `config.xml`, как это было описано ранее. (см. [Определение LDAP сервера](#ldap-server-definition)). + +**Параметры** + +- `server` — имя одного из серверов LDAP, определенных в секции `ldap_servers` в файле конфигурации (см.выше). Этот параметр обязательный и не может быть пустым. +- `roles` — секция со списком локально определенных ролей, которые будут присвоены каждому пользователю, полученному от сервера LDAP. + - Если роли не указаны ни здесь, ни в секции `role_mapping` (см. ниже), пользователь после аутентификации не сможет выполнять никаких действий. +- `role_mapping` — секция c параметрами LDAP поиска и правилами отображения. + - При аутентификации пользователя, пока еще связанного с LDAP, производится LDAP поиск с помощью `search_filter` и имени этого пользователя. Для каждой записи, найденной в ходе поиска, выделяется значение указанного атрибута. У каждого атрибута, имеющего указанный префикс, этот префикс удаляется, а остальная часть значения становится именем локальной роли, определенной в ClickHouse, причем предполагается, что эта роль была ранее создана запросом [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) до этого. + - Внутри одной секции `ldap` может быть несколько секций `role_mapping`. Все они будут применены. + - `base_dn` — шаблон, который используется для создания базового DN для LDAP поиска. + - При формировании DN все подстроки `{user_name}` и `{bind_dn}` в шаблоне будут заменяться на фактическое имя пользователя и DN привязки соответственно при каждом LDAP поиске. + - `scope` — Область LDAP поиска. + - Возможные значения: `base`, `one_level`, `children`, `subtree` (по умолчанию). + - `search_filter` — шаблон, который используется для создания фильтра для каждого LDAP поиска. + - при формировании фильтра все подстроки `{user_name}`, `{bind_dn}` и `{base_dn}` в шаблоне будут заменяться на фактическое имя пользователя, DN привязки и базовый DN соответственно при каждом LDAP поиске. + - Обратите внимание, что специальные символы должны быть правильно экранированы в XML. + - `attribute` — имя атрибута, значение которого будет возвращаться LDAP поиском. + - `prefix` — префикс, который, как предполагается, будет находиться перед началом каждой строки в исходном списке строк, возвращаемых LDAP поиском. Префикс будет удален из исходных строк, а сами они будут рассматриваться как имена локальных ролей. По умолчанию: пустая строка. + +[Оригинальная статья](https://clickhouse.tech/docs/en/operations/external-authenticators/ldap) diff --git a/docs/ru/operations/index.md b/docs/ru/operations/index.md index 74a1d135967..88212e6804f 100644 --- a/docs/ru/operations/index.md +++ b/docs/ru/operations/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u042d\u043a\u0441\u043f\u043b\u0443\u0430\u0442\u0430\u0446\u0438\u044f" +toc_folder_title: "Эксплуатация" toc_priority: 41 -toc_title: "\u042d\u043a\u0441\u043f\u043b\u0443\u0430\u0442\u0430\u0446\u0438\u044f" +toc_title: "Эксплуатация" --- # Эксплуатация {#operations} @@ -23,4 +23,3 @@ toc_title: "\u042d\u043a\u0441\u043f\u043b\u0443\u0430\u0442\u0430\u0446\u0438\u - [Настройки](settings/index.md#settings) - [Утилиты](utilities/index.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/) diff --git a/docs/ru/operations/monitoring.md b/docs/ru/operations/monitoring.md index 52d0b5ecc8a..da51d27ded2 100644 --- a/docs/ru/operations/monitoring.md +++ b/docs/ru/operations/monitoring.md @@ -1,6 +1,6 @@ --- toc_priority: 45 -toc_title: "\u041c\u043e\u043d\u0438\u0442\u043e\u0440\u0438\u043d\u0433" +toc_title: "Мониторинг" --- # Мониторинг {#monitoring} @@ -43,4 +43,3 @@ ClickHouse собирает: Для мониторинга серверов в кластерной конфигурации необходимо установить параметр [max_replica_delay_for_distributed_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries) и использовать HTTP ресурс `/replicas_status`. Если реплика доступна и не отстаёт от других реплик, то запрос к `/replicas_status` возвращает `200 OK`. Если реплика отстаёт, то запрос возвращает `503 HTTP_SERVICE_UNAVAILABLE`, включая информацию о размере отставания. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/monitoring) diff --git a/docs/ru/operations/opentelemetry.md b/docs/ru/operations/opentelemetry.md new file mode 100644 index 00000000000..073e7c67e9c --- /dev/null +++ b/docs/ru/operations/opentelemetry.md @@ -0,0 +1,36 @@ +--- +toc_priority: 62 +toc_title: Поддержка OpenTelemetry +--- + +# [экспериментально] Поддержка OpenTelemetry + +ClickHouse поддерживает [OpenTelemetry](https://opentelemetry.io/) — открытый стандарт для сбора трассировок и метрик из распределенного приложения. + +!!! warning "Предупреждение" +Поддержка стандарта экспериментальная и будет со временем меняться. + +## Обеспечение поддержки контекста трассировки в ClickHouse + +ClickHouse принимает контекстную информацию трассировки через HTTP заголовок `tracecontext`, как описано в [рекомендации W3C](https://www.w3.org/TR/trace-context/). Также он принимает контекстную информацию через нативный протокол, который используется для связи между серверами ClickHouse или между клиентом и сервером. Для ручного тестирования стандартный заголовок `tracecontext`, содержащий контекст трассировки, может быть передан в `clickhouse-client` через флаги: `--opentelemetry-traceparent` и `--opentelemetry-tracestate`. + +Если входящий контекст трассировки не указан, ClickHouse может начать трассировку с вероятностью, задаваемой настройкой [opentelemetry_start_trace_probability](../operations/settings/settings.md#opentelemetry-start-trace-probability). + +## Распространение контекста трассировки + +Контекст трассировки распространяется на нижестоящие сервисы в следующих случаях: + +* При использовании запросов к удаленным серверам ClickHouse, например, при использовании движка таблиц [Distributed](../engines/table-engines/special/distributed.md). + +* При использовании табличной функции [url](../sql-reference/table-functions/url.md). Информация контекста трассировки передается в HTTP заголовки. + +## Как ClickHouse выполняет трассировку + +ClickHouse создает `trace spans` для каждого запроса и некоторых этапов выполнения запроса, таких как планирование запросов или распределенные запросы. + +Чтобы анализировать информацию трассировки, ее следует экспортировать в систему мониторинга, поддерживающую OpenTelemetry, такую как [Jaeger](https://jaegertracing.io/) или [Prometheus](https://prometheus.io/). ClickHouse не зависит от конкретной системы мониторинга, вместо этого предоставляя данные трассировки только через системную таблицу. Информация о диапазоне трассировки в OpenTelemetry, [требуемая стандартом](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span), хранится в системной таблице [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md). + +Таблица должна быть включена в конфигурации сервера, смотрите элемент `opentelemetry_span_log` в файле конфигурации `config.xml`. По умолчанию таблица включена всегда. + +Теги или атрибуты сохраняются в виде двух параллельных массивов, содержащих ключи и значения. Для работы с ними используйте [ARRAY JOIN](../sql-reference/statements/select/array-join.md). + diff --git a/docs/ru/operations/quotas.md b/docs/ru/operations/quotas.md index 92533eef0c1..78966492f25 100644 --- a/docs/ru/operations/quotas.md +++ b/docs/ru/operations/quotas.md @@ -1,6 +1,6 @@ --- toc_priority: 51 -toc_title: "\u041a\u0432\u043e\u0442\u044b" +toc_title: "Квоты" --- # Квоты {#quotas} @@ -29,6 +29,8 @@ toc_title: "\u041a\u0432\u043e\u0442\u044b" 0 + 0 + 0 0 0 0 @@ -48,6 +50,8 @@ toc_title: "\u041a\u0432\u043e\u0442\u044b" 3600 1000 + 100 + 100 100 1000000000 100000000000 @@ -58,6 +62,8 @@ toc_title: "\u041a\u0432\u043e\u0442\u044b" 86400 10000 + 10000 + 10000 1000 5000000000 500000000000 @@ -74,6 +80,10 @@ toc_title: "\u041a\u0432\u043e\u0442\u044b" `queries` - общее количество запросов; +`query_selects` – общее количество запросов `SELECT`. + +`query_inserts` – общее количество запросов `INSERT`. + `errors` - количество запросов, при выполнении которых было выкинуто исключение; `result_rows` - суммарное количество строк, отданных в виде результата; @@ -107,4 +117,3 @@ toc_title: "\u041a\u0432\u043e\u0442\u044b" При перезапуске сервера, квоты сбрасываются. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/quotas/) diff --git a/docs/ru/operations/requirements.md b/docs/ru/operations/requirements.md index 36a7dd30b34..6567dcc9695 100644 --- a/docs/ru/operations/requirements.md +++ b/docs/ru/operations/requirements.md @@ -1,6 +1,6 @@ --- toc_priority: 44 -toc_title: "\u0422\u0440\u0435\u0431\u043e\u0432\u0430\u043d\u0438\u044f" +toc_title: "Требования" --- # Требования {#trebovaniia} diff --git a/docs/ru/operations/server-configuration-parameters/index.md b/docs/ru/operations/server-configuration-parameters/index.md index a691fe69fef..503c5d32163 100644 --- a/docs/ru/operations/server-configuration-parameters/index.md +++ b/docs/ru/operations/server-configuration-parameters/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u041a\u043e\u043d\u0444\u0438\u0433\u0443\u0440\u0430\u0446\u0438\u043e\u043d\u043d\u044b\u0435\u0020\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b\u0020\u0441\u0435\u0440\u0432\u0435\u0440\u0430" +toc_folder_title: "Конфигурационные параметры сервера" toc_priority: 54 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +toc_title: "Введение" --- # Конфигурационные параметры сервера {#server-settings} @@ -14,4 +14,3 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" Перед изучением настроек ознакомьтесь с разделом [Конфигурационные файлы](../configuration-files.md#configuration_files), обратите внимание на использование подстановок (атрибуты `incl` и `optional`). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/server_configuration_parameters/) diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index 15ab13836e3..be9e2deab74 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -1,6 +1,6 @@ --- toc_priority: 57 -toc_title: "\u041a\u043e\u043d\u0444\u0438\u0433\u0443\u0440\u0430\u0446\u0438\u043e\u043d\u043d\u044b\u0435\u0020\u043f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u044b\u0020\u0441\u0435\u0440\u0432\u0435\u0440\u0430" +toc_title: "Конфигурационные параметры сервера" --- # Конфигурационные параметры сервера {#server-configuration-parameters-reference} @@ -101,6 +101,12 @@ ClickHouse проверяет условия для `min_part_size` и `min_part ``` +## database_atomic_delay_before_drop_table_sec {#database_atomic_delay_before_drop_table_sec} + +Устанавливает задержку перед удалением табличных данных, в секундах. Если запрос имеет идентификатор `SYNC`, эта настройка игнорируется. + +Значение по умолчанию: `480` (8 минут). + ## default\_database {#default-database} База данных по умолчанию. @@ -285,7 +291,7 @@ ClickHouse проверяет условия для `min_part_size` и `min_part ## interserver_http_host {#interserver-http-host} -Имя хоста, которое могут использовать другие серверы для обращения к этому. +Имя хоста, которое могут использовать другие серверы для обращения к этому хосту. Если не указано, то определяется аналогично команде `hostname -f`. @@ -297,11 +303,36 @@ ClickHouse проверяет условия для `min_part_size` и `min_part example.yandex.ru ``` +## interserver_https_port {#interserver-https-port} + +Порт для обмена данными между репликами ClickHouse по протоколу `HTTPS`. + +**Пример** + +``` xml +9010 +``` + +## interserver_https_host {#interserver-https-host} + +Имя хоста, которое могут использовать другие реплики для обращения к нему по протоколу `HTTPS`. + +**Пример** + +``` xml +example.yandex.ru +``` + + + ## interserver_http_credentials {#server-settings-interserver-http-credentials} Имя пользователя и пароль, использующиеся для аутентификации при [репликации](../../operations/server-configuration-parameters/settings.md) движками Replicated\*. Это имя пользователя и пароль используются только для взаимодействия между репликами кластера и никак не связаны с аутентификацией клиентов ClickHouse. Сервер проверяет совпадение имени и пароля для соединяющихся с ним реплик, а также использует это же имя и пароль для соединения с другими репликами. Соответственно, эти имя и пароль должны быть прописаны одинаковыми для всех реплик кластера. По умолчанию аутентификация не используется. +!!! note "Примечание" + Эти учетные данные являются общими для обмена данными по протоколам `HTTP` и `HTTPS`. + Раздел содержит следующие параметры: - `user` — имя пользователя. @@ -481,7 +512,15 @@ ClickHouse проверяет условия для `min_part_size` и `min_part ## max_concurrent_queries {#max-concurrent-queries} -Максимальное количество одновременно обрабатываемых запросов. +Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). + +!!! info "Примечание" + Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. + +Возможные значения: + +- Положительное целое число. +- 0 — выключена. **Пример** @@ -509,6 +548,21 @@ ClickHouse проверяет условия для `min_part_size` и `min_part - [max_concurrent_queries](#max-concurrent-queries) +## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries} + +Определяет минимальное количество засечек, считываемых запросом для применения настройки [max_concurrent_queries](#max-concurrent-queries). + +Возможные значения: + +- Положительное целое число. +- 0 — выключена. + +**Пример** + +``` xml +10 +``` + ## max_connections {#max-connections} Максимальное количество входящих соединений. @@ -1159,5 +1213,3 @@ ClickHouse использует ZooKeeper для хранения метадан ``` - -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/server_configuration_parameters/settings/) diff --git a/docs/ru/operations/settings/constraints-on-settings.md b/docs/ru/operations/settings/constraints-on-settings.md index b23be22958c..754d6cbba8a 100644 --- a/docs/ru/operations/settings/constraints-on-settings.md +++ b/docs/ru/operations/settings/constraints-on-settings.md @@ -1,6 +1,6 @@ --- toc_priority: 62 -toc_title: "\u041e\u0433\u0440\u0430\u043d\u0438\u0447\u0435\u043d\u0438\u044f\u0020\u043d\u0430\u0020\u0438\u0437\u043c\u0435\u043d\u0435\u043d\u0438\u0435\u0020\u043d\u0430\u0441\u0442\u0440\u043e\u0435\u043a" +toc_title: "Ограничения на изменение настроек" --- # Ограничения на изменение настроек {#constraints-on-settings} @@ -71,4 +71,3 @@ Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should n **Примечание:** профиль с именем `default` обрабатывается специальным образом: все ограничения на изменение настроек из этого профиля становятся дефолтными и влияют на всех пользователей, кроме тех, где эти ограничения явно переопределены. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/constraints_on_settings/) diff --git a/docs/ru/operations/settings/index.md b/docs/ru/operations/settings/index.md index 2ef1d4730a3..050df975b47 100644 --- a/docs/ru/operations/settings/index.md +++ b/docs/ru/operations/settings/index.md @@ -54,4 +54,3 @@ SELECT getSetting('custom_a'); - [Конфигурационные параметры сервера](../../operations/server-configuration-parameters/settings.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/) diff --git a/docs/ru/operations/settings/merge-tree-settings.md b/docs/ru/operations/settings/merge-tree-settings.md index bfc0b0a2644..f9093d379e3 100644 --- a/docs/ru/operations/settings/merge-tree-settings.md +++ b/docs/ru/operations/settings/merge-tree-settings.md @@ -55,6 +55,26 @@ Eсли число кусков в партиции превышает знач ClickHouse искусственно выполняет `INSERT` дольше (добавляет ‘sleep’), чтобы фоновый механизм слияния успевал слиять куски быстрее, чем они добавляются. +## inactive_parts_to_throw_insert {#inactive-parts-to-throw-insert} + +Если число неактивных кусков в партиции превышает значение `inactive_parts_to_throw_insert`, `INSERT` прерывается с исключением «Too many inactive parts (N). Parts cleaning are processing significantly slower than inserts». + +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: 0 (не ограничено). + +## inactive_parts_to_delay_insert {#inactive-parts-to-delay-insert} + +Если число неактивных кусков в партиции больше или равно значению `inactive_parts_to_delay_insert`, `INSERT` искусственно замедляется. Это полезно, когда сервер не может быстро очистить неактивные куски. + +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: 0 (не ограничено). + ## max_delay_to_insert {#max-delay-to-insert} Величина в секундах, которая используется для расчета задержки `INSERT`, если число кусков в партиции превышает значение [parts_to_delay_insert](#parts-to-delay-insert). diff --git a/docs/ru/operations/settings/permissions-for-queries.md b/docs/ru/operations/settings/permissions-for-queries.md index ae896dac77c..8cd5a2570ca 100644 --- a/docs/ru/operations/settings/permissions-for-queries.md +++ b/docs/ru/operations/settings/permissions-for-queries.md @@ -1,6 +1,6 @@ --- toc_priority: 58 -toc_title: "\u0420\u0430\u0437\u0440\u0435\u0448\u0435\u043d\u0438\u044f\u0020\u0434\u043b\u044f\u0020\u0437\u0430\u043f\u0440\u043e\u0441\u043e\u0432" +toc_title: "Разрешения для запросов" --- # Разрешения для запросов {#permissions_for_queries} @@ -59,4 +59,3 @@ toc_title: "\u0420\u0430\u0437\u0440\u0435\u0448\u0435\u043d\u0438\u044f\u0020\u 1 -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/permissions_for_queries/) diff --git a/docs/ru/operations/settings/query-complexity.md b/docs/ru/operations/settings/query-complexity.md index b0eac5d96e7..c2e00302d18 100644 --- a/docs/ru/operations/settings/query-complexity.md +++ b/docs/ru/operations/settings/query-complexity.md @@ -1,6 +1,6 @@ --- toc_priority: 59 -toc_title: "\u041e\u0433\u0440\u0430\u043d\u0438\u0447\u0435\u043d\u0438\u044f\u0020\u043d\u0430\u0020\u0441\u043b\u043e\u0436\u043d\u043e\u0441\u0442\u044c\u0020\u0437\u0430\u043f\u0440\u043e\u0441\u0430" +toc_title: "Ограничения на сложность запроса" --- # Ограничения на сложность запроса {#restrictions-on-query-complexity} @@ -314,4 +314,3 @@ FORMAT Null; > «Too many partitions for single INSERT block (more than» + toString(max_parts) + «). The limit is controlled by ‘max_partitions_per_insert_block’ setting. Large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).» -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/query_complexity/) diff --git a/docs/ru/operations/settings/settings-profiles.md b/docs/ru/operations/settings/settings-profiles.md index 10feda01850..d3b3d29db94 100644 --- a/docs/ru/operations/settings/settings-profiles.md +++ b/docs/ru/operations/settings/settings-profiles.md @@ -1,6 +1,6 @@ --- toc_priority: 61 -toc_title: "\u041f\u0440\u043e\u0444\u0438\u043b\u0438\u0020\u043d\u0430\u0441\u0442\u0440\u043e\u0435\u043a" +toc_title: "Профили настроек" --- # Профили настроек {#settings-profiles} @@ -77,4 +77,3 @@ SET profile = 'web' Профиль `web` — обычный профиль, который может быть установлен с помощью запроса `SET` или параметра URL при запросе по HTTP. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/settings_profiles/) diff --git a/docs/ru/operations/settings/settings-users.md b/docs/ru/operations/settings/settings-users.md index 2069922d0ea..6a10e518817 100644 --- a/docs/ru/operations/settings/settings-users.md +++ b/docs/ru/operations/settings/settings-users.md @@ -1,6 +1,6 @@ --- toc_priority: 63 -toc_title: "\u041d\u0430\u0441\u0442\u0440\u043e\u0439\u043a\u0438\u0020\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u0435\u0439" +toc_title: "Настройки пользователей" --- # Настройки пользователей {#nastroiki-polzovatelei} @@ -162,4 +162,3 @@ toc_title: "\u041d\u0430\u0441\u0442\u0440\u043e\u0439\u043a\u0438\u0020\u043f\u Элемент `filter` содержать любое выражение, возвращающее значение типа [UInt8](../../sql-reference/data-types/int-uint.md). Обычно он содержит сравнения и логические операторы. Строки `database_name.table1`, для которых фильтр возвращает 0 не выдаются пользователю. Фильтрация несовместима с операциями `PREWHERE` и отключает оптимизацию `WHERE→PREWHERE`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/settings_users/) diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 7322b6c9184..7acdd65051b 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1,6 +1,6 @@ --- toc_priority: 60 -toc_title: "\u041d\u0430\u0441\u0442\u0440\u043e\u0439\u043a\u0438" +toc_title: "Настройки" --- # Настройки {#settings} @@ -134,7 +134,7 @@ ClickHouse применяет настройку в тех случаях, ко ## max_http_get_redirects {#setting-max_http_get_redirects} -Ограничивает максимальное количество переходов по редиректам в таблицах с движком [URL](../../engines/table-engines/special/url.md) при выполнении HTTP запросов методом GET. Настройка применяется для обоих типов таблиц: созданных запросом [CREATE TABLE](../../sql_reference/create/#create-table-query) и с помощью табличной функции [url](../../sql-reference/table-functions/url.md). +Ограничивает максимальное количество переходов по редиректам в таблицах с движком [URL](../../engines/table-engines/special/url.md) при выполнении HTTP запросов методом GET. Настройка применяется для обоих типов таблиц: созданных запросом [CREATE TABLE](../../sql-reference/statements/create/table.md#create-table-query) и с помощью табличной функции [url](../../sql-reference/table-functions/url.md). Возможные значения: @@ -306,7 +306,7 @@ INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), ( CREATE TABLE table_with_enum_column_for_tsv_insert (Id Int32,Value Enum('first' = 1, 'second' = 2)) ENGINE=Memory(); ``` -При включенной настройке `input_format_tsv_enum_as_number`: +При включенной настройке `input_format_tsv_enum_as_number`: ```sql SET input_format_tsv_enum_as_number = 1; @@ -556,7 +556,7 @@ ClickHouse может парсить только базовый формат `Y Возможные значения: -- 0 — Устаревшее поведение отключено. +- 0 — Устаревшее поведение отключено. - 1 — Устаревшее поведение включено. Значение по умолчанию: 0. @@ -759,6 +759,38 @@ log_queries_min_type='EXCEPTION_WHILE_PROCESSING' log_query_threads=1 ``` +## log_comment {#settings-log-comment} + +Задаёт значение поля `log_comment` таблицы [system.query_log](../system-tables/query_log.md) и текст комментария в логе сервера. + +Может быть использована для улучшения читабельности логов сервера. Кроме того, помогает быстро выделить связанные с тестом запросы из `system.query_log` после запуска [clickhouse-test](../../development/tests.md). + +Возможные значения: + +- Любая строка не длиннее [max_query_size](#settings-max_query_size). При превышении длины сервер сгенерирует исключение. + +Значение по умолчанию: пустая строка. + +**Пример** + +Запрос: + +``` sql +SET log_comment = 'log_comment test', log_queries = 1; +SELECT 1; +SYSTEM FLUSH LOGS; +SELECT type, query FROM system.query_log WHERE log_comment = 'log_comment test' AND event_date >= yesterday() ORDER BY event_time DESC LIMIT 2; +``` + +Результат: + +``` text +┌─type────────┬─query─────┐ +│ QueryStart │ SELECT 1; │ +│ QueryFinish │ SELECT 1; │ +└─────────────┴───────────┘ +``` + ## max_insert_block_size {#settings-max_insert_block_size} Формировать блоки указанного размера, при вставке в таблицу. @@ -1087,8 +1119,23 @@ load_balancing = round_robin ## max_parallel_replicas {#settings-max_parallel_replicas} Максимальное количество используемых реплик каждого шарда при выполнении запроса. -Для консистентности (чтобы получить разные части одного и того же разбиения), эта опция работает только при заданном ключе сэмплирования. -Отставание реплик не контролируется. + +Возможные значения: + +- Целое положительное число. + +**Дополнительная информация** + +Эта настройка полезна для реплицируемых таблиц с ключом сэмплирования. Запрос может обрабатываться быстрее, если он выполняется на нескольких серверах параллельно. Однако производительность обработки запроса, наоборот, может упасть в следующих ситуациях: + +- Позиция ключа сэмплирования в ключе партиционирования не позволяет выполнять эффективное сканирование. +- Добавление ключа сэмплирования в таблицу делает фильтрацию по другим столбцам менее эффективной. +- Ключ сэмплирования является выражением, которое сложно вычисляется. +- У распределения сетевых задержек в кластере длинный «хвост», из-за чего при параллельных запросах к нескольким серверам увеличивается среднее время задержки. + +!!! warning "Предупреждение" + Параллельное выполнение запроса может привести к неверному результату, если в запросе есть объединение или подзапросы и при этом таблицы не удовлетворяют определенным требованиям. Подробности смотрите в разделе [Распределенные подзапросы и max_parallel_replicas](../../sql-reference/operators/in.md#max_parallel_replica-subqueries). + ## compile {#compile} @@ -1236,7 +1283,7 @@ SELECT area/period FROM account_orders FORMAT JSON; CREATE TABLE table_with_enum_column_for_csv_insert (Id Int32,Value Enum('first' = 1, 'second' = 2)) ENGINE=Memory(); ``` -При включенной настройке `input_format_csv_enum_as_number`: +При включенной настройке `input_format_csv_enum_as_number`: ```sql SET input_format_csv_enum_as_number = 1; @@ -1731,7 +1778,7 @@ ClickHouse генерирует исключение Включает или отключает режим синхронного добавления данных в распределенные таблицы (таблицы с движком [Distributed](../../engines/table-engines/special/distributed.md#distributed)). -По умолчанию ClickHouse вставляет данные в распределённую таблицу в асинхронном режиме. Если `insert_distributed_sync=1`, то данные вставляются сихронно, а запрос `INSERT` считается выполненным успешно, когда данные записаны на все шарды (по крайней мере на одну реплику для каждого шарда, если `internal_replication = true`). +По умолчанию ClickHouse вставляет данные в распределённую таблицу в асинхронном режиме. Если `insert_distributed_sync=1`, то данные вставляются сихронно, а запрос `INSERT` считается выполненным успешно, когда данные записаны на все шарды (по крайней мере на одну реплику для каждого шарда, если `internal_replication = true`). Возможные значения: @@ -1744,6 +1791,67 @@ ClickHouse генерирует исключение - [Движок Distributed](../../engines/table-engines/special/distributed.md#distributed) - [Управление распределёнными таблицами](../../sql-reference/statements/system.md#query-language-system-distributed) + +## insert_distributed_one_random_shard {#insert_distributed_one_random_shard} + +Включает или отключает режим вставки данных в [Distributed](../../engines/table-engines/special/distributed.md#distributed)) таблицу в случайный шард при отсутствии ключ шардирования. + +По умолчанию при вставке данных в `Distributed` таблицу с несколькими шардами и при отсутствии ключа шардирования сервер ClickHouse будет отклонять любой запрос на вставку данных. Когда `insert_distributed_one_random_shard = 1`, вставки принимаются, а данные записываются в случайный шард. + +Возможные значения: + +- 0 — если у таблицы несколько шардов, но ключ шардирования отсутствует, вставка данных отклоняется. +- 1 — если ключ шардирования отсутствует, то вставка данных осуществляется в случайный шард среди всех доступных шардов. + +Значение по умолчанию: `0`. + +## insert_shard_id {#insert_shard_id} + +Если не `0`, указывает, в какой шард [Distributed](../../engines/table-engines/special/distributed.md#distributed) таблицы данные будут вставлены синхронно. + +Если значение настройки `insert_shard_id` указано неверно, сервер выдаст ошибку. + +Узнать количество шардов `shard_num` на кластере `requested_cluster` можно из конфигурации сервера, либо используя запрос: + +``` sql +SELECT uniq(shard_num) FROM system.clusters WHERE cluster = 'requested_cluster'; +``` + +Возможные значения: + +- 0 — выключено. +- Любое число от `1` до `shards_num` соответствующей [Distributed](../../engines/table-engines/special/distributed.md#distributed) таблицы. + +Значение по умолчанию: `0`. + +**Пример** + +Запрос: + +```sql +CREATE TABLE x AS system.numbers ENGINE = MergeTree ORDER BY number; +CREATE TABLE x_dist AS x ENGINE = Distributed('test_cluster_two_shards_localhost', currentDatabase(), x); +INSERT INTO x_dist SELECT * FROM numbers(5) SETTINGS insert_shard_id = 1; +SELECT * FROM x_dist ORDER BY number ASC; +``` + +Результат: + +``` text +┌─number─┐ +│ 0 │ +│ 0 │ +│ 1 │ +│ 1 │ +│ 2 │ +│ 2 │ +│ 3 │ +│ 3 │ +│ 4 │ +│ 4 │ +└────────┘ +``` + ## validate_polygons {#validate_polygons} Включает или отключает генерирование исключения в функции [pointInPolygon](../../sql-reference/functions/geo/index.md#pointinpolygon), если многоугольник самопересекающийся или самокасающийся. @@ -1937,6 +2045,21 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1; Значение по умолчанию: 16. +## background_message_broker_schedule_pool_size {#background_message_broker_schedule_pool_size} + +Задает количество потоков для фонового потокового вывода сообщений. Настройка применяется при запуске сервера ClickHouse и не может быть изменена в пользовательском сеансе. + +Допустимые значения: + +- Положительное целое число. + +Значение по умолчанию: 16. + +**Смотрите также** + +- Движок [Kafka](../../engines/table-engines/integrations/kafka.md#kafka). +- Движок [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine). + ## format_avro_schema_registry_url {#format_avro_schema_registry_url} Задает URL реестра схем [Confluent](https://docs.confluent.io/current/schema-registry/index.html) для использования с форматом [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent). @@ -2052,11 +2175,11 @@ SELECT * FROM a; ## ttl_only_drop_parts {#ttl_only_drop_parts} -Для таблиц [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) включает или отключает возможность полного удаления кусков данных, в которых все записи устарели. +Для таблиц [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) включает или отключает возможность полного удаления кусков данных, в которых все записи устарели. -Когда настройка `ttl_only_drop_parts` отключена (т.е. по умолчанию), сервер лишь удаляет устаревшие записи в соответствии с их временем жизни (TTL). +Когда настройка `ttl_only_drop_parts` отключена (т.е. по умолчанию), сервер лишь удаляет устаревшие записи в соответствии с их временем жизни (TTL). -Когда настройка `ttl_only_drop_parts` включена, сервер целиком удаляет куски данных, в которых все записи устарели. +Когда настройка `ttl_only_drop_parts` включена, сервер целиком удаляет куски данных, в которых все записи устарели. Удаление целых кусков данных вместо удаления отдельных записей позволяет устанавливать меньший таймаут `merge_with_ttl_timeout` и уменьшает нагрузку на сервер, что способствует росту производительности. @@ -2067,18 +2190,18 @@ SELECT * FROM a; Значение по умолчанию: `0`. -**См. также** +**См. также** - [Секции и настройки запроса CREATE TABLE](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) (настройка `merge_with_ttl_timeout`) - [Table TTL](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-table-ttl) ## output_format_pretty_max_value_width {#output_format_pretty_max_value_width} -Ограничивает длину значения, выводимого в формате [Pretty](../../interfaces/formats.md#pretty). Если значение длиннее указанного количества символов, оно обрезается. +Ограничивает длину значения, выводимого в формате [Pretty](../../interfaces/formats.md#pretty). Если значение длиннее указанного количества символов, оно обрезается. Возможные значения: -- Положительное целое число. +- Положительное целое число. - 0 — значение обрезается полностью. Значение по умолчанию: `10000` символов. @@ -2227,17 +2350,17 @@ SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; Включает или отключает сохранение типа `Nullable` для аргумента функции [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast). -Если настройка включена, то когда в функцию `CAST` передается аргумент с типом `Nullable`, функция возвращает результат, также преобразованный к типу `Nullable`. -Если настройка отключена, то функция `CAST` всегда возвращает результат строго указанного типа. +Если настройка включена, то когда в функцию `CAST` передается аргумент с типом `Nullable`, функция возвращает результат, также преобразованный к типу `Nullable`. +Если настройка отключена, то функция `CAST` всегда возвращает результат строго указанного типа. Возможные значения: - 0 — функция `CAST` преобразует аргумент строго к указанному типу. -- 1 — если аргумент имеет тип `Nullable`, то функция `CAST` преобразует его к типу `Nullable` для указанного типа. +- 1 — если аргумент имеет тип `Nullable`, то функция `CAST` преобразует его к типу `Nullable` для указанного типа. Значение по умолчанию: `0`. -**Примеры** +**Примеры** Запрос возвращает аргумент, преобразованный строго к указанному типу: @@ -2269,9 +2392,9 @@ SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); └───┴───────────────────────────────────────────────────┘ ``` -**См. также** +**См. также** -- Функция [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) +- Функция [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) ## persistent {#persistent} @@ -2349,7 +2472,7 @@ SELECT number FROM numbers(3) FORMAT JSONEachRow; [ {"number":"0"}, {"number":"1"}, -{"number":"2"} +{"number":"2"} ] ``` @@ -2537,4 +2660,101 @@ SELECT * FROM test2; Обратите внимание на то, что эта настройка влияет на поведение [материализованных представлений](../../sql-reference/statements/create/view.md#materialized) и БД [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md). +## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists} + +Включает или отключает возможность выполнять запрос `SELECT` к таблице на движке [File](../../engines/table-engines/special/file.md), не содержащей файл. + +Возможные значения: +- 0 — запрос `SELECT` генерирует исключение. +- 1 — запрос `SELECT` возвращает пустой результат. + +Значение по умолчанию: `0`. + +## engine_file_truncate_on_insert {#engine-file-truncate-on-insert} + +Включает или выключает удаление данных из таблицы до вставки в таблицу на движке [File](../../engines/table-engines/special/file.md). + +Возможные значения: +- 0 — запрос `INSERT` добавляет данные в конец файла после существующих. +- 1 — `INSERT` удаляет имеющиеся в файле данные и замещает их новыми. + +Значение по умолчанию: `0`. + +## allow_experimental_geo_types {#allow-experimental-geo-types} + +Разрешает использование экспериментальных типов данных для работы с [географическими структурами](../../sql-reference/data-types/geo.md). + +Возможные значения: +- 0 — использование типов данных для работы с географическими структурами не поддерживается. +- 1 — использование типов данных для работы с географическими структурами поддерживается. + +Значение по умолчанию: `0`. + +## database_atomic_wait_for_drop_and_detach_synchronously {#database_atomic_wait_for_drop_and_detach_synchronously} + +Добавляет модификатор `SYNC` ко всем запросам `DROP` и `DETACH`. + +Возможные значения: + +- 0 — Запросы будут выполняться с задержкой. +- 1 — Запросы будут выполняться без задержки. + +Значение по умолчанию: `0`. + +## show_table_uuid_in_table_create_query_if_not_nil {#show_table_uuid_in_table_create_query_if_not_nil} + +Устанавливает отображение запроса `SHOW TABLE`. + +Возможные значения: + +- 0 — Запрос будет отображаться без UUID таблицы. +- 1 — Запрос будет отображаться с UUID таблицы. + +Значение по умолчанию: `0`. + +## allow_experimental_live_view {#allow-experimental-live-view} + +Включает экспериментальную возможность использования [LIVE-представлений](../../sql-reference/statements/create/view.md#live-view). + +Возможные значения: +- 0 — живые представления не поддерживаются. +- 1 — живые представления поддерживаются. + +Значение по умолчанию: `0`. + +## live_view_heartbeat_interval {#live-view-heartbeat-interval} + +Задает интервал в секундах для периодической проверки существования [LIVE VIEW](../../sql-reference/statements/create/view.md#live-view). + +Значение по умолчанию: `15`. + +## max_live_view_insert_blocks_before_refresh {#max-live-view-insert-blocks-before-refresh} + +Задает наибольшее число вставок, после которых запрос на формирование [LIVE VIEW](../../sql-reference/statements/create/view.md#live-view) исполняется снова. + +Значение по умолчанию: `64`. + +## temporary_live_view_timeout {#temporary-live-view-timeout} + +Задает время в секундах, после которого [LIVE VIEW](../../sql-reference/statements/create/view.md#live-view) удаляется. + +Значение по умолчанию: `5`. + +## periodic_live_view_refresh {#periodic-live-view-refresh} + +Задает время в секундах, по истечении которого [LIVE VIEW](../../sql-reference/statements/create/view.md#live-view) с установленным автообновлением обновляется. + +Значение по умолчанию: `60`. + +## check_query_single_value_result {#check_query_single_value_result} + +Определяет уровень детализации результата для запросов [CHECK TABLE](../../sql-reference/statements/check-table.md#checking-mergetree-tables) для таблиц семейства `MergeTree`. + +Возможные значения: + +- 0 — запрос возвращает статус каждого куска данных таблицы. +- 1 — запрос возвращает статус таблицы в целом. + +Значение по умолчанию: `0`. + [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/settings/) diff --git a/docs/ru/operations/system-tables/asynchronous_metric_log.md b/docs/ru/operations/system-tables/asynchronous_metric_log.md index 2fe617e48af..979b63f0cc8 100644 --- a/docs/ru/operations/system-tables/asynchronous_metric_log.md +++ b/docs/ru/operations/system-tables/asynchronous_metric_log.md @@ -34,4 +34,3 @@ SELECT * FROM system.asynchronous_metric_log LIMIT 10 - [system.asynchronous_metrics](#system_tables-asynchronous_metrics) — Содержит метрики, которые периодически вычисляются в фоновом режиме. - [system.metric_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/asynchronous_metric_log) diff --git a/docs/ru/operations/system-tables/asynchronous_metrics.md b/docs/ru/operations/system-tables/asynchronous_metrics.md index 5ff010bc79f..9d12a119c43 100644 --- a/docs/ru/operations/system-tables/asynchronous_metrics.md +++ b/docs/ru/operations/system-tables/asynchronous_metrics.md @@ -35,5 +35,4 @@ SELECT * FROM system.asynchronous_metrics LIMIT 10 - [system.events](#system_tables-events) — таблица с количеством произошедших событий. - [system.metric_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. - [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/asynchronous_metrics) \ No newline at end of file diff --git a/docs/ru/operations/system-tables/clusters.md b/docs/ru/operations/system-tables/clusters.md index 9cf84ea5f02..ddc6849b44d 100644 --- a/docs/ru/operations/system-tables/clusters.md +++ b/docs/ru/operations/system-tables/clusters.md @@ -13,4 +13,3 @@ - `port` (UInt16) — порт, на который обращаться для соединения с сервером. - `user` (String) — имя пользователя, которого использовать для соединения с сервером. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/clusters) diff --git a/docs/ru/operations/system-tables/columns.md b/docs/ru/operations/system-tables/columns.md index 8cb9408e7d8..b8a0aef2299 100644 --- a/docs/ru/operations/system-tables/columns.md +++ b/docs/ru/operations/system-tables/columns.md @@ -4,7 +4,9 @@ С помощью этой таблицы можно получить информацию аналогично запросу [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table), но для многих таблиц сразу. -Таблица `system.columns` содержит столбцы (тип столбца указан в скобках): +Колонки [временных таблиц](../../sql-reference/statements/create/table.md#temporary-tables) содержатся в `system.columns` только в тех сессиях, в которых эти таблицы были созданы. Поле `database` у таких колонок пустое. + +Cтолбцы: - `database` ([String](../../sql-reference/data-types/string.md)) — имя базы данных. - `table` ([String](../../sql-reference/data-types/string.md)) — имя таблицы. @@ -23,4 +25,46 @@ - `is_in_sampling_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — флаг, показывающий включение столбца в ключ выборки. - `compression_codec` ([String](../../sql-reference/data-types/string.md)) — имя кодека сжатия. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/columns) +**Пример** + +```sql +SELECT * FROM system.columns LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: system +table: aggregate_function_combinators +name: name +type: String +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: + +Row 2: +────── +database: system +table: aggregate_function_combinators +name: is_internal +type: UInt8 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: +``` diff --git a/docs/ru/operations/system-tables/contributors.md b/docs/ru/operations/system-tables/contributors.md index 64c9a863bc3..6e11219e044 100644 --- a/docs/ru/operations/system-tables/contributors.md +++ b/docs/ru/operations/system-tables/contributors.md @@ -39,4 +39,3 @@ SELECT * FROM system.contributors WHERE name='Olga Khvostikova' └──────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/contributors) diff --git a/docs/ru/operations/system-tables/current-roles.md b/docs/ru/operations/system-tables/current-roles.md index a948b7b1e97..42ed4260fde 100644 --- a/docs/ru/operations/system-tables/current-roles.md +++ b/docs/ru/operations/system-tables/current-roles.md @@ -8,4 +8,3 @@ - `with_admin_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Флаг, который показывает, обладает ли `current_role` роль привилегией `ADMIN OPTION`. - `is_default` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Флаг, который показывает, является ли `current_role` ролью по умолчанию. - [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/current-roles) diff --git a/docs/ru/operations/system-tables/data_type_families.md b/docs/ru/operations/system-tables/data_type_families.md index d8d0b5e1074..ba4e5e64ec3 100644 --- a/docs/ru/operations/system-tables/data_type_families.md +++ b/docs/ru/operations/system-tables/data_type_families.md @@ -1,6 +1,6 @@ # system.data_type_families {#system_tables-data_type_families} -Содержит информацию о поддерживаемых [типах данных](../../sql-reference/data-types/). +Содержит информацию о поддерживаемых [типах данных](../../sql-reference/data-types/index.md). Столбцы: @@ -33,4 +33,3 @@ SELECT * FROM system.data_type_families WHERE alias_to = 'String' - [Синтаксис](../../sql-reference/syntax.md) — поддерживаемый SQL синтаксис. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/data_type_families) diff --git a/docs/ru/operations/system-tables/databases.md b/docs/ru/operations/system-tables/databases.md index 00a4b543717..026f49c0d5d 100644 --- a/docs/ru/operations/system-tables/databases.md +++ b/docs/ru/operations/system-tables/databases.md @@ -4,4 +4,3 @@ Для каждой базы данных, о которой знает сервер, будет присутствовать соответствующая запись в таблице. Эта системная таблица используется для реализации запроса `SHOW DATABASES`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/databases) \ No newline at end of file diff --git a/docs/ru/operations/system-tables/detached_parts.md b/docs/ru/operations/system-tables/detached_parts.md index c59daa3985c..7abed6500aa 100644 --- a/docs/ru/operations/system-tables/detached_parts.md +++ b/docs/ru/operations/system-tables/detached_parts.md @@ -1,7 +1,6 @@ # system.detached_parts {#system_tables-detached_parts} Содержит информацию об отсоединённых кусках таблиц семейства [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). Столбец `reason` содержит причину, по которой кусок был отсоединён. Для кусов, отсоединённых пользователем, `reason` содержит пустую строку. -Такие куски могут быть присоединены с помощью [ALTER TABLE ATTACH PARTITION\|PART](../../sql_reference/alter/#alter_attach-partition). Остальные столбцы описаны в [system.parts](#system_tables-parts). -Если имя куска некорректно, значения некоторых столбцов могут быть `NULL`. Такие куски могут быть удалены с помощью [ALTER TABLE DROP DETACHED PART](../../sql_reference/alter/#alter_drop-detached). +Такие куски могут быть присоединены с помощью [ALTER TABLE ATTACH PARTITION|PART](../../sql-reference/statements/alter/index.md#alter_attach-partition). Остальные столбцы описаны в [system.parts](#system_tables-parts). +Если имя куска некорректно, значения некоторых столбцов могут быть `NULL`. Такие куски могут быть удалены с помощью [ALTER TABLE DROP DETACHED PART](../../sql-reference/statements/alter/index.md#alter_drop-detached). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/detached_parts) diff --git a/docs/ru/operations/system-tables/dictionaries.md b/docs/ru/operations/system-tables/dictionaries.md index cd1a4acab72..6a49904aae9 100644 --- a/docs/ru/operations/system-tables/dictionaries.md +++ b/docs/ru/operations/system-tables/dictionaries.md @@ -59,4 +59,3 @@ SELECT * FROM system.dictionaries └──────────┴──────┴────────┴─────────────┴──────┴────────┴──────────────────────────────────────┴─────────────────────┴─────────────────┴─────────────┴──────────┴───────────────┴───────────────────────┴────────────────────────────┴──────────────┴──────────────┴─────────────────────┴──────────────────────────────┘───────────────────────┴────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/dictionaries) \ No newline at end of file diff --git a/docs/ru/operations/system-tables/disks.md b/docs/ru/operations/system-tables/disks.md index 2832e7a1a32..186dfbd7819 100644 --- a/docs/ru/operations/system-tables/disks.md +++ b/docs/ru/operations/system-tables/disks.md @@ -10,4 +10,3 @@ Cодержит информацию о дисках, заданных в [ко - `total_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — объём диска в байтах. - `keep_free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — место, которое должно остаться свободным на диске в байтах. Задаётся значением параметра `keep_free_space_bytes` конфигурации дисков. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/disks) diff --git a/docs/ru/operations/system-tables/distributed_ddl_queue.md b/docs/ru/operations/system-tables/distributed_ddl_queue.md index 058ed06f639..99d92574a0b 100644 --- a/docs/ru/operations/system-tables/distributed_ddl_queue.md +++ b/docs/ru/operations/system-tables/distributed_ddl_queue.md @@ -14,7 +14,7 @@ - `initiator` ([String](../../sql-reference/data-types/string.md)) — узел, выполнивший запрос. - `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время начала запроса. - `query_finish_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время окончания запроса. -- `query_duration_ms` ([UInt64](../../sql-reference/data-types/datetime64.md)) — продолжительность выполнения запроса (в миллисекундах). +- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — продолжительность выполнения запроса (в миллисекундах). - `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — код исключения из [ZooKeeper](../../operations/tips.md#zookeeper). **Пример** @@ -61,5 +61,4 @@ exception_code: ZOK 2 rows in set. Elapsed: 0.025 sec. ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/distributed_ddl_queuedistributed_ddl_queue.md) \ No newline at end of file diff --git a/docs/ru/operations/system-tables/distribution_queue.md b/docs/ru/operations/system-tables/distribution_queue.md index 18346b34e04..5b811ab2be8 100644 --- a/docs/ru/operations/system-tables/distribution_queue.md +++ b/docs/ru/operations/system-tables/distribution_queue.md @@ -43,4 +43,3 @@ last_exception: - [Движок таблиц Distributed](../../engines/table-engines/special/distributed.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/distribution_queue) diff --git a/docs/ru/operations/system-tables/enabled-roles.md b/docs/ru/operations/system-tables/enabled-roles.md index cd3b0846718..a3f5ba179b3 100644 --- a/docs/ru/operations/system-tables/enabled-roles.md +++ b/docs/ru/operations/system-tables/enabled-roles.md @@ -9,4 +9,3 @@ - `is_current` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Флаг, который показывает, является ли `enabled_role` текущей ролью текущего пользователя. - `is_default` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Флаг, который показывает, является ли `enabled_role` ролью по умолчанию. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/enabled-roles) \ No newline at end of file diff --git a/docs/ru/operations/system-tables/events.md b/docs/ru/operations/system-tables/events.md index 0a48617bb5c..c05be74eea6 100644 --- a/docs/ru/operations/system-tables/events.md +++ b/docs/ru/operations/system-tables/events.md @@ -31,4 +31,3 @@ SELECT * FROM system.events LIMIT 5 - [system.metric_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. - [Мониторинг](../../operations/monitoring.md) — основы мониторинга в ClickHouse. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/events) diff --git a/docs/ru/operations/system-tables/functions.md b/docs/ru/operations/system-tables/functions.md index c51adb2c109..de752e2018c 100644 --- a/docs/ru/operations/system-tables/functions.md +++ b/docs/ru/operations/system-tables/functions.md @@ -7,4 +7,3 @@ - `name` (`String`) – Имя функции. - `is_aggregate` (`UInt8`) – Признак, является ли функция агрегатной. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/functions) diff --git a/docs/ru/operations/system-tables/grants.md b/docs/ru/operations/system-tables/grants.md index 58d8a9e1e06..76a014f62dd 100644 --- a/docs/ru/operations/system-tables/grants.md +++ b/docs/ru/operations/system-tables/grants.md @@ -21,4 +21,3 @@ - `grant_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Разрешение предоставлено с опцией `WITH GRANT OPTION`, подробнее см. [GRANT](../../sql-reference/statements/grant.md#grant-privigele-syntax). -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/grants) diff --git a/docs/ru/operations/system-tables/graphite_retentions.md b/docs/ru/operations/system-tables/graphite_retentions.md index 66fca7ba299..1098a29aac6 100644 --- a/docs/ru/operations/system-tables/graphite_retentions.md +++ b/docs/ru/operations/system-tables/graphite_retentions.md @@ -14,4 +14,3 @@ - `Tables.database` (Array(String)) - Массив имён баз данных таблиц, использующих параметр `config_name`. - `Tables.table` (Array(String)) - Массив имён таблиц, использующих параметр `config_name`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/graphite_retentions) diff --git a/docs/ru/operations/system-tables/index.md b/docs/ru/operations/system-tables/index.md index 93ea1c92068..fce93f33a27 100644 --- a/docs/ru/operations/system-tables/index.md +++ b/docs/ru/operations/system-tables/index.md @@ -1,6 +1,6 @@ --- toc_priority: 52 -toc_title: "\u0421\u0438\u0441\u0442\u0435\u043c\u043d\u044b\u0435\u0020\u0442\u0430\u0431\u043b\u0438\u0446\u044b" +toc_title: "Системные таблицы" --- # Системные таблицы {#system-tables} @@ -9,25 +9,54 @@ toc_title: "\u0421\u0438\u0441\u0442\u0435\u043c\u043d\u044b\u0435\u0020\u0442\u Системные таблицы содержат информацию о: -- Состоянии сервера, процессов и окружении. -- Внутренних процессах сервера. +- состоянии сервера, процессов и окружении. +- внутренних процессах сервера. Системные таблицы: -- Находятся в базе данных `system`. -- Доступны только для чтения данных. -- Не могут быть удалены или изменены, но их можно отсоединить. +- находятся в базе данных `system`. +- доступны только для чтения данных. +- не могут быть удалены или изменены, но их можно отсоединить. -Системные таблицы `metric_log`, `query_log`, `query_thread_log`, `trace_log` системные таблицы хранят данные в файловой системе. Остальные системные таблицы хранят свои данные в оперативной памяти. Сервер ClickHouse создает такие системные таблицы при запуске. +Большинство системных таблиц хранят свои данные в оперативной памяти. Сервер ClickHouse создает эти системные таблицы при старте. + +В отличие от других системных таблиц, таблицы с системными логами [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), [crash_log](../../operations/system-tables/crash-log.md) и [text_log](../../operations/system-tables/text_log.md) используют движок таблиц [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) и по умолчанию хранят свои данные в файловой системе. Если удалить таблицу из файловой системы, сервер ClickHouse снова создаст пустую таблицу во время следующей записи данных. Если схема системной таблицы изменилась в новом релизе, то ClickHouse переименует текущую таблицу и создаст новую. + +Таблицы с системными логами `log` можно настроить, создав конфигурационный файл с тем же именем, что и таблица в разделе `/etc/clickhouse-server/config.d/`, или указав соответствующие элементы в `/etc/clickhouse-server/config.xml`. Настраиваться могут следующие элементы: + +- `database` — база данных, к которой принадлежит системная таблица. Эта опция на текущий момент устарела. Все системные таблицы находятся в базе данных `system`. +- `table` — таблица для добавления данных. +- `partition_by` — [ключ партиционирования](../../engines/table-engines/mergetree-family/custom-partitioning-key.md). +- `ttl` — [время жизни](../../sql-reference/statements/alter/ttl.md) записей в таблице. +- `flush_interval_milliseconds` — интервал сброса данных на диск, в миллисекундах. +- `engine` — полное имя движка (начиная с `ENGINE =` ) с параметрами. Эта опция противоречит `partition_by` и `ttl`. Если указать оба параметра вместе, сервер вернет ошибку и завершит работу. + +Пример: + +```xml + + + system + query_log
+ toYYYYMM(event_date) + event_date + INTERVAL 30 DAY DELETE + + 7500 +
+
+``` + +По умолчанию размер таблицы не ограничен. Управлять размером таблицы можно используя [TTL](../../sql-reference/statements/alter/ttl.md#manipuliatsii-s-ttl-tablitsy) для удаления устаревших записей журнала. Также вы можете использовать функцию партиционирования для таблиц `MergeTree`. ### Источники системных показателей Для сбора системных показателей сервер ClickHouse использует: -- Возможности `CAP_NET_ADMIN`. +- возможности `CAP_NET_ADMIN`. - [procfs](https://ru.wikipedia.org/wiki/Procfs) (только Linux). -**procfs** Если для сервера ClickHouse не включено `CAP_NET_ADMIN`, он пытается обратиться к `ProcfsMetricsProvider`. `ProcfsMetricsProvider` позволяет собирать системные показатели для каждого запроса (для CPU и I/O). @@ -41,4 +70,3 @@ toc_title: "\u0421\u0438\u0441\u0442\u0435\u043c\u043d\u044b\u0435\u0020\u0442\u - `OSReadBytes` - `OSWriteBytes` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system-tables/) diff --git a/docs/ru/operations/system-tables/licenses.md b/docs/ru/operations/system-tables/licenses.md index a6a49d5e0be..598da1e72ee 100644 --- a/docs/ru/operations/system-tables/licenses.md +++ b/docs/ru/operations/system-tables/licenses.md @@ -36,4 +36,3 @@ SELECT library_name, license_type, license_path FROM system.licenses LIMIT 15 ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/licenses) diff --git a/docs/ru/operations/system-tables/merges.md b/docs/ru/operations/system-tables/merges.md index 021a95981e6..f48f0d1ac27 100644 --- a/docs/ru/operations/system-tables/merges.md +++ b/docs/ru/operations/system-tables/merges.md @@ -18,4 +18,3 @@ - `bytes_written_uncompressed UInt64` — Количество записанных байт, несжатых. - `rows_written UInt64` — Количество записанных строк. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/merges) diff --git a/docs/ru/operations/system-tables/metric_log.md b/docs/ru/operations/system-tables/metric_log.md index 2458c93da59..5160b32927b 100644 --- a/docs/ru/operations/system-tables/metric_log.md +++ b/docs/ru/operations/system-tables/metric_log.md @@ -48,4 +48,3 @@ CurrentMetric_ReplicatedChecks: 0 - [system.metrics](#system_tables-metrics) — таблица с мгновенно вычисляемыми метриками. - [Мониторинг](../../operations/monitoring.md) — основы мониторинга в ClickHouse. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/metric_log) diff --git a/docs/ru/operations/system-tables/metrics.md b/docs/ru/operations/system-tables/metrics.md index db4016687d6..13d5fbc750a 100644 --- a/docs/ru/operations/system-tables/metrics.md +++ b/docs/ru/operations/system-tables/metrics.md @@ -38,4 +38,3 @@ SELECT * FROM system.metrics LIMIT 10 - [system.metric_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. - [Мониторинг](../../operations/monitoring.md) — основы мониторинга в ClickHouse. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/metrics) diff --git a/docs/ru/operations/system-tables/mutations.md b/docs/ru/operations/system-tables/mutations.md index 044677030ba..4370ab593e7 100644 --- a/docs/ru/operations/system-tables/mutations.md +++ b/docs/ru/operations/system-tables/mutations.md @@ -45,4 +45,3 @@ - [Движок MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) - [Репликация данных](../../engines/table-engines/mergetree-family/replication.md) (семейство ReplicatedMergeTree) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/mutations) diff --git a/docs/ru/operations/system-tables/numbers.md b/docs/ru/operations/system-tables/numbers.md index 02192184aa1..0be4a4ce05d 100644 --- a/docs/ru/operations/system-tables/numbers.md +++ b/docs/ru/operations/system-tables/numbers.md @@ -4,4 +4,3 @@ Эту таблицу можно использовать для тестов, а также если вам нужно сделать перебор. Чтения из этой таблицы не распараллеливаются. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/numbers) diff --git a/docs/ru/operations/system-tables/numbers_mt.md b/docs/ru/operations/system-tables/numbers_mt.md index 12409d831a1..d66c4515ddb 100644 --- a/docs/ru/operations/system-tables/numbers_mt.md +++ b/docs/ru/operations/system-tables/numbers_mt.md @@ -3,4 +3,3 @@ То же самое, что и [system.numbers](../../operations/system-tables/numbers.md), но чтение распараллеливается. Числа могут возвращаться в произвольном порядке. Используется для тестов. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/numbers_mt) diff --git a/docs/ru/operations/system-tables/one.md b/docs/ru/operations/system-tables/one.md index 4231277ffe4..5cb297f06d4 100644 --- a/docs/ru/operations/system-tables/one.md +++ b/docs/ru/operations/system-tables/one.md @@ -4,4 +4,3 @@ Эта таблица используется, если в `SELECT` запросе не указана секция `FROM`. То есть, это - аналог таблицы `DUAL`, которую можно найти в других СУБД. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/one) diff --git a/docs/ru/operations/system-tables/opentelemetry_span_log.md b/docs/ru/operations/system-tables/opentelemetry_span_log.md index 96555064b0e..c421a602300 100644 --- a/docs/ru/operations/system-tables/opentelemetry_span_log.md +++ b/docs/ru/operations/system-tables/opentelemetry_span_log.md @@ -46,4 +46,3 @@ attribute.names: [] attribute.values: [] ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/opentelemetry_span_log) diff --git a/docs/ru/operations/system-tables/part_log.md b/docs/ru/operations/system-tables/part_log.md index bba4fda6135..a8d892f3b67 100644 --- a/docs/ru/operations/system-tables/part_log.md +++ b/docs/ru/operations/system-tables/part_log.md @@ -16,6 +16,7 @@ - `MOVE_PART` — перемещение куска между дисками. - `event_date` ([Date](../../sql-reference/data-types/date.md)) — дата события. - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время события. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — время события с точностью до микросекунд. - `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — длительность. - `database` ([String](../../sql-reference/data-types/string.md)) — имя базы данных, в которой находится кусок. - `table` ([String](../../sql-reference/data-types/string.md)) — имя таблицы, в которой находится кусок. @@ -47,6 +48,7 @@ query_id: 983ad9c7-28d5-4ae1-844e-603116b7de31 event_type: NewPart event_date: 2021-02-02 event_time: 2021-02-02 11:14:28 +event_time_microseconds: 2021-02-02 11:14:28.861919 duration_ms: 35 database: default table: log_mt_2 @@ -64,4 +66,3 @@ error: 0 exception: ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/part_log) diff --git a/docs/ru/operations/system-tables/parts.md b/docs/ru/operations/system-tables/parts.md index 950e652332d..1c7f0ad2e9a 100644 --- a/docs/ru/operations/system-tables/parts.md +++ b/docs/ru/operations/system-tables/parts.md @@ -155,4 +155,3 @@ move_ttl_info.max: [] - [Движок MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) - [TTL для столбцов и таблиц](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/parts) diff --git a/docs/ru/operations/system-tables/parts_columns.md b/docs/ru/operations/system-tables/parts_columns.md index db4d453e8f1..5640929d810 100644 --- a/docs/ru/operations/system-tables/parts_columns.md +++ b/docs/ru/operations/system-tables/parts_columns.md @@ -145,4 +145,3 @@ column_marks_bytes: 48 - [Движок MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) -[Оригинальная статья](https://clickhouse.tech/docs/en/operations/system_tables/parts_columns) diff --git a/docs/ru/operations/system-tables/processes.md b/docs/ru/operations/system-tables/processes.md index c9216e162b3..682b174c483 100644 --- a/docs/ru/operations/system-tables/processes.md +++ b/docs/ru/operations/system-tables/processes.md @@ -14,4 +14,3 @@ - `query` (String) – текст запроса. Для запросов `INSERT` не содержит встаявляемые данные. - `query_id` (String) – идентификатор запроса, если был задан. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/processes) diff --git a/docs/ru/operations/system-tables/query_log.md b/docs/ru/operations/system-tables/query_log.md index 39f685288d8..d3872e1ef18 100644 --- a/docs/ru/operations/system-tables/query_log.md +++ b/docs/ru/operations/system-tables/query_log.md @@ -44,9 +44,15 @@ ClickHouse не удаляет данные из таблица автомати - `result_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — количество строк в результате запроса `SELECT` или количество строк в запросе `INSERT`. - `result_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — объём RAM в байтах, использованный для хранения результата запроса. - `memory_usage` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — потребление RAM запросом. +- `current_database` ([String](../../sql-reference/data-types/string.md)) — имя текущей базы данных. - `query` ([String](../../sql-reference/data-types/string.md)) — текст запроса. +- `normalized_query_hash` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — идентичная хэш-сумма без значений литералов для аналогичных запросов. +- `query_kind` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — тип запроса. +- `databases` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — имена баз данных, присутствующих в запросе. +- `tables` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — имена таблиц, присутствующих в запросе. +- `columns` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — имена столбцов, присутствующих в запросе. +- `exception_code` ([Int32](../../sql-reference/data-types/int-uint.md)) — код исключения. - `exception` ([String](../../sql-reference/data-types/string.md)) — сообщение исключения, если запрос завершился по исключению. -- `exception_code` ([Int32](../../sql-reference/data-types/int-uint.md)) — код исключения. - `stack_trace` ([String](../../sql-reference/data-types/string.md)) — [stack trace](https://en.wikipedia.org/wiki/Stack_trace). Пустая строка, если запрос успешно завершен. - `is_initial_query` ([UInt8](../../sql-reference/data-types/int-uint.md)) — вид запроса. Возможные значения: - 1 — запрос был инициирован клиентом. @@ -74,68 +80,97 @@ ClickHouse не удаляет данные из таблица автомати - 1 — `GET`. - 2 — `POST`. - `http_user_agent` ([String](../../sql-reference/data-types/string.md)) — HTTP заголовок `UserAgent`. -- `quota_key` ([String](../../sql-reference/data-types/string.md)) — «ключ квоты» из настроек [квот](quotas.md) (см. `keyed`). +- `http_referer` ([String](../../sql-reference/data-types/string.md)) — HTTP заголовок `Referer` (содержит полный или частичный адрес страницы, с которой был выполнен запрос). +- `forwarded_for` ([String](../../sql-reference/data-types/string.md)) — HTTP заголовок `X-Forwarded-For`. +- `quota_key` ([String](../../sql-reference/data-types/string.md)) — `ключ квоты` из настроек [квот](quotas.md) (см. `keyed`). - `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ревизия ClickHouse. -- `thread_numbers` ([Array(UInt32)](../../sql-reference/data-types/array.md)) — количество потоков, участвующих в обработке запросов. +- `log_comment` ([String](../../sql-reference/data-types/string.md)) — комментарий к записи в логе. Представляет собой произвольную строку, длина которой должна быть не больше, чем [max_query_size](../../operations/settings/settings.md#settings-max_query_size). Если нет комментария, то пустая строка. +- `thread_ids` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — идентификаторы потоков, участвующих в обработке запросов. - `ProfileEvents.Names` ([Array(String)](../../sql-reference/data-types/array.md)) — счетчики для изменения различных метрик. Описание метрик можно получить из таблицы [system.events](#system_tables-events)(#system_tables-events - `ProfileEvents.Values` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — метрики, перечисленные в столбце `ProfileEvents.Names`. - `Settings.Names` ([Array(String)](../../sql-reference/data-types/array.md)) — имена настроек, которые меняются, когда клиент выполняет запрос. Чтобы разрешить логирование изменений настроек, установите параметр `log_query_settings` равным 1. - `Settings.Values` ([Array(String)](../../sql-reference/data-types/array.md)) — значения настроек, которые перечислены в столбце `Settings.Names`. +- `used_aggregate_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `агрегатных функций`, использованных при выполнении запроса. +- `used_aggregate_function_combinators` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `комбинаторов агрегатных функций`, использованных при выполнении запроса. +- `used_database_engines` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `движков баз данных`, использованных при выполнении запроса. +- `used_data_type_families` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `семейств типов данных`, использованных при выполнении запроса. +- `used_dictionaries` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `источников словарей`, использованных при выполнении запроса. +- `used_formats` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `форматов`, использованных при выполнении запроса. +- `used_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `функций`, использованных при выполнении запроса. +- `used_storages` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `движков таблиц`, использованных при выполнении запроса. +- `used_table_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — канонические имена `табличных функций`, использованных при выполнении запроса. **Пример** ``` sql -SELECT * FROM system.query_log LIMIT 1 \G +SELECT * FROM system.query_log WHERE type = 'QueryFinish' AND (query LIKE '%toDate(\'2000-12-05\')%') ORDER BY query_start_time DESC LIMIT 1 FORMAT Vertical; ``` ``` text Row 1: ────── -type: QueryStart -event_date: 2020-09-11 -event_time: 2020-09-11 10:08:17 -event_time_microseconds: 2020-09-11 10:08:17.063321 -query_start_time: 2020-09-11 10:08:17 -query_start_time_microseconds: 2020-09-11 10:08:17.063321 -query_duration_ms: 0 -read_rows: 0 -read_bytes: 0 -written_rows: 0 -written_bytes: 0 -result_rows: 0 -result_bytes: 0 -memory_usage: 0 -current_database: default -query: INSERT INTO test1 VALUES -exception_code: 0 -exception: -stack_trace: -is_initial_query: 1 -user: default -query_id: 50a320fd-85a8-49b8-8761-98a86bcbacef -address: ::ffff:127.0.0.1 -port: 33452 -initial_user: default -initial_query_id: 50a320fd-85a8-49b8-8761-98a86bcbacef -initial_address: ::ffff:127.0.0.1 -initial_port: 33452 -interface: 1 -os_user: bharatnc -client_hostname: tower -client_name: ClickHouse -client_revision: 54437 -client_version_major: 20 -client_version_minor: 7 -client_version_patch: 2 -http_method: 0 -http_user_agent: -quota_key: -revision: 54440 -thread_ids: [] -ProfileEvents.Names: [] -ProfileEvents.Values: [] -Settings.Names: ['use_uncompressed_cache','load_balancing','log_queries','max_memory_usage','allow_introspection_functions'] -Settings.Values: ['0','random','1','10000000000','1'] +type: QueryFinish +event_date: 2021-03-18 +event_time: 2021-03-18 20:54:18 +event_time_microseconds: 2021-03-18 20:54:18.676686 +query_start_time: 2021-03-18 20:54:18 +query_start_time_microseconds: 2021-03-18 20:54:18.673934 +query_duration_ms: 2 +read_rows: 100 +read_bytes: 800 +written_rows: 0 +written_bytes: 0 +result_rows: 2 +result_bytes: 4858 +memory_usage: 0 +current_database: default +query: SELECT uniqArray([1, 1, 2]), SUBSTRING('Hello, world', 7, 5), flatten([[[BIT_AND(123)]], [[mod(3, 2)], [CAST('1' AS INTEGER)]]]), week(toDate('2000-12-05')), CAST(arrayJoin([NULL, NULL]) AS Nullable(TEXT)), avgOrDefaultIf(number, number % 2), sumOrNull(number), toTypeName(sumOrNull(number)), countIf(toDate('2000-12-05') + number as d, toDayOfYear(d) % 2) FROM numbers(100) +normalized_query_hash: 17858008518552525706 +query_kind: Select +databases: ['_table_function'] +tables: ['_table_function.numbers'] +columns: ['_table_function.numbers.number'] +exception_code: 0 +exception: +stack_trace: +is_initial_query: 1 +user: default +query_id: 58f3d392-0fa0-4663-ae1d-29917a1a9c9c +address: ::ffff:127.0.0.1 +port: 37486 +initial_user: default +initial_query_id: 58f3d392-0fa0-4663-ae1d-29917a1a9c9c +initial_address: ::ffff:127.0.0.1 +initial_port: 37486 +interface: 1 +os_user: sevirov +client_hostname: clickhouse.ru-central1.internal +client_name: ClickHouse +client_revision: 54447 +client_version_major: 21 +client_version_minor: 4 +client_version_patch: 1 +http_method: 0 +http_user_agent: +http_referer: +forwarded_for: +quota_key: +revision: 54449 +log_comment: +thread_ids: [587,11939] +ProfileEvents.Names: ['Query','SelectQuery','ReadCompressedBytes','CompressedReadBufferBlocks','CompressedReadBufferBytes','IOBufferAllocs','IOBufferAllocBytes','ArenaAllocChunks','ArenaAllocBytes','FunctionExecute','TableFunctionExecute','NetworkSendElapsedMicroseconds','SelectedRows','SelectedBytes','ContextLock','RWLockAcquiredReadLocks','RealTimeMicroseconds','UserTimeMicroseconds','SystemTimeMicroseconds','SoftPageFaults','OSCPUVirtualTimeMicroseconds','OSWriteBytes'] +ProfileEvents.Values: [1,1,36,1,10,2,1048680,1,4096,36,1,110,100,800,77,1,3137,1476,1101,8,2577,8192] +Settings.Names: ['load_balancing','max_memory_usage'] +Settings.Values: ['random','10000000000'] +used_aggregate_functions: ['groupBitAnd','avg','sum','count','uniq'] +used_aggregate_function_combinators: ['OrDefault','If','OrNull','Array'] +used_database_engines: [] +used_data_type_families: ['String','Array','Int32','Nullable'] +used_dictionaries: [] +used_formats: [] +used_functions: ['toWeek','CAST','arrayFlatten','toTypeName','toDayOfYear','addDays','array','toDate','modulo','substring','plus'] +used_storages: [] +used_table_functions: ['numbers'] ``` **Смотрите также** @@ -143,4 +178,3 @@ Settings.Values: ['0','random','1','10000000000','1'] - [system.query_thread_log](../../operations/system-tables/query_thread_log.md#system_tables-query_thread_log) — в этой таблице содержится информация о цепочке каждого выполненного запроса. [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/query_log) - diff --git a/docs/ru/operations/system-tables/query_thread_log.md b/docs/ru/operations/system-tables/query_thread_log.md index 052baf98035..0292a321524 100644 --- a/docs/ru/operations/system-tables/query_thread_log.md +++ b/docs/ru/operations/system-tables/query_thread_log.md @@ -114,4 +114,3 @@ ProfileEvents.Values: [1,1,11,11,591,148,3,71,29,6533808,1,11,72,18,47, - [system.query_log](../../operations/system-tables/query_log.md#system_tables-query_log) — описание системной таблицы `query_log`, которая содержит общую информацию о выполненных запросах. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/query_thread_log) diff --git a/docs/ru/operations/system-tables/quota_limits.md b/docs/ru/operations/system-tables/quota_limits.md index a9ab87055d4..4103391cfd6 100644 --- a/docs/ru/operations/system-tables/quota_limits.md +++ b/docs/ru/operations/system-tables/quota_limits.md @@ -4,17 +4,17 @@ Столбцы: -- `quota_name` ([String](../../sql-reference/data-types/string.md)) — Имя квоты. -- `duration` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Длина временного интервала для расчета потребления ресурсов, в секундах. -- `is_randomized_interval` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Логическое значение. Оно показывает, является ли интервал рандомизированным. Интервал всегда начинается в одно и то же время, если он не рандомизирован. Например, интервал в 1 минуту всегда начинается с целого числа минут (то есть он может начинаться в 11:20:00, но никогда не начинается в 11:20:01), интервал в один день всегда начинается в полночь UTC. Если интервал рандомизирован, то самый первый интервал начинается в произвольное время, а последующие интервалы начинаются один за другим. Значения: - - `0` — Интервал рандомизирован. - - `1` — Интервал не рандомизирован. -- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное число запросов. -- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество ошибок. -- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество строк результата. -- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальный объем оперативной памяти в байтах, используемый для хранения результата запроса. -- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество строк, считываемых из всех таблиц и табличных функций, участвующих в запросе. -- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество байтов, считываемых из всех таблиц и табличных функций, участвующих в запросе. -- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Максимальное время выполнения запроса, в секундах. - -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/quota_limits) +- `quota_name` ([String](../../sql-reference/data-types/string.md)) — имя квоты. +- `duration` ([UInt32](../../sql-reference/data-types/int-uint.md)) — длина временного интервала для расчета потребления ресурсов, в секундах. +- `is_randomized_interval` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — логическое значение. Оно показывает, является ли интервал рандомизированным. Интервал всегда начинается в одно и то же время, если он не рандомизирован. Например, интервал в 1 минуту всегда начинается с целого числа минут (то есть он может начинаться в 11:20:00, но никогда не начинается в 11:20:01), интервал в один день всегда начинается в полночь UTC. Если интервал рандомизирован, то самый первый интервал начинается в произвольное время, а последующие интервалы начинаются один за другим. Значения: + - `0` — интервал рандомизирован. + - `1` — интервал не рандомизирован. +- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное число запросов. +- `max_query_selects` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное число запросов `SELECT`. +- `max_query_inserts` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное число запросов `INSERT`. +- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество ошибок. +- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество строк результата. +- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальный объем оперативной памяти в байтах, используемый для хранения результата запроса. +- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество строк, считываемых из всех таблиц и табличных функций, участвующих в запросе. +- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество байтов, считываемых из всех таблиц и табличных функций, участвующих в запросе. +- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — максимальное время выполнения запроса, в секундах. \ No newline at end of file diff --git a/docs/ru/operations/system-tables/quota_usage.md b/docs/ru/operations/system-tables/quota_usage.md index cea3c4b2daa..19e9397ebaa 100644 --- a/docs/ru/operations/system-tables/quota_usage.md +++ b/docs/ru/operations/system-tables/quota_usage.md @@ -4,28 +4,28 @@ Столбцы: -- `quota_name` ([String](../../sql-reference/data-types/string.md)) — Имя квоты. -- `quota_key`([String](../../sql-reference/data-types/string.md)) — Значение ключа. Например, если keys = `ip_address`, `quota_key` может иметь значение '192.168.1.1'. -- `start_time`([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Время начала расчета потребления ресурсов. -- `end_time`([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Время окончания расчета потребления ресурс -- `duration` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Длина временного интервала для расчета потребления ресурсов, в секундах. -- `queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Общее количество запросов на этом интервале. -- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество запросов. -- `errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Число запросов, вызвавших ошибки. -- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное число ошибок. -- `result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Общее количество строк результата. -- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество строк результата. -- `result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Объем оперативной памяти в байтах, используемый для хранения результата запроса. -- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальный объем оперативной памяти, используемый для хранения результата запроса, в байтах. -- `read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Общее число исходных строк, считываемых из таблиц для выполнения запроса на всех удаленных серверах. -- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество строк, считываемых из всех таблиц и табличных функций, участвующих в запросах. -- `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Общее количество байт, считанных из всех таблиц и табличных функций, участвующих в запросах. -- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество байт, считываемых из всех таблиц и табличных функций. -- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Общее время выполнения запроса, в секундах. -- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Максимальное время выполнения запроса. +- `quota_name` ([String](../../sql-reference/data-types/string.md)) — имя квоты. +- `quota_key`([String](../../sql-reference/data-types/string.md)) — значение ключа. Например, если keys = `ip_address`, `quota_key` может иметь значение '192.168.1.1'. +- `start_time`([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — время начала расчета потребления ресурсов. +- `end_time`([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — время окончания расчета потребления ресурс +- `duration` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — длина временного интервала для расчета потребления ресурсов, в секундах. +- `queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество запросов на этом интервале. +- `query_selects` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество запросов `SELECT` на этом интервале. +- `query_inserts` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество запросов `INSERT` на этом интервале. +- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество запросов. +- `errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — число запросов, вызвавших ошибки. +- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное число ошибок. +- `result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество строк результата. +- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество строк результата. +- `result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — объем оперативной памяти в байтах, используемый для хранения результата запроса. +- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальный объем оперативной памяти, используемый для хранения результата запроса, в байтах. +- `read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее число исходных строк, считываемых из таблиц для выполнения запроса на всех удаленных серверах. +- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество строк, считываемых из всех таблиц и табличных функций, участвующих в запросах. +- `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество байт, считанных из всех таблиц и табличных функций, участвующих в запросах. +- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество байт, считываемых из всех таблиц и табличных функций. +- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — общее время выполнения запроса, в секундах. +- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — максимальное время выполнения запроса. ## Смотрите также {#see-also} -- [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) - -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/quota_usage) +- [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) \ No newline at end of file diff --git a/docs/ru/operations/system-tables/quotas.md b/docs/ru/operations/system-tables/quotas.md index 15bb41a85bf..fe6b78cc44b 100644 --- a/docs/ru/operations/system-tables/quotas.md +++ b/docs/ru/operations/system-tables/quotas.md @@ -25,5 +25,4 @@ - [SHOW QUOTAS](../../sql-reference/statements/show.md#show-quotas-statement) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/quotas) diff --git a/docs/ru/operations/system-tables/quotas_usage.md b/docs/ru/operations/system-tables/quotas_usage.md index 9d6d339c434..fe066e38add 100644 --- a/docs/ru/operations/system-tables/quotas_usage.md +++ b/docs/ru/operations/system-tables/quotas_usage.md @@ -4,29 +4,31 @@ Столбцы: -- `quota_name` ([String](../../sql-reference/data-types/string.md)) — Имя квоты. -- `quota_key` ([String](../../sql-reference/data-types/string.md)) — Ключ квоты. -- `is_current` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Квота используется для текущего пользователя. -- `start_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md)))) — Время начала расчета потребления ресурсов. -- `end_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md)))) — Время окончания расчета потребления ресурсов. -- `duration` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt32](../../sql-reference/data-types/int-uint.md))) — Длина временного интервала для расчета потребления ресурсов, в секундах. -- `queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Общее количество запросов на этом интервале. -- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное число запросов. -- `errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Число запросов, вызвавших ошибки. -- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное число ошибок. -- `result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of rows given as a result. -- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum of source rows read from tables. -- `result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Объем оперативной памяти в байтах, используемый для хранения результата запроса. -- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальный объем оперативной памяти, используемый для хранения результата запроса, в байтах. -- `read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Общее число исходных строк, считываемых из таблиц для выполнения запроса на всех удаленных серверах. -- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество строк, считываемых из всех таблиц и табличных функций, участвующих в запросах. -- `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Общее количество байт, считанных из всех таблиц и табличных функций, участвующих в запросах. -- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Максимальное количество байт, считываемых из всех таблиц и табличных функций. -- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Общее время выполнения запроса, в секундах. -- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Максимальное время выполнения запроса. +- `quota_name` ([String](../../sql-reference/data-types/string.md)) — имя квоты. +- `quota_key` ([String](../../sql-reference/data-types/string.md)) — ключ квоты. +- `is_current` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — квота используется для текущего пользователя. +- `start_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md)))) — время начала расчета потребления ресурсов. +- `end_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md)))) — время окончания расчета потребления ресурсов. +- `duration` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt32](../../sql-reference/data-types/int-uint.md))) — длина временного интервала для расчета потребления ресурсов, в секундах. +- `queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество запросов на этом интервале. +- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное число запросов. +- `query_selects` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество запросов `SELECT` на этом интервале. +- `max_query_selects` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество запросов `SELECT` на этом интервале. +- `query_inserts` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество запросов `INSERT` на этом интервале. +- `max_query_inserts` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество запросов `INSERT` на этом интервале. +- `errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — число запросов, вызвавших ошибки. +- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное число ошибок. +- `result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество строк, приведенных в результате. +- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество исходных строк, считываемых из таблиц. +- `result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — объем оперативной памяти в байтах, используемый для хранения результата запроса. +- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальный объем оперативной памяти, используемый для хранения результата запроса, в байтах. +- `read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее число исходных строк, считываемых из таблиц для выполнения запроса на всех удаленных серверах. +- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество строк, считываемых из всех таблиц и табличных функций, участвующих в запросах. +- `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество байт, считанных из всех таблиц и табличных функций, участвующих в запросах. +- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество байт, считываемых из всех таблиц и табличных функций. +- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — общее время выполнения запроса, в секундах. +- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — максимальное время выполнения запроса. ## Смотрите также {#see-also} -- [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) - -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/quotas_usage) +- [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) \ No newline at end of file diff --git a/docs/ru/operations/system-tables/replicas.md b/docs/ru/operations/system-tables/replicas.md index 8d4eb60c56a..7879ee707a4 100644 --- a/docs/ru/operations/system-tables/replicas.md +++ b/docs/ru/operations/system-tables/replicas.md @@ -120,5 +120,4 @@ WHERE Если этот запрос ничего не возвращает - значит всё хорошо. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/replicas) diff --git a/docs/ru/operations/system-tables/replicated_fetches.md b/docs/ru/operations/system-tables/replicated_fetches.md index 94584f390ee..31d5a5cfe08 100644 --- a/docs/ru/operations/system-tables/replicated_fetches.md +++ b/docs/ru/operations/system-tables/replicated_fetches.md @@ -67,4 +67,3 @@ thread_id: 54 - [Управление таблицами ReplicatedMergeTree](../../sql-reference/statements/system/#query-language-system-replicated) -[Оригинальная статья](https://clickhouse.tech/docs/en/operations/system_tables/replicated_fetches) diff --git a/docs/ru/operations/system-tables/replication_queue.md b/docs/ru/operations/system-tables/replication_queue.md index 47f64aea55d..56e8c695a21 100644 --- a/docs/ru/operations/system-tables/replication_queue.md +++ b/docs/ru/operations/system-tables/replication_queue.md @@ -70,12 +70,11 @@ num_tries: 36 last_exception: Code: 226, e.displayText() = DB::Exception: Marks file '/opt/clickhouse/data/merge/visits_v2/tmp_fetch_20201130_121373_121384_2/CounterID.mrk' doesn't exist (version 20.8.7.15 (official build)) last_attempt_time: 2020-12-08 17:35:54 num_postponed: 0 -postpone_reason: +postpone_reason: last_postpone_time: 1970-01-01 03:00:00 ``` **Смотрите также** -- [Управление таблицами ReplicatedMergeTree](../../sql-reference/statements/system.md/#query-language-system-replicated) +- [Управление таблицами ReplicatedMergeTree](../../sql-reference/statements/system.md#query-language-system-replicated) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/replication_queue) diff --git a/docs/ru/operations/system-tables/role-grants.md b/docs/ru/operations/system-tables/role-grants.md index f014af1fe3d..2c80a597857 100644 --- a/docs/ru/operations/system-tables/role-grants.md +++ b/docs/ru/operations/system-tables/role-grants.md @@ -14,4 +14,3 @@ - 1 — Роль обладает привилегией `ADMIN OPTION`. - 0 — Роль не обладает привилегией `ADMIN OPTION`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/role-grants) \ No newline at end of file diff --git a/docs/ru/operations/system-tables/roles.md b/docs/ru/operations/system-tables/roles.md index 1b548e85be2..c2b94214012 100644 --- a/docs/ru/operations/system-tables/roles.md +++ b/docs/ru/operations/system-tables/roles.md @@ -14,4 +14,3 @@ - [SHOW ROLES](../../sql-reference/statements/show.md#show-roles-statement) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/roles) diff --git a/docs/ru/operations/system-tables/row_policies.md b/docs/ru/operations/system-tables/row_policies.md index 7d0a490f01c..f1e84a201cb 100644 --- a/docs/ru/operations/system-tables/row_policies.md +++ b/docs/ru/operations/system-tables/row_policies.md @@ -31,4 +31,3 @@ - [SHOW POLICIES](../../sql-reference/statements/show.md#show-policies-statement) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/row_policies) diff --git a/docs/ru/operations/system-tables/settings.md b/docs/ru/operations/system-tables/settings.md index c1ada37131c..c9d63d336b6 100644 --- a/docs/ru/operations/system-tables/settings.md +++ b/docs/ru/operations/system-tables/settings.md @@ -48,5 +48,5 @@ SELECT * FROM system.settings WHERE changed AND name='load_balancing' - [Настройки](../settings/index.md#settings) - [Разрешения для запросов](../settings/permissions-for-queries.md#settings_readonly) - [Ограничения для значений настроек](../settings/constraints-on-settings.md) +- Выражение [SHOW SETTINGS](../../sql-reference/statements/show.md#show-settings) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/settings) diff --git a/docs/ru/operations/system-tables/settings_profile_elements.md b/docs/ru/operations/system-tables/settings_profile_elements.md index cd801468e21..8a1461c6bb0 100644 --- a/docs/ru/operations/system-tables/settings_profile_elements.md +++ b/docs/ru/operations/system-tables/settings_profile_elements.md @@ -27,4 +27,3 @@ - `inherit_profile` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Родительский профиль для данного профиля настроек. `NULL` если не задано. Профиль настроек может наследовать все значения и ограничения настроек (`min`, `max`, `readonly`) от своего родительского профиля. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/settings_profile_elements) diff --git a/docs/ru/operations/system-tables/settings_profiles.md b/docs/ru/operations/system-tables/settings_profiles.md index e1401553a4a..f8101fb0cb7 100644 --- a/docs/ru/operations/system-tables/settings_profiles.md +++ b/docs/ru/operations/system-tables/settings_profiles.md @@ -21,4 +21,3 @@ - [SHOW PROFILES](../../sql-reference/statements/show.md#show-profiles-statement) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/settings_profiles) diff --git a/docs/ru/operations/system-tables/stack_trace.md b/docs/ru/operations/system-tables/stack_trace.md index 0689e15c35c..58d0a1c4b6a 100644 --- a/docs/ru/operations/system-tables/stack_trace.md +++ b/docs/ru/operations/system-tables/stack_trace.md @@ -85,4 +85,3 @@ res: /lib/x86_64-linux-gnu/libc-2.27.so - [arrayMap](../../sql-reference/functions/array-functions.md#array-map) — Описание и пример использования функции `arrayMap`. - [arrayFilter](../../sql-reference/functions/array-functions.md#array-filter) — Описание и пример использования функции `arrayFilter`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/stack_trace) diff --git a/docs/ru/operations/system-tables/storage_policies.md b/docs/ru/operations/system-tables/storage_policies.md index e62266af131..b2005d5f31e 100644 --- a/docs/ru/operations/system-tables/storage_policies.md +++ b/docs/ru/operations/system-tables/storage_policies.md @@ -14,4 +14,3 @@ Если политика хранения содержит несколько томов, то каждому тому соответствует отдельная запись в таблице. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/storage_policies) diff --git a/docs/ru/operations/system-tables/table_engines.md b/docs/ru/operations/system-tables/table_engines.md index eb198475e43..b6f6d3decc2 100644 --- a/docs/ru/operations/system-tables/table_engines.md +++ b/docs/ru/operations/system-tables/table_engines.md @@ -6,8 +6,8 @@ - `name` (String) — имя движка. - `supports_settings` (UInt8) — флаг, показывающий поддержку секции `SETTINGS`. -- `supports_skipping_indices` (UInt8) — флаг, показывающий поддержку [индексов пропуска данных](table_engines/mergetree/#table_engine-mergetree-data_skipping-indexes). -- `supports_ttl` (UInt8) — флаг, показывающий поддержку [TTL](table_engines/mergetree/#table_engine-mergetree-ttl). +- `supports_skipping_indices` (UInt8) — флаг, показывающий поддержку [индексов пропуска данных](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-data_skipping-indexes). +- `supports_ttl` (UInt8) — флаг, показывающий поддержку [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). - `supports_sort_order` (UInt8) — флаг, показывающий поддержку секций `PARTITION_BY`, `PRIMARY_KEY`, `ORDER_BY` и `SAMPLE_BY`. - `supports_replication` (UInt8) — флаг, показывающий поддержку [репликации](../../engines/table-engines/mergetree-family/replication.md). - `supports_deduplication` (UInt8) — флаг, показывающий наличие в движке дедупликации данных. @@ -34,4 +34,3 @@ WHERE name in ('Kafka', 'MergeTree', 'ReplicatedCollapsingMergeTree') - [Настройки](../../engines/table-engines/integrations/kafka.md#table_engine-kafka-creating-a-table) Kafka - [Настройки](../../engines/table-engines/special/join.md#join-limitations-and-settings) Join -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/table_engines) diff --git a/docs/ru/operations/system-tables/tables.md b/docs/ru/operations/system-tables/tables.md index 52de10871b2..11bb6a9eda2 100644 --- a/docs/ru/operations/system-tables/tables.md +++ b/docs/ru/operations/system-tables/tables.md @@ -1,40 +1,94 @@ # system.tables {#system-tables} -Содержит метаданные каждой таблицы, о которой знает сервер. Отсоединённые таблицы не отображаются в `system.tables`. +Содержит метаданные каждой таблицы, о которой знает сервер. -Эта таблица содержит следующие столбцы (тип столбца показан в скобках): +Отсоединённые таблицы ([DETACH](../../sql-reference/statements/detach.md)) не отображаются в `system.tables`. -- `database String` — имя базы данных, в которой находится таблица. -- `name` (String) — имя таблицы. -- `engine` (String) — движок таблицы (без параметров). -- `is_temporary` (UInt8) — флаг, указывающий на то, временная это таблица или нет. -- `data_path` (String) — путь к данным таблицы в файловой системе. -- `metadata_path` (String) — путь к табличным метаданным в файловой системе. -- `metadata_modification_time` (DateTime) — время последней модификации табличных метаданных. -- `dependencies_database` (Array(String)) — зависимости базы данных. -- `dependencies_table` (Array(String)) — табличные зависимости (таблицы [MaterializedView](../../engines/table-engines/special/materializedview.md), созданные на базе текущей таблицы). -- `create_table_query` (String) — запрос, которым создавалась таблица. -- `engine_full` (String) — параметры табличного движка. -- `partition_key` (String) — ключ партиционирования таблицы. -- `sorting_key` (String) — ключ сортировки таблицы. -- `primary_key` (String) - первичный ключ таблицы. -- `sampling_key` (String) — ключ сэмплирования таблицы. -- `storage_policy` (String) - политика хранения данных: +Информация о [временных таблицах](../../sql-reference/statements/create/table.md#temporary-tables) содержится в `system.tables` только в тех сессиях, в которых эти таблицы были созданы. Поле `database` у таких таблиц пустое, а флаг `is_temporary` включен. + +Столбцы: + +- `database` ([String](../../sql-reference/data-types/string.md)) — имя базы данных, в которой находится таблица. +- `name` ([String](../../sql-reference/data-types/string.md)) — имя таблицы. +- `engine` ([String](../../sql-reference/data-types/string.md)) — движок таблицы (без параметров). +- `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) — флаг, указывающий на то, временная это таблица или нет. +- `data_path` ([String](../../sql-reference/data-types/string.md)) — путь к данным таблицы в файловой системе. +- `metadata_path` ([String](../../sql-reference/data-types/string.md)) — путь к табличным метаданным в файловой системе. +- `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время последней модификации табличных метаданных. +- `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — зависимости базы данных. +- `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — табличные зависимости (таблицы [MaterializedView](../../engines/table-engines/special/materializedview.md), созданные на базе текущей таблицы). +- `create_table_query` ([String](../../sql-reference/data-types/string.md)) — запрос, при помощи которого создавалась таблица. +- `engine_full` ([String](../../sql-reference/data-types/string.md)) — параметры табличного движка. +- `partition_key` ([String](../../sql-reference/data-types/string.md)) — ключ партиционирования таблицы. +- `sorting_key` ([String](../../sql-reference/data-types/string.md)) — ключ сортировки таблицы. +- `primary_key` ([String](../../sql-reference/data-types/string.md)) - первичный ключ таблицы. +- `sampling_key` ([String](../../sql-reference/data-types/string.md)) — ключ сэмплирования таблицы. +- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - политика хранения данных: - [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) - [Distributed](../../engines/table-engines/special/distributed.md#distributed) -- `total_rows` (Nullable(UInt64)) - общее количество строк, если есть возможность быстро определить точное количество строк в таблице, в противном случае `Null` (включая базовую таблицу `Buffer`). +- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - общее количество строк, если есть возможность быстро определить точное количество строк в таблице, в противном случае `NULL` (включая базовую таблицу `Buffer`). -- `total_bytes` (Nullable(UInt64)) - общее количество байт, если можно быстро определить точное количество байт для таблицы на накопителе, в противном случае `Null` (**не включает** в себя никакого базового хранилища). +- `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - общее количество байт, если можно быстро определить точное количество байт для таблицы на накопителе, в противном случае `NULL` (не включает в себя никакого базового хранилища). - Если таблица хранит данные на диске, возвращает используемое пространство на диске (т. е. сжатое). - Если таблица хранит данные в памяти, возвращает приблизительное количество используемых байт в памяти. -- `lifetime_rows` (Nullable(UInt64)) - общее количество строк, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`). +- `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - общее количество строк, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`). -- `lifetime_bytes` (Nullable(UInt64)) - общее количество байт, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`). +- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - общее количество байт, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`). Таблица `system.tables` используется при выполнении запроса `SHOW TABLES`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/tables) +**Пример** + +```sql +SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: system +name: aggregate_function_combinators +uuid: 00000000-0000-0000-0000-000000000000 +engine: SystemAggregateFunctionCombinators +is_temporary: 0 +data_paths: [] +metadata_path: /var/lib/clickhouse/metadata/system/aggregate_function_combinators.sql +metadata_modification_time: 1970-01-01 03:00:00 +dependencies_database: [] +dependencies_table: [] +create_table_query: +engine_full: +partition_key: +sorting_key: +primary_key: +sampling_key: +storage_policy: +total_rows: ᴺᵁᴸᴸ +total_bytes: ᴺᵁᴸᴸ + +Row 2: +────── +database: system +name: asynchronous_metrics +uuid: 00000000-0000-0000-0000-000000000000 +engine: SystemAsynchronousMetrics +is_temporary: 0 +data_paths: [] +metadata_path: /var/lib/clickhouse/metadata/system/asynchronous_metrics.sql +metadata_modification_time: 1970-01-01 03:00:00 +dependencies_database: [] +dependencies_table: [] +create_table_query: +engine_full: +partition_key: +sorting_key: +primary_key: +sampling_key: +storage_policy: +total_rows: ᴺᵁᴸᴸ +total_bytes: ᴺᵁᴸᴸ +``` diff --git a/docs/ru/operations/system-tables/text_log.md b/docs/ru/operations/system-tables/text_log.md index 141c3680c07..97c6ef9e2cd 100644 --- a/docs/ru/operations/system-tables/text_log.md +++ b/docs/ru/operations/system-tables/text_log.md @@ -50,4 +50,3 @@ source_file: /ClickHouse/src/Interpreters/DNSCacheUpdater.cpp; void source_line: 45 ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/text_log) diff --git a/docs/ru/operations/system-tables/trace_log.md b/docs/ru/operations/system-tables/trace_log.md index 3f0a16199d5..3d22e4eabfd 100644 --- a/docs/ru/operations/system-tables/trace_log.md +++ b/docs/ru/operations/system-tables/trace_log.md @@ -12,7 +12,7 @@ ClickHouse создает эту таблицу когда утсановлен - `event_time`([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время в момент снятия экземпляра стэка адресов вызова. -- `event_time_microseconds` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время в момент снятия экземпляра стэка адресов вызова с точностью до микросекунд. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — дата и время в момент снятия экземпляра стэка адресов вызова с точностью до микросекунд. - `revision`([UInt32](../../sql-reference/data-types/int-uint.md)) — ревизия сборки сервера ClickHouse. @@ -50,4 +50,3 @@ trace: [371912858,371912789,371798468,371799717,371801313,3717 size: 5244400 ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/trace_log) diff --git a/docs/ru/operations/system-tables/users.md b/docs/ru/operations/system-tables/users.md index c12b91f445f..2a523ae4a9a 100644 --- a/docs/ru/operations/system-tables/users.md +++ b/docs/ru/operations/system-tables/users.md @@ -31,4 +31,3 @@ - [SHOW USERS](../../sql-reference/statements/show.md#show-users-statement) -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/users) diff --git a/docs/ru/operations/system-tables/zookeeper.md b/docs/ru/operations/system-tables/zookeeper.md index 9a2b781d8f3..a6ce62a9d4e 100644 --- a/docs/ru/operations/system-tables/zookeeper.md +++ b/docs/ru/operations/system-tables/zookeeper.md @@ -69,4 +69,3 @@ pzxid: 987021252247 path: /clickhouse/tables/01-08/visits/replicas ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/zookeeper) diff --git a/docs/ru/operations/tips.md b/docs/ru/operations/tips.md index 40035309c03..4535767e8e0 100644 --- a/docs/ru/operations/tips.md +++ b/docs/ru/operations/tips.md @@ -1,6 +1,6 @@ --- toc_priority: 58 -toc_title: "\u0421\u043e\u0432\u0435\u0442\u044b\u0020\u043f\u043e\u0020\u044d\u043a\u0441\u043f\u043b\u0443\u0430\u0442\u0430\u0446\u0438\u0438" +toc_title: "Советы по эксплуатации" --- # Советы по эксплуатации {#sovety-po-ekspluatatsii} @@ -246,4 +246,3 @@ script end script ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/tips/) diff --git a/docs/ru/operations/troubleshooting.md b/docs/ru/operations/troubleshooting.md index 3df2a1dd46c..5882bc36f9e 100644 --- a/docs/ru/operations/troubleshooting.md +++ b/docs/ru/operations/troubleshooting.md @@ -1,6 +1,6 @@ --- toc_priority: 46 -toc_title: "\u0423\u0441\u0442\u0440\u0430\u043d\u0435\u043d\u0438\u0435\u0020\u043d\u0435\u0438\u0441\u043f\u0440\u0430\u0432\u043d\u043e\u0441\u0442\u0435\u0439" +toc_title: "Устранение неисправностей" --- # Устранение неисправностей {#ustranenie-neispravnostei} diff --git a/docs/ru/operations/update.md b/docs/ru/operations/update.md index c74b28b3fd7..a3e87b52ede 100644 --- a/docs/ru/operations/update.md +++ b/docs/ru/operations/update.md @@ -1,9 +1,9 @@ --- toc_priority: 47 -toc_title: "\u041e\u0431\u043d\u043e\u0432\u043b\u0435\u043d\u0438\u0435\u0020\u0043\u006c\u0069\u0063\u006b\u0048\u006f\u0075\u0073\u0065" +toc_title: "Обновление ClickHouse" --- -# Обновление ClickHouse {#obnovlenie-clickhouse} +# Обновление ClickHouse {#clickhouse-upgrade} Если ClickHouse установлен с помощью deb-пакетов, выполните следующие команды на сервере: @@ -15,4 +15,17 @@ $ sudo service clickhouse-server restart Если ClickHouse установлен не из рекомендуемых deb-пакетов, используйте соответствующий метод обновления. -ClickHouse не поддерживает распределенное обновление. Операция должна выполняться последовательно на каждом отдельном сервере. Не обновляйте все серверы в кластере одновременно, иначе кластер становится недоступным в течение некоторого времени. +!!! note "Примечание" + Вы можете обновить сразу несколько серверов, кроме случая, когда все реплики одного шарда отключены. + +Обновление ClickHouse до определенной версии: + +**Пример** + +`xx.yy.a.b` — это номер текущей стабильной версии. Последнюю стабильную версию можно узнать [здесь](https://github.com/ClickHouse/ClickHouse/releases) + +```bash +$ sudo apt-get update +$ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b +$ sudo service clickhouse-server restart +``` diff --git a/docs/ru/operations/utilities/clickhouse-benchmark.md b/docs/ru/operations/utilities/clickhouse-benchmark.md index 2a883cf3bb5..b4769b17818 100644 --- a/docs/ru/operations/utilities/clickhouse-benchmark.md +++ b/docs/ru/operations/utilities/clickhouse-benchmark.md @@ -160,4 +160,3 @@ localhost:9000, queries 10, QPS: 6.082, RPS: 121959604.568, MiB/s: 930.478, resu 99.990% 0.172 sec. ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/utilities/clickhouse-benchmark.md) diff --git a/docs/ru/operations/utilities/clickhouse-copier.md b/docs/ru/operations/utilities/clickhouse-copier.md index 243ad7f379b..aa4fd68f8e8 100644 --- a/docs/ru/operations/utilities/clickhouse-copier.md +++ b/docs/ru/operations/utilities/clickhouse-copier.md @@ -181,4 +181,3 @@ $ clickhouse-copier --daemon --config zookeeper.xml --task-path /task/path --bas `clickhouse-copier` отслеживает изменения `/task/path/description` и применяет их «на лету». Если вы поменяете, например, значение `max_workers`, то количество процессов, выполняющих задания, также изменится. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/utils/clickhouse-copier/) diff --git a/docs/ru/operations/utilities/clickhouse-local.md b/docs/ru/operations/utilities/clickhouse-local.md index 2b5c9b119e2..682dc0b5ace 100644 --- a/docs/ru/operations/utilities/clickhouse-local.md +++ b/docs/ru/operations/utilities/clickhouse-local.md @@ -14,14 +14,15 @@ toc_title: clickhouse-local !!! warning "Warning" Мы не рекомендуем подключать серверную конфигурацию к `clickhouse-local`, поскольку данные можно легко повредить неосторожными действиями. -Для временных данных по умолчанию создается специальный каталог. Если вы хотите обойти это действие, каталог данных можно указать с помощью опции `-- --path`. +Для временных данных по умолчанию создается специальный каталог. -## Вызов программы {#vyzov-programmy} +## Вызов программы {#usage} Основной формат вызова: ``` bash -$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" +$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" \ + --query "query" ``` Ключи команды: @@ -30,15 +31,23 @@ $ clickhouse-local --structure "table_structure" --input-format "format_of_incom - `-if`, `--input-format` — формат входящих данных. По умолчанию — `TSV`. - `-f`, `--file` — путь к файлу с данными. По умолчанию — `stdin`. - `-q`, `--query` — запросы на выполнение. Разделитель запросов — `;`. +- `-qf`, `--queries-file` - путь к файлу с запросами для выполнения. Необходимо задать либо параметр `query`, либо `queries-file`. - `-N`, `--table` — имя таблицы, в которую будут помещены входящие данные. По умолчанию - `table`. - `-of`, `--format`, `--output-format` — формат выходных данных. По умолчанию — `TSV`. +- `-d`, `--database` — база данных по умолчанию. Если не указано, используется значение `_local`. - `--stacktrace` — вывод отладочной информации при исключениях. +- `--echo` — перед выполнением запрос выводится в консоль. - `--verbose` — подробный вывод при выполнении запроса. -- `-s` — отключает вывод системных логов в `stderr`. -- `--config-file` — путь к файлу конфигурации. По умолчанию `clickhouse-local` запускается с пустой конфигурацией. Конфигурационный файл имеет тот же формат, что и для сервера ClickHouse и в нём можно использовать все конфигурационные параметры сервера. Обычно подключение конфигурации не требуется, если требуется установить отдельный параметр, то это можно сделать ключом с именем параметра. +- `--logger.console` — логирование действий в консоль. +- `--logger.log` — логирование действий в файл с указанным именем. +- `--logger.level` — уровень логирования. +- `--ignore-error` — не прекращать обработку если запрос выдал ошибку. +- `-c`, `--config-file` — путь к файлу конфигурации. По умолчанию `clickhouse-local` запускается с пустой конфигурацией. Конфигурационный файл имеет тот же формат, что и для сервера ClickHouse, и в нём можно использовать все конфигурационные параметры сервера. Обычно подключение конфигурации не требуется; если требуется установить отдельный параметр, то это можно сделать ключом с именем параметра. +- `--no-system-tables` — запуск без использования системных таблиц. - `--help` — вывод справочной информации о `clickhouse-local`. +- `-V`, `--version` — вывод текущей версии и выход. -## Примеры вызова {#primery-vyzova} +## Примеры вызова {#examples} ``` bash $ echo -e "1,2\n3,4" | clickhouse-local --structure "a Int64, b Int64" \ @@ -76,7 +85,9 @@ $ clickhouse-local --query " 1 2 ``` -А теперь давайте выведем на экран объём оперативной памяти, занимаемой пользователями (Unix): +Объём оперативной памяти, занимаемой процессами, которые запустил пользователь (Unix): + +Запрос: ``` bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ @@ -85,6 +96,8 @@ $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" ``` +Результат: + ``` text Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ┏━━━━━━━━━━┳━━━━━━━━━━┓ @@ -97,4 +110,3 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ... ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/utils/clickhouse-local/) diff --git a/docs/ru/operations/utilities/index.md b/docs/ru/operations/utilities/index.md index 5b55ebd798d..fa257fb4b1a 100644 --- a/docs/ru/operations/utilities/index.md +++ b/docs/ru/operations/utilities/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u0423\u0442\u0438\u043b\u0438\u0442\u044b" +toc_folder_title: "Утилиты" toc_priority: 56 -toc_title: "\u041e\u0431\u0437\u043e\u0440" +toc_title: "Обзор" --- # Утилиты ClickHouse {#utility-clickhouse} @@ -9,4 +9,3 @@ toc_title: "\u041e\u0431\u0437\u043e\u0440" - [clickhouse-local](clickhouse-local.md) - [clickhouse-copier](clickhouse-copier.md) - копирует (и перешардирует) данные с одного кластера на другой. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/utils/) diff --git a/docs/ru/sql-reference/aggregate-functions/combinators.md b/docs/ru/sql-reference/aggregate-functions/combinators.md index 592c61f87ff..74f9d1c1c05 100644 --- a/docs/ru/sql-reference/aggregate-functions/combinators.md +++ b/docs/ru/sql-reference/aggregate-functions/combinators.md @@ -1,6 +1,6 @@ --- toc_priority: 37 -toc_title: "\u041a\u043e\u043c\u0431\u0438\u043d\u0430\u0442\u043e\u0440\u044b\u0020\u0430\u0433\u0440\u0435\u0433\u0430\u0442\u043d\u044b\u0445\u0020\u0444\u0443\u043d\u043a\u0446\u0438\u0439" +toc_title: "Комбинаторы агрегатных функций" --- @@ -27,6 +27,40 @@ toc_title: "\u041a\u043e\u043c\u0431\u0438\u043d\u0430\u0442\u043e\u0440\u044b\u Комбинаторы -If и -Array можно сочетать. При этом, должен сначала идти Array, а потом If. Примеры: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Из-за такого порядка получается, что аргумент cond не должен быть массивом. +## -SimpleState {#agg-functions-combinator-simplestate} + +При использовании этого комбинатора агрегатная функция возвращает то же значение, но типа [SimpleAggregateFunction(...)](../../sql-reference/data-types/simpleaggregatefunction.md). Текущее значение функции может храниться в таблице для последующей работы с таблицами семейства [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md). + +**Синтаксис** + +``` sql +SimpleState(x) +``` + +**Аргументы** + +- `x` — параметры агрегатной функции. + +**Возвращаемое значение** + +Значение агрегатной функции типа `SimpleAggregateFunction(...)`. + +**Пример** + +Запрос: + +``` sql +WITH anySimpleState(number) AS c SELECT toTypeName(c), c FROM numbers(1); +``` + +Результат: + +``` text +┌─toTypeName(c)────────────────────────┬─c─┐ +│ SimpleAggregateFunction(any, UInt64) │ 0 │ +└──────────────────────────────────────┴───┘ +``` + ## -State {#state} В случае применения этого комбинатора, агрегатная функция возвращает не готовое значение (например, в случае функции [uniq](reference/uniq.md#agg_function-uniq) — количество уникальных значений), а промежуточное состояние агрегации (например, в случае функции `uniq` — хэш-таблицу для расчёта количества уникальных значений), которое имеет тип `AggregateFunction(...)` и может использоваться для дальнейшей обработки или может быть сохранено в таблицу для последующей доагрегации. @@ -70,9 +104,9 @@ toc_title: "\u041a\u043e\u043c\u0431\u0438\u043d\u0430\u0442\u043e\u0440\u044b\u OrDefault(x) ``` -**Параметры** +**Аргументы** -- `x` — Параметры агрегатной функции. +- `x` — аргументы агрегатной функции. **Возращаемые зачения** @@ -131,14 +165,14 @@ FROM OrNull(x) ``` -**Параметры** +**Аргументы** -- `x` — Параметры агрегатной функции. +- `x` — аргументы агрегатной функции. **Возвращаемые значения** -- Результат агрегатной функции, преобразованный в тип данных `Nullable`. -- `NULL`, если у агрегатной функции нет входных данных. +- Результат агрегатной функции, преобразованный в тип данных `Nullable`. +- `NULL`, если у агрегатной функции нет входных данных. Тип: `Nullable(aggregate function return type)`. @@ -188,7 +222,7 @@ FROM Resample(start, end, step)(, resampling_key) ``` -**Параметры** +**Аргументы** - `start` — начальное значение для интервала значений `resampling_key`. - `stop` — конечное значение для интервала значений `resampling_key`. Интервал не включает значение `stop` (`[start, stop)`). @@ -247,5 +281,3 @@ FROM people │ [3,2] │ [11.5,12.949999809265137] │ └────────┴───────────────────────────┘ ``` - -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/agg_functions/combinators/) diff --git a/docs/ru/sql-reference/aggregate-functions/index.md b/docs/ru/sql-reference/aggregate-functions/index.md index 4a7768f587f..7afb6a374a7 100644 --- a/docs/ru/sql-reference/aggregate-functions/index.md +++ b/docs/ru/sql-reference/aggregate-functions/index.md @@ -1,8 +1,7 @@ --- -toc_folder_title: "\u0410\u0433\u0440\u0435\u0433\u0430\u0442\u043D\u044B\u0435 \u0444\ - \u0443\u043D\u043A\u0446\u0438\u0438" +toc_folder_title: "Агрегатные функции" toc_priority: 33 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043D\u0438\u0435" +toc_title: "Введение" --- # Агрегатные функции {#aggregate-functions} @@ -58,4 +57,3 @@ SELECT groupArray(y) FROM t_null_big `groupArray` не включает `NULL` в результирующий массив. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/) diff --git a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md index f20acaa45c3..e5162b63b88 100644 --- a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 38 -toc_title: "\u041f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u0438\u0447\u0435\u0441\u043a\u0438\u0435\u0020\u0430\u0433\u0440\u0435\u0433\u0430\u0442\u043d\u044b\u0435\u0020\u0444\u0443\u043d\u043a\u0446\u0438\u0438" +toc_title: "Параметрические агрегатные функции" --- # Параметрические агрегатные функции {#aggregate_functions_parametric} @@ -11,14 +11,19 @@ toc_title: "\u041f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u0438\u0447\u0435\u Рассчитывает адаптивную гистограмму. Не гарантирует точного результата. - histogram(number_of_bins)(values) +``` sql +histogram(number_of_bins)(values) +``` Функция использует [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). Границы столбцов устанавливаются по мере поступления новых данных в функцию. В общем случае столбцы имею разную ширину. +**Аргументы** + +`values` — [выражение](../syntax.md#syntax-expressions), предоставляющее входные значения. + **Параметры** `number_of_bins` — максимальное количество корзин в гистограмме. Функция автоматически вычисляет количество корзин. Она пытается получить указанное количество корзин, но если не получилось, то в результате корзин будет меньше. -`values` — [выражение](../syntax.md#syntax-expressions), предоставляющее входные значения. **Возвращаемые значения** @@ -87,14 +92,16 @@ sequenceMatch(pattern)(timestamp, cond1, cond2, ...) !!! warning "Предупреждение" События, произошедшие в одну и ту же секунду, располагаются в последовательности в неопределенном порядке, что может повлиять на результат работы функции. -**Параметры** - -- `pattern` — строка с шаблоном. Смотрите [Синтаксис шаблонов](#sequence-function-pattern-syntax). +**Аргументы** - `timestamp` — столбец, содержащий метки времени. Типичный тип данных столбца — `Date` или `DateTime`. Также можно использовать любой из поддержанных типов данных [UInt](../../sql-reference/aggregate-functions/parametric-functions.md). - `cond1`, `cond2` — условия, описывающие цепочку событий. Тип данных — `UInt8`. Можно использовать до 32 условий. Функция учитывает только те события, которые указаны в условиях. Функция пропускает данные из последовательности, если они не описаны ни в одном из условий. +**Параметры** + +- `pattern` — строка с шаблоном. Смотрите [Синтаксис шаблонов](#sequence-function-pattern-syntax). + **Возвращаемые значения** - 1, если цепочка событий, соответствующая шаблону найдена. @@ -174,14 +181,16 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM sequenceCount(pattern)(timestamp, cond1, cond2, ...) ``` -**Параметры** - -- `pattern` — строка с шаблоном. Смотрите [Синтаксис шаблонов](#sequence-function-pattern-syntax). +**Аргументы** - `timestamp` — столбец, содержащий метки времени. Типичный тип данных столбца — `Date` или `DateTime`. Также можно использовать любой из поддержанных типов данных [UInt](../../sql-reference/aggregate-functions/parametric-functions.md). - `cond1`, `cond2` — условия, описывающие цепочку событий. Тип данных — `UInt8`. Можно использовать до 32 условий. Функция учитывает только те события, которые указаны в условиях. Функция пропускает данные из последовательности, если они не описаны ни в одном из условий. +**Параметры** + +- `pattern` — строка с шаблоном. Смотрите [Синтаксис шаблонов](#sequence-function-pattern-syntax). + **Возвращаемое значение** - Число непересекающихся цепочек событий, соответствущих шаблону. @@ -234,15 +243,21 @@ SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t **Синтаксис** ``` sql -windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) +windowFunnel(window, [mode, [mode, ... ]])(timestamp, cond1, cond2, ..., condN) ``` +**Аргументы** + +- `timestamp` — имя столбца, содержащего временные отметки. [Date](../../sql-reference/aggregate-functions/parametric-functions.md), [DateTime](../../sql-reference/aggregate-functions/parametric-functions.md#data_type-datetime) и другие параметры с типом `Integer`. В случае хранения меток времени в столбцах с типом `UInt64`, максимально допустимое значение соответствует ограничению для типа `Int64`, т.е. равно `2^63-1`. +- `cond` — условия или данные, описывающие цепочку событий. [UInt8](../../sql-reference/aggregate-functions/parametric-functions.md). + **Параметры** -- `window` — ширина скользящего окна по времени в секундах. [UInt](../../sql-reference/aggregate-functions/parametric-functions.md). -- `mode` - необязательный параметр. Если установлено значение `'strict'`, то функция `windowFunnel()` применяет условия только для уникальных значений. -- `timestamp` — имя столбца, содержащего временные отметки. [Date](../../sql-reference/aggregate-functions/parametric-functions.md), [DateTime](../../sql-reference/aggregate-functions/parametric-functions.md#data_type-datetime) и другие параметры с типом `Integer`. В случае хранения меток времени в столбцах с типом `UInt64`, максимально допустимое значение соответствует ограничению для типа `Int64`, т.е. равно `2^63-1`. -- `cond` — условия или данные, описывающие цепочку событий. [UInt8](../../sql-reference/aggregate-functions/parametric-functions.md). +- `window` — ширина скользящего окна по времени. Единица измерения зависит от `timestamp` и может варьироваться. Должно соблюдаться условие `timestamp события cond2 <= timestamp события cond1 + window`. +- `mode` — необязательный параметр. Может быть установленно несколько значений одновременно. + - `'strict'` — не учитывать подряд идущие повторяющиеся события. + - `'strict_order'` — запрещает посторонние события в искомой последовательности. Например, при поиске цепочки `A->B->C` в `A->B->D->C` поиск будет остановлен на `D` и функция вернет 2. + - `'strict_increase'` — условия прменяются только для событий со строго возрастающими временными метками. **Возвращаемое значение** @@ -306,7 +321,7 @@ ORDER BY level ASC Функция принимает набор (от 1 до 32) логических условий, как в [WHERE](../../sql-reference/statements/select/where.md#select-where), и применяет их к заданному набору данных. -Условия, кроме первого, применяются попарно: результат второго будет истинным, если истинно первое и второе, третьего - если истинно первое и третье и т. д. +Условия, кроме первого, применяются попарно: результат второго будет истинным, если истинно первое и второе, третьего - если истинно первое и третье и т.д. **Синтаксис** @@ -314,7 +329,7 @@ ORDER BY level ASC retention(cond1, cond2, ..., cond32) ``` -**Параметры** +**Аргументы** - `cond` — вычисляемое условие или выражение, которое возвращает `UInt8` результат (1/0). @@ -481,4 +496,3 @@ FROM Решение: пишем в запросе GROUP BY SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/agg_functions/parametric_functions/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/any.md b/docs/ru/sql-reference/aggregate-functions/reference/any.md index 38c412813ab..6142b9a2092 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/any.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/any.md @@ -12,4 +12,3 @@ toc_priority: 6 При наличии в запросе `SELECT` секции `GROUP BY` или хотя бы одной агрегатной функции, ClickHouse (в отличие от, например, MySQL) требует, чтобы все выражения в секциях `SELECT`, `HAVING`, `ORDER BY` вычислялись из ключей или из агрегатных функций. То есть, каждый выбираемый из таблицы столбец, должен использоваться либо в ключах, либо внутри агрегатных функций. Чтобы получить поведение, как в MySQL, вы можете поместить остальные столбцы в агрегатную функцию `any`. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/any/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/ru/sql-reference/aggregate-functions/reference/anyheavy.md index 19fda7f64b7..bb7a01a47f3 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/anyheavy.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/anyheavy.md @@ -29,4 +29,3 @@ FROM ontime └───────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/anyheavy/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/anylast.md b/docs/ru/sql-reference/aggregate-functions/reference/anylast.md index da68c926d43..7be380461f7 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/anylast.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/anylast.md @@ -7,4 +7,3 @@ toc_priority: 104 Выбирает последнее попавшееся значение. Результат так же недетерминирован, как и для функции [any](../../../sql-reference/aggregate-functions/reference/any.md). -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/anylast/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/argmax.md b/docs/ru/sql-reference/aggregate-functions/reference/argmax.md index f44e65831a9..edad26ee232 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/argmax.md @@ -20,20 +20,20 @@ argMax(arg, val) argMax(tuple(arg, val)) ``` -**Параметры** +**Аргументы** - `arg` — аргумент. - `val` — значение. **Возвращаемое значение** -- Значение `arg`, соответствующее максимальному значению `val`. +- значение `arg`, соответствующее максимальному значению `val`. Тип: соответствует типу `arg`. Если передан кортеж: -- Кортеж `(arg, val)` c максимальным значением `val` и соответствующим ему `arg`. +- кортеж `(arg, val)` c максимальным значением `val` и соответствующим ему `arg`. Тип: [Tuple](../../../sql-reference/data-types/tuple.md). @@ -52,15 +52,14 @@ argMax(tuple(arg, val)) Запрос: ``` sql -SELECT argMax(user, salary), argMax(tuple(user, salary)) FROM salary; +SELECT argMax(user, salary), argMax(tuple(user, salary), salary), argMax(tuple(user, salary)) FROM salary; ``` Результат: ``` text -┌─argMax(user, salary)─┬─argMax(tuple(user, salary))─┐ -│ director │ ('director',5000) │ -└──────────────────────┴─────────────────────────────┘ +┌─argMax(user, salary)─┬─argMax(tuple(user, salary), salary)─┬─argMax(tuple(user, salary))─┐ +│ director │ ('director',5000) │ ('director',5000) │ +└──────────────────────┴─────────────────────────────────────┴─────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/reference/argmax/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/argmin.md b/docs/ru/sql-reference/aggregate-functions/reference/argmin.md index 8c25b79f92a..dc54c424fb3 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/argmin.md @@ -20,7 +20,7 @@ argMin(arg, val) argMin(tuple(arg, val)) ``` -**Параметры** +**Аргументы** - `arg` — аргумент. - `val` — значение. @@ -63,4 +63,3 @@ SELECT argMin(user, salary), argMin(tuple(user, salary)) FROM salary; └──────────────────────┴─────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/reference/argmin/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/avg.md b/docs/ru/sql-reference/aggregate-functions/reference/avg.md index b0bee64ec66..c5e1dec14e0 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/avg.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/avg.md @@ -4,8 +4,60 @@ toc_priority: 5 # avg {#agg_function-avg} -Вычисляет среднее. -Работает только для чисел. -Результат всегда Float64. +Вычисляет среднее арифметическое. + +**Синтаксис** + +``` sql +avg(x) +``` + +**Аргументы** + +- `x` — входное значение типа [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) или [Decimal](../../../sql-reference/data-types/decimal.md). + +**Возвращаемое значение** + +- среднее арифметическое, всегда типа [Float64](../../../sql-reference/data-types/float.md). +- `NaN`, если входное значение `x` — пустое. + +**Пример** + +Запрос: + +``` sql +SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5); +``` + +Результат: + +``` text +┌─avg(x)─┐ +│ 2.5 │ +└────────┘ +``` + +**Пример** + +Создайте временную таблицу: + +Запрос: + +``` sql +CREATE table test (t UInt8) ENGINE = Memory; +``` + +Выполните запрос: + +``` sql +SELECT avg(t) FROM test; +``` + +Результат: + +``` text +┌─avg(x)─┐ +│ nan │ +└────────┘ +``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/avg/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/ru/sql-reference/aggregate-functions/reference/avgweighted.md index 72e6ca5c88c..291abbfa3fb 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/avgweighted.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/avgweighted.md @@ -12,10 +12,10 @@ toc_priority: 107 avgWeighted(x, weight) ``` -**Параметры** +**Аргументы** -- `x` — Значения. [Целые числа](../../../sql-reference/data-types/int-uint.md) или [числа с плавающей запятой](../../../sql-reference/data-types/float.md). -- `weight` — Веса отдельных значений. [Целые числа](../../../sql-reference/data-types/int-uint.md) или [числа с плавающей запятой](../../../sql-reference/data-types/float.md). +- `x` — значения. [Целые числа](../../../sql-reference/data-types/int-uint.md) или [числа с плавающей запятой](../../../sql-reference/data-types/float.md). +- `weight` — веса отдельных значений. [Целые числа](../../../sql-reference/data-types/int-uint.md) или [числа с плавающей запятой](../../../sql-reference/data-types/float.md). Типы параметров должны совпадать. @@ -43,4 +43,3 @@ FROM values('x Int8, w Int8', (4, 1), (1, 0), (10, 2)) └────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/avgweighted/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/corr.md b/docs/ru/sql-reference/aggregate-functions/reference/corr.md index 6d631241f6a..7522dcebd0b 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/corr.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/corr.md @@ -11,4 +11,3 @@ toc_priority: 107 !!! note "Примечание" Функция использует вычислительно неустойчивый алгоритм. Если для ваших расчётов необходима [вычислительная устойчивость](https://ru.wikipedia.org/wiki/Вычислительная_устойчивость), используйте функцию `corrStable`. Она работает медленнее, но обеспечивает меньшую вычислительную ошибку. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/corr/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/count.md b/docs/ru/sql-reference/aggregate-functions/reference/count.md index d99c3b2aeb2..06cf66bd8bd 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/count.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/count.md @@ -4,14 +4,14 @@ toc_priority: 1 # count {#agg_function-count} -Вычисляет количество строк или не NULL значений . +Вычисляет количество строк или не NULL значений. ClickHouse поддерживает следующие виды синтаксиса для `count`: - `count(expr)` или `COUNT(DISTINCT expr)`. - `count()` или `COUNT(*)`. Синтаксис `count()` специфичен для ClickHouse. -**Параметры** +**Аргументы** Функция может принимать: @@ -21,7 +21,7 @@ ClickHouse поддерживает следующие виды синтакси **Возвращаемое значение** - Если функция вызывается без параметров, она вычисляет количество строк. -- Если передаётся [выражение](../../syntax.md#syntax-expressions) , то функция вычисляет количество раз, когда выражение возвращает не NULL. Если выражение возвращает значение типа [Nullable](../../../sql-reference/data-types/nullable.md), то результат `count` не становится `Nullable`. Функция возвращает 0, если выражение возвращает `NULL` для всех строк. +- Если передаётся [выражение](../../syntax.md#syntax-expressions), то функция подсчитывает количество раз, когда выражение не равно NULL. Если выражение имеет тип [Nullable](../../../sql-reference/data-types/nullable.md), то результат `count` не становится `Nullable`. Функция возвращает 0, если выражение равно `NULL` для всех строк. В обоих случаях тип возвращаемого значения [UInt64](../../../sql-reference/data-types/int-uint.md). @@ -69,4 +69,3 @@ SELECT count(DISTINCT num) FROM t Этот пример показывает, что `count(DISTINCT num)` выполняется с помощью функции `uniqExact` в соответствии со значением настройки `count_distinct_implementation`. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/count/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/covarpop.md b/docs/ru/sql-reference/aggregate-functions/reference/covarpop.md index e30b19924f9..1438fefbd8e 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/covarpop.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/covarpop.md @@ -11,4 +11,3 @@ toc_priority: 36 !!! note "Примечание" Функция использует вычислительно неустойчивый алгоритм. Если для ваших расчётов необходима [вычислительная устойчивость](https://ru.wikipedia.org/wiki/Вычислительная_устойчивость), используйте функцию `covarPopStable`. Она работает медленнее, но обеспечивает меньшую вычислительную ошибку. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/covarpop/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/ru/sql-reference/aggregate-functions/reference/covarsamp.md index 7fa9a1d3f2c..b4cea16f4c0 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/covarsamp.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/covarsamp.md @@ -13,4 +13,3 @@ toc_priority: 37 !!! note "Примечание" Функция использует вычислительно неустойчивый алгоритм. Если для ваших расчётов необходима [вычислительная устойчивость](https://ru.wikipedia.org/wiki/Вычислительная_устойчивость), используйте функцию `covarSampStable`. Она работает медленнее, но обеспечивает меньшую вычислительную ошибку. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/covarsamp/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/deltasum.md b/docs/ru/sql-reference/aggregate-functions/reference/deltasum.md new file mode 100644 index 00000000000..b025a248f3c --- /dev/null +++ b/docs/ru/sql-reference/aggregate-functions/reference/deltasum.md @@ -0,0 +1,69 @@ +--- +toc_priority: 141 +--- + +# deltaSum {#agg_functions-deltasum} + +Суммирует арифметическую разницу между последовательными строками. Если разница отрицательна — она будет проигнорирована. + +**Синтаксис** + +``` sql +deltaSum(value) +``` + +**Аргументы** + +- `value` — входные значения, должны быть типа [Integer](../../data-types/int-uint.md) или [Float](../../data-types/float.md). + +**Возвращаемое значение** + +- накопленная арифметическая разница, типа `Integer` или `Float`. + +**Примеры** + +Запрос: + +``` sql +SELECT deltaSum(arrayJoin([1, 2, 3])); +``` + +Результат: + +``` text +┌─deltaSum(arrayJoin([1, 2, 3]))─┐ +│ 2 │ +└────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3])); +``` + +Результат: + +``` text +┌─deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]))─┐ +│ 7 │ +└───────────────────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT deltaSum(arrayJoin([2.25, 3, 4.5])); +``` + +Результат: + +``` text +┌─deltaSum(arrayJoin([2.25, 3, 4.5]))─┐ +│ 2.25 │ +└─────────────────────────────────────┘ +``` + +## Смотрите также {#see-also} + +- [runningDifference](../../functions/other-functions.md#runningdifferencex) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/grouparray.md b/docs/ru/sql-reference/aggregate-functions/reference/grouparray.md index 7640795fc51..370190dbb3c 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/grouparray.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/grouparray.md @@ -14,4 +14,3 @@ toc_priority: 110 В некоторых случаях, вы всё же можете рассчитывать на порядок выполнения запроса. Это — случаи, когда `SELECT` идёт из подзапроса, в котором используется `ORDER BY`. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/grouparray/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/ru/sql-reference/aggregate-functions/reference/grouparrayinsertat.md index 5c73bccc2bb..f91d4f19675 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/grouparrayinsertat.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/grouparrayinsertat.md @@ -9,24 +9,24 @@ toc_priority: 112 **Синтаксис** ```sql -groupArrayInsertAt(default_x, size)(x, pos); +groupArrayInsertAt(default_x, size)(x, pos) ``` Если запрос вставляет вставляется несколько значений в одну и ту же позицию, то функция ведет себя следующим образом: -- Если запрос выполняется в одном потоке, то используется первое из вставляемых значений. -- Если запрос выполняется в нескольких потоках, то в результирующем массиве может оказаться любое из вставляемых значений. +- Если запрос выполняется в одном потоке, то используется первое из вставляемых значений. +- Если запрос выполняется в нескольких потоках, то в результирующем массиве может оказаться любое из вставляемых значений. -**Параметры** +**Аргументы** -- `x` — Значение, которое будет вставлено. [Выражение](../../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../../../sql-reference/data-types/index.md#data_types). -- `pos` — Позиция, в которую вставляется заданный элемент `x`. Нумерация индексов в массиве начинается с нуля. [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64). -- `default_x` — Значение по умолчанию для подстановки на пустые позиции. Опциональный параметр. [Выражение](../../syntax.md#syntax-expressions), возвращающее значение с типом параметра `x`. Если `default_x` не определен, используются [значения по умолчанию](../../../sql-reference/statements/create/table.md#create-default-values). -- `size`— Длина результирующего массива. Опциональный параметр. При использовании этого параметра должно быть указано значение по умолчанию `default_x`. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). +- `x` — значение, которое будет вставлено. [Выражение](../../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../../../sql-reference/data-types/index.md#data_types). +- `pos` — позиция, в которую вставляется заданный элемент `x`. Нумерация индексов в массиве начинается с нуля. [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64). +- `default_x` — значение по умолчанию для подстановки на пустые позиции. Опциональный параметр. [Выражение](../../syntax.md#syntax-expressions), возвращающее значение с типом параметра `x`. Если `default_x` не определен, используются [значения по умолчанию](../../../sql-reference/statements/create/table.md#create-default-values). +- `size` — длина результирующего массива. Опциональный параметр. При использовании этого параметра должно быть указано значение по умолчанию `default_x`. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). **Возвращаемое значение** -- Массив со вставленными значениями. +- Массив со вставленными значениями. Тип: [Array](../../../sql-reference/data-types/array.md#data-type-array). @@ -90,4 +90,3 @@ SELECT groupArrayInsertAt(number, 0) FROM numbers_mt(10) SETTINGS max_block_size └───────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/grouparraymovingavg.md b/docs/ru/sql-reference/aggregate-functions/reference/grouparraymovingavg.md index 6307189c440..5930e8b8484 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/grouparraymovingavg.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/grouparraymovingavg.md @@ -6,12 +6,14 @@ toc_priority: 114 Вычисляет скользящее среднее для входных значений. - groupArrayMovingAvg(numbers_for_summing) - groupArrayMovingAvg(window_size)(numbers_for_summing) +``` sql +groupArrayMovingAvg(numbers_for_summing) +groupArrayMovingAvg(window_size)(numbers_for_summing) +``` Функция может принимать размер окна в качестве параметра. Если окно не указано, то функция использует размер окна, равный количеству строк в столбце. -**Параметры** +**Аргументы** - `numbers_for_summing` — [выражение](../../syntax.md#syntax-expressions), возвращающее значение числового типа. - `window_size` — размер окна. @@ -75,4 +77,3 @@ FROM t └───────────┴──────────────────────────────────┴───────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/grouparraymovingsum.md b/docs/ru/sql-reference/aggregate-functions/reference/grouparraymovingsum.md index c95f1b0b0eb..feaef8e79d8 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/grouparraymovingsum.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/grouparraymovingsum.md @@ -13,7 +13,7 @@ groupArrayMovingSum(window_size)(numbers_for_summing) Функция может принимать размер окна в качестве параметра. Если окно не указано, то функция использует размер окна, равный количеству строк в столбце. -**Параметры** +**Аргументы** - `numbers_for_summing` — [выражение](../../syntax.md#syntax-expressions), возвращающее значение числового типа. - `window_size` — размер окна. @@ -75,4 +75,3 @@ FROM t └────────────┴─────────────────────────────────┴────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/ru/sql-reference/aggregate-functions/reference/grouparraysample.md index 4c2dafe1a3c..1d58b3397ab 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/grouparraysample.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/grouparraysample.md @@ -12,7 +12,7 @@ toc_priority: 114 groupArraySample(max_size[, seed])(x) ``` -**Параметры** +**Аргументы** - `max_size` — максимальное количество элементов в возвращаемом массиве. [UInt64](../../data-types/int-uint.md). - `seed` — состояние генератора случайных чисел. Необязательный параметр. [UInt64](../../data-types/int-uint.md). Значение по умолчанию: `123456`. diff --git a/docs/ru/sql-reference/aggregate-functions/reference/groupbitand.md b/docs/ru/sql-reference/aggregate-functions/reference/groupbitand.md index 03aff64fecf..b4b862d5716 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/groupbitand.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/groupbitand.md @@ -10,7 +10,7 @@ toc_priority: 125 groupBitAnd(expr) ``` -**Параметры** +**Аргументы** `expr` – выражение, результат которого имеет тип данных `UInt*`. @@ -45,4 +45,3 @@ binary decimal 00000100 = 4 ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/groupbitand/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/groupbitmap.md b/docs/ru/sql-reference/aggregate-functions/reference/groupbitmap.md index a4be18b75ec..4012d3e052e 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/groupbitmap.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/groupbitmap.md @@ -10,7 +10,7 @@ Bitmap или агрегатные вычисления для столбца с groupBitmap(expr) ``` -**Параметры** +**Аргументы** `expr` – выражение, результат которого имеет тип данных `UInt*`. @@ -43,4 +43,3 @@ num 3 ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/groupbitmap/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/ru/sql-reference/aggregate-functions/reference/groupbitor.md index e1afced014f..6967b26e722 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/groupbitor.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/groupbitor.md @@ -10,7 +10,7 @@ toc_priority: 126 groupBitOr(expr) ``` -**Параметры** +**Аргументы** `expr` – выражение, результат которого имеет тип данных `UInt*`. @@ -45,4 +45,3 @@ binary decimal 01111101 = 125 ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/groupbitor/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/groupbitxor.md b/docs/ru/sql-reference/aggregate-functions/reference/groupbitxor.md index a80f86b2a5f..ca565d5a027 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/groupbitxor.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/groupbitxor.md @@ -10,7 +10,7 @@ toc_priority: 127 groupBitXor(expr) ``` -**Параметры** +**Аргументы** `expr` – выражение, результат которого имеет тип данных `UInt*`. @@ -45,4 +45,3 @@ binary decimal 01101000 = 104 ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/groupbitxor/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/groupuniqarray.md b/docs/ru/sql-reference/aggregate-functions/reference/groupuniqarray.md index cecc63aef22..7d64b13a203 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/groupuniqarray.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/groupuniqarray.md @@ -10,4 +10,3 @@ toc_priority: 111 Функция `groupUniqArray(max_size)(x)` ограничивает размер результирующего массива до `max_size` элементов. Например, `groupUniqArray(1)(x)` равнозначно `[any(x)]`. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/groupuniqarray/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/index.md b/docs/ru/sql-reference/aggregate-functions/reference/index.md index 4c0060581fd..1af07623ade 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/index.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0421\u043f\u0440\u0430\u0432\u043e\u0447\u043d\u0438\u043a" +toc_folder_title: "Справочник" toc_priority: 36 toc_hidden: true --- @@ -65,4 +65,3 @@ toc_hidden: true - [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md) - [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/reference) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/initializeAggregation.md b/docs/ru/sql-reference/aggregate-functions/reference/initializeAggregation.md index a2e3764193e..3565115d8de 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/initializeAggregation.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/initializeAggregation.md @@ -10,10 +10,10 @@ toc_priority: 150 **Синтаксис** ``` sql -initializeAggregation (aggregate_function, column_1, column_2); +initializeAggregation (aggregate_function, column_1, column_2) ``` -**Параметры** +**Аргументы** - `aggregate_function` — название функции агрегации, состояние которой нужно создать. [String](../../../sql-reference/data-types/string.md#string). - `column_n` — столбец, который передается в функцию агрегации как аргумент. [String](../../../sql-reference/data-types/string.md#string). diff --git a/docs/ru/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/ru/sql-reference/aggregate-functions/reference/kurtpop.md index a00dae51ed6..1a1198b2beb 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/kurtpop.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/kurtpop.md @@ -10,9 +10,9 @@ toc_priority: 153 kurtPop(expr) ``` -**Параметры** +**Аргументы** -`expr` — [Выражение](../../syntax.md#syntax-expressions), возвращающее число. +`expr` — [выражение](../../syntax.md#syntax-expressions), возвращающее число. **Возвращаемое значение** @@ -21,7 +21,6 @@ kurtPop(expr) **Пример** ``` sql -SELECT kurtPop(value) FROM series_with_value_column +SELECT kurtPop(value) FROM series_with_value_column; ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/kurtpop/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/ru/sql-reference/aggregate-functions/reference/kurtsamp.md index 379d74ec0c3..50b48d11b18 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/kurtsamp.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/kurtsamp.md @@ -12,9 +12,9 @@ toc_priority: 154 kurtSamp(expr) ``` -**Параметры** +**Аргументы** -`expr` — [Выражение](../../syntax.md#syntax-expressions), возвращающее число. +`expr` — [выражение](../../syntax.md#syntax-expressions), возвращающее число. **Возвращаемое значение** @@ -23,7 +23,6 @@ kurtSamp(expr) **Пример** ``` sql -SELECT kurtSamp(value) FROM series_with_value_column +SELECT kurtSamp(value) FROM series_with_value_column; ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/kurtsamp/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md index a4647ecfb34..9d02bee8622 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -17,16 +17,18 @@ mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_ind Проверяется нулевая гипотеза, что генеральные совокупности стохастически равны. Наряду с двусторонней гипотезой могут быть проверены и односторонние. Для применения U-критерия Манна — Уитни закон распределения генеральных совокупностей не обязан быть нормальным. +**Аргументы** + +- `sample_data` — данные выборок. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) или [Decimal](../../../sql-reference/data-types/decimal.md). +- `sample_index` — индексы выборок. [Integer](../../../sql-reference/data-types/int-uint.md). + **Параметры** - `alternative` — альтернативная гипотеза. (Необязательный параметр, по умолчанию: `'two-sided'`.) [String](../../../sql-reference/data-types/string.md). - `'two-sided'`; - `'greater'`; - `'less'`. -- `continuity_correction` - если не 0, то при вычислении p-значения применяется коррекция непрерывности. (Необязательный параметр, по умолчанию: 1.) [UInt64](../../../sql-reference/data-types/int-uint.md). -- `sample_data` — данные выборок. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). -- `sample_index` — индексы выборок. [Integer](../../../sql-reference/data-types/int-uint.md). - +- `continuity_correction` — если не 0, то при вычислении p-значения применяется коррекция непрерывности. (Необязательный параметр, по умолчанию: 1.) [UInt64](../../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -69,4 +71,3 @@ SELECT mannWhitneyUTest('greater')(sample_data, sample_index) FROM mww_ttest; - [U-критерий Манна — Уитни](https://ru.wikipedia.org/wiki/U-%D0%BA%D1%80%D0%B8%D1%82%D0%B5%D1%80%D0%B8%D0%B9_%D0%9C%D0%B0%D0%BD%D0%BD%D0%B0_%E2%80%94_%D0%A3%D0%B8%D1%82%D0%BD%D0%B8) -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/max.md b/docs/ru/sql-reference/aggregate-functions/reference/max.md index 4ee577471ea..4f61ecd051d 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/max.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/max.md @@ -6,4 +6,3 @@ toc_priority: 3 Вычисляет максимум. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/max/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/median.md b/docs/ru/sql-reference/aggregate-functions/reference/median.md index 803b2309665..a208c21dd21 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/median.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/median.md @@ -40,4 +40,3 @@ SELECT medianDeterministic(val, 1) FROM t └─────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/median/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/min.md b/docs/ru/sql-reference/aggregate-functions/reference/min.md index 7b56de3aed4..16dd577e790 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/min.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/min.md @@ -6,4 +6,3 @@ toc_priority: 2 Вычисляет минимум. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/min/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantile.md b/docs/ru/sql-reference/aggregate-functions/reference/quantile.md index 10fec16ab94..10862e38e00 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantile.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantile.md @@ -18,10 +18,10 @@ quantile(level)(expr) Алиас: `median`. -**Параметры** +**Аргументы** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). +- `level` — уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — выражение, зависящее от значений столбцов, возвращающее данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). **Возвращаемое значение** @@ -65,4 +65,3 @@ SELECT quantile(val) FROM t - [median](../../../sql-reference/aggregate-functions/reference/median.md#median) - [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/quantile/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantiledeterministic.md b/docs/ru/sql-reference/aggregate-functions/reference/quantiledeterministic.md index fdbcda821f6..ec308ea239b 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantiledeterministic.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantiledeterministic.md @@ -18,11 +18,11 @@ quantileDeterministic(level)(expr, determinator) Алиас: `medianDeterministic`. -**Параметры** +**Аргументы** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). -- `determinator` — Число, хэш которого используется при сэмплировании в алгоритме reservoir sampling, чтобы сделать результат детерминированным. В качестве детерминатора можно использовать любое определённое положительное число, например, идентификатор пользователя или события. Если одно и то же значение детерминатора попадается в выборке слишком часто, то функция выдаёт некорректный результат. +- `level` — уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — выражение, зависящее от значений столбцов, возвращающее данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). +- `determinator` — число, хэш которого используется при сэмплировании в алгоритме «Reservoir sampling», чтобы сделать результат детерминированным. В качестве значения можно использовать любое определённое положительное число, например, идентификатор пользователя или события. Если одно и то же значение попадается в выборке слишком часто, то функция выдаёт некорректный результат. **Возвращаемое значение** @@ -65,4 +65,3 @@ SELECT quantileDeterministic(val, 1) FROM t - [median](../../../sql-reference/aggregate-functions/reference/median.md#median) - [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/qurntiledeterministic/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantileexact.md b/docs/ru/sql-reference/aggregate-functions/reference/quantileexact.md index 4ee815a94fb..82ebae1c14e 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantileexact.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantileexact.md @@ -18,10 +18,11 @@ quantileExact(level)(expr) Алиас: `medianExact`. -**Параметры** +**Аргументы** + +- `level` — уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — выражение, зависящее от значений столбцов, возвращающее данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). **Возвращаемое значение** @@ -77,10 +78,11 @@ quantileExact(level)(expr) Алиас: `medianExactLow`. -**Параметры** +**Аргументы** + +- `level` — уровень квантили. Опциональный параметр. Константное занчение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://en.wikipedia.org/wiki/Median). +- `expr` — выражение, зависящее от значений столбцов, возвращающее данные [числовых типов](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) или [DateTime](../../../sql-reference/data-types/datetime.md). -- `level` — Уровень квантили. Опциональный параметр. Константное занчение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://en.wikipedia.org/wiki/Median). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) или [DateTime](../../../sql-reference/data-types/datetime.md). **Возвращаемое значение** @@ -127,10 +129,11 @@ quantileExactHigh(level)(expr) Алиас: `medianExactHigh`. -**Параметры** +**Аргументы** + +- `level` — уровень квантили. Опциональный параметр. Константное занчение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://en.wikipedia.org/wiki/Median). +- `expr` — выражение, зависящее от значений столбцов, возвращающее данные [числовых типов](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) или [DateTime](../../../sql-reference/data-types/datetime.md). -- `level` — Уровень квантили. Опциональный параметр. Константное занчение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://en.wikipedia.org/wiki/Median). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) или [DateTime](../../../sql-reference/data-types/datetime.md). **Возвращаемое значение** @@ -163,4 +166,3 @@ SELECT quantileExactHigh(number) FROM numbers(10) - [median](../../../sql-reference/aggregate-functions/reference/median.md#median) - [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/quantileexact/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/ru/sql-reference/aggregate-functions/reference/quantileexactweighted.md index f6982d4566f..3746c328470 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantileexactweighted.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantileexactweighted.md @@ -18,11 +18,11 @@ quantileExactWeighted(level)(expr, weight) Алиас: `medianExactWeighted`. -**Параметры** +**Аргументы** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). -- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. +- `level` — уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — выражение, зависящее от значений столбцов, возвращающее данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). +- `weight` — столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. **Возвращаемое значение** @@ -66,4 +66,3 @@ SELECT quantileExactWeighted(n, val) FROM t - [median](../../../sql-reference/aggregate-functions/reference/median.md#median) - [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/quantileexactweited/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md b/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md index 82e806b67fa..671cbc1fc4d 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md @@ -8,4 +8,3 @@ Syntax: `quantiles(level1, level2, …)(x)` All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/quantiles/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/ru/sql-reference/aggregate-functions/reference/quantiletdigest.md index f372e308e73..130ff7566ba 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantiletdigest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantiletdigest.md @@ -20,10 +20,10 @@ quantileTDigest(level)(expr) Алиас: `medianTDigest`. -**Параметры** +**Аргументы** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). +- `level` — уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — выражение, зависящее от значений столбцов, возвращающее данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). **Возвращаемое значение** @@ -56,4 +56,3 @@ SELECT quantileTDigest(number) FROM numbers(10) - [median](../../../sql-reference/aggregate-functions/reference/median.md#median) - [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/qurntiledigest/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/ru/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md index b6dd846967b..f7239be0ba5 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md @@ -20,11 +20,11 @@ quantileTDigestWeighted(level)(expr, weight) Алиас: `medianTDigest`. -**Параметры** +**Аргументы** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). -- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. +- `level` — уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — выражение, зависящее от значений столбцов, возвращающее данные [числовых типов](../../../sql-reference/data-types/index.md#data_types) или типов [Date](../../../sql-reference/data-types/date.md), [DateTime](../../../sql-reference/data-types/datetime.md). +- `weight` — столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. **Возвращаемое значение** @@ -57,4 +57,3 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) - [median](../../../sql-reference/aggregate-functions/reference/median.md#median) - [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/quantiledigestweighted/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/ru/sql-reference/aggregate-functions/reference/quantiletiming.md index 32e5e6ce31b..03d448a5d63 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantiletiming.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantiletiming.md @@ -18,11 +18,11 @@ quantileTiming(level)(expr) Алиас: `medianTiming`. -**Параметры** +**Аргументы** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `level` — уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — [Выражение](../../syntax.md#syntax-expressions) над значения столбца, которые возвращают данные типа [Float\*](../../../sql-reference/data-types/float.md). +- `expr` — [выражение](../../syntax.md#syntax-expressions), зависящее от значений столбцов, возвращающее данные типа [Float\*](../../../sql-reference/data-types/float.md). - Если в функцию передать отрицательные значения, то её поведение не определено. - Если значение больше, чем 30 000 (например, время загрузки страницы превышает 30 секунд), то оно приравнивается к 30 000. @@ -85,4 +85,3 @@ SELECT quantileTiming(response_time) FROM t - [median](../../../sql-reference/aggregate-functions/reference/median.md#median) - [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/quantiletiming/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/ru/sql-reference/aggregate-functions/reference/quantiletimingweighted.md index 4a7fcc666d5..a50e09668ab 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantiletimingweighted.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantiletimingweighted.md @@ -18,16 +18,16 @@ quantileTimingWeighted(level)(expr, weight) Алиас: `medianTimingWeighted`. -**Параметры** +**Аргументы** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `level` — уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — [Выражение](../../syntax.md#syntax-expressions) над значения столбца, которые возвращают данные типа [Float\*](../../../sql-reference/data-types/float.md). +- `expr` — [выражение](../../syntax.md#syntax-expressions), зависящее от значений столбцов, возвращающее данные типа [Float\*](../../../sql-reference/data-types/float.md). - Если в функцию передать отрицательные значения, то её поведение не определено. - Если значение больше, чем 30 000 (например, время загрузки страницы превышает 30 секунд), то оно приравнивается к 30 000. -- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. +- `weight` — столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. **Точность** @@ -84,4 +84,3 @@ SELECT quantileTimingWeighted(response_time, weight) FROM t - [median](../../../sql-reference/aggregate-functions/reference/median.md#median) - [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/quantiletiming weighted/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/ru/sql-reference/aggregate-functions/reference/rankCorr.md index 48a19e87c52..c98e7b88bcf 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/rankCorr.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/rankCorr.md @@ -8,10 +8,10 @@ rankCorr(x, y) ``` -**Параметры** +**Аргументы** -- `x` — Произвольное значение. [Float32](../../../sql-reference/data-types/float.md#float32-float64) или [Float64](../../../sql-reference/data-types/float.md#float32-float64). -- `y` — Произвольное значение. [Float32](../../../sql-reference/data-types/float.md#float32-float64) или [Float64](../../../sql-reference/data-types/float.md#float32-float64). +- `x` — произвольное значение. [Float32](../../../sql-reference/data-types/float.md#float32-float64) или [Float64](../../../sql-reference/data-types/float.md#float32-float64). +- `y` — произвольное значение. [Float32](../../../sql-reference/data-types/float.md#float32-float64) или [Float64](../../../sql-reference/data-types/float.md#float32-float64). **Возвращаемое значение** diff --git a/docs/ru/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/ru/sql-reference/aggregate-functions/reference/simplelinearregression.md index 370b1bde8d2..f634e553738 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/simplelinearregression.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/simplelinearregression.md @@ -41,4 +41,3 @@ SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6]) └───────────────────────────────────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/skewpop.md b/docs/ru/sql-reference/aggregate-functions/reference/skewpop.md index a6dee5dc5ef..ed4a95696f2 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/skewpop.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/skewpop.md @@ -10,9 +10,9 @@ toc_priority: 150 skewPop(expr) ``` -**Параметры** +**Аргументы** -`expr` — [Выражение](../../syntax.md#syntax-expressions), возвращающее число. +`expr` — [выражение](../../syntax.md#syntax-expressions), возвращающее число. **Возвращаемое значение** @@ -21,7 +21,6 @@ skewPop(expr) **Пример** ``` sql -SELECT skewPop(value) FROM series_with_value_column +SELECT skewPop(value) FROM series_with_value_column; ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/skewpop/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/ru/sql-reference/aggregate-functions/reference/skewsamp.md index 171eb5e304a..213d26e4647 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/skewsamp.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/skewsamp.md @@ -12,9 +12,9 @@ toc_priority: 151 skewSamp(expr) ``` -**Параметры** +**Аргументы** -`expr` — [Выражение](../../syntax.md#syntax-expressions), возвращающее число. +`expr` — [выражение](../../syntax.md#syntax-expressions), возвращающее число. **Возвращаемое значение** @@ -23,7 +23,6 @@ skewSamp(expr) **Пример** ``` sql -SELECT skewSamp(value) FROM series_with_value_column +SELECT skewSamp(value) FROM series_with_value_column; ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/skewsamp/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/stddevpop.md b/docs/ru/sql-reference/aggregate-functions/reference/stddevpop.md index ada8b8884cd..66d63147586 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/stddevpop.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/stddevpop.md @@ -9,4 +9,3 @@ toc_priority: 30 !!! note "Примечание" Функция использует вычислительно неустойчивый алгоритм. Если для ваших расчётов необходима [вычислительная устойчивость](https://ru.wikipedia.org/wiki/Вычислительная_устойчивость), используйте функцию `stddevPopStable`. Она работает медленнее, но обеспечивает меньшую вычислительную ошибку. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/stddevpop/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/stddevsamp.md b/docs/ru/sql-reference/aggregate-functions/reference/stddevsamp.md index 952b6bcde68..5fbf438e894 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/stddevsamp.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/stddevsamp.md @@ -9,4 +9,3 @@ toc_priority: 31 !!! note "Примечание" Функция использует вычислительно неустойчивый алгоритм. Если для ваших расчётов необходима [вычислительная устойчивость](https://ru.wikipedia.org/wiki/Вычислительная_устойчивость), используйте функцию `stddevSampStable`. Она работает медленнее, но обеспечивает меньшую вычислительную ошибку. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/stddevsamp/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/ru/sql-reference/aggregate-functions/reference/stochasticlinearregression.md index 0b268e9ea1b..6da0f6caacd 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/stochasticlinearregression.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -86,4 +86,3 @@ evalMLMethod(model, param1, param2) FROM test_data - [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md#agg_functions-stochasticlogisticregression) - [Отличие линейной от логистической регрессии.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/ru/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md index 01d3a0797bd..67454aa2c1b 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md @@ -54,4 +54,3 @@ stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') - [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md#agg_functions-stochasticlinearregression) - [Отличие линейной от логистической регрессии](https://moredez.ru/q/51225972/) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md b/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md index 77378de95d1..16daddfbecf 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md @@ -16,7 +16,7 @@ studentTTest(sample_data, sample_index) Значения выборок берутся из столбца `sample_data`. Если `sample_index` равно 0, то значение из этой строки принадлежит первой выборке. Во всех остальных случаях значение принадлежит второй выборке. Проверяется нулевая гипотеза, что средние значения генеральных совокупностей совпадают. Для применения t-критерия Стьюдента распределение в генеральных совокупностях должно быть нормальным и дисперсии должны совпадать. -**Параметры** +**Аргументы** - `sample_data` — данные выборок. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). - `sample_index` — индексы выборок. [Integer](../../../sql-reference/data-types/int-uint.md). @@ -63,4 +63,3 @@ SELECT studentTTest(sample_data, sample_index) FROM student_ttest; - [t-критерий Стьюдента](https://ru.wikipedia.org/wiki/T-%D0%BA%D1%80%D0%B8%D1%82%D0%B5%D1%80%D0%B8%D0%B9_%D0%A1%D1%82%D1%8C%D1%8E%D0%B4%D0%B5%D0%BD%D1%82%D0%B0) - [welchTTest](welchttest.md#welchttest) -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/reference/studentttest/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/sum.md b/docs/ru/sql-reference/aggregate-functions/reference/sum.md index 5fa769f3479..487313c006b 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/sum.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/sum.md @@ -7,4 +7,3 @@ toc_priority: 4 Вычисляет сумму. Работает только для чисел. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/sum/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/summap.md b/docs/ru/sql-reference/aggregate-functions/reference/summap.md index 460fc078893..3cfe4c26fcc 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/summap.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/summap.md @@ -42,4 +42,3 @@ GROUP BY timeslot └─────────────────────┴──────────────────────────────────────────────┴────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/summap/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/sumwithoverflow.md b/docs/ru/sql-reference/aggregate-functions/reference/sumwithoverflow.md index 845adc510f2..1e1962babbe 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/sumwithoverflow.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/sumwithoverflow.md @@ -8,4 +8,3 @@ toc_priority: 140 Работает только для чисел. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/sumwithoverflow/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/topk.md b/docs/ru/sql-reference/aggregate-functions/reference/topk.md index 6aefd38bf34..4d6a8b46c2c 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/topk.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/topk.md @@ -18,8 +18,8 @@ topK(N)(column) **Аргументы** -- ‘N’ - Количество значений. -- ‘x’ – Столбец. +- `N` – количество значений. +- `x` – столбец. **Пример** @@ -36,4 +36,3 @@ FROM ontime └─────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/topk/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/ru/sql-reference/aggregate-functions/reference/topkweighted.md index 20bd3ee85ff..840f9c553f5 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/topkweighted.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/topkweighted.md @@ -12,13 +12,13 @@ toc_priority: 109 topKWeighted(N)(x, weight) ``` -**Параметры** +**Аргументы** -- `N` — Количество элементов для выдачи. +- `N` — количество элементов для выдачи. **Аргументы** -- `x` – значение. +- `x` — значение. - `weight` — вес. [UInt8](../../../sql-reference/data-types/int-uint.md). **Возвращаемое значение** @@ -41,4 +41,3 @@ SELECT topKWeighted(10)(number, number) FROM numbers(1000) └───────────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/topkweighted/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/uniq.md b/docs/ru/sql-reference/aggregate-functions/reference/uniq.md index f5f3f198139..01bb8bea45a 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/uniq.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/uniq.md @@ -10,7 +10,7 @@ toc_priority: 190 uniq(x[, ...]) ``` -**Параметры** +**Аргументы** Функция принимает переменное число входных параметров. Параметры могут быть числовых типов, а также `Tuple`, `Array`, `Date`, `DateTime`, `String`. @@ -39,4 +39,3 @@ uniq(x[, ...]) - [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) - [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/uniq/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/uniqcombined.md b/docs/ru/sql-reference/aggregate-functions/reference/uniqcombined.md index 751dc1a8c98..3009beb994b 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/uniqcombined.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/uniqcombined.md @@ -12,7 +12,7 @@ uniqCombined(HLL_precision)(x[, ...]) Функция `uniqCombined` — это хороший выбор для вычисления количества различных значений. -**Параметры** +**Аргументы** Функция принимает переменное число входных параметров. Параметры могут быть числовых типов, а также `Tuple`, `Array`, `Date`, `DateTime`, `String`. @@ -50,4 +50,3 @@ uniqCombined(HLL_precision)(x[, ...]) - [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) - [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/uniqcombined/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/uniqcombined64.md b/docs/ru/sql-reference/aggregate-functions/reference/uniqcombined64.md index 5db27fb301d..6fde16b4b0c 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/uniqcombined64.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/uniqcombined64.md @@ -6,4 +6,3 @@ toc_priority: 193 Использует 64-битный хэш для всех типов, в отличие от [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined). -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/uniqcombined64/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/uniqexact.md b/docs/ru/sql-reference/aggregate-functions/reference/uniqexact.md index 3dd22b2b4bc..613558ba887 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/uniqexact.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/uniqexact.md @@ -14,7 +14,7 @@ uniqExact(x[, ...]) Функция `uniqExact` расходует больше оперативной памяти, чем функция `uniq`, так как размер состояния неограниченно растёт по мере роста количества различных значений. -**Параметры** +**Аргументы** Функция принимает переменное число входных параметров. Параметры могут быть числовых типов, а также `Tuple`, `Array`, `Date`, `DateTime`, `String`. @@ -24,4 +24,3 @@ uniqExact(x[, ...]) - [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqcombined) - [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqhll12) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/uniqexact/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/uniqhll12.md b/docs/ru/sql-reference/aggregate-functions/reference/uniqhll12.md index 09e52ac6833..7a421d419ae 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/uniqhll12.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/uniqhll12.md @@ -10,7 +10,7 @@ toc_priority: 194 uniqHLL12(x[, ...]) ``` -**Параметры** +**Аргументы** Функция принимает переменное число входных параметров. Параметры могут быть числовых типов, а также `Tuple`, `Array`, `Date`, `DateTime`, `String`. @@ -26,7 +26,7 @@ uniqHLL12(x[, ...]) - Использует алгоритм HyperLogLog для аппроксимации числа различных значений аргументов. - Используется 212 5-битовых ячеек. Размер состояния чуть больше 2.5 КБ. Результат не точный (ошибка до ~10%) для небольших множеств (<10K элементов). Однако для множеств большой кардинальности (10K - 100M) результат довольно точен (ошибка до ~1.6%). Начиная с 100M ошибка оценки будет только расти и для множеств огромной кардинальности (1B+ элементов) функция возвращает результат с очень большой неточностью. + Используется 2^12 5-битовых ячеек. Размер состояния чуть больше 2.5 КБ. Результат не точный (ошибка до ~10%) для небольших множеств (<10K элементов). Однако для множеств большой кардинальности (10K - 100M) результат довольно точен (ошибка до ~1.6%). Начиная с 100M ошибка оценки будет только расти и для множеств огромной кардинальности (1B+ элементов) функция возвращает результат с очень большой неточностью. - Результат детерминирован (не зависит от порядка выполнения запроса). @@ -38,4 +38,3 @@ uniqHLL12(x[, ...]) - [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/uniqhll12/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/varpop.md b/docs/ru/sql-reference/aggregate-functions/reference/varpop.md index 9615e03673b..0a78b3cbb76 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/varpop.md @@ -11,4 +11,3 @@ toc_priority: 32 !!! note "Примечание" Функция использует вычислительно неустойчивый алгоритм. Если для ваших расчётов необходима [вычислительная устойчивость](https://ru.wikipedia.org/wiki/Вычислительная_устойчивость), используйте функцию `varPopStable`. Она работает медленнее, но обеспечивает меньшую вычислительную ошибку. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/varpop/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/varsamp.md b/docs/ru/sql-reference/aggregate-functions/reference/varsamp.md index 31aaac68e7b..e18b858b7e2 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/varsamp.md @@ -13,4 +13,3 @@ toc_priority: 33 !!! note "Примечание" Функция использует вычислительно неустойчивый алгоритм. Если для ваших расчётов необходима [вычислительная устойчивость](https://ru.wikipedia.org/wiki/Вычислительная_устойчивость), используйте функцию `varSampStable`. Она работает медленнее, но обеспечиват меньшую вычислительную ошибку. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/vasamp/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md b/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md index 16c122d1b49..594a609d89e 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md @@ -16,7 +16,7 @@ welchTTest(sample_data, sample_index) Значения выборок берутся из столбца `sample_data`. Если `sample_index` равно 0, то значение из этой строки принадлежит первой выборке. Во всех остальных случаях значение принадлежит второй выборке. Проверяется нулевая гипотеза, что средние значения генеральных совокупностей совпадают. Для применения t-критерия Уэлча распределение в генеральных совокупностях должно быть нормальным. Дисперсии могут не совпадать. -**Параметры** +**Аргументы** - `sample_data` — данные выборок. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). - `sample_index` — индексы выборок. [Integer](../../../sql-reference/data-types/int-uint.md). @@ -63,4 +63,3 @@ SELECT welchTTest(sample_data, sample_index) FROM welch_ttest; - [t-критерий Уэлча](https://ru.wikipedia.org/wiki/T-%D0%BA%D1%80%D0%B8%D1%82%D0%B5%D1%80%D0%B8%D0%B9_%D0%A3%D1%8D%D0%BB%D1%87%D0%B0) - [studentTTest](studentttest.md#studentttest) -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/reference/welchTTest/) diff --git a/docs/ru/sql-reference/data-types/aggregatefunction.md b/docs/ru/sql-reference/data-types/aggregatefunction.md index 018d38d825e..6ca6879cf6c 100644 --- a/docs/ru/sql-reference/data-types/aggregatefunction.md +++ b/docs/ru/sql-reference/data-types/aggregatefunction.md @@ -65,4 +65,3 @@ SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP Смотрите в описании движка [AggregatingMergeTree](../../sql-reference/data-types/aggregatefunction.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/nested_data_structures/aggregatefunction/) diff --git a/docs/ru/sql-reference/data-types/array.md b/docs/ru/sql-reference/data-types/array.md index 86a23ed041b..30952d6e126 100644 --- a/docs/ru/sql-reference/data-types/array.md +++ b/docs/ru/sql-reference/data-types/array.md @@ -76,4 +76,3 @@ Received exception from server (version 1.1.54388): Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/array/) diff --git a/docs/ru/sql-reference/data-types/boolean.md b/docs/ru/sql-reference/data-types/boolean.md index bb0cd50c739..dff35777ff9 100644 --- a/docs/ru/sql-reference/data-types/boolean.md +++ b/docs/ru/sql-reference/data-types/boolean.md @@ -1,10 +1,9 @@ --- toc_priority: 43 -toc_title: "\u0411\u0443\u043b\u0435\u0432\u044b\u0020\u0437\u043d\u0430\u0447\u0435\u043d\u0438\u044f" +toc_title: "Булевы значения" --- # Булевы значения {#bulevy-znacheniia} Отдельного типа для булевых значений нет. Для них используется тип UInt8, в котором используются только значения 0 и 1. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/boolean/) diff --git a/docs/ru/sql-reference/data-types/date.md b/docs/ru/sql-reference/data-types/date.md index 490bc5c28b4..50508de96a3 100644 --- a/docs/ru/sql-reference/data-types/date.md +++ b/docs/ru/sql-reference/data-types/date.md @@ -44,4 +44,3 @@ SELECT * FROM dt; - [Тип данных `DateTime`](../../sql-reference/data-types/datetime.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/date/) diff --git a/docs/ru/sql-reference/data-types/datetime.md b/docs/ru/sql-reference/data-types/datetime.md index ffdf83e5bd0..ebd780d0d7d 100644 --- a/docs/ru/sql-reference/data-types/datetime.md +++ b/docs/ru/sql-reference/data-types/datetime.md @@ -126,4 +126,3 @@ FROM dt - [Тип данных `Date`](date.md) - [Тип данных `DateTime64`](datetime64.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/datetime/) diff --git a/docs/ru/sql-reference/data-types/datetime64.md b/docs/ru/sql-reference/data-types/datetime64.md index 6576bf9dc0d..3a08da75bb7 100644 --- a/docs/ru/sql-reference/data-types/datetime64.md +++ b/docs/ru/sql-reference/data-types/datetime64.md @@ -7,9 +7,9 @@ toc_title: DateTime64 Позволяет хранить момент времени, который может быть представлен как календарная дата и время, с заданной суб-секундной точностью. -Размер тика/точность: 10-precision секунд, где precision - целочисленный параметр типа. +Размер тика (точность, precision): 10-precision секунд, где precision - целочисленный параметр. -Синтаксис: +**Синтаксис:** ``` sql DateTime64(precision, [timezone]) @@ -17,9 +17,11 @@ DateTime64(precision, [timezone]) Данные хранятся в виде количества ‘тиков’, прошедших с момента начала эпохи (1970-01-01 00:00:00 UTC), в Int64. Размер тика определяется параметром precision. Дополнительно, тип `DateTime64` позволяет хранить часовой пояс, единый для всей колонки, который влияет на то, как будут отображаться значения типа `DateTime64` в текстовом виде и как будут парситься значения заданные в виде строк (‘2020-01-01 05:00:01.000’). Часовой пояс не хранится в строках таблицы (выборки), а хранится в метаданных колонки. Подробнее см. [DateTime](datetime.md). -## Пример {#primer} +Поддерживаются значения от 1 января 1925 г. и до 31 декабря 2283 г. -**1.** Создание таблицы с столбцом типа `DateTime64` и вставка данных в неё: +## Примеры {#examples} + +1. Создание таблицы со столбцом типа `DateTime64` и вставка данных в неё: ``` sql CREATE TABLE dt @@ -27,15 +29,15 @@ CREATE TABLE dt `timestamp` DateTime64(3, 'Europe/Moscow'), `event_id` UInt8 ) -ENGINE = TinyLog +ENGINE = TinyLog; ``` ``` sql -INSERT INTO dt Values (1546300800000, 1), ('2019-01-01 00:00:00', 2) +INSERT INTO dt Values (1546300800000, 1), ('2019-01-01 00:00:00', 2); ``` ``` sql -SELECT * FROM dt +SELECT * FROM dt; ``` ``` text @@ -46,12 +48,12 @@ SELECT * FROM dt ``` - При вставке даты-времени как числа (аналогично ‘Unix timestamp’), время трактуется как UTC. Unix timestamp `1546300800` в часовом поясе `Europe/London (UTC+0)` представляет время `'2019-01-01 00:00:00'`. Однако, столбец `timestamp` имеет тип `DateTime('Europe/Moscow (UTC+3)')`, так что при выводе в виде строки время отобразится как `2019-01-01 03:00:00`. -- При вставке даты-времени в виде строки, время трактуется соответственно часовому поясу установленному для колонки. `'2019-01-01 00:00:00'` трактуется как время по Москве (и в базу сохраняется `'2018-12-31 21:00:00'` в виде Unix Timestamp) +- При вставке даты-времени в виде строки, время трактуется соответственно часовому поясу установленному для колонки. `'2019-01-01 00:00:00'` трактуется как время по Москве (и в базу сохраняется `'2018-12-31 21:00:00'` в виде Unix Timestamp). -**2.** Фильтрация по значениям даты-времени +2. Фильтрация по значениям даты и времени ``` sql -SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow') +SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow'); ``` ``` text @@ -60,12 +62,12 @@ SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europ └─────────────────────────┴──────────┘ ``` -В отличие от типа `DateTime`, `DateTime64` не конвертируется из строк автоматически +В отличие от типа `DateTime`, `DateTime64` не конвертируется из строк автоматически. -**3.** Получение часового пояса для значения типа `DateTime64`: +3. Получение часового пояса для значения типа `DateTime64`: ``` sql -SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x +SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x; ``` ``` text @@ -74,13 +76,13 @@ SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS └─────────────────────────┴────────────────────────────────┘ ``` -**4.** Конвертация часовых поясов +4. Конвертация часовых поясов ``` sql SELECT toDateTime64(timestamp, 3, 'Europe/London') as lon_time, toDateTime64(timestamp, 3, 'Europe/Moscow') as mos_time -FROM dt +FROM dt; ``` ``` text @@ -90,7 +92,7 @@ FROM dt └─────────────────────────┴─────────────────────────┘ ``` -## See Also {#see-also} +**See Also** - [Функции преобразования типов](../../sql-reference/functions/type-conversion-functions.md) - [Функции для работы с датой и временем](../../sql-reference/functions/date-time-functions.md) diff --git a/docs/ru/sql-reference/data-types/decimal.md b/docs/ru/sql-reference/data-types/decimal.md index bdcd3c767b9..8524e8ea132 100644 --- a/docs/ru/sql-reference/data-types/decimal.md +++ b/docs/ru/sql-reference/data-types/decimal.md @@ -112,4 +112,3 @@ DB::Exception: Can't compare. - [countDigits](../../sql-reference/functions/other-functions.md#count-digits) -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/decimal/) diff --git a/docs/ru/sql-reference/data-types/domains/index.md b/docs/ru/sql-reference/data-types/domains/index.md index 4449469b1bc..35f8149112f 100644 --- a/docs/ru/sql-reference/data-types/domains/index.md +++ b/docs/ru/sql-reference/data-types/domains/index.md @@ -1,6 +1,6 @@ --- -toc_folder_title: "\u0414\u043e\u043c\u0435\u043d\u044b" -toc_title_title: "\u041e\u0431\u0437\u043e\u0440" +toc_folder_title: "Домены" +toc_title_title: "Обзор" toc_priority: 56 --- @@ -30,4 +30,3 @@ toc_priority: 56 - Невозможно неявно преобразовывать строковые значение в значения с доменным типом данных при вставке данных из другого столбца или таблицы. - Домен не добавляет ограничения на хранимые значения. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/domains/overview) diff --git a/docs/ru/sql-reference/data-types/domains/ipv4.md b/docs/ru/sql-reference/data-types/domains/ipv4.md index 57d6f12ab17..af5f8261fae 100644 --- a/docs/ru/sql-reference/data-types/domains/ipv4.md +++ b/docs/ru/sql-reference/data-types/domains/ipv4.md @@ -81,4 +81,3 @@ SELECT toTypeName(i), CAST(from AS UInt32) AS i FROM hits LIMIT 1; └──────────────────────────────────┴────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/domains/ipv4) diff --git a/docs/ru/sql-reference/data-types/domains/ipv6.md b/docs/ru/sql-reference/data-types/domains/ipv6.md index 04c5fd0d491..5b3c17feceb 100644 --- a/docs/ru/sql-reference/data-types/domains/ipv6.md +++ b/docs/ru/sql-reference/data-types/domains/ipv6.md @@ -81,4 +81,3 @@ SELECT toTypeName(i), CAST(from AS FixedString(16)) AS i FROM hits LIMIT 1; └───────────────────────────────────────────┴─────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/domains/ipv6) diff --git a/docs/ru/sql-reference/data-types/enum.md b/docs/ru/sql-reference/data-types/enum.md index b86d15c19a8..95c053bed2c 100644 --- a/docs/ru/sql-reference/data-types/enum.md +++ b/docs/ru/sql-reference/data-types/enum.md @@ -126,4 +126,3 @@ INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) При ALTER, есть возможность поменять Enum8 на Enum16 и обратно - так же, как можно поменять Int8 на Int16. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/enum/) diff --git a/docs/ru/sql-reference/data-types/fixedstring.md b/docs/ru/sql-reference/data-types/fixedstring.md index 21115418e30..ef73dadaddf 100644 --- a/docs/ru/sql-reference/data-types/fixedstring.md +++ b/docs/ru/sql-reference/data-types/fixedstring.md @@ -58,4 +58,3 @@ WHERE a = 'b\0' Обратите внимание, что длина значения `FixedString(N)` постоянна. Функция [length](../../sql-reference/data-types/fixedstring.md#array_functions-length) возвращает `N` даже если значение `FixedString(N)` заполнено только нулевыми байтами, однако функция [empty](../../sql-reference/data-types/fixedstring.md#empty) в этом же случае возвращает `1`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/fixedstring/) diff --git a/docs/ru/sql-reference/data-types/float.md b/docs/ru/sql-reference/data-types/float.md index 0e861f170b7..89ac00ab62f 100644 --- a/docs/ru/sql-reference/data-types/float.md +++ b/docs/ru/sql-reference/data-types/float.md @@ -89,4 +89,3 @@ SELECT 0 / 0 Смотрите правила сортировки `NaN` в разделе [Секция ORDER BY ](../../sql-reference/statements/select/order-by.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/float/) diff --git a/docs/ru/sql-reference/data-types/geo.md b/docs/ru/sql-reference/data-types/geo.md new file mode 100644 index 00000000000..23b47f38d05 --- /dev/null +++ b/docs/ru/sql-reference/data-types/geo.md @@ -0,0 +1,105 @@ +--- +toc_priority: 62 +toc_title: Географические структуры +--- + +# Типы данных для работы с географическими структурами {#geo-data-types} + +ClickHouse поддерживает типы данных для отображения географических объектов — точек (местоположений), территорий и т.п. + +!!! warning "Предупреждение" + Сейчас использование типов данных для работы с географическими структурами является экспериментальной возможностью. Чтобы использовать эти типы данных, включите настройку `allow_experimental_geo_types = 1`. + +**См. также** +- [Хранение географических структур данных](https://ru.wikipedia.org/wiki/GeoJSON). +- Настройка [allow_experimental_geo_types](../../operations/settings/settings.md#allow-experimental-geo-types). + +## Point {#point-data-type} + +Тип `Point` (точка) определяется парой координат X и Y и хранится в виде кортежа [Tuple](tuple.md)([Float64](float.md), [Float64](float.md)). + +**Пример** + +Запрос: + +```sql +SET allow_experimental_geo_types = 1; +CREATE TABLE geo_point (p Point) ENGINE = Memory(); +INSERT INTO geo_point VALUES((10, 10)); +SELECT p, toTypeName(p) FROM geo_point; +``` +Результат: + +``` text +┌─p─────┬─toTypeName(p)─┐ +│ (10,10) │ Point │ +└───────┴───────────────┘ +``` + +## Ring {#ring-data-type} + +Тип `Ring` описывает простой многоугольник без внутренних областей (дыр) и хранится в виде массива точек: [Array](array.md)([Point](#point-data-type)). + +**Пример** + +Запрос: + +```sql +SET allow_experimental_geo_types = 1; +CREATE TABLE geo_ring (r Ring) ENGINE = Memory(); +INSERT INTO geo_ring VALUES([(0, 0), (10, 0), (10, 10), (0, 10)]); +SELECT r, toTypeName(r) FROM geo_ring; +``` +Результат: + +``` text +┌─r─────────────────────────────┬─toTypeName(r)─┐ +│ [(0,0),(10,0),(10,10),(0,10)] │ Ring │ +└───────────────────────────────┴───────────────┘ +``` + +## Polygon {#polygon-data-type} + +Тип `Polygon` описывает многоугольник с внутренними областями (дырами) и хранится в виде массива: [Array](array.md)([Ring](#ring-data-type)). Первый элемент массива описывает внешний многоугольник (контур), а остальные элементы описывают дыры. + +**Пример** + +Запись в этой таблице описывает многоугольник с одной дырой: + +```sql +SET allow_experimental_geo_types = 1; +CREATE TABLE geo_polygon (pg Polygon) ENGINE = Memory(); +INSERT INTO geo_polygon VALUES([[(20, 20), (50, 20), (50, 50), (20, 50)], [(30, 30), (50, 50), (50, 30)]]); +SELECT pg, toTypeName(pg) FROM geo_polygon; +``` + +Результат: + +``` text +┌─pg────────────────────────────────────────────────────────────┬─toTypeName(pg)─┐ +│ [[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] │ Polygon │ +└───────────────────────────────────────────────────────────────┴────────────────┘ +``` + +## MultiPolygon {#multipolygon-data-type} + +Тип `MultiPolygon` описывает элемент, состоящий из нескольких простых многоугольников (полигональную сетку). Он хранится в виде массива многоугольников: [Array](array.md)([Polygon](#polygon-data-type)). + +**Пример** + +Запись в этой таблице описывает элемент, состоящий из двух многоугольников — первый без дыр, а второй с одной дырой: + +```sql +SET allow_experimental_geo_types = 1; +CREATE TABLE geo_multipolygon (mpg MultiPolygon) ENGINE = Memory(); +INSERT INTO geo_multipolygon VALUES([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]]); +SELECT mpg, toTypeName(mpg) FROM geo_multipolygon; +``` +Result: + +``` text +┌─mpg─────────────────────────────────────────────────────────────────────────────────────────────┬─toTypeName(mpg)─┐ +│ [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] │ MultiPolygon │ +└─────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────┘ +``` + diff --git a/docs/ru/sql-reference/data-types/index.md b/docs/ru/sql-reference/data-types/index.md index 7a5618f4c5d..2b29ee1bc19 100644 --- a/docs/ru/sql-reference/data-types/index.md +++ b/docs/ru/sql-reference/data-types/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u0422\u0438\u043F\u044B \u0434\u0430\u043D\u043D\u044B\u0445" +toc_folder_title: "Типы данных" toc_priority: 37 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043D\u0438\u0435" +toc_title: "Введение" --- # Типы данных {#data_types} @@ -11,4 +11,3 @@ ClickHouse может сохранять в ячейках таблиц данн Зависимость имен типов данных от регистра можно проверить в системной таблице [system.data_type_families](../../operations/system-tables/data_type_families.md#system_tables-data_type_families). Раздел содержит описания поддерживаемых типов данных и специфику их использования и/или реализации, если таковые имеются. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/) diff --git a/docs/ru/sql-reference/data-types/int-uint.md b/docs/ru/sql-reference/data-types/int-uint.md index d3c342e467a..c026f5fc4a5 100644 --- a/docs/ru/sql-reference/data-types/int-uint.md +++ b/docs/ru/sql-reference/data-types/int-uint.md @@ -35,4 +35,3 @@ toc_title: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 `UInt128` пока не реализован. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/int_uint/) diff --git a/docs/ru/sql-reference/data-types/lowcardinality.md b/docs/ru/sql-reference/data-types/lowcardinality.md index d94cedd29ce..fe9118b1e14 100644 --- a/docs/ru/sql-reference/data-types/lowcardinality.md +++ b/docs/ru/sql-reference/data-types/lowcardinality.md @@ -23,7 +23,7 @@ LowCardinality(data_type) Эффективность использования типа данных `LowCarditality` зависит от разнообразия данных. Если словарь содержит менее 10 000 различных значений, ClickHouse в основном показывает более высокую эффективность чтения и хранения данных. Если же словарь содержит более 100 000 различных значений, ClickHouse может работать хуже, чем при использовании обычных типов данных. -При работе со строками, использование `LowCardinality` вместо [Enum](enum.md). `LowCardinality` обеспечивает большую гибкость в использовании и часто показывает такую же или более высокую эффективность. +При работе со строками, использование `LowCardinality` вместо [Enum](enum.md) обеспечивает большую гибкость в использовании и часто показывает такую же или более высокую эффективность. ## Пример @@ -58,4 +58,3 @@ ORDER BY id - [Reducing Clickhouse Storage Cost with the Low Cardinality Type – Lessons from an Instana Engineer](https://www.instana.com/blog/reducing-clickhouse-storage-cost-with-the-low-cardinality-type-lessons-from-an-instana-engineer/). - [String Optimization (video presentation in Russian)](https://youtu.be/rqf-ILRgBdY?list=PL0Z2YDlm0b3iwXCpEFiOOYmwXzVmjJfEt). [Slides in English](https://github.com/yandex/clickhouse-presentations/raw/master/meetup19/string_optimization.pdf). -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/data-types/lowcardinality/) diff --git a/docs/ru/sql-reference/data-types/multiword-types.md b/docs/ru/sql-reference/data-types/multiword-types.md index 559755ef989..0a8afff448d 100644 --- a/docs/ru/sql-reference/data-types/multiword-types.md +++ b/docs/ru/sql-reference/data-types/multiword-types.md @@ -26,4 +26,3 @@ toc_title: Составные типы | BINARY LARGE OBJECT | [String](../../sql-reference/data-types/string.md) | | BINARY VARYING | [String](../../sql-reference/data-types/string.md) | -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/data-types/multiword-types/) diff --git a/docs/ru/sql-reference/data-types/nested-data-structures/index.md b/docs/ru/sql-reference/data-types/nested-data-structures/index.md index d53cabc6652..78262347bac 100644 --- a/docs/ru/sql-reference/data-types/nested-data-structures/index.md +++ b/docs/ru/sql-reference/data-types/nested-data-structures/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0412\u043b\u043e\u0436\u0435\u043d\u043d\u044b\u0435\u0020\u0441\u0442\u0440\u0443\u043a\u0442\u0443\u0440\u044b\u0020\u0434\u0430\u043d\u043d\u044b\u0445" +toc_folder_title: "Вложенные структуры данных" toc_hidden: true toc_priority: 54 toc_title: hidden @@ -7,4 +7,3 @@ toc_title: hidden # Вложенные структуры данных {#vlozhennye-struktury-dannykh} -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/nested_data_structures/) diff --git a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md index 0e43383b283..199d141a191 100644 --- a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md +++ b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md @@ -96,4 +96,3 @@ LIMIT 10 Работоспособность запроса ALTER для элементов вложенных структур данных, является сильно ограниченной. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/nested_data_structures/nested/) diff --git a/docs/ru/sql-reference/data-types/nullable.md b/docs/ru/sql-reference/data-types/nullable.md index 71e1f7a37a0..3f33c4b2540 100644 --- a/docs/ru/sql-reference/data-types/nullable.md +++ b/docs/ru/sql-reference/data-types/nullable.md @@ -48,4 +48,3 @@ SELECT x + y from t_null └────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/nullable/) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 52f0412a177..7b81c577762 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -1,8 +1,11 @@ -# SimpleAggregateFunction {#data-type-simpleaggregatefunction} +# SimpleAggregateFunction(func, type) {#data-type-simpleaggregatefunction} -`SimpleAggregateFunction(name, types_of_arguments…)` data type stores current value of the aggregate function, and does not store its full state as [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) does. This optimization can be applied to functions for which the following property holds: the result of applying a function `f` to a row set `S1 UNION ALL S2` can be obtained by applying `f` to parts of the row set separately, and then again applying `f` to the results: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. This property guarantees that partial aggregation results are enough to compute the combined one, so we don’t have to store and process any extra data. +Хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, которые обладают следующим свойством: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, +а затем повторного выполнения `f` к результатам: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Это свойство гарантирует, что результатов частичной агрегации достаточно для вычисления комбинированной, поэтому хранить и обрабатывать какие-либо дополнительные данные не требуется. -The following aggregate functions are supported: +Чтобы получить промежуточное значение, обычно используются агрегатные функции с суффиксом [-SimpleState](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-simplestate). + +Поддерживаются следующие агрегатные функции: - [`any`](../../sql-reference/aggregate-functions/reference/any.md#agg_function-any) - [`anyLast`](../../sql-reference/aggregate-functions/reference/anylast.md#anylastx) @@ -14,23 +17,26 @@ The following aggregate functions are supported: - [`groupBitOr`](../../sql-reference/aggregate-functions/reference/groupbitor.md#groupbitor) - [`groupBitXor`](../../sql-reference/aggregate-functions/reference/groupbitxor.md#groupbitxor) - [`groupArrayArray`](../../sql-reference/aggregate-functions/reference/grouparray.md#agg_function-grouparray) -- [`groupUniqArrayArray`](../../sql-reference/aggregate-functions/reference/groupuniqarray.md#groupuniqarray) +- [`groupUniqArrayArray`](../../sql-reference/aggregate-functions/reference/groupuniqarray.md) +- [`sumMap`](../../sql-reference/aggregate-functions/reference/summap.md#agg_functions-summap) +- [`minMap`](../../sql-reference/aggregate-functions/reference/minmap.md#agg_functions-minmap) +- [`maxMap`](../../sql-reference/aggregate-functions/reference/maxmap.md#agg_functions-maxmap) +- [`argMin`](../../sql-reference/aggregate-functions/reference/argmin.md) +- [`argMax`](../../sql-reference/aggregate-functions/reference/argmax.md) -Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. `SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. +!!! note "Примечание" + Значения `SimpleAggregateFunction(func, Type)` отображаются и хранятся так же, как и `Type`, поэтому комбинаторы [-Merge](../../sql-reference/aggregate-functions/combinators.md#aggregate_functions_combinators-merge) и [-State](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state) не требуются. -**Parameters** + `SimpleAggregateFunction` имеет лучшую производительность, чем `AggregateFunction` с той же агрегатной функцией. -- Name of the aggregate function. -- Types of the aggregate function arguments. +**Параметры** -**Example** +- `func` — имя агрегатной функции. +- `type` — типы аргументов агрегатной функции. + +**Пример** ``` sql -CREATE TABLE t -( - column1 SimpleAggregateFunction(sum, UInt64), - column2 SimpleAggregateFunction(any, String) -) ENGINE = ... +CREATE TABLE simple (id UInt64, val SimpleAggregateFunction(sum, Double)) ENGINE=AggregatingMergeTree ORDER BY id; ``` -[Original article](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) diff --git a/docs/ru/sql-reference/data-types/special-data-types/expression.md b/docs/ru/sql-reference/data-types/special-data-types/expression.md index 718fcc886a6..f11f66a40c7 100644 --- a/docs/ru/sql-reference/data-types/special-data-types/expression.md +++ b/docs/ru/sql-reference/data-types/special-data-types/expression.md @@ -7,4 +7,3 @@ toc_title: Expression Используется для представления лямбда-выражений в функциях высшего порядка. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/special_data_types/expression/) diff --git a/docs/ru/sql-reference/data-types/special-data-types/index.md b/docs/ru/sql-reference/data-types/special-data-types/index.md index 29c057472ea..823a84e2e43 100644 --- a/docs/ru/sql-reference/data-types/special-data-types/index.md +++ b/docs/ru/sql-reference/data-types/special-data-types/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0421\u043b\u0443\u0436\u0435\u0431\u043d\u044b\u0435\u0020\u0442\u0438\u043f\u044b\u0020\u0434\u0430\u043d\u043d\u044b\u0445" +toc_folder_title: "Служебные типы данных" toc_hidden: true toc_priority: 55 toc_title: hidden @@ -9,4 +9,3 @@ toc_title: hidden Значения служебных типов данных не могут сохраняться в таблицу и выводиться в качестве результата, а возникают как промежуточный результат выполнения запроса. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/special_data_types/) diff --git a/docs/ru/sql-reference/data-types/special-data-types/nothing.md b/docs/ru/sql-reference/data-types/special-data-types/nothing.md index c6a9cb868d8..30d425461e1 100644 --- a/docs/ru/sql-reference/data-types/special-data-types/nothing.md +++ b/docs/ru/sql-reference/data-types/special-data-types/nothing.md @@ -19,4 +19,3 @@ SELECT toTypeName(Array()) └─────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/special_data_types/nothing/) diff --git a/docs/ru/sql-reference/data-types/special-data-types/set.md b/docs/ru/sql-reference/data-types/special-data-types/set.md index 4c2f4ed2c66..5867df3c947 100644 --- a/docs/ru/sql-reference/data-types/special-data-types/set.md +++ b/docs/ru/sql-reference/data-types/special-data-types/set.md @@ -7,4 +7,3 @@ toc_title: Set Используется для представления правой части выражения IN. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/special_data_types/set/) diff --git a/docs/ru/sql-reference/data-types/string.md b/docs/ru/sql-reference/data-types/string.md index 6a07f7e51de..9470f523629 100644 --- a/docs/ru/sql-reference/data-types/string.md +++ b/docs/ru/sql-reference/data-types/string.md @@ -17,4 +17,3 @@ toc_title: String Также, некоторые функции по работе со строками, имеют отдельные варианты, которые работают при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Например, функция length вычисляет длину строки в байтах, а функция lengthUTF8 - длину строки в кодовых точках Unicode, при допущении, что значение в кодировке UTF-8. -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/string/) diff --git a/docs/ru/sql-reference/data-types/tuple.md b/docs/ru/sql-reference/data-types/tuple.md index e2a1450b47f..702b5962f7b 100644 --- a/docs/ru/sql-reference/data-types/tuple.md +++ b/docs/ru/sql-reference/data-types/tuple.md @@ -47,4 +47,3 @@ SELECT tuple(1,NULL) AS x, toTypeName(x) └──────────┴─────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/tuple/) diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md index 350e391dbed..da8492e7cc0 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md @@ -1,6 +1,6 @@ --- toc_priority: 45 -toc_title: "\u0418\u0435\u0440\u0430\u0440\u0445\u0438\u0447\u0435\u0441\u043a\u0438\u0435\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u0438" +toc_title: "Иерархические словари" --- # Иерархические словари {#ierarkhicheskie-slovari} @@ -65,4 +65,3 @@ ClickHouse поддерживает свойство [hierarchical](external-dic ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict_hierarchical/) diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index f6b8b670563..285982565c2 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -1,6 +1,6 @@ --- toc_priority: 41 -toc_title: "\u0425\u0440\u0430\u043d\u0435\u043d\u0438\u0435\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u0435\u0439\u0020\u0432\u0020\u043f\u0430\u043c\u044f\u0442\u0438" +toc_title: "Хранение словарей в памяти" --- # Хранение словарей в памяти {#dicts-external-dicts-dict-layout} @@ -318,8 +318,6 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) 1048576 /var/lib/clickhouse/clickhouse_dictionaries/test_dict - - 1048576 ``` @@ -327,8 +325,8 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) или ``` sql -LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 - PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576)) +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 + PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict)) ``` ### complex_key_ssd_cache {#complex-key-ssd-cache} @@ -443,4 +441,3 @@ dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) Данные должны полностью помещаться в оперативной памяти. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict_layout/) diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md index ec0fb8e0ee5..9589353649d 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md @@ -1,6 +1,6 @@ --- toc_priority: 42 -toc_title: "\u041e\u0431\u043d\u043e\u0432\u043b\u0435\u043d\u0438\u0435\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u0435\u0439" +toc_title: "Обновление словарей" --- # Обновление словарей {#obnovlenie-slovarei} @@ -28,7 +28,7 @@ LIFETIME(300) ... ``` -Настройка `0` запрещает обновление словарей. +Настройка `0` (`LIFETIME(0)`) запрещает обновление словарей. Можно задать интервал, внутри которого ClickHouse равномерно-случайно выберет время для обновления. Это необходимо для распределения нагрузки на источник словаря при обновлении на большом количестве серверов. @@ -51,16 +51,19 @@ LIFETIME(300) LIFETIME(MIN 300 MAX 360) ``` +Если `0` и `0`, ClickHouse не перегружает словарь по истечению времени. +В этм случае, ClickHouse может перезагрузить данные словаря если изменился XML файл с конфигурацией словаря или если была выполнена команда `SYSTEM RELOAD DICTIONARY`. + При обновлении словарей сервер ClickHouse применяет различную логику в зависимости от типа [источника](external-dicts-dict-sources.md): -> - У текстового файла проверяется время модификации. Если время изменилось по отношению к запомненному ранее, то словарь обновляется. -> - Для MySQL источника, время модификации проверяется запросом `SHOW TABLE STATUS` (для MySQL 8 необходимо отключить кеширование мета-информации в MySQL `set global information_schema_stats_expiry=0`. -> - Словари из других источников по умолчанию обновляются каждый раз. +- У текстового файла проверяется время модификации. Если время изменилось по отношению к запомненному ранее, то словарь обновляется. +- Для MySQL источника, время модификации проверяется запросом `SHOW TABLE STATUS` (для MySQL 8 необходимо отключить кеширование мета-информации в MySQL `set global information_schema_stats_expiry=0`. +- Словари из других источников по умолчанию обновляются каждый раз. -Для других источников (ODBC, ClickHouse и т.д.) можно настроить запрос, который позволит обновлять словари только в случае их фактического изменения, а не каждый раз. Чтобы это сделать необходимо выполнить следующие условия/действия: +Для других источников (ODBC, PostgreSQL, ClickHouse и т.д.) можно настроить запрос, который позволит обновлять словари только в случае их фактического изменения, а не каждый раз. Чтобы это сделать необходимо выполнить следующие условия/действия: -> - В таблице словаря должно быть поле, которое гарантированно изменяется при обновлении данных в источнике. -> - В настройках источника указывается запрос, который получает изменяющееся поле. Результат запроса сервер ClickHouse интерпретирует как строку и если эта строка изменилась по отношению к предыдущему состоянию, то словарь обновляется. Запрос следует указывать в поле `` настроек [источника](external-dicts-dict-sources.md). +- В таблице словаря должно быть поле, которое гарантированно изменяется при обновлении данных в источнике. +- В настройках источника указывается запрос, который получает изменяющееся поле. Результат запроса сервер ClickHouse интерпретирует как строку и если эта строка изменилась по отношению к предыдущему состоянию, то словарь обновляется. Запрос следует указывать в поле `` настроек [источника](external-dicts-dict-sources.md). Пример настройки: @@ -83,4 +86,3 @@ SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source wher ... ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict_lifetime/) diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index 3bb11b638b2..a7999470330 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -1,6 +1,6 @@ --- toc_priority: 43 -toc_title: "\u0418\u0441\u0442\u043e\u0447\u043d\u0438\u043a\u0438\u0020\u0432\u043d\u0435\u0448\u043d\u0438\u0445\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u0435\u0439" +toc_title: "Источники внешних словарей" --- # Источники внешних словарей {#dicts-external-dicts-dict-sources} @@ -65,9 +65,11 @@ SETTINGS(format_csv_allow_single_quotes = 0) - СУБД: - [ODBC](#dicts-external_dicts_dict_sources-odbc) - [MySQL](#dicts-external_dicts_dict_sources-mysql) + - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - [Redis](#dicts-external_dicts_dict_sources-redis) + - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) ## Локальный файл {#dicts-external_dicts_dict_sources-local_file} @@ -313,6 +315,7 @@ PRIMARY KEY id SOURCE(ODBC(connection_string 'DSN=myconnection' table 'postgresql_table')) LAYOUT(HASHED()) LIFETIME(MIN 300 MAX 360) +``` Может понадобиться в `odbc.ini` указать полный путь до библиотеки с драйвером `DRIVER=/usr/local/lib/psqlodbcw.so`. @@ -320,15 +323,15 @@ LIFETIME(MIN 300 MAX 360) ОС Ubuntu. -Установка драйвера: : +Установка драйвера: ```bash $ sudo apt-get install tdsodbc freetds-bin sqsh ``` -Настройка драйвера: : +Настройка драйвера: -``` bash +```bash $ cat /etc/freetds/freetds.conf ... @@ -338,8 +341,11 @@ $ sudo apt-get install tdsodbc freetds-bin sqsh tds version = 7.0 client charset = UTF-8 + # тестирование TDS соединения + $ sqsh -S MSSQL -D database -U user -P password + + $ cat /etc/odbcinst.ini - ... [FreeTDS] Description = FreeTDS @@ -348,8 +354,8 @@ $ sudo apt-get install tdsodbc freetds-bin sqsh FileUsage = 1 UsageCount = 5 - $ cat ~/.odbc.ini - ... + $ cat /etc/odbc.ini + # $ cat ~/.odbc.ini # если вы вошли из под пользователя из под которого запущен ClickHouse [MSSQL] Description = FreeTDS @@ -359,8 +365,15 @@ $ sudo apt-get install tdsodbc freetds-bin sqsh UID = test PWD = test Port = 1433 + + + # (не обязательно) тест ODBC соединения (используйте isql поставляемый вместе с [unixodbc](https://packages.debian.org/sid/unixodbc)-package) + $ isql -v MSSQL "user" "password" ``` +Примечание: +- чтобы определить самую раннюю версию TDS, которая поддерживается определенной версией SQL Server, обратитесь к документации продукта или посмотрите на [MS-TDS Product Behavior](https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-tds/135d0ebe-5c4c-4a94-99bf-1811eccb9f4a) + Настройка словаря в ClickHouse: ``` xml @@ -572,7 +585,7 @@ SOURCE(CLICKHOUSE( или ``` sql -SOURCE(MONGO( +SOURCE(MONGODB( host 'localhost' port 27017 user '' @@ -624,4 +637,92 @@ SOURCE(REDIS( - `storage_type` – способ хранения ключей. Необходимо использовать `simple` для источников с одним столбцом ключей, `hash_map` – для источников с двумя столбцами ключей. Источники с более, чем двумя столбцами ключей, не поддерживаются. Может отсутствовать, значение по умолчанию `simple`. - `db_index` – номер базы данных. Может отсутствовать, значение по умолчанию 0. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict_sources/) +### Cassandra {#dicts-external_dicts_dict_sources-cassandra} + +Пример настройки: + +``` xml + + + localhost + 9042 + username + qwerty123 + database_name + table_name + 1 + 1 + One + "SomeColumn" = 42 + 8 + + +``` + +Поля настройки: +- `host` – Имя хоста с установленной Cassandra или разделенный через запятую список хостов. +- `port` – Порт на серверах Cassandra. Если не указан, используется значение по умолчанию 9042. +- `user` – Имя пользователя для соединения с Cassandra. +- `password` – Пароль для соединения с Cassandra. +- `keyspace` – Имя keyspace (база данных). +- `column_family` – Имя семейства столбцов (таблица). +- `allow_filering` – Флаг, разрешающий или не разрешающий потенциально дорогостоящие условия на кластеризации ключевых столбцов. Значение по умолчанию 1. +- `partition_key_prefix` – Количество партиций ключевых столбцов в первичном ключе таблицы Cassandra. +Необходимо для составления ключей словаря. Порядок ключевых столбцов в определении словеря должен быть таким же как в Cassandra. +Значение по умолчанию 1 (первый ключевой столбец это ключ партицирования, остальные ключевые столбцы - ключи кластеризации). +- `consistency` – Уровень консистентности. Возмодные значения: `One`, `Two`, `Three`, + `All`, `EachQuorum`, `Quorum`, `LocalQuorum`, `LocalOne`, `Serial`, `LocalSerial`. Значение по умолчанию `One`. +- `where` – Опциональный критерий выборки. +- `max_threads` – Максимальное кол-во тредов для загрузки данных из нескольких партиций в словарь. + +### PosgreSQL {#dicts-external_dicts_dict_sources-postgresql} + +Пример настройки: + +``` xml + + + 5432 + clickhouse + qwerty + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` + +или + +``` sql +SOURCE(POSTGRESQL( + port 5432 + host 'postgresql-hostname' + user 'postgres_user' + password 'postgres_password' + db 'db_name' + table 'table_name' + replica(host 'example01-1' port 5432 priority 1) + replica(host 'example01-2' port 5432 priority 2) + where 'id=10' + invalidate_query 'SQL_QUERY' +)) +``` + +Setting fields: + +- `host` – Хост для соединения с PostgreSQL. Вы можете указать его для всех реплик или задать индивидуально для каждой релпики (внутри ``). +- `port` – Порт для соединения с PostgreSQL. Вы можете указать его для всех реплик или задать индивидуально для каждой релпики (внутри ``). +- `user` – Имя пользователя для соединения с PostgreSQL. Вы можете указать его для всех реплик или задать индивидуально для каждой релпики (внутри ``). +- `password` – Пароль для пользователя PostgreSQL. +- `replica` – Section of replica configurations. There can be multiple sections. + - `replica/host` – хост PostgreSQL. + - `replica/port` – порт PostgreSQL . + - `replica/priority` – Приоритет реплики. Во время попытки соединения, ClickHouse будет перебирать реплики в порядке приоритет. Меньшее значение означает более высокий приоритет. +- `db` – Имя базы данных. +- `table` – Имя таблицы. +- `where` – Условие выборки. Синтаксис для условий такой же как для `WHERE` выражения в PostgreSQL, для примера, `id > 10 AND id < 20`. Необязательный параметр. +- `invalidate_query` – Запрос для проверки условия загрузки словаря. Необязательный параметр. Читайте больше в разделе [Обновление словарей](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). + + diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md index bf87ce61b9e..609ee225ce2 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md @@ -1,9 +1,9 @@ --- toc_priority: 44 -toc_title: "\u041a\u043b\u044e\u0447\u0020\u0438\u0020\u043f\u043e\u043b\u044f\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u044f" +toc_title: "Ключ и поля словаря" --- -# Ключ и поля словаря {#kliuch-i-polia-slovaria} +# Ключ и поля словаря {#dictionary-key-and-fields} Секция `` описывает ключ словаря и поля, доступные для запросов. @@ -88,7 +88,7 @@ PRIMARY KEY Id - `PRIMARY KEY` – имя столбца с ключами. -### Составной ключ {#sostavnoi-kliuch} +### Составной ключ {#composite-key} Ключом может быть кортеж (`tuple`) из полей произвольных типов. В этом случае [layout](external-dicts-dict-layout.md) должен быть `complex_key_hashed` или `complex_key_cache`. @@ -159,14 +159,12 @@ CREATE DICTIONARY somename ( | Тег | Описание | Обязательный | |------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------| | `name` | Имя столбца. | Да | -| `type` | Тип данных ClickHouse.
ClickHouse пытается привести значение из словаря к заданному типу данных. Например, в случае MySQL, в таблице-источнике поле может быть `TEXT`, `VARCHAR`, `BLOB`, но загружено может быть как `String`. [Nullable](../../../sql-reference/data-types/nullable.md) не поддерживается. | Да | -| `null_value` | Значение по умолчанию для несуществующего элемента.
В примере это пустая строка. Нельзя указать значение `NULL`. | Да | +| `type` | Тип данных ClickHouse.
ClickHouse пытается привести значение из словаря к заданному типу данных. Например, в случае MySQL, в таблице-источнике поле может быть `TEXT`, `VARCHAR`, `BLOB`, но загружено может быть как `String`.
[Nullable](../../../sql-reference/data-types/nullable.md) в настоящее время поддерживается для словарей [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md). Для словарей [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache) и [IPTrie](external-dicts-dict-layout.md#ip-trie) `Nullable`-типы не поддерживаются. | Да | +| `null_value` | Значение по умолчанию для несуществующего элемента.
В примере это пустая строка. Значение [NULL](../../syntax.md#null-literal) можно указывать только для типов `Nullable` (см. предыдущую строку с описанием типов). | Да | | `expression` | [Выражение](../../syntax.md#syntax-expressions), которое ClickHouse выполняет со значением.
Выражением может быть имя столбца в удаленной SQL базе. Таким образом, вы можете использовать его для создания псевдонима удаленного столбца.

Значение по умолчанию: нет выражения. | Нет | -| `hierarchical` | Если `true`, то атрибут содержит ключ предка для текущего элемента. Смотрите [Иерархические словари](external-dicts-dict-hierarchical.md).

Default value: `false`. | No | +| `hierarchical` | Если `true`, то атрибут содержит ключ предка для текущего элемента. Смотрите [Иерархические словари](external-dicts-dict-hierarchical.md).

Значение по умолчанию: `false`. | Нет | | `is_object_id` | Признак того, что запрос выполняется к документу MongoDB по `ObjectID`.

Значение по умолчанию: `false`. | Нет | -## Смотрите также {#smotrite-takzhe} +**Смотрите также** - [Функции для работы с внешними словарями](../../../sql-reference/functions/ext-dict-functions.md). - -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict_structure/) diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md index ff18f906926..4dc74200093 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md @@ -1,6 +1,6 @@ --- toc_priority: 40 -toc_title: "\u041d\u0430\u0441\u0442\u0440\u043e\u0439\u043a\u0430\u0020\u0432\u043d\u0435\u0448\u043d\u0435\u0433\u043e\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u044f" +toc_title: "Настройка внешнего словаря" --- # Настройка внешнего словаря {#dicts-external-dicts-dict} @@ -48,4 +48,3 @@ LIFETIME(...) -- Lifetime of dictionary in memory - [structure](external-dicts-dict-structure.md) — Структура словаря. Ключ и атрибуты, которые можно получить по ключу. - [lifetime](external-dicts-dict-lifetime.md) — Периодичность обновления словарей. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict/) diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts.md index c18af68c15e..04ef24b68c5 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts.md @@ -1,6 +1,6 @@ --- toc_priority: 39 -toc_title: "\u0412\u043d\u0435\u0448\u043d\u0438\u0435\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u0438" +toc_title: "Внешние словари" --- @@ -61,4 +61,3 @@ ClickHouse: - [Ключ и поля словаря](external-dicts-dict-structure.md) - [Функции для работы с внешними словарями](../../../sql-reference/functions/ext-dict-functions.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts/) diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/index.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/index.md index b448858b1fa..c0d954d6976 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/index.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0412\u043d\u0435\u0448\u043d\u0438\u0435\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u0438" +toc_folder_title: "Внешние словари" toc_priority: 37 --- diff --git a/docs/ru/sql-reference/dictionaries/index.md b/docs/ru/sql-reference/dictionaries/index.md index 5a4119b4dd5..59c7518d0c5 100644 --- a/docs/ru/sql-reference/dictionaries/index.md +++ b/docs/ru/sql-reference/dictionaries/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u0421\u043b\u043e\u0432\u0430\u0440\u0438" +toc_folder_title: "Словари" toc_priority: 35 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +toc_title: "Введение" --- # Словари {#slovari} @@ -10,11 +10,8 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" ClickHouse поддерживает специальные функции для работы со словарями, которые можно использовать в запросах. Проще и эффективнее использовать словари с помощью функций, чем `JOIN` с таблицами-справочниками. -В словаре нельзя хранить значения [NULL](../../sql-reference/syntax.md#null-literal). - ClickHouse поддерживает: - [Встроенные словари](internal-dicts.md#internal_dicts) со специфическим [набором функций](../../sql-reference/dictionaries/external-dictionaries/index.md). - [Подключаемые (внешние) словари](external-dictionaries/external-dicts.md#dicts-external-dicts) с [набором функций](../../sql-reference/dictionaries/external-dictionaries/index.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/) diff --git a/docs/ru/sql-reference/dictionaries/internal-dicts.md b/docs/ru/sql-reference/dictionaries/internal-dicts.md index d8103efa6ae..34e407ceacd 100644 --- a/docs/ru/sql-reference/dictionaries/internal-dicts.md +++ b/docs/ru/sql-reference/dictionaries/internal-dicts.md @@ -1,6 +1,6 @@ --- toc_priority: 39 -toc_title: "\u0412\u0441\u0442\u0440\u043e\u0435\u043d\u043d\u044b\u0435\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u0438" +toc_title: "Встроенные словари" --- # Встроенные словари {#internal_dicts} @@ -50,4 +50,3 @@ ClickHouse содержит встроенную возможность рабо Также имеются функции для работы с идентификаторами операционных систем и поисковых систем Яндекс.Метрики, пользоваться которыми не нужно. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/internal_dicts/) diff --git a/docs/ru/sql-reference/distributed-ddl.md b/docs/ru/sql-reference/distributed-ddl.md index 275709320f6..e03ecb893bc 100644 --- a/docs/ru/sql-reference/distributed-ddl.md +++ b/docs/ru/sql-reference/distributed-ddl.md @@ -1,6 +1,6 @@ --- toc_priority: 32 -toc_title: "\u0420\u0430\u0441\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u043d\u044b\u0435\u0020\u0044\u0044\u004c\u0020\u0437\u0430\u043f\u0440\u043e\u0441\u044b\u000a" +toc_title: "Распределенные DDL запросы" --- # Распределенные DDL запросы (секция ON CLUSTER) {#raspredelennye-ddl-zaprosy-sektsiia-on-cluster} @@ -15,5 +15,4 @@ CREATE TABLE IF NOT EXISTS all_hits ON CLUSTER cluster (p Date, i Int32) ENGINE Для корректного выполнения таких запросов необходимо на каждом хосте иметь одинаковое определение кластера (для упрощения синхронизации конфигов можете использовать подстановки из ZooKeeper). Также необходимо подключение к ZooKeeper серверам. Локальная версия запроса в конечном итоге будет выполнена на каждом хосте кластера, даже если некоторые хосты в данный момент не доступны. Гарантируется упорядоченность выполнения запросов в рамках одного хоста. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/distributed-ddl) \ No newline at end of file diff --git a/docs/ru/sql-reference/functions/arithmetic-functions.md b/docs/ru/sql-reference/functions/arithmetic-functions.md index 16c3e8fd8f0..f587b7b5b5d 100644 --- a/docs/ru/sql-reference/functions/arithmetic-functions.md +++ b/docs/ru/sql-reference/functions/arithmetic-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 34 -toc_title: "\u0410\u0440\u0438\u0444\u043c\u0435\u0442\u0438\u0447\u0435\u0441\u043a\u0438\u0435\u0020\u0444\u0443\u043d\u043a\u0446\u0438\u0438" +toc_title: "Арифметические функции" --- # Арифметические функции {#arifmeticheskie-funktsii} @@ -83,4 +83,3 @@ SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 Вычисляет наименьшее общее кратное чисел. При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/arithmetic_functions/) diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 80057e6f0e0..560795506a0 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 35 -toc_title: "\u041c\u0430\u0441\u0441\u0438\u0432\u044b" +toc_title: "Массивы" --- # Массивы {#functions-for-working-with-arrays} @@ -58,7 +58,7 @@ toc_title: "\u041c\u0430\u0441\u0441\u0438\u0432\u044b" arrayConcat(arrays) ``` -**Параметры** +**Аргументы** - `arrays` – произвольное количество элементов типа [Array](../../sql-reference/functions/array-functions.md) **Пример** @@ -108,7 +108,7 @@ SELECT has([1, 2, NULL], NULL) hasAll(set, subset) ``` -**Параметры** +**Аргументы** - `set` – массив любого типа с набором элементов. - `subset` – массив любого типа со значениями, которые проверяются на вхождение в `set`. @@ -146,7 +146,7 @@ hasAll(set, subset) hasAny(array1, array2) ``` -**Параметры** +**Аргументы** - `array1` – массив любого типа с набором элементов. - `array2` – массив любого типа с набором элементов. @@ -320,21 +320,21 @@ SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res arrayPopBack(array) ``` -**Параметры** +**Аргументы** -- `array` - Массив. +- `array` – массив. **Пример** ``` sql -SELECT arrayPopBack([1, 2, 3]) AS res +SELECT arrayPopBack([1, 2, 3]) AS res; ``` -text - - ┌─res───┐ - │ [1,2] │ - └───────┘ +``` text +┌─res───┐ +│ [1,2] │ +└───────┘ +``` ## arrayPopFront {#arraypopfront} @@ -344,14 +344,14 @@ text arrayPopFront(array) ``` -**Параметры** +**Аргументы** -- `array` - Массив. +- `array` – массив. **Пример** ``` sql -SELECT arrayPopFront([1, 2, 3]) AS res +SELECT arrayPopFront([1, 2, 3]) AS res; ``` ``` text @@ -368,15 +368,15 @@ SELECT arrayPopFront([1, 2, 3]) AS res arrayPushBack(array, single_value) ``` -**Параметры** +**Аргументы** -- `array` - Массив. -- `single_value` - Одиночное значение. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`. Функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. +- `array` – массив. +- `single_value` – значение добавляемого элемента. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. **Пример** ``` sql -SELECT arrayPushBack(['a'], 'b') AS res +SELECT arrayPushBack(['a'], 'b') AS res; ``` ``` text @@ -393,15 +393,15 @@ SELECT arrayPushBack(['a'], 'b') AS res arrayPushFront(array, single_value) ``` -**Параметры** +**Аргументы** -- `array` - Массив. -- `single_value` - Одиночное значение. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`. Функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. +- `array` – массив. +- `single_value` – значение добавляемого элемента. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. **Пример** ``` sql -SELECT arrayPushFront(['b'], 'a') AS res +SELECT arrayPushFront(['b'], 'a') AS res; ``` ``` text @@ -418,7 +418,7 @@ SELECT arrayPushFront(['b'], 'a') AS res arrayResize(array, size[, extender]) ``` -**Параметры** +**Аргументы** - `array` — массив. - `size` — необходимая длина массива. @@ -433,7 +433,7 @@ arrayResize(array, size[, extender]) **Примеры вызовов** ``` sql -SELECT arrayResize([1], 3) +SELECT arrayResize([1], 3); ``` ``` text @@ -443,7 +443,7 @@ SELECT arrayResize([1], 3) ``` ``` sql -SELECT arrayResize([1], 3, NULL) +SELECT arrayResize([1], 3, NULL); ``` ``` text @@ -460,16 +460,16 @@ SELECT arrayResize([1], 3, NULL) arraySlice(array, offset[, length]) ``` -**Параметры** +**Аргументы** -- `array` - Массив данных. -- `offset` - Отступ от края массива. Положительное значение - отступ слева, отрицательное значение - отступ справа. Отсчет элементов массива начинается с 1. -- `length` - Длина необходимого среза. Если указать отрицательное значение, то функция вернёт открытый срез `[offset, array_length - length)`. Если не указать значение, то функция вернёт срез `[offset, the_end_of_array]`. +- `array` – массив данных. +- `offset` – отступ от края массива. Положительное значение - отступ слева, отрицательное значение - отступ справа. Отсчет элементов массива начинается с 1. +- `length` – длина необходимого среза. Если указать отрицательное значение, то функция вернёт открытый срез `[offset, array_length - length)`. Если не указать значение, то функция вернёт срез `[offset, the_end_of_array]`. **Пример** ``` sql -SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res +SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res; ``` ``` text @@ -702,9 +702,9 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; arrayDifference(array) ``` -**Параметры** +**Аргументы** -- `array` – [Массив](https://clickhouse.tech/docs/ru/data_types/array/). +- `array` – [массив](https://clickhouse.tech/docs/ru/data_types/array/). **Возвращаемое значение** @@ -715,10 +715,10 @@ arrayDifference(array) Запрос: ``` sql -SELECT arrayDifference([1, 2, 3, 4]) +SELECT arrayDifference([1, 2, 3, 4]); ``` -Ответ: +Результат: ``` text ┌─arrayDifference([1, 2, 3, 4])─┐ @@ -731,10 +731,10 @@ SELECT arrayDifference([1, 2, 3, 4]) Запрос: ``` sql -SELECT arrayDifference([0, 10000000000000000000]) +SELECT arrayDifference([0, 10000000000000000000]); ``` -Ответ: +Результат: ``` text ┌─arrayDifference([0, 10000000000000000000])─┐ @@ -752,9 +752,9 @@ SELECT arrayDifference([0, 10000000000000000000]) arrayDistinct(array) ``` -**Параметры** +**Аргументы** -- `array` – [Массив](https://clickhouse.tech/docs/ru/data_types/array/). +- `array` – [массив](https://clickhouse.tech/docs/ru/data_types/array/). **Возвращаемое значение** @@ -765,7 +765,7 @@ arrayDistinct(array) Запрос: ``` sql -SELECT arrayDistinct([1, 2, 2, 3, 1]) +SELECT arrayDistinct([1, 2, 2, 3, 1]); ``` Ответ: @@ -820,7 +820,7 @@ SELECT arrayReduce(agg_func, arr1, arr2, ..., arrN) ``` -**Параметры** +**Аргументы** - `agg_func` — Имя агрегатной функции, которая должна быть константой [string](../../sql-reference/data-types/string.md). - `arr` — Любое количество столбцов типа [array](../../sql-reference/data-types/array.md) в качестве параметров агрегатной функции. @@ -832,10 +832,10 @@ arrayReduce(agg_func, arr1, arr2, ..., arrN) Запрос: ```sql -SELECT arrayReduce('max', [1, 2, 3]) +SELECT arrayReduce('max', [1, 2, 3]); ``` -Ответ: +Результат: ```text ┌─arrayReduce('max', [1, 2, 3])─┐ @@ -850,10 +850,10 @@ SELECT arrayReduce('max', [1, 2, 3]) Запрос: ```sql -SELECT arrayReduce('maxIf', [3, 5], [1, 0]) +SELECT arrayReduce('maxIf', [3, 5], [1, 0]); ``` -Ответ: +Результат: ```text ┌─arrayReduce('maxIf', [3, 5], [1, 0])─┐ @@ -866,10 +866,10 @@ SELECT arrayReduce('maxIf', [3, 5], [1, 0]) Запрос: ```sql -SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); ``` -Ответ: +Результат: ```text ┌─arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])─┐ @@ -887,15 +887,15 @@ SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) arrayReduceInRanges(agg_func, ranges, arr1, arr2, ..., arrN) ``` -**Параметры** +**Аргументы** -- `agg_func` — Имя агрегатной функции, которая должна быть [строковой](../../sql-reference/data-types/string.md) константой. -- `ranges` — Диапазоны для агрегирования, которые должны быть [массивом](../../sql-reference/data-types/array.md) of [кортежей](../../sql-reference/data-types/tuple.md) который содержит индекс и длину каждого диапазона. -- `arr` — Любое количество столбцов типа [Array](../../sql-reference/data-types/array.md) в качестве параметров агрегатной функции. +- `agg_func` — имя агрегатной функции, которая должна быть [строковой](../../sql-reference/data-types/string.md) константой. +- `ranges` — диапазоны для агрегирования, которые должны быть [массивом](../../sql-reference/data-types/array.md) of [кортежей](../../sql-reference/data-types/tuple.md) содержащих индекс и длину каждого диапазона. +- `arr` — любое количество столбцов типа [Array](../../sql-reference/data-types/array.md) в качестве параметров агрегатной функции. **Возвращаемое значение** -- Массив, содержащий результаты агрегатной функции для указанных диапазонов. +- Массив, содержащий результаты агрегатной функции для указанных диапазонов. Тип: [Array](../../sql-reference/data-types/array.md). @@ -911,7 +911,7 @@ SELECT arrayReduceInRanges( ) AS res ``` -Ответ: +Результат: ```text ┌─res─────────────────────────┐ @@ -958,14 +958,14 @@ flatten(array_of_arrays) Синоним: `flatten`. -**Параметры** +**Аргументы** -- `array_of_arrays` — [Массив](../../sql-reference/functions/array-functions.md) массивов. Например, `[[1,2,3], [4,5]]`. +- `array_of_arrays` — [массив](../../sql-reference/functions/array-functions.md) массивов. Например, `[[1,2,3], [4,5]]`. **Примеры** ``` sql -SELECT flatten([[[1]], [[2], [3]]]) +SELECT flatten([[[1]], [[2], [3]]]); ``` ``` text @@ -984,9 +984,9 @@ SELECT flatten([[[1]], [[2], [3]]]) arrayCompact(arr) ``` -**Параметры** +**Аргументы** -`arr` — [Массив](../../sql-reference/functions/array-functions.md) для обхода. +`arr` — [массив](../../sql-reference/functions/array-functions.md) для обхода. **Возвращаемое значение** @@ -999,10 +999,10 @@ arrayCompact(arr) Запрос: ``` sql -SELECT arrayCompact([1, 1, nan, nan, 2, 3, 3, 3]) +SELECT arrayCompact([1, 1, nan, nan, 2, 3, 3, 3]); ``` -Ответ: +Результат: ``` text ┌─arrayCompact([1, 1, nan, nan, 2, 3, 3, 3])─┐ @@ -1020,9 +1020,9 @@ SELECT arrayCompact([1, 1, nan, nan, 2, 3, 3, 3]) arrayZip(arr1, arr2, ..., arrN) ``` -**Параметры** +**Аргументы** -- `arrN` — [Массив](../data-types/array.md). +- `arrN` — [массив](../data-types/array.md). Функция принимает любое количество массивов, которые могут быть различных типов. Все массивы должны иметь одинаковую длину. @@ -1037,10 +1037,10 @@ arrayZip(arr1, arr2, ..., arrN) Запрос: ``` sql -SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]) +SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]); ``` -Ответ: +Результат: ``` text ┌─arrayZip(['a', 'b', 'c'], [5, 2, 1])─┐ @@ -1067,7 +1067,7 @@ SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; Следующий пример показывает, как создать кортежи из элементов разных массивов: ``` sql -SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res +SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res; ``` ``` text @@ -1111,6 +1111,78 @@ SELECT Функция `arrayFilter` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. +## arrayFill(func, arr1, …) {#array-fill} + +Перебирает `arr1` от первого элемента к последнему и заменяет `arr1[i]` на `arr1[i - 1]`, если `func` вернула 0. Первый элемент `arr1` остаётся неизменным. + +Примеры: + +``` sql +SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res +``` + +``` text +┌─res──────────────────────────────┐ +│ [1,1,3,11,12,12,12,5,6,14,14,14] │ +└──────────────────────────────────┘ +``` + +Функция `arrayFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. + +## arrayReverseFill(func, arr1, …) {#array-reverse-fill} + +Перебирает `arr1` от последнего элемента к первому и заменяет `arr1[i]` на `arr1[i + 1]`, если `func` вернула 0. Последний элемент `arr1` остаётся неизменным. + +Примеры: + +``` sql +SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res +``` + +``` text +┌─res────────────────────────────────┐ +│ [1,3,3,11,12,5,5,5,6,14,NULL,NULL] │ +└────────────────────────────────────┘ +``` + +Функция `arrayReverseFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. + +## arraySplit(func, arr1, …) {#array-split} + +Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в левую часть. Массив не разбивается по первому элементу. + +Примеры: + +``` sql +SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res +``` + +``` text +┌─res─────────────┐ +│ [[1,2,3],[4,5]] │ +└─────────────────┘ +``` + +Функция `arraySplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. + +## arrayReverseSplit(func, arr1, …) {#array-reverse-split} + +Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в правую часть. Массив не разбивается по последнему элементу. + +Примеры: + +``` sql +SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res +``` + +``` text +┌─res───────────────┐ +│ [[1],[2,3,4],[5]] │ +└───────────────────┘ +``` + +Функция `arrayReverseSplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. + ## arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} Возвращает 1, если существует хотя бы один элемент массива `arr`, для которого функция func возвращает не 0. Иначе возвращает 0. @@ -1137,7 +1209,7 @@ SELECT ## arrayMin {#array-min} -Возвращает значение минимального элемента в исходном массиве. +Возвращает значение минимального элемента в исходном массиве. Если передана функция `func`, возвращается минимум из элементов массива, преобразованных этой функцией. @@ -1149,7 +1221,7 @@ SELECT arrayMin([func,] arr) ``` -**Параметры** +**Аргументы** - `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md). - `arr` — массив. [Array](../../sql-reference/data-types/array.md). @@ -1192,7 +1264,7 @@ SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res; ## arrayMax {#array-max} -Возвращает значение максимального элемента в исходном массиве. +Возвращает значение максимального элемента в исходном массиве. Если передана функция `func`, возвращается максимум из элементов массива, преобразованных этой функцией. @@ -1204,7 +1276,7 @@ SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res; arrayMax([func,] arr) ``` -**Параметры** +**Аргументы** - `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md). - `arr` — массив. [Array](../../sql-reference/data-types/array.md). @@ -1247,7 +1319,7 @@ SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; ## arraySum {#array-sum} -Возвращает сумму элементов в исходном массиве. +Возвращает сумму элементов в исходном массиве. Если передана функция `func`, возвращается сумма элементов массива, преобразованных этой функцией. @@ -1259,10 +1331,10 @@ SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; arraySum([func,] arr) ``` -**Параметры** +**Аргументы** - `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — массив. [Array](../../sql-reference/data-types/array.md). +- `arr` — массив. [Array](../../sql-reference/data-types/array.md). **Возвращаемое значение** @@ -1302,7 +1374,7 @@ SELECT arraySum(x -> x*x, [2, 3]) AS res; ## arrayAvg {#array-avg} -Возвращает среднее значение элементов в исходном массиве. +Возвращает среднее значение элементов в исходном массиве. Если передана функция `func`, возвращается среднее значение элементов массива, преобразованных этой функцией. @@ -1314,10 +1386,10 @@ SELECT arraySum(x -> x*x, [2, 3]) AS res; arrayAvg([func,] arr) ``` -**Параметры** +**Аргументы** - `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — массив. [Array](../../sql-reference/data-types/array.md). +- `arr` — массив. [Array](../../sql-reference/data-types/array.md). **Возвращаемое значение** @@ -1355,6 +1427,52 @@ SELECT arrayAvg(x -> (x * x), [2, 4]) AS res; └─────┘ ``` +**Синтаксис** + +``` sql +arraySum(arr) +``` + +**Возвращаемое значение** + +- Число. + +Тип: [Int](../../sql-reference/data-types/int-uint.md) или [Float](../../sql-reference/data-types/float.md). + +**Аргументы** + +- `arr` — [массив](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +```sql +SELECT arraySum([2,3]) AS res; +``` + +Результат: + +``` text +┌─res─┐ +│ 5 │ +└─────┘ +``` + +Запрос: + +``` sql +SELECT arraySum(x -> x*x, [2, 3]) AS res; +``` + +Результат: + +``` text +┌─res─┐ +│ 13 │ +└─────┘ +``` + ## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} Возвращает массив из частичных сумм элементов исходного массива (сумма с накоплением). Если указана функция `func`, то значения элементов массива преобразуются этой функцией перед суммированием. @@ -1383,7 +1501,8 @@ SELECT arrayCumSum([1, 1, 1, 1]) AS res arrayAUC(arr_scores, arr_labels) ``` -**Параметры** +**Аргументы** + - `arr_scores` — оценка, которую дает модель предсказания. - `arr_labels` — ярлыки выборок, обычно 1 для содержательных выборок и 0 для бессодержательных выборок. @@ -1398,10 +1517,10 @@ arrayAUC(arr_scores, arr_labels) Запрос: ``` sql -select arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]) +SELECT arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]); ``` -Ответ: +Результат: ``` text ┌─arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1])─┐ @@ -1409,4 +1528,3 @@ select arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]) └────────────────────────────────────────---──┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/array_functions/) diff --git a/docs/ru/sql-reference/functions/array-join.md b/docs/ru/sql-reference/functions/array-join.md index 2ed3d25fa92..3e3cf5c4011 100644 --- a/docs/ru/sql-reference/functions/array-join.md +++ b/docs/ru/sql-reference/functions/array-join.md @@ -1,6 +1,6 @@ --- toc_priority: 61 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u044f\u0020\u0041\u0072\u0072\u0061\u0079\u004a\u006f\u0069\u006e" +toc_title: "Функция ArrayJoin" --- # Функция ArrayJoin {#functions_arrayjoin} @@ -32,4 +32,3 @@ SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src └─────┴───────────┴─────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/array_join/) diff --git a/docs/ru/sql-reference/functions/bit-functions.md b/docs/ru/sql-reference/functions/bit-functions.md index 8c7808437a5..a5124e67235 100644 --- a/docs/ru/sql-reference/functions/bit-functions.md +++ b/docs/ru/sql-reference/functions/bit-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 48 -toc_title: "\u0411\u0438\u0442\u043e\u0432\u044b\u0435\u0020\u0444\u0443\u043d\u043a\u0446\u0438\u0438" +toc_title: "Битовые функции" --- # Битовые функции {#bitovye-funktsii} @@ -31,10 +31,10 @@ toc_title: "\u0411\u0438\u0442\u043e\u0432\u044b\u0435\u0020\u0444\u0443\u043d\u SELECT bitTest(number, index) ``` -**Параметры** +**Аргументы** - `number` – целое число. -- `index` – position of bit. +- `index` – позиция бита. **Возвращаемое значение** @@ -49,10 +49,10 @@ SELECT bitTest(number, index) Запрос: ``` sql -SELECT bitTest(43, 1) +SELECT bitTest(43, 1); ``` -Ответ: +Результат: ``` text ┌─bitTest(43, 1)─┐ @@ -65,10 +65,10 @@ SELECT bitTest(43, 1) Запрос: ``` sql -SELECT bitTest(43, 2) +SELECT bitTest(43, 2); ``` -Ответ: +Результат: ``` text ┌─bitTest(43, 2)─┐ @@ -93,7 +93,7 @@ SELECT bitTest(43, 2) SELECT bitTestAll(number, index1, index2, index3, index4, ...) ``` -**Параметры** +**Аргументы** - `number` – целое число. - `index1`, `index2`, `index3`, `index4` – позиция бита. Например, конъюнкция для набора позиций `index1`, `index2`, `index3`, `index4` является истинной, если все его позиции истинны `index1` ⋀ `index2` ⋀ `index3` ⋀ `index4`. @@ -111,10 +111,10 @@ SELECT bitTestAll(number, index1, index2, index3, index4, ...) Запрос: ``` sql -SELECT bitTestAll(43, 0, 1, 3, 5) +SELECT bitTestAll(43, 0, 1, 3, 5); ``` -Ответ: +Результат: ``` text ┌─bitTestAll(43, 0, 1, 3, 5)─┐ @@ -127,10 +127,10 @@ SELECT bitTestAll(43, 0, 1, 3, 5) Запрос: ``` sql -SELECT bitTestAll(43, 0, 1, 3, 5, 2) +SELECT bitTestAll(43, 0, 1, 3, 5, 2); ``` -Ответ: +Результат: ``` text ┌─bitTestAll(43, 0, 1, 3, 5, 2)─┐ @@ -155,7 +155,7 @@ SELECT bitTestAll(43, 0, 1, 3, 5, 2) SELECT bitTestAny(number, index1, index2, index3, index4, ...) ``` -**Параметры** +**Аргументы** - `number` – целое число. - `index1`, `index2`, `index3`, `index4` – позиции бита. @@ -173,10 +173,10 @@ SELECT bitTestAny(number, index1, index2, index3, index4, ...) Запрос: ``` sql -SELECT bitTestAny(43, 0, 2) +SELECT bitTestAny(43, 0, 2); ``` -Ответ: +Результат: ``` text ┌─bitTestAny(43, 0, 2)─┐ @@ -189,10 +189,10 @@ SELECT bitTestAny(43, 0, 2) Запрос: ``` sql -SELECT bitTestAny(43, 4, 2) +SELECT bitTestAny(43, 4, 2); ``` -Ответ: +Результат: ``` text ┌─bitTestAny(43, 4, 2)─┐ @@ -210,9 +210,9 @@ SELECT bitTestAny(43, 4, 2) bitCount(x) ``` -**Параметры** +**Аргументы** -- `x` — [Целое число](../../sql-reference/functions/bit-functions.md) или [число с плавающей запятой](../../sql-reference/functions/bit-functions.md). Функция использует представление числа в памяти, что позволяет поддержать числа с плавающей запятой. +- `x` — [целое число](../../sql-reference/functions/bit-functions.md) или [число с плавающей запятой](../../sql-reference/functions/bit-functions.md). Функция использует представление числа в памяти, что позволяет поддержать числа с плавающей запятой. **Возвращаемое значение** @@ -229,7 +229,7 @@ bitCount(x) Запрос: ``` sql -SELECT bitCount(333) +SELECT bitCount(333); ``` Результат: @@ -240,4 +240,53 @@ SELECT bitCount(333) └───────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/bit_functions/) +## bitHammingDistance {#bithammingdistance} + +Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между битовыми представлениями двух целых чисел. Может быть использовано с функциями [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash) для проверки двух строк на схожесть. Чем меньше расстояние, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +bitHammingDistance(int1, int2) +``` + +**Аргументы** + +- `int1` — первое целое число. [Int64](../../sql-reference/data-types/int-uint.md). +- `int2` — второе целое число. [Int64](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Расстояние Хэмминга. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Примеры** + +Запрос: + +``` sql +SELECT bitHammingDistance(111, 121); +``` + +Результат: + +``` text +┌─bitHammingDistance(111, 121)─┐ +│ 3 │ +└──────────────────────────────┘ +``` + +Используя [SimHash](../../sql-reference/functions/hash-functions.md#ngramsimhash): + +``` sql +SELECT bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat')); +``` + +Результат: + +``` text +┌─bitHammingDistance(ngramSimHash('cat ate rat'), ngramSimHash('rat ate cat'))─┐ +│ 5 │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/bitmap-functions.md b/docs/ru/sql-reference/functions/bitmap-functions.md index b21ddea94e4..3da729664d0 100644 --- a/docs/ru/sql-reference/functions/bitmap-functions.md +++ b/docs/ru/sql-reference/functions/bitmap-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 49 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0431\u0438\u0442\u043c\u0430\u043f\u043e\u0432" +toc_title: "Функции для битмапов" --- # Функции для битовых масок {#bitmap-functions} @@ -13,19 +13,19 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u bitmapBuild(array) ``` -**Параметры** +**Аргументы** - `array` – массив типа `UInt*`. **Пример** ``` sql -SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) +SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res); ``` ``` text ┌─res─┬─toTypeName(bitmapBuild([1, 2, 3, 4, 5]))─────┐ -│  │ AggregateFunction(groupBitmap, UInt8) │ +│ │ AggregateFunction(groupBitmap, UInt8) │ └─────┴──────────────────────────────────────────────┘ ``` @@ -37,14 +37,14 @@ SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) bitmapToArray(bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. **Пример** ``` sql -SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res +SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res; ``` ``` text @@ -63,11 +63,11 @@ SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res bitmapSubsetLimit(bitmap, range_start, cardinality_limit) ``` -**Параметры** +**Аргументы** -- `bitmap` – Битмап. [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – битмап. [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – Начальная точка подмножества. [UInt32](../../sql-reference/functions/bitmap-functions.md#bitmap-functions). +- `range_start` – начальная точка подмножества. [UInt32](../../sql-reference/functions/bitmap-functions.md#bitmap-functions). - `cardinality_limit` – Верхний предел подмножества. [UInt32](../../sql-reference/functions/bitmap-functions.md#bitmap-functions). **Возвращаемое значение** @@ -81,10 +81,10 @@ bitmapSubsetLimit(bitmap, range_start, cardinality_limit) Запрос: ``` sql -SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res; ``` -Ответ: +Результат: ``` text ┌─res───────────────────────┐ @@ -100,12 +100,11 @@ SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12 bitmapContains(haystack, needle) ``` -**Параметры** +**Аргументы** - `haystack` – [объект Bitmap](#bitmap_functions-bitmapbuild), в котором функция ищет значение. - `needle` – значение, которое функция ищет. Тип — [UInt32](../../sql-reference/data-types/int-uint.md). - **Возвращаемые значения** - 0 — если в `haystack` нет `needle`. @@ -116,7 +115,7 @@ bitmapContains(haystack, needle) **Пример** ``` sql -SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res +SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res; ``` ``` text @@ -135,7 +134,7 @@ bitmapHasAny(bitmap1, bitmap2) Если вы уверены, что `bitmap2` содержит строго один элемент, используйте функцию [bitmapContains](#bitmap_functions-bitmapcontains). Она работает эффективнее. -**Параметры** +**Аргументы** - `bitmap*` – массив любого типа с набором элементов. @@ -147,7 +146,7 @@ bitmapHasAny(bitmap1, bitmap2) **Пример** ``` sql -SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res +SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` ``` text @@ -165,14 +164,14 @@ SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res bitmapHasAll(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. **Пример** ``` sql -SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res +SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` ``` text @@ -189,14 +188,14 @@ SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res bitmapAnd(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. **Пример** ``` sql -SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res; ``` ``` text @@ -213,14 +212,14 @@ SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS re bitmapOr(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. **Пример** ``` sql -SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res; ``` ``` text @@ -237,14 +236,14 @@ SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res bitmapXor(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. **Пример** ``` sql -SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res; ``` ``` text @@ -261,14 +260,14 @@ SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS re bitmapAndnot(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. **Пример** ``` sql -SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res; ``` ``` text @@ -285,14 +284,14 @@ SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS bitmapCardinality(bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. **Пример** ``` sql -SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res +SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res; ``` ``` text @@ -309,7 +308,7 @@ SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res bitmapAndCardinality(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. @@ -333,7 +332,7 @@ SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; bitmapOrCardinality(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. @@ -357,7 +356,7 @@ SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; bitmapXorCardinality(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. @@ -381,7 +380,7 @@ SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; bitmapAndnotCardinality(bitmap,bitmap) ``` -**Параметры** +**Аргументы** - `bitmap` – битовый массив. @@ -397,4 +396,3 @@ SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res └─────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/bitmap_functions/) diff --git a/docs/ru/sql-reference/functions/comparison-functions.md b/docs/ru/sql-reference/functions/comparison-functions.md index a98c97ec96c..b7301bde275 100644 --- a/docs/ru/sql-reference/functions/comparison-functions.md +++ b/docs/ru/sql-reference/functions/comparison-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 36 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0441\u0440\u0430\u0432\u043d\u0435\u043d\u0438\u044f" +toc_title: "Функции сравнения" --- # Функции сравнения {#funktsii-sravneniia} @@ -34,4 +34,3 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0441\u0440\u0430\u ## greaterOrEquals, оператор `>=` {#function-greaterorequals} -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/comparison_functions/) diff --git a/docs/ru/sql-reference/functions/conditional-functions.md b/docs/ru/sql-reference/functions/conditional-functions.md index 83268b68959..b191937df51 100644 --- a/docs/ru/sql-reference/functions/conditional-functions.md +++ b/docs/ru/sql-reference/functions/conditional-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 43 -toc_title: "\u0423\u0441\u043b\u043e\u0432\u043d\u044b\u0435\u0020\u0444\u0443\u043d\u043a\u0446\u0438\u0438" +toc_title: "Условные функции" --- # Условные функции {#uslovnye-funktsii} @@ -17,11 +17,11 @@ SELECT if(cond, then, else) Если условие `cond` не равно нулю, то возвращается результат выражения `then`. Если условие `cond` равно нулю или является NULL, то результат выражения `then` пропускается и возвращается результат выражения `else`. -**Параметры** +**Аргументы** -- `cond` – Условие, которое может быть равно 0 или нет. Может быть [UInt8](../../sql-reference/functions/conditional-functions.md) или `NULL`. -- `then` - Возвращается результат выражения, если условие `cond` истинно. -- `else` - Возвращается результат выражения, если условие `cond` ложно. +- `cond` – проверяемое условие. Может быть [UInt8](../../sql-reference/functions/conditional-functions.md) или `NULL`. +- `then` – возвращается результат выражения, если условие `cond` истинно. +- `else` – возвращается результат выражения, если условие `cond` ложно. **Возвращаемые значения** @@ -32,10 +32,10 @@ SELECT if(cond, then, else) Запрос: ``` sql -SELECT if(1, plus(2, 2), plus(2, 6)) +SELECT if(1, plus(2, 2), plus(2, 6)); ``` -Ответ: +Результат: ``` text ┌─plus(2, 2)─┐ @@ -46,10 +46,10 @@ SELECT if(1, plus(2, 2), plus(2, 6)) Запрос: ``` sql -SELECT if(0, plus(2, 2), plus(2, 6)) +SELECT if(0, plus(2, 2), plus(2, 6)); ``` -Ответ: +Результат: ``` text ┌─plus(2, 6)─┐ @@ -79,11 +79,11 @@ SELECT if(0, plus(2, 2), plus(2, 6)) multiIf(cond_1, then_1, cond_2, then_2...else) -**Параметры** +**Аргументы** -- `cond_N` — Условие, при выполнении которого функция вернёт `then_N`. -- `then_N` — Результат функции при выполнении. -- `else` — Результат функции, если ни одно из условий не выполнено. +- `cond_N` — условие, при выполнении которого функция вернёт `then_N`. +- `then_N` — результат функции при выполнении. +- `else` — результат функции, если ни одно из условий не выполнено. Функция принимает `2N+1` параметров. @@ -111,4 +111,3 @@ SELECT if(0, plus(2, 2), plus(2, 6)) └────────────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/conditional_functions/) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 31482cde77f..0adccbe888b 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 39 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u0434\u0430\u0442\u0430\u043c\u0438\u0020\u0438\u0020\u0432\u0440\u0435\u043c\u0435\u043d\u0435\u043c" +toc_title: "Функции для работы с датами и временем" --- # Функции для работы с датами и временем {#funktsii-dlia-raboty-s-datami-i-vremenem} @@ -23,8 +23,6 @@ SELECT └─────────────────────┴────────────┴────────────┴─────────────────────┘ ``` -Поддерживаются только часовые пояса, отличающиеся от UTC на целое число часов. - ## toTimeZone {#totimezone} Переводит дату или дату-с-временем в указанный часовой пояс. Часовой пояс (таймзона) это атрибут типов Date/DateTime, внутреннее значение (количество секунд) поля таблицы или колонки результата не изменяется, изменяется тип поля и автоматически его текстовое отображение. @@ -63,40 +61,58 @@ int32samoa: 1546300800 Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD). +Синоним: `YEAR`. + ## toQuarter {#toquarter} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер квартала. +Синоним: `QUARTER`. + ## toMonth {#tomonth} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер месяца (1-12). +Синоним: `MONTH`. + ## toDayOfYear {#todayofyear} Переводит дату или дату-с-временем в число типа UInt16, содержащее номер дня года (1-366). +Синоним: `DAYOFYEAR`. + ## toDayOfMonth {#todayofmonth} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в месяце (1-31). +Синонимы: `DAYOFMONTH`, `DAY`. + ## toDayOfWeek {#todayofweek} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в неделе (понедельник - 1, воскресенье - 7). +Синоним: `DAYOFWEEK`. + ## toHour {#tohour} Переводит дату-с-временем в число типа UInt8, содержащее номер часа в сутках (0-23). Функция исходит из допущения, что перевод стрелок вперёд, если осуществляется, то на час, в два часа ночи, а перевод стрелок назад, если осуществляется, то на час, в три часа ночи (что, в общем, не верно - даже в Москве два раза перевод стрелок был осуществлён в другое время). +Синоним: `HOUR`. + ## toMinute {#tominute} Переводит дату-с-временем в число типа UInt8, содержащее номер минуты в часе (0-59). +Синоним: `MINUTE`. + ## toSecond {#tosecond} Переводит дату-с-временем в число типа UInt8, содержащее номер секунды в минуте (0-59). Секунды координации не учитываются. +Синоним: `SECOND`. + ## toUnixTimestamp {#to-unix-timestamp} Переводит дату-с-временем в число типа UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time). @@ -120,7 +136,7 @@ toUnixTimestamp(str, [timezone]) Запрос: ``` sql -SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp +SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp; ``` Результат: @@ -146,6 +162,7 @@ SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp ```sql SELECT toStartOfISOYear(toDate('2017-01-01')) AS ISOYear20170101; ``` + ```text ┌─ISOYear20170101─┐ │ 2016-01-04 │ @@ -199,14 +216,14 @@ SELECT toStartOfISOYear(toDate('2017-01-01')) AS ISOYear20170101; toStartOfSecond(value[, timezone]) ``` -**Параметры** +**Аргументы** -- `value` — Дата и время. [DateTime64](../data-types/datetime64.md). -- `timezone` — [Часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../data-types/string.md). +- `value` — дата и время. [DateTime64](../data-types/datetime64.md). +- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../data-types/string.md). **Возвращаемое значение** -- Входное значение с отсеченными долями секунды. +- Входное значение с отсеченными долями секунды. Тип: [DateTime64](../data-types/datetime64.md). @@ -240,9 +257,9 @@ WITH toDateTime64('2020-01-01 10:20:30.999', 3) AS dt64 SELECT toStartOfSecond(d └────────────────────────────────────────┘ ``` -**См. также** +**Смотрите также** -- Часовая зона сервера, конфигурационный параметр [timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). +- Часовая зона сервера, конфигурационный параметр [timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). ## toStartOfFiveMinute {#tostartoffiveminute} @@ -305,7 +322,9 @@ WITH toDateTime64('2020-01-01 10:20:30.999', 3) AS dt64 SELECT toStartOfSecond(d Переводит дату-с-временем или дату в число типа UInt16, содержащее номер ISO года. ISO год отличается от обычного года, потому что в соответствии с [ISO 8601:1988](https://en.wikipedia.org/wiki/ISO_8601) ISO год начинается необязательно первого января. -Пример: +**Пример** + +Запрос: ```sql SELECT @@ -313,6 +332,9 @@ SELECT toYear(date), toISOYear(date) ``` + +Результат: + ```text ┌───────date─┬─toYear(toDate('2017-01-01'))─┬─toISOYear(toDate('2017-01-01'))─┐ │ 2017-01-01 │ 2017 │ 2016 │ @@ -326,12 +348,18 @@ SELECT 1 Января 2017 г. - воскресение, т.е. первая ISO неделя 2017 года началась в понедельник 2 января, поэтому 1 января 2017 это последняя неделя 2016 года. +**Пример** + +Запрос: + ```sql SELECT toISOWeek(toDate('2017-01-01')) AS ISOWeek20170101, toISOWeek(toDate('2017-01-02')) AS ISOWeek20170102 ``` +Результат: + ```text ┌─ISOWeek20170101─┬─ISOWeek20170102─┐ │ 52 │ 1 │ @@ -368,10 +396,14 @@ SELECT **Пример** +Запрос: + ```sql SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS week1, toWeek(date,9) AS week9; ``` +Результат: + ```text ┌───────date─┬─week0─┬─week1─┬─week9─┐ │ 2016-12-27 │ 52 │ 52 │ 1 │ @@ -387,10 +419,14 @@ SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS we **Пример** +Запрос: + ```sql SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(date,1) AS yearWeek1, toYearWeek(date,9) AS yearWeek9; ``` +Результат: + ```text ┌───────date─┬─yearWeek0─┬─yearWeek1─┬─yearWeek9─┐ │ 2016-12-27 │ 201652 │ 201652 │ 201701 │ @@ -409,9 +445,9 @@ date_trunc(unit, value[, timezone]) Синоним: `dateTrunc`. -**Параметры** +**Аргументы** -- `unit` — Название части даты или времени. [String](../syntax.md#syntax-string-literal). +- `unit` — единица измерения времени, в которой задана отсекаемая часть. [String Literal](../syntax.md#syntax-string-literal). Возможные значения: - `second` @@ -423,8 +459,8 @@ date_trunc(unit, value[, timezone]) - `quarter` - `year` -- `value` — Дата и время. [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — [Часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../../sql-reference/data-types/string.md) +- `value` — дата и время. [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). +- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../../sql-reference/data-types/string.md) **Возвращаемое значение** @@ -462,10 +498,267 @@ SELECT now(), date_trunc('hour', now(), 'Europe/Moscow'); └─────────────────────┴────────────────────────────────────────────┘ ``` -**См. также** +**Смотрите также** - [toStartOfInterval](#tostartofintervaltime-or-data-interval-x-unit-time-zone) +## date\_add {#date_add} + +Добавляет интервал времени или даты к указанной дате или дате со временем. + +**Синтаксис** + +``` sql +date_add(unit, value, date) +``` + +Синонимы: `dateAdd`, `DATE_ADD`. + +**Аргументы** + +- `unit` — единица измерения времени, в которой задан интервал для добавления. [String](../../sql-reference/data-types/string.md). + Возможные значения: + + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` + +- `value` — значение интервала для добавления. [Int](../../sql-reference/data-types/int-uint.md). +- `date` — дата или дата со временем, к которой добавляется `value`. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +**Возвращаемое значение** + +Дата или дата со временем, полученная в результате добавления `value`, выраженного в `unit`, к `date`. + +Тип: [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +**Пример** + +Запрос: + +```sql +SELECT date_add(YEAR, 3, toDate('2018-01-01')); +``` + +Результат: + +```text +┌─plus(toDate('2018-01-01'), toIntervalYear(3))─┐ +│ 2021-01-01 │ +└───────────────────────────────────────────────┘ +``` + +## date\_diff {#date_diff} + +Вычисляет разницу между двумя значениями дат или дат со временем. + +**Синтаксис** + +``` sql +date_diff('unit', startdate, enddate, [timezone]) +``` + +Синонимы: `dateDiff`, `DATE_DIFF`. + +**Аргументы** + +- `unit` — единица измерения времени, в которой будет выражено возвращаемое значение функции. [String](../../sql-reference/data-types/string.md). + Возможные значения: + + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` + +- `startdate` — первая дата или дата со временем, которая вычитается из `enddate`. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +- `enddate` — вторая дата или дата со временем, из которой вычитается `startdate`. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (необязательно). Если этот аргумент указан, то он применяется как для `startdate`, так и для `enddate`. Если этот аргумент не указан, то используются часовые пояса аргументов `startdate` и `enddate`. Если часовые пояса аргументов `startdate` и `enddate` не совпадают, то результат не определен. [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +Разница между `enddate` и `startdate`, выраженная в `unit`. + +Тип: [Int](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); +``` + +Результат: + +``` text +┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ +│ 25 │ +└────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +## date\_sub {#date_sub} + +Вычитает интервал времени или даты из указанной даты или даты со временем. + +**Синтаксис** + +``` sql +date_sub(unit, value, date) +``` + +Синонимы: `dateSub`, `DATE_SUB`. + +**Аргументы** + +- `unit` — единица измерения времени, в которой задан интервал для вычитания. [String](../../sql-reference/data-types/string.md). + Возможные значения: + + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` + +- `value` — значение интервала для вычитания. [Int](../../sql-reference/data-types/int-uint.md). +- `date` — дата или дата со временем, из которой вычитается `value`. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +**Возвращаемое значение** + +Дата или дата со временем, полученная в результате вычитания `value`, выраженного в `unit`, из `date`. + +Тип: [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +**Пример** + +Запрос: + +``` sql +SELECT date_sub(YEAR, 3, toDate('2018-01-01')); +``` + +Результат: + +``` text +┌─minus(toDate('2018-01-01'), toIntervalYear(3))─┐ +│ 2015-01-01 │ +└────────────────────────────────────────────────┘ +``` + +## timestamp\_add {#timestamp_add} + +Добавляет интервал времени к указанной дате или дате со временем. + +**Синтаксис** + +``` sql +timestamp_add(date, INTERVAL value unit) +``` + +Синонимы: `timeStampAdd`, `TIMESTAMP_ADD`. + +**Аргументы** + +- `date` — дата или дата со временем. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). +- `value` — значение интервала для добавления. [Int](../../sql-reference/data-types/int-uint.md). +- `unit` — единица измерения времени, в которой задан интервал для добавления. [String](../../sql-reference/data-types/string.md). + Возможные значения: + + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` + +**Возвращаемое значение** + +Дата или дата со временем, полученная в результате добавления `value`, выраженного в `unit`, к `date`. + +Тип: [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +**Пример** + +Запрос: + +```sql +select timestamp_add(toDate('2018-01-01'), INTERVAL 3 MONTH); +``` + +Результат: + +```text +┌─plus(toDate('2018-01-01'), toIntervalMonth(3))─┐ +│ 2018-04-01 │ +└────────────────────────────────────────────────┘ +``` + +## timestamp\_sub {#timestamp_sub} + +Вычитает интервал времени из указанной даты или даты со временем. + +**Синтакис** + +``` sql +timestamp_sub(unit, value, date) +``` + +Синонимы: `timeStampSub`, `TIMESTAMP_SUB`. + +**Аргументы** + +- `unit` — единица измерения времени, в которой задан интервал для вычитания. [String](../../sql-reference/data-types/string.md). + Возможные значения: + + - `second` + - `minute` + - `hour` + - `day` + - `week` + - `month` + - `quarter` + - `year` + +- `value` — значение интервала для вычитания. [Int](../../sql-reference/data-types/int-uint.md). +- `date` — дата или дата со временем. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +**Возвращаемое значение** + +Дата или дата со временем, полученная в результате вычитания `value`, выраженного в `unit`, из `date`. + +Тип: [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md). + +**Пример** + +Запрос: + +```sql +select timestamp_sub(MONTH, 5, toDateTime('2018-12-18 01:02:03')); +``` + +Результат: + +```text +┌─minus(toDateTime('2018-12-18 01:02:03'), toIntervalMonth(5))─┐ +│ 2018-07-18 01:02:03 │ +└──────────────────────────────────────────────────────────────┘ +``` + ## now {#now} Возвращает текущую дату и время. @@ -478,7 +771,7 @@ now([timezone]) **Параметры** -- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). [String](../../sql-reference/data-types/string.md) +- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). [String](../../sql-reference/data-types/string.md). **Возвращаемое значение** @@ -518,68 +811,13 @@ SELECT now('Europe/Moscow'); ## today {#today} -Принимает ноль аргументов и возвращает текущую дату на один из моментов выполнения запроса. +Возвращает текущую дату на момент выполнения запроса. Функция не требует аргументов. То же самое, что toDate(now()) ## yesterday {#yesterday} -Принимает ноль аргументов и возвращает вчерашнюю дату на один из моментов выполнения запроса. -Делает то же самое, что today() - 1. - -## dateDiff {#datediff} - -Вычисляет разницу между двумя значениями дат с временем. - -**Синтаксис** - -``` sql -dateDiff('unit', startdate, enddate, [timezone]) -``` - -**Параметры** - -- `unit` — Единица измерения времени, в которой будет вычислена разница между `startdate` и `enddate`. [String](../syntax.md#syntax-string-literal). - - Поддерживаемые значения: - - | unit | - | ------ | - |second | - |minute | - |hour | - |day | - |week | - |month | - |quarter | - |year | - -- `startdate` — Первая дата. [Date](../../sql-reference/functions/date-time-functions.md) или [DateTime](../../sql-reference/functions/date-time-functions.md). - -- `enddate` — Вторая дата. [Date](../../sql-reference/functions/date-time-functions.md) или [DateTime](../../sql-reference/functions/date-time-functions.md). - -- `timezone` — Опциональный параметр. Если определен, применяется к обоим значениям: `startdate` и `enddate`. Если не определен, используются часовые пояса `startdate` и `enddate`. Если часовые пояса не совпадают, вернется неожидаемый результат. - -**Возвращаемое значение** - -Разница между `startdate` и `enddate`, выраженная в `unit`. - -Тип: `int`. - -**Пример** - -Запрос: - -``` sql -SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); -``` - -Ответ: - -``` text -┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ -│ 25 │ -└────────────────────────────────────────────────────────────────────────────────────────┘ -``` +Возвращает вчерашнюю дату на момент выполнения запроса. +Делает то же самое, что today() - 1. Функция не требует аргументов. ## timeSlot {#timeslot} @@ -620,6 +858,7 @@ formatDateTime(Time, Format\[, Timezone\]) Возвращает значение времени и даты в определенном вами формате. **Поля подстановки** + Используйте поля подстановки для того, чтобы определить шаблон для выводимой строки. В колонке «Пример» результат работы функции для времени `2018-01-02 22:33:44`. | Поле | Описание | Пример | @@ -627,7 +866,7 @@ formatDateTime(Time, Format\[, Timezone\]) | %C | номер года, поделённый на 100 (00-99) | 20 | | %d | день месяца, с ведущим нулём (01-31) | 02 | | %D | короткая запись %m/%d/%y | 01/02/18 | -| %e | день месяца, с ведущим пробелом ( 1-31) | 2 | +| %e | день месяца, с ведущим пробелом ( 1-31) |   2 | | %F | короткая запись %Y-%m-%d | 2018-01-02 | | %G | четырехзначный формат вывода ISO-года, который основывается на особом подсчете номера недели согласно [стандарту ISO 8601](https://ru.wikipedia.org/wiki/ISO_8601), обычно используется вместе с %V | 2018 | | %g | двузначный формат вывода года по стандарту ISO 8601 | 18 | @@ -638,6 +877,7 @@ formatDateTime(Time, Format\[, Timezone\]) | %M | минуты, с ведущим нулём (00-59) | 33 | | %n | символ переноса строки (‘’) | | | %p | обозначения AM или PM | PM | +| %Q | квартал (1-4) | 1 | | %R | короткая запись %H:%M | 22:33 | | %S | секунды, с ведущими нулями (00-59) | 44 | | %t | символ табуляции (’) | | @@ -654,10 +894,10 @@ formatDateTime(Time, Format\[, Timezone\]) Запрос: ``` sql -SELECT formatDateTime(toDate('2010-01-04'), '%g') +SELECT formatDateTime(toDate('2010-01-04'), '%g'); ``` -Ответ: +Результат: ``` ┌─formatDateTime(toDate('2010-01-04'), '%g')─┐ @@ -665,4 +905,42 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g') └────────────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/date_time_functions/) +## FROM\_UNIXTIME {#fromunixtime} + +Функция преобразует Unix timestamp в календарную дату и время. + +**Примеры** + +Если указан только один аргумент типа [Integer](../../sql-reference/data-types/int-uint.md), то функция действует так же, как [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime), и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). + +Запрос: + +```sql +SELECT FROM_UNIXTIME(423543535); +``` + +Результат: + +```text +┌─FROM_UNIXTIME(423543535)─┐ +│ 1983-06-04 10:58:55 │ +└──────────────────────────┘ +``` + +В случае, когда есть два аргумента: первый типа [Integer](../../sql-reference/data-types/int-uint.md) или [DateTime](../../sql-reference/data-types/datetime.md), а второй является строкой постоянного формата — функция работает также, как [formatDateTime](#formatdatetime), и возвращает значение типа [String](../../sql-reference/data-types/string.md#string). + + +Запрос: + +```sql +SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime; +``` + +Результат: + +```text +┌─DateTime────────────┐ +│ 2009-02-11 14:42:23 │ +└─────────────────────┘ +``` + diff --git a/docs/ru/sql-reference/functions/encoding-functions.md b/docs/ru/sql-reference/functions/encoding-functions.md index 6f1c2aad6cb..f4fa21ba46a 100644 --- a/docs/ru/sql-reference/functions/encoding-functions.md +++ b/docs/ru/sql-reference/functions/encoding-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 52 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043a\u043e\u0434\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u044f" +toc_title: "Функции кодирования" --- # Функции кодирования {#funktsii-kodirovaniia} @@ -15,13 +15,13 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043a\u043e\u0434\u char(number_1, [number_2, ..., number_n]); ``` -**Параметры** +**Аргументы** -- `number_1, number_2, ..., number_n` — Числовые аргументы, которые интерпретируются как целые числа. Типы: [Int](../../sql-reference/functions/encoding-functions.md), [Float](../../sql-reference/functions/encoding-functions.md). +- `number_1, number_2, ..., number_n` — числовые аргументы, которые интерпретируются как целые числа. Типы: [Int](../../sql-reference/functions/encoding-functions.md), [Float](../../sql-reference/functions/encoding-functions.md). **Возвращаемое значение** -- строка из соответствующих байт. +- Строка из соответствующих байт. Тип: `String`. @@ -30,10 +30,10 @@ char(number_1, [number_2, ..., number_n]); Запрос: ``` sql -SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello +SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello; ``` -Ответ: +Результат: ``` text ┌─hello─┐ @@ -49,7 +49,7 @@ SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello SELECT char(0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, 0xD1, 0x82) AS hello; ``` -Ответ: +Результат: ``` text ┌─hello──┐ @@ -63,7 +63,7 @@ SELECT char(0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, 0xD1, 0x SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; ``` -Ответ: +Результат: ``` text ┌─hello─┐ @@ -75,6 +75,8 @@ SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; Returns a string containing the argument’s hexadecimal representation. +Синоним: `HEX`. + **Syntax** ``` sql @@ -170,4 +172,3 @@ If you want to convert the result to a number, you can use the ‘reverse’ and Принимает целое число. Возвращает массив чисел типа UInt64, содержащий степени двойки, в сумме дающих исходное число; числа в массиве идут по возрастанию. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/encoding_functions/) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index f1f6516d453..844f9cc3197 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 67 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438 \u0434\u043b\u044f \u0448\u0438\u0444\u0440\u043e\u0432\u0430\u043d\u0438\u044f" +toc_title: "Функции для шифрования" --- # Функции шифрования {#encryption-functions} @@ -11,7 +11,7 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438 \u0434\u043b\u044f \u0448 Длина инициализирующего вектора всегда 16 байт (лишнии байты игнорируются). -Обратите внимание, что эти функции работают медленно. +Обратите внимание, что до версии Clickhouse 21.1 эти функции работали медленно. ## encrypt {#encrypt} @@ -31,7 +31,7 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438 \u0434\u043b\u044f \u0448 encrypt('mode', 'plaintext', 'key' [, iv, aad]) ``` -**Параметры** +**Аргументы** - `mode` — режим шифрования. [String](../../sql-reference/data-types/string.md#string). - `plaintext` — текст, который будет зашифрован. [String](../../sql-reference/data-types/string.md#string). @@ -41,7 +41,7 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad]) **Возвращаемое значение** -- Зашифрованная строка. [String](../../sql-reference/data-types/string.md#string). +- Бинарная зашифрованная строка. [String](../../sql-reference/data-types/string.md#string). **Примеры** @@ -52,57 +52,38 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad]) ``` sql CREATE TABLE encryption_test ( - input String, - key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'), - iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'), - key32 String DEFAULT substring(key, 1, 32), - key24 String DEFAULT substring(key, 1, 24), - key16 String DEFAULT substring(key, 1, 16) -) Engine = Memory; + `comment` String, + `secret` String +) +ENGINE = Memory; ``` -Вставим эти данные: +Вставим некоторые данные (замечание: не храните ключи или инициализирующие векторы в базе данных, так как это компрометирует всю концепцию шифрования), также хранение "подсказок" небезопасно и используется только для наглядности: Запрос: ``` sql -INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?'); +INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\ +('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ +('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ +('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212')); ``` -Пример без `iv`: - Запрос: ``` sql -SELECT 'aes-128-ecb' AS mode, hex(encrypt(mode, input, key16)) FROM encryption_test; +SELECT comment, hex(secret) FROM encryption_test; ``` Результат: ``` text -┌─mode────────┬─hex(encrypt('aes-128-ecb', input, key16))────────────────────────┐ -│ aes-128-ecb │ 4603E6862B0D94BBEC68E0B0DF51D60F │ -│ aes-128-ecb │ 3004851B86D3F3950672DE7085D27C03 │ -│ aes-128-ecb │ E807F8C8D40A11F65076361AFC7D8B68D8658C5FAA6457985CAA380F16B3F7E4 │ -└─────────────┴──────────────────────────────────────────────────────────────────┘ -``` - -Пример с `iv`: - -Запрос: - -``` sql -SELECT 'aes-256-ctr' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encryption_test; -``` - -Результат: - -``` text -┌─mode────────┬─hex(encrypt('aes-256-ctr', input, key32, iv))─┐ -│ aes-256-ctr │ │ -│ aes-256-ctr │ 7FB039F7 │ -│ aes-256-ctr │ 5CBD20F7ABD3AC41FCAA1A5C0E119E2B325949 │ -└─────────────┴───────────────────────────────────────────────┘ +┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-cfb128 no IV │ B4972BDC4459 │ +│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ +│ aes-256-cfb128 with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└─────────────────────────────────────┴──────────────────────────────────┘ ``` Пример в режиме `-gcm`: @@ -110,41 +91,27 @@ SELECT 'aes-256-ctr' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encrypti Запрос: ``` sql -SELECT 'aes-256-gcm' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encryption_test; +INSERT INTO encryption_test VALUES('aes-256-gcm', encrypt('aes-256-gcm', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')), \ +('aes-256-gcm with AAD', encrypt('aes-256-gcm', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv', 'aad')); + +SELECT comment, hex(secret) FROM encryption_test WHERE comment LIKE '%gcm%'; ``` Результат: ``` text -┌─mode────────┬─hex(encrypt('aes-256-gcm', input, key32, iv))──────────────────────────┐ -│ aes-256-gcm │ E99DBEBC01F021758352D7FBD9039EFA │ -│ aes-256-gcm │ 8742CE3A7B0595B281C712600D274CA881F47414 │ -│ aes-256-gcm │ A44FD73ACEB1A64BDE2D03808A2576EDBB60764CC6982DB9AF2C33C893D91B00C60DC5 │ -└─────────────┴────────────────────────────────────────────────────────────────────────┘ -``` - -Пример в режиме `-gcm` и с `aad`: - -Запрос: - -``` sql -SELECT 'aes-192-gcm' AS mode, hex(encrypt(mode, input, key24, iv, 'AAD')) FROM encryption_test; -``` - -Результат: - -``` text -┌─mode────────┬─hex(encrypt('aes-192-gcm', input, key24, iv, 'AAD'))───────────────────┐ -│ aes-192-gcm │ 04C13E4B1D62481ED22B3644595CB5DB │ -│ aes-192-gcm │ 9A6CF0FD2B329B04EAD18301818F016DF8F77447 │ -│ aes-192-gcm │ B961E9FD9B940EBAD7ADDA75C9F198A40797A5EA1722D542890CC976E21113BBB8A7AA │ -└─────────────┴────────────────────────────────────────────────────────────────────────┘ +┌─comment──────────────┬─hex(secret)──────────────────────────────────┐ +│ aes-256-gcm │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │ +│ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │ +└──────────────────────┴──────────────────────────────────────────────┘ ``` ## aes_encrypt_mysql {#aes_encrypt_mysql} Совместима с шифрованием myqsl, результат может быть расшифрован функцией [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt). +При одинаковых входящих значениях зашифрованный текст будет совпадать с результатом, возвращаемым функцией `encrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_encrypt_mysql` будет работать аналогично функции `aes_encrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`. + Функция поддерживает шифрофание данных следующими режимами: - aes-128-ecb, aes-192-ecb, aes-256-ecb @@ -156,86 +123,104 @@ SELECT 'aes-192-gcm' AS mode, hex(encrypt(mode, input, key24, iv, 'AAD')) FROM e **Синтаксис** -```sql +``` sql aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv]) ``` -**Параметры** +**Аргументы** - `mode` — режим шифрования. [String](../../sql-reference/data-types/string.md#string). - `plaintext` — текст, который будет зашифрован. [String](../../sql-reference/data-types/string.md#string). -- `key` — ключ шифрования. [String](../../sql-reference/data-types/string.md#string). -- `iv` — инициализирующий вектор. Необязателен. [String](../../sql-reference/data-types/string.md#string). +- `key` — ключ шифрования. Если ключ длиннее, чем требует режим шифрования, производится специфичная для MySQL свертка ключа. [String](../../sql-reference/data-types/string.md#string). +- `iv` — инициализирующий вектор. Необязателен, учитываются только первые 16 байтов. [String](../../sql-reference/data-types/string.md#string). **Возвращаемое значение** -- Зашифрованная строка. [String](../../sql-reference/data-types/string.md#string). +- Бинарная зашифрованная строка. [String](../../sql-reference/data-types/string.md#string). **Примеры** -Создадим такую таблицу: +При одинаковых входящих значениях результаты шифрования у функций `encrypt` и `aes_encrypt_mysql` совпадают. Запрос: ``` sql -CREATE TABLE encryption_test -( - input String, - key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'), - iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'), - key32 String DEFAULT substring(key, 1, 32), - key24 String DEFAULT substring(key, 1, 24), - key16 String DEFAULT substring(key, 1, 16) -) Engine = Memory; -``` - -Вставим эти данные: - -Запрос: - -``` sql -INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?'); -``` - -Пример без `iv`: - -Запрос: - -``` sql -SELECT 'aes-128-cbc' AS mode, hex(aes_encrypt_mysql(mode, input, key32)) FROM encryption_test; +SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; ``` Результат: ``` text -┌─mode────────┬─hex(aes_encrypt_mysql('aes-128-cbc', input, key32))──────────────┐ -│ aes-128-cbc │ FEA8CFDE6EE2C6E7A2CC6ADDC9F62C83 │ -│ aes-128-cbc │ 78B16CD4BE107660156124C5FEE6454A │ -│ aes-128-cbc │ 67C0B119D96F18E2823968D42871B3D179221B1E7EE642D628341C2B29BA2E18 │ -└─────────────┴──────────────────────────────────────────────────────────────────┘ +┌─ciphertexts_equal─┐ +│ 1 │ +└───────────────────┘ ``` -Пример с `iv`: +Функция `encrypt` генерирует исключение, если `key` или `iv` длиннее чем нужно: Запрос: ``` sql -SELECT 'aes-256-cfb128' AS mode, hex(aes_encrypt_mysql(mode, input, key32, iv)) FROM encryption_test; +SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); ``` Результат: ``` text -┌─mode───────────┬─hex(aes_encrypt_mysql('aes-256-cfb128', input, key32, iv))─┐ -│ aes-256-cfb128 │ │ -│ aes-256-cfb128 │ 7FB039F7 │ -│ aes-256-cfb128 │ 5CBD20F7ABD3AC41FCAA1A5C0E119E2BB5174F │ -└────────────────┴────────────────────────────────────────────────────────────┘ +Received exception from server (version 21.1.2): +Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). +``` + +Однако функция `aes_encrypt_mysql` в аналогичном случае возвращает результат, который может быть обработан MySQL: + +Запрос: + +``` sql +SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; +``` + +Результат: + +```text +┌─ciphertext───┐ +│ 24E9E4966469 │ +└──────────────┘ +``` + +Если передать `iv` еще длиннее, результат останется таким же: + +Запрос: + +``` sql +SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext +``` + +Результат: + +``` text +┌─ciphertext───┐ +│ 24E9E4966469 │ +└──────────────┘ +``` + +Это совпадает с результатом, возвращаемым MySQL при таких же входящих значениях: + +``` sql +mysql> SET block_encryption_mode='aes-256-cfb128'; +Query OK, 0 rows affected (0.00 sec) + +mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; ++------------------------+ +| ciphertext | ++------------------------+ +| 0x24E9E4966469 | ++------------------------+ +1 row in set (0.00 sec) ``` ## decrypt {#decrypt} -Функция поддерживает расшифровку данных следующими режимами: +Функция расшифровывает зашифрованный текст и может работать в следующих режимах: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc @@ -247,17 +232,17 @@ SELECT 'aes-256-cfb128' AS mode, hex(aes_encrypt_mysql(mode, input, key32, iv)) **Синтаксис** -```sql +``` sql decrypt('mode', 'ciphertext', 'key' [, iv, aad]) ``` -**Параметры** +**Аргументы** - `mode` — режим шифрования. [String](../../sql-reference/data-types/string.md#string). - `ciphertext` — зашифрованный текст, который будет расшифрован. [String](../../sql-reference/data-types/string.md#string). - `key` — ключ шифрования. [String](../../sql-reference/data-types/string.md#string). - `iv` — инициализирующий вектор. Обязателен для `-gcm` режимов, для остальных режимов опциональный. [String](../../sql-reference/data-types/string.md#string). -- `aad` — дополнительные аутентифицированные данные. Текст не будет расшифрован, если это значение неверно. Работает только с `-gcm` режимами. Для остальных вызовет исключение. [String](../../sql-reference/data-types/string.md#string). +- `aad` — дополнительные аутентифицированные данные. Текст не будет расшифрован, если это значение неверно. Работает только с `-gcm` режимами. Для остальных вызовет исключение. [String](../../sql-reference/data-types/string.md#string). **Возвращаемое значение** @@ -265,52 +250,58 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad]) **Примеры** -Создадим такую таблицу: +Рассмотрим таблицу из примера для функции [encrypt](#encrypt). Запрос: ``` sql -CREATE TABLE encryption_test -( - input String, - key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'), - iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'), - key32 String DEFAULT substring(key, 1, 32), - key24 String DEFAULT substring(key, 1, 24), - key16 String DEFAULT substring(key, 1, 16) -) Engine = Memory; -``` - -Вставим эти данные: - -Запрос: - -``` sql -INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?'); -``` - -Запрос: - -``` sql - -SELECT 'aes-128-ecb' AS mode, decrypt(mode, encrypt(mode, input, key16), key16) FROM encryption_test; +SELECT comment, hex(secret) FROM encryption_test; ``` Результат: -```text -┌─mode────────┬─decrypt('aes-128-ecb', encrypt('aes-128-ecb', input, key16), key16)─┐ -│ aes-128-ecb │ │ -│ aes-128-ecb │ text │ -│ aes-128-ecb │ What Is ClickHouse? │ -└─────────────┴─────────────────────────────────────────────────────────────────────┘ +``` text +┌─comment──────────────┬─hex(secret)──────────────────────────────────┐ +│ aes-256-gcm │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │ +│ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │ +└──────────────────────┴──────────────────────────────────────────────┘ +┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-cfb128 no IV │ B4972BDC4459 │ +│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ +│ aes-256-cfb128 with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└─────────────────────────────────────┴──────────────────────────────────┘ ``` +Теперь попытаемся расшифровать эти данные: + +Запрос: + +``` sql +SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test; +``` + +Результат: + +``` text +┌─comment─────────────────────────────┬─plaintext─┐ +│ aes-256-cfb128 no IV │ Secret │ +│ aes-256-cfb128 no IV, different key │ �4� + � │ +│ aes-256-cfb128 with IV │ ���6�~ │ + │aes-256-cbc no IV │ �2*4�h3c�4w��@ +└─────────────────────────────────────┴───────────┘ +``` + +Обратите внимание, что только часть данных была расшифрована верно. Оставшаяся часть расшифрована некорректно, так как при шифровании использовались другие значения `mode`, `key`, или `iv`. + ## aes_decrypt_mysql {#aes_decrypt_mysql} Совместима с шифрованием myqsl и может расшифровать данные, зашифрованные функцией [AES_ENCRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-encrypt). -Функция поддерживает расшифровку данных следующими режимами: +При одинаковых входящих значениях расшифрованный текст будет совпадать с результатом, возвращаемым функцией `decrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_decrypt_mysql` будет работать аналогично функции `aes_decrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`. + +Функция поддерживает расшифровку данных в следующих режимах: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc @@ -321,62 +312,50 @@ SELECT 'aes-128-ecb' AS mode, decrypt(mode, encrypt(mode, input, key16), key16) **Синтаксис** -```sql +``` sql aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) ``` -**Параметры** +**Аргументы** - `mode` — режим шифрования. [String](../../sql-reference/data-types/string.md#string). - `ciphertext` — зашифрованный текст, который будет расшифрован. [String](../../sql-reference/data-types/string.md#string). - `key` — ключ шифрования. [String](../../sql-reference/data-types/string.md#string). - `iv` — инициализирующий вектор. Необязателен. [String](../../sql-reference/data-types/string.md#string). - **Возвращаемое значение** - Расшифрованная строка. [String](../../sql-reference/data-types/string.md#string). **Примеры** -Создадим такую таблицу: +Расшифруем данные, которые до этого были зашифрованы в MySQL: -Запрос: ``` sql -CREATE TABLE encryption_test -( - input String, - key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'), - iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'), - key32 String DEFAULT substring(key, 1, 32), - key24 String DEFAULT substring(key, 1, 24), - key16 String DEFAULT substring(key, 1, 16) -) Engine = Memory; -``` +mysql> SET block_encryption_mode='aes-256-cfb128'; +Query OK, 0 rows affected (0.00 sec) -Вставим эти данные: - -Запрос: - -``` sql -INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?'); +mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; ++------------------------+ +| ciphertext | ++------------------------+ +| 0x24E9E4966469 | ++------------------------+ +1 row in set (0.00 sec) ``` Запрос: ``` sql -SELECT 'aes-128-cbc' AS mode, aes_decrypt_mysql(mode, aes_encrypt_mysql(mode, input, key), key) FROM encryption_test; +SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext; ``` Результат: ``` text -┌─mode────────┬─aes_decrypt_mysql('aes-128-cbc', aes_encrypt_mysql('aes-128-cbc', input, key), key)─┐ -│ aes-128-cbc │ │ -│ aes-128-cbc │ text │ -│ aes-128-cbc │ What Is ClickHouse? │ -└─────────────┴─────────────────────────────────────────────────────────────────────────────────────┘ +┌─plaintext─┐ +│ Secret │ +└───────────┘ ``` - [Original article](https://clickhouse.tech/docs/ru/sql-reference/functions/encryption_functions/) diff --git a/docs/ru/sql-reference/functions/ext-dict-functions.md b/docs/ru/sql-reference/functions/ext-dict-functions.md index 6054ed141d4..919f8ebe276 100644 --- a/docs/ru/sql-reference/functions/ext-dict-functions.md +++ b/docs/ru/sql-reference/functions/ext-dict-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 58 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u0432\u043d\u0435\u0448\u043d\u0438\u043c\u0438\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u044f\u043c\u0438" +toc_title: "Функции для работы с внешними словарями" --- # Функции для работы с внешними словарями {#ext_dict_functions} @@ -16,7 +16,7 @@ dictGet('dict_name', 'attr_name', id_expr) dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` -**Параметры** +**Аргументы** - `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). - `attr_name` — имя столбца словаря. [Строковый литерал](../syntax.md#syntax-string-literal). @@ -105,7 +105,7 @@ LIMIT 3 dictHas('dict_name', id) ``` -**Параметры** +**Аргументы** - `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). - `id_expr` — значение ключа словаря. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../sql-reference/functions/ext-dict-functions.md) или [Tuple](../../sql-reference/functions/ext-dict-functions.md) в зависимости от конфигурации словаря. @@ -127,7 +127,7 @@ dictHas('dict_name', id) dictGetHierarchy('dict_name', key) ``` -**Параметры** +**Аргументы** - `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). - `key` — значение ключа. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../sql-reference/functions/ext-dict-functions.md). @@ -144,7 +144,7 @@ Type: [Array(UInt64)](../../sql-reference/functions/ext-dict-functions.md). `dictIsIn ('dict_name', child_id_expr, ancestor_id_expr)` -**Параметры** +**Аргументы** - `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). - `child_id_expr` — ключ для проверки. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../sql-reference/functions/ext-dict-functions.md). @@ -180,12 +180,12 @@ dictGet[Type]('dict_name', 'attr_name', id_expr) dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` -**Параметры** +**Аргументы** - `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). - `attr_name` — имя столбца словаря. [Строковый литерал](../syntax.md#syntax-string-literal). - `id_expr` — значение ключа словаря. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../sql-reference/functions/ext-dict-functions.md) или [Tuple](../../sql-reference/functions/ext-dict-functions.md) в зависимости от конфигурации словаря. -- `default_value_expr` — значение, возвращаемое в том случае, когда словарь не содержит строки с заданным ключом `id_expr`. [Выражение](../syntax.md#syntax-expressions) возвращающее значение с типом данных, сконфигурированным для атрибута `attr_name`. +- `default_value_expr` — значение, возвращаемое в том случае, когда словарь не содержит строки с заданным ключом `id_expr`. [Выражение](../syntax.md#syntax-expressions), возвращающее значение с типом данных, сконфигурированным для атрибута `attr_name`. **Возвращаемое значение** @@ -198,4 +198,3 @@ dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) Если значение атрибута не удалось обработать или оно не соответствует типу данных атрибута, то ClickHouse генерирует исключение. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/ext_dict_functions/) diff --git a/docs/ru/sql-reference/functions/files.md b/docs/ru/sql-reference/functions/files.md new file mode 100644 index 00000000000..9cb659375b9 --- /dev/null +++ b/docs/ru/sql-reference/functions/files.md @@ -0,0 +1,33 @@ +--- +toc_priority: 43 +toc_title: "Функции для работы с файлами" +--- + +# Функции для работы с файлами {#funktsii-dlia-raboty-s-failami} + +## file {#file} + +Читает файл как строку. Содержимое файла не разбирается (не парсится) и записывается в указанную колонку в виде единой строки. + +**Синтаксис** + +``` sql +file(path) +``` + +**Аргументы** + +- `path` — относительный путь до файла от [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Путь к файлу может включать следующие символы подстановки и шаблоны: `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, `'abc', 'def'` — строки. + +**Примеры** + +Вставка данных из файлов a.txt и b.txt в таблицу в виде строк: + +``` sql +INSERT INTO table SELECT file('a.txt'), file('b.txt'); +``` + +**Смотрите также** + +- [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path) +- [file](../table-functions/file.md) diff --git a/docs/ru/sql-reference/functions/functions-for-nulls.md b/docs/ru/sql-reference/functions/functions-for-nulls.md index 17da1ea9194..365dba75da7 100644 --- a/docs/ru/sql-reference/functions/functions-for-nulls.md +++ b/docs/ru/sql-reference/functions/functions-for-nulls.md @@ -1,6 +1,6 @@ --- toc_priority: 63 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u004e\u0075\u006c\u006c\u0061\u0062\u006c\u0065\u002d\u0430\u0440\u0433\u0443\u043c\u0435\u043d\u0442\u0430\u043c\u0438" +toc_title: "Функции для работы с Nullable-аргументами" --- # Функции для работы с Nullable-аргументами {#funktsii-dlia-raboty-s-nullable-argumentami} @@ -13,7 +13,9 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u isNull(x) ``` -**Параметры** +Синоним: `ISNULL`. + +**Аргументы** - `x` — значение с не составным типом данных. @@ -36,7 +38,7 @@ isNull(x) Запрос ``` sql -SELECT x FROM t_null WHERE isNull(y) +SELECT x FROM t_null WHERE isNull(y); ``` ``` text @@ -53,7 +55,7 @@ SELECT x FROM t_null WHERE isNull(y) isNotNull(x) ``` -**Параметры** +**Аргументы** - `x` — значение с не составным типом данных. @@ -76,7 +78,7 @@ isNotNull(x) Запрос ``` sql -SELECT x FROM t_null WHERE isNotNull(y) +SELECT x FROM t_null WHERE isNotNull(y); ``` ``` text @@ -93,7 +95,7 @@ SELECT x FROM t_null WHERE isNotNull(y) coalesce(x,...) ``` -**Параметры** +**Аргументы** - Произвольное количество параметров не составного типа. Все параметры должны быть совместимы по типу данных. @@ -118,7 +120,7 @@ coalesce(x,...) Получим из адресной книги первый доступный способ связаться с клиентом: ``` sql -SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook +SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook; ``` ``` text @@ -136,7 +138,7 @@ SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook ifNull(x,alt) ``` -**Параметры** +**Аргументы** - `x` — значение для проверки на `NULL`, - `alt` — значение, которое функция вернёт, если `x` — `NULL`. @@ -149,7 +151,7 @@ ifNull(x,alt) **Пример** ``` sql -SELECT ifNull('a', 'b') +SELECT ifNull('a', 'b'); ``` ``` text @@ -159,7 +161,7 @@ SELECT ifNull('a', 'b') ``` ``` sql -SELECT ifNull(NULL, 'b') +SELECT ifNull(NULL, 'b'); ``` ``` text @@ -176,7 +178,7 @@ SELECT ifNull(NULL, 'b') nullIf(x, y) ``` -**Параметры** +**Аргументы** `x`, `y` — значения для сравнивания. Они должны быть совместимых типов, иначе ClickHouse сгенерирует исключение. @@ -188,7 +190,7 @@ nullIf(x, y) **Пример** ``` sql -SELECT nullIf(1, 1) +SELECT nullIf(1, 1); ``` ``` text @@ -198,7 +200,7 @@ SELECT nullIf(1, 1) ``` ``` sql -SELECT nullIf(1, 2) +SELECT nullIf(1, 2); ``` ``` text @@ -215,7 +217,7 @@ SELECT nullIf(1, 2) assumeNotNull(x) ``` -**Параметры** +**Аргументы** - `x` — исходное значение. @@ -229,7 +231,7 @@ assumeNotNull(x) Рассмотрим таблицу `t_null`. ``` sql -SHOW CREATE TABLE t_null +SHOW CREATE TABLE t_null; ``` ``` text @@ -248,7 +250,7 @@ SHOW CREATE TABLE t_null Применим функцию `assumeNotNull` к столбцу `y`. ``` sql -SELECT assumeNotNull(y) FROM t_null +SELECT assumeNotNull(y) FROM t_null; ``` ``` text @@ -259,7 +261,7 @@ SELECT assumeNotNull(y) FROM t_null ``` ``` sql -SELECT toTypeName(assumeNotNull(y)) FROM t_null +SELECT toTypeName(assumeNotNull(y)) FROM t_null; ``` ``` text @@ -277,7 +279,7 @@ SELECT toTypeName(assumeNotNull(y)) FROM t_null toNullable(x) ``` -**Параметры** +**Аргументы** - `x` — значение произвольного не составного типа. @@ -288,7 +290,7 @@ toNullable(x) **Пример** ``` sql -SELECT toTypeName(10) +SELECT toTypeName(10); ``` ``` text @@ -298,7 +300,7 @@ SELECT toTypeName(10) ``` ``` sql -SELECT toTypeName(toNullable(10)) +SELECT toTypeName(toNullable(10)); ``` ``` text @@ -307,4 +309,3 @@ SELECT toTypeName(toNullable(10)) └────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/functions_for_nulls/) diff --git a/docs/ru/sql-reference/functions/geo/coordinates.md b/docs/ru/sql-reference/functions/geo/coordinates.md index 1931a9b932f..2605dc7a82f 100644 --- a/docs/ru/sql-reference/functions/geo/coordinates.md +++ b/docs/ru/sql-reference/functions/geo/coordinates.md @@ -1,5 +1,5 @@ --- -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u0433\u0435\u043e\u0433\u0440\u0430\u0444\u0438\u0447\u0435\u0441\u043a\u0438\u043c\u0438\u0020\u043a\u043e\u043e\u0440\u0434\u0438\u043d\u0430\u0442\u0430\u043c\u0438" +toc_title: "Функции для работы с географическими координатами" toc_priority: 62 --- @@ -133,4 +133,3 @@ SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res └─────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/functions/geo/coordinates) diff --git a/docs/ru/sql-reference/functions/geo/geohash.md b/docs/ru/sql-reference/functions/geo/geohash.md index 38c64f11b10..0992b620e60 100644 --- a/docs/ru/sql-reference/functions/geo/geohash.md +++ b/docs/ru/sql-reference/functions/geo/geohash.md @@ -1,5 +1,5 @@ --- -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u0441\u0438\u0441\u0442\u0435\u043c\u043e\u0439\u0020\u0047\u0065\u006f\u0068\u0061\u0073\u0068" +toc_title: "Функции для работы с системой Geohash" --- # Функции для работы с системой Geohash {#geohash} @@ -29,7 +29,7 @@ geohashEncode(longitude, latitude, [precision]) **Пример** ``` sql -SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res +SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res; ``` ``` text @@ -57,7 +57,7 @@ geohashDecode(geohash_string) **Пример** ``` sql -SELECT geohashDecode('ezs42') AS res +SELECT geohashDecode('ezs42') AS res; ``` ``` text @@ -76,13 +76,13 @@ SELECT geohashDecode('ezs42') AS res geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision) ``` -**Параметры** +**Аргументы** - `longitude_min` — минимальная долгота. Диапазон возможных значений: `[-180°, 180°]`. Тип данных: [Float](../../../sql-reference/data-types/float.md)). -- `latitude_min` - минимальная широта. Диапазон возможных значений: `[-90°, 90°]`. Тип данных: [Float](../../../sql-reference/data-types/float.md). -- `longitude_max` - максимальная долгота. Диапазон возможных значений: `[-180°, 180°]`. Тип данных: [Float](../../../sql-reference/data-types/float.md). -- `latitude_max` - максимальная широта. Диапазон возможных значений: `[-90°, 90°]`. Тип данных: [Float](../../../sql-reference/data-types/float.md). -- `precision` - точность geohash. Диапазон возможных значений: `[1, 12]`. Тип данных: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `latitude_min` — минимальная широта. Диапазон возможных значений: `[-90°, 90°]`. Тип данных: [Float](../../../sql-reference/data-types/float.md). +- `longitude_max` — максимальная долгота. Диапазон возможных значений: `[-180°, 180°]`. Тип данных: [Float](../../../sql-reference/data-types/float.md). +- `latitude_max` — максимальная широта. Диапазон возможных значений: `[-90°, 90°]`. Тип данных: [Float](../../../sql-reference/data-types/float.md). +- `precision` — точность geohash. Диапазон возможных значений: `[1, 12]`. Тип данных: [UInt8](../../../sql-reference/data-types/int-uint.md). !!! info "Замечание" Все передаваемые координаты должны быть одного и того же типа: либо `Float32`, либо `Float64`. @@ -102,8 +102,9 @@ geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precisi Запрос: ``` sql -SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos +SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos; ``` + Результат: ``` text @@ -112,4 +113,3 @@ SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos └─────────────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/functions/geo/geohash) diff --git a/docs/ru/sql-reference/functions/geo/h3.md b/docs/ru/sql-reference/functions/geo/h3.md index 69d06b5dfa6..27a512a9931 100644 --- a/docs/ru/sql-reference/functions/geo/h3.md +++ b/docs/ru/sql-reference/functions/geo/h3.md @@ -1,5 +1,5 @@ --- -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u0438\u043d\u0434\u0435\u043a\u0441\u0430\u043c\u0438\u0020\u0048\u0033" +toc_title: "Функции для работы с индексами H3" --- # Функции для работы с индексами H3 {#h3index} @@ -38,8 +38,9 @@ h3IsValid(h3index) Запрос: ``` sql -SELECT h3IsValid(630814730351855103) as h3IsValid +SELECT h3IsValid(630814730351855103) as h3IsValid; ``` + Результат: ``` text @@ -74,8 +75,9 @@ h3GetResolution(h3index) Запрос: ``` sql -SELECT h3GetResolution(639821929606596015) as resolution +SELECT h3GetResolution(639821929606596015) as resolution; ``` + Результат: ``` text @@ -107,8 +109,9 @@ h3EdgeAngle(resolution) Запрос: ``` sql -SELECT h3EdgeAngle(10) as edgeAngle +SELECT h3EdgeAngle(10) as edgeAngle; ``` + Результат: ``` text @@ -140,8 +143,9 @@ h3EdgeLengthM(resolution) Запрос: ``` sql -SELECT h3EdgeLengthM(15) as edgeLengthM +SELECT h3EdgeLengthM(15) as edgeLengthM; ``` + Результат: ``` text @@ -160,7 +164,7 @@ SELECT h3EdgeLengthM(15) as edgeLengthM geoToH3(lon, lat, resolution) ``` -**Параметры** +**Аргументы** - `lon` — географическая долгота. Тип данных — [Float64](../../../sql-reference/data-types/float.md). - `lat` — географическая широта. Тип данных — [Float64](../../../sql-reference/data-types/float.md). @@ -178,10 +182,10 @@ geoToH3(lon, lat, resolution) Запрос: ``` sql -SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index +SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index; ``` -Ответ: +Результат: ``` text ┌────────────h3Index─┐ @@ -199,7 +203,7 @@ SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index h3kRing(h3index, k) ``` -**Параметры** +**Аргументы** - `h3index` — идентификатор шестигранника. Тип данных: [UInt64](../../../sql-reference/data-types/int-uint.md). - `k` — радиус. Тип данных: [целое число](../../../sql-reference/data-types/int-uint.md) @@ -215,8 +219,9 @@ h3kRing(h3index, k) Запрос: ``` sql -SELECT arrayJoin(h3kRing(644325529233966508, 1)) AS h3index +SELECT arrayJoin(h3kRing(644325529233966508, 1)) AS h3index; ``` + Результат: ``` text @@ -311,7 +316,7 @@ SELECT h3HexAreaM2(13) as area; h3IndexesAreNeighbors(index1, index2) ``` -**Параметры** +**Аргументы** - `index1` — индекс шестиугольной ячейки. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md). - `index2` — индекс шестиугольной ячейки. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md). @@ -349,7 +354,7 @@ SELECT h3IndexesAreNeighbors(617420388351344639, 617420388352655359) AS n; h3ToChildren(index, resolution) ``` -**Параметры** +**Аргументы** - `index` — индекс шестиугольной ячейки. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md). - `resolution` — разрешение. Диапазон: `[0, 15]`. Тип: [UInt8](../../../sql-reference/data-types/int-uint.md). @@ -386,7 +391,7 @@ SELECT h3ToChildren(599405990164561919, 6) AS children; h3ToParent(index, resolution) ``` -**Параметры** +**Аргументы** - `index` — индекс шестиугольной ячейки. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md). - `resolution` — разрешение. Диапазон: `[0, 15]`. Тип: [UInt8](../../../sql-reference/data-types/int-uint.md). @@ -520,4 +525,3 @@ SELECT h3GetResolution(617420388352917503) as res; └─────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/functions/geo/h3) diff --git a/docs/ru/sql-reference/functions/geo/index.md b/docs/ru/sql-reference/functions/geo/index.md index cedaafaa31d..4d3bdfcd468 100644 --- a/docs/ru/sql-reference/functions/geo/index.md +++ b/docs/ru/sql-reference/functions/geo/index.md @@ -1,8 +1,7 @@ --- toc_priority: 62 -toc_folder_title: "\u0413\u0435\u043e\u002d\u0434\u0430\u043d\u043d\u044b\u0435" +toc_folder_title: "Гео-данные" toc_title: hidden --- -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/functions/geo/) diff --git a/docs/ru/sql-reference/functions/hash-functions.md b/docs/ru/sql-reference/functions/hash-functions.md index f7820889ea9..07c741e0588 100644 --- a/docs/ru/sql-reference/functions/hash-functions.md +++ b/docs/ru/sql-reference/functions/hash-functions.md @@ -1,12 +1,14 @@ --- toc_priority: 50 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0445\u044d\u0448\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u044f" +toc_title: "Функции хэширования" --- # Функции хэширования {#funktsii-kheshirovaniia} Функции хэширования могут использоваться для детерминированного псевдослучайного разбрасывания элементов. +Simhash – это хеш-функция, которая для близких значений возвращает близкий хеш. + ## halfMD5 {#hash-functions-halfmd5} [Интерпретирует](../../sql-reference/functions/hash-functions.md#type_conversion_functions-reinterpretAsString) все входные параметры как строки и вычисляет хэш [MD5](https://ru.wikipedia.org/wiki/MD5) для каждой из них. Затем объединяет хэши, берет первые 8 байт хэша результирующей строки и интерпретирует их как значение типа `UInt64` с big-endian порядком байтов. @@ -18,9 +20,9 @@ halfMD5(par1, ...) Функция относительно медленная (5 миллионов коротких строк в секунду на ядро процессора). По возможности, используйте функцию [sipHash64](#hash_functions-siphash64) вместо неё. -**Параметры** +**Аргументы** -Функция принимает переменное число входных параметров. Параметры могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). +Функция принимает переменное число входных параметров. Аргументы могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). **Возвращаемое значение** @@ -29,7 +31,7 @@ halfMD5(par1, ...) **Пример** ``` sql -SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS halfMD5hash, toTypeName(halfMD5hash) AS type +SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS halfMD5hash, toTypeName(halfMD5hash) AS type; ``` ``` text @@ -61,9 +63,9 @@ sipHash64(par1,...) 3. Затем функция принимает хэш-значение, вычисленное на предыдущем шаге, и третий элемент исходного хэш-массива, и вычисляет хэш для массива из них. 4. Предыдущий шаг повторяется для всех остальных элементов исходного хэш-массива. -**Параметры** +**Аргументы** -Функция принимает переменное число входных параметров. Параметры могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). +Функция принимает переменное число входных параметров. Аргументы могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). **Возвращаемое значение** @@ -72,7 +74,7 @@ sipHash64(par1,...) **Пример** ``` sql -SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type +SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type; ``` ``` text @@ -97,9 +99,9 @@ cityHash64(par1,...) Это не криптографическая хэш-функция. Она использует CityHash алгоритм для строковых параметров и зависящую от реализации быструю некриптографическую хэш-функцию для параметров с другими типами данных. Функция использует комбинатор CityHash для получения конечных результатов. -**Параметры** +**Аргументы** -Функция принимает переменное число входных параметров. Параметры могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). +Функция принимает переменное число входных параметров. Аргументы могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). **Возвращаемое значение** @@ -110,7 +112,7 @@ cityHash64(par1,...) Пример вызова: ``` sql -SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS CityHash, toTypeName(CityHash) AS type +SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS CityHash, toTypeName(CityHash) AS type; ``` ``` text @@ -166,9 +168,9 @@ farmHash64(par1, ...) Эти функции используют методы `Fingerprint64` и `Hash64` из всех [доступных методов](https://github.com/google/farmhash/blob/master/src/farmhash.h). -**Параметры** +**Аргументы** -Функция принимает переменное число входных параметров. Параметры могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). +Функция принимает переменное число входных параметров. Аргументы могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). **Возвращаемое значение** @@ -177,7 +179,7 @@ farmHash64(par1, ...) **Пример** ``` sql -SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS FarmHash, toTypeName(FarmHash) AS type +SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS FarmHash, toTypeName(FarmHash) AS type; ``` ``` text @@ -191,7 +193,7 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 Вычисляет [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) от строки. `JavaHash` не отличается ни скоростью, ни качеством, поэтому эту функцию следует считать устаревшей. Используйте эту функцию, если вам необходимо получить значение хэша по такому же алгоритму. ``` sql -SELECT javaHash(''); +SELECT javaHash('') ``` **Возвращаемое значение** @@ -208,7 +210,7 @@ SELECT javaHash(''); SELECT javaHash('Hello, world!'); ``` -Ответ: +Результат: ``` text ┌─javaHash('Hello, world!')─┐ @@ -226,7 +228,7 @@ SELECT javaHash('Hello, world!'); javaHashUTF16LE(stringUtf16le) ``` -**Параметры** +**Аргументы** - `stringUtf16le` — строка в `UTF-16LE`. @@ -243,10 +245,10 @@ javaHashUTF16LE(stringUtf16le) Запрос: ``` sql -SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')) +SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')); ``` -Ответ: +Результат: ``` text ┌─javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le'))─┐ @@ -259,7 +261,7 @@ SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')) Вычисляет `HiveHash` от строки. ``` sql -SELECT hiveHash(''); +SELECT hiveHash('') ``` `HiveHash` — это результат [JavaHash](#hash_functions-javahash) с обнулённым битом знака числа. Функция используется в [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) вплоть до версии 3.0. @@ -278,7 +280,7 @@ SELECT hiveHash(''); SELECT hiveHash('Hello, world!'); ``` -Ответ: +Результат: ``` text ┌─hiveHash('Hello, world!')─┐ @@ -294,9 +296,9 @@ SELECT hiveHash('Hello, world!'); metroHash64(par1, ...) ``` -**Параметры** +**Аргументы** -Функция принимает переменное число входных параметров. Параметры могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). +Функция принимает переменное число входных параметров. Аргументы могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). **Возвращаемое значение** @@ -305,7 +307,7 @@ metroHash64(par1, ...) **Пример** ``` sql -SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MetroHash, toTypeName(MetroHash) AS type +SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MetroHash, toTypeName(MetroHash) AS type; ``` ``` text @@ -329,9 +331,9 @@ murmurHash2_32(par1, ...) murmurHash2_64(par1, ...) ``` -**Параметры** +**Аргументы** -Обе функции принимают переменное число входных параметров. Параметры могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). +Обе функции принимают переменное число входных параметров. Аргументы могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). **Возвращаемое значение** @@ -341,7 +343,7 @@ murmurHash2_64(par1, ...) **Пример** ``` sql -SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash2, toTypeName(MurmurHash2) AS type +SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash2, toTypeName(MurmurHash2) AS type; ``` ``` text @@ -360,9 +362,9 @@ SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23: gccMurmurHash(par1, ...); ``` -**Параметры** +**Аргументы** -- `par1, ...` — Переменное число параметров. Каждый параметр может быть любого из [поддерживаемых типов данных](../../sql-reference/data-types/index.md). +- `par1, ...` — переменное число параметров. Каждый параметр может быть любого из [поддерживаемых типов данных](../../sql-reference/data-types/index.md). **Возвращаемое значение** @@ -397,9 +399,9 @@ murmurHash3_32(par1, ...) murmurHash3_64(par1, ...) ``` -**Параметры** +**Аргументы** -Обе функции принимают переменное число входных параметров. Параметры могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). +Обе функции принимают переменное число входных параметров. Аргументы могут быть любого [поддерживаемого типа данных](../../sql-reference/functions/hash-functions.md). **Возвращаемое значение** @@ -409,7 +411,7 @@ murmurHash3_64(par1, ...) **Пример** ``` sql -SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash3, toTypeName(MurmurHash3) AS type +SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash3, toTypeName(MurmurHash3) AS type; ``` ``` text @@ -426,9 +428,9 @@ SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23: murmurHash3_128( expr ) ``` -**Параметры** +**Аргументы** -- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее значение типа[String](../../sql-reference/functions/hash-functions.md). +- `expr` — [выражение](../syntax.md#syntax-expressions), возвращающее значение типа [String](../../sql-reference/functions/hash-functions.md). **Возвращаемое значение** @@ -437,13 +439,13 @@ murmurHash3_128( expr ) **Пример** ``` sql -SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) AS type +SELECT hex(murmurHash3_128('example_string')) AS MurmurHash3, toTypeName(MurmurHash3) AS type; ``` ``` text -┌─MurmurHash3──────┬─type────────────┐ -│ 6�1�4"S5KT�~~q │ FixedString(16) │ -└──────────────────┴─────────────────┘ +┌─MurmurHash3──────────────────────┬─type───┐ +│ 368A1A311CB7342253354B548E7E7E71 │ String │ +└──────────────────────────────────┴────────┘ ``` ## xxHash32, xxHash64 {#hash-functions-xxhash32-xxhash64} @@ -451,11 +453,11 @@ SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) Вычисляет `xxHash` от строки. Предлагается в двух вариантах: 32 и 64 бита. ``` sql -SELECT xxHash32(''); +SELECT xxHash32('') OR -SELECT xxHash64(''); +SELECT xxHash64('') ``` **Возвращаемое значение** @@ -472,7 +474,7 @@ SELECT xxHash64(''); SELECT xxHash32('Hello, world!'); ``` -Ответ: +Результат: ``` text ┌─xxHash32('Hello, world!')─┐ @@ -484,4 +486,937 @@ SELECT xxHash32('Hello, world!'); - [xxHash](http://cyan4973.github.io/xxHash/). -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/hash_functions/) +## ngramSimHash {#ngramsimhash} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHash(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHash('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1627567969 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitive {#ngramsimhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashCaseInsensitive(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashCaseInsensitive('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌──────Hash─┐ +│ 562180645 │ +└───────────┘ +``` + +## ngramSimHashUTF8 {#ngramsimhashutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashUTF8(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashUTF8('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1628157797 │ +└────────────┘ +``` + +## ngramSimHashCaseInsensitiveUTF8 {#ngramsimhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммовый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +ngramSimHashCaseInsensitiveUTF8(string[, ngramsize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT ngramSimHashCaseInsensitiveUTF8('ClickHouse') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 1636742693 │ +└────────────┘ +``` + +## wordShingleSimHash {#wordshinglesimhash} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHash(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitive {#wordshinglesimhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashCaseInsensitive(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## wordShingleSimHashUTF8 {#wordshinglesimhashutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashUTF8(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2328277067 │ +└────────────┘ +``` + +## wordShingleSimHashCaseInsensitiveUTF8 {#wordshinglesimhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шингловый `simhash`. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [bitHammingDistance](../../sql-reference/functions/bit-functions.md#bithammingdistance). Чем меньше [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между результатом вычисления `simhash` двух строк, тем больше вероятность, что строки совпадают. + +**Синтаксис** + +``` sql +wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Значение хеш-функции от строки. + +Тип: [UInt64](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleSimHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Hash; +``` + +Результат: + +``` text +┌───────Hash─┐ +│ 2194812424 │ +└────────────┘ +``` + +## ngramMinHash {#ngramminhash} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHash(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHash('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,9054248444481805918) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitive {#ngramminhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHashCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashCaseInsensitive('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (2106263556442004574,13203602793651726206) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashUTF8 {#ngramminhashutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** +``` sql +ngramMinHashUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (18333312859352735453,6742163577938632877) │ +└────────────────────────────────────────────┘ +``` + +## ngramMinHashCaseInsensitiveUTF8 {#ngramminhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и вычисляет хеш для каждой n-граммы. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple───────────────────────────────────────┐ +│ (12493625717655877135,13203602793651726206) │ +└─────────────────────────────────────────────┘ +``` + +## ngramMinHashArg {#ngramminhasharg} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHash](#ngramminhash) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +ngramMinHashArg(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArg('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('Hou','lic','ick','ous','ckH','Cli')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitive {#ngramminhashargcaseinsensitive} + +Выделяет из ASCII строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitive](#ngramminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgCaseInsensitive('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','kHo','use','Cli'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgUTF8 {#ngramminhashargutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashUTF8](#ngramminhashutf8) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ous','ick','lic','Hou','kHo','use'),('kHo','Hou','lic','ick','ous','ckH')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## ngramMinHashArgCaseInsensitiveUTF8 {#ngramminhashargcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (n-граммы) размером `ngramsize` символов и возвращает n-граммы с минимальным и максимальным хешами, вычисленными функцией [ngramMinHashCaseInsensitiveUTF8](#ngramminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `ngramsize` — размер n-грамм. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` n-грамм. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT ngramMinHashArgCaseInsensitiveUTF8('ClickHouse') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────────────┐ +│ (('ckH','ous','ick','lic','kHo','use'),('kHo','lic','ick','ous','ckH','Hou')) │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHash {#wordshingleminhash} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHash(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHash('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitive {#wordshingleminhashcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashUTF8 {#wordshingleminhashutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистрозависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────┐ +│ (16452112859864147620,5844417301642981317) │ +└────────────────────────────────────────────┘ +``` + +## wordShingleMinHashCaseInsensitiveUTF8 {#wordshingleminhashcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и вычисляет хеш для каждого шингла. Использует `hashnum` минимальных хешей, чтобы вычислить минимальный хеш, и `hashnum` максимальных хешей, чтобы вычислить максимальный хеш. Возвращает кортеж из этих хешей. Функция регистро**не**зависимая. + +Может быть использована для проверки двух строк на схожесть вместе с функцией [tupleHammingDistance](../../sql-reference/functions/tuple-functions.md#tuplehammingdistance). Если для двух строк минимальные или максимальные хеши одинаковы, мы считаем, что эти строки совпадают. + +**Синтаксис** + +``` sql +wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж с двумя хешами — минимальным и максимальным. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([UInt64](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md)). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).') AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────┐ +│ (3065874883688416519,1634050779997673240) │ +└───────────────────────────────────────────┘ +``` + +## wordShingleMinHashArg {#wordshingleminhasharg} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordshingleMinHash](#wordshingleminhash) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArg(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArg('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitive {#wordshingleminhashargcaseinsensitive} + +Выделяет из ASCII строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitive](#wordshingleminhashcaseinsensitive) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitive('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgUTF8 {#wordshingleminhashargutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashUTF8](#wordshingleminhashutf8) с теми же входными данными. Функция регистрозависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple─────────────────────────────────────────────────────────────────┐ +│ (('OLAP','database','analytical'),('online','oriented','processing')) │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +## wordShingleMinHashArgCaseInsensitiveUTF8 {#wordshingleminhashargcaseinsensitiveutf8} + +Выделяет из UTF-8 строки отрезки (шинглы) из `shinglesize` слов и возвращает шинглы с минимальным и максимальным хешами, вычисленными функцией [wordShingleMinHashCaseInsensitiveUTF8](#wordshingleminhashcaseinsensitiveutf8) с теми же входными данными. Функция регистро**не**зависимая. + +**Синтаксис** + +``` sql +wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum]) +``` + +**Аргументы** + +- `string` — строка. [String](../../sql-reference/data-types/string.md). +- `shinglesize` — размер словесных шинглов. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `3`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `hashnum` — количество минимальных и максимальных хешей, которое используется при вычислении результата. Необязательный. Возможные значения: любое число от `1` до `25`. Значение по умолчанию: `6`. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +- Кортеж из двух кортежей, каждый из которых состоит из `hashnum` шинглов. + +Тип: [Tuple](../../sql-reference/data-types/tuple.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md)), [Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md))). + +**Пример** + +Запрос: + +``` sql +SELECT wordShingleMinHashArgCaseInsensitiveUTF8('ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).', 1, 3) AS Tuple; +``` + +Результат: + +``` text +┌─Tuple──────────────────────────────────────────────────────────────────┐ +│ (('queries','database','analytical'),('oriented','processing','DBMS')) │ +└────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/in-functions.md b/docs/ru/sql-reference/functions/in-functions.md index b732f67303b..2bdb71d5f93 100644 --- a/docs/ru/sql-reference/functions/in-functions.md +++ b/docs/ru/sql-reference/functions/in-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 60 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0435\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u0438\u0020\u043e\u043f\u0435\u0440\u0430\u0442\u043e\u0440\u0430\u0020\u0049\u004e" +toc_title: "Функции для реализации оператора IN" --- # Функции для реализации оператора IN {#funktsii-dlia-realizatsii-operatora-in} @@ -9,4 +9,3 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u Смотрите раздел [Операторы IN](../operators/in.md#select-in-operators). -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/in_functions/) diff --git a/docs/ru/sql-reference/functions/index.md b/docs/ru/sql-reference/functions/index.md index 25d3b6de067..1eefd4d9f73 100644 --- a/docs/ru/sql-reference/functions/index.md +++ b/docs/ru/sql-reference/functions/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438" +toc_folder_title: "Функции" toc_priority: 32 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043d\u0438\u0435" +toc_title: "Введение" --- # Функции {#funktsii} @@ -82,4 +82,3 @@ str -> str != Referer Если функция в запросе выполняется на сервере-инициаторе запроса, а вам нужно, чтобы она выполнялась на удалённых серверах, вы можете обернуть её в агрегатную функцию any или добавить в ключ в `GROUP BY`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/) diff --git a/docs/ru/sql-reference/functions/introspection.md b/docs/ru/sql-reference/functions/introspection.md index 00dd660bc16..cb2bcdb787f 100644 --- a/docs/ru/sql-reference/functions/introspection.md +++ b/docs/ru/sql-reference/functions/introspection.md @@ -1,6 +1,6 @@ --- toc_priority: 65 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0438\u043d\u0442\u0440\u043e\u0441\u043f\u0435\u043a\u0446\u0438\u0438" +toc_title: "Функции интроспекции" --- # Функции интроспекции {#introspection-functions} @@ -32,7 +32,7 @@ ClickHouse сохраняет отчеты профилировщика в [жу addressToLine(address_of_binary_instruction) ``` -**Параметры** +**Аргументы** - `address_of_binary_instruction` ([Тип UInt64](../../sql-reference/functions/introspection.md))- Адрес инструкции в запущенном процессе. @@ -53,13 +53,13 @@ addressToLine(address_of_binary_instruction) Включение функций самоанализа: ``` sql -SET allow_introspection_functions=1 +SET allow_introspection_functions=1; ``` Выбор первой строки из списка `trace_log` системная таблица: ``` sql -SELECT * FROM system.trace_log LIMIT 1 \G +SELECT * FROM system.trace_log LIMIT 1 \G; ``` ``` text @@ -79,7 +79,7 @@ trace: [140658411141617,94784174532828,94784076370703,94784076 Получение имени файла исходного кода и номера строки для одного адреса: ``` sql -SELECT addressToLine(94784076370703) \G +SELECT addressToLine(94784076370703) \G; ``` ``` text @@ -123,9 +123,9 @@ trace_source_code_lines: /lib/x86_64-linux-gnu/libpthread-2.27.so addressToSymbol(address_of_binary_instruction) ``` -**Параметры** +**Аргументы** -- `address_of_binary_instruction` ([Тип uint64](../../sql-reference/functions/introspection.md)) — Адрес инструкции в запущенном процессе. +- `address_of_binary_instruction` ([Тип uint64](../../sql-reference/functions/introspection.md)) — адрес инструкции в запущенном процессе. **Возвращаемое значение** @@ -139,13 +139,13 @@ addressToSymbol(address_of_binary_instruction) Включение функций самоанализа: ``` sql -SET allow_introspection_functions=1 +SET allow_introspection_functions=1; ``` Выбор первой строки из списка `trace_log` системная таблица: ``` sql -SELECT * FROM system.trace_log LIMIT 1 \G +SELECT * FROM system.trace_log LIMIT 1 \G; ``` ``` text @@ -165,7 +165,7 @@ trace: [94138803686098,94138815010911,94138815096522,94138815101224,9413 Получение символа для одного адреса: ``` sql -SELECT addressToSymbol(94138803686098) \G +SELECT addressToSymbol(94138803686098) \G; ``` ``` text @@ -220,9 +220,9 @@ clone demangle(symbol) ``` -**Параметры** +**Аргументы** -- `symbol` ([Строка](../../sql-reference/functions/introspection.md)) - Символ из объектного файла. +- `symbol` ([Строка](../../sql-reference/functions/introspection.md)) - символ из объектного файла. **Возвращаемое значение** @@ -236,13 +236,13 @@ demangle(symbol) Включение функций самоанализа: ``` sql -SET allow_introspection_functions=1 +SET allow_introspection_functions=1; ``` Выбор первой строки из списка `trace_log` системная таблица: ``` sql -SELECT * FROM system.trace_log LIMIT 1 \G +SELECT * FROM system.trace_log LIMIT 1 \G; ``` ``` text @@ -262,7 +262,7 @@ trace: [94138803686098,94138815010911,94138815096522,94138815101224,9413 Получение имени функции для одного адреса: ``` sql -SELECT demangle(addressToSymbol(94138803686098)) \G +SELECT demangle(addressToSymbol(94138803686098)) \G; ``` ``` text @@ -336,6 +336,7 @@ SELECT tid(); │ 3878 │ └───────┘ ``` + ## logTrace {#logtrace} Выводит сообщение в лог сервера для каждого [Block](https://clickhouse.tech/docs/ru/development/architecture/#block). @@ -346,7 +347,7 @@ SELECT tid(); logTrace('message') ``` -**Параметры** +**Аргументы** - `message` — сообщение, которое отправляется в серверный лог. [String](../../sql-reference/data-types/string.md#string). @@ -354,7 +355,7 @@ logTrace('message') - Всегда возвращает 0. -**Example** +**Пример** Запрос: @@ -370,4 +371,4 @@ SELECT logTrace('logTrace message'); └──────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/introspection/) \ No newline at end of file +[Original article](https://clickhouse.tech/docs/en/query_language/functions/introspection/) diff --git a/docs/ru/sql-reference/functions/ip-address-functions.md b/docs/ru/sql-reference/functions/ip-address-functions.md index 52f0a92bc9f..d7f6d2f7618 100644 --- a/docs/ru/sql-reference/functions/ip-address-functions.md +++ b/docs/ru/sql-reference/functions/ip-address-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 55 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u0049\u0050\u002d\u0430\u0434\u0440\u0435\u0441\u0430\u043c\u0438" +toc_title: "Функции для работы с IP-адресами" --- # Функции для работы с IP-адресами {#funktsii-dlia-raboty-s-ip-adresami} @@ -9,10 +9,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u Принимает число типа UInt32. Интерпретирует его, как IPv4-адрес в big endian. Возвращает строку, содержащую соответствующий IPv4-адрес в формате A.B.C.D (числа в десятичной форме через точки). +Синоним: `INET_NTOA`. + ## IPv4StringToNum(s) {#ipv4stringtonums} Функция, обратная к IPv4NumToString. Если IPv4 адрес в неправильном формате, то возвращает 0. +Синоним: `INET_ATON`. + ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Похоже на IPv4NumToString, но вместо последнего октета используется xxx. @@ -49,7 +53,11 @@ LIMIT 10 ### IPv6NumToString(x) {#ipv6numtostringx} Принимает значение типа FixedString(16), содержащее IPv6-адрес в бинарном виде. Возвращает строку, содержащую этот адрес в текстовом виде. -IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. Примеры: +IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. + +Примеры: `INET6_NTOA`. + +Примеры: ``` sql SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr @@ -113,17 +121,60 @@ LIMIT 10 └────────────────────────────┴────────┘ ``` -## IPv6StringToNum(s) {#ipv6stringtonums} +## IPv6StringToNum {#ipv6stringtonums} + +Функция, обратная к [IPv6NumToString](#ipv6numtostringx). Если IPv6 адрес передан в неправильном формате, то возвращает строку из нулевых байт. + +Если IP адрес является корректным IPv4 адресом, функция возвращает его IPv6 эквивалент. -Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт. HEX может быть в любом регистре. +Синоним: `INET6_ATON`. + +**Синтаксис** + +``` sql +IPv6StringToNum(string) +``` + +**Аргумент** + +- `string` — IP адрес. [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +- Адрес IPv6 в двоичном представлении. + +Тип: [FixedString(16)](../../sql-reference/data-types/fixedstring.md). + +**Пример** + +Запрос: + +``` sql +SELECT addr, cutIPv6(IPv6StringToNum(addr), 0, 0) FROM (SELECT ['notaddress', '127.0.0.1', '1111::ffff'] AS addr) ARRAY JOIN addr; +``` + +Результат: + +``` text +┌─addr───────┬─cutIPv6(IPv6StringToNum(addr), 0, 0)─┐ +│ notaddress │ :: │ +│ 127.0.0.1 │ ::ffff:127.0.0.1 │ +│ 1111::ffff │ 1111::ffff │ +└────────────┴──────────────────────────────────────┘ +``` + +**Смотрите также** + +- [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4). + ## IPv4ToIPv6(x) {#ipv4toipv6x} Принимает число типа `UInt32`. Интерпретирует его, как IPv4-адрес в [big endian](https://en.wikipedia.org/wiki/Endianness). Возвращает значение `FixedString(16)`, содержащее адрес IPv6 в двоичном формате. Примеры: ``` sql -SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr +SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr; ``` ``` text @@ -156,7 +207,7 @@ SELECT Принимает на вход IPv4 и значение `UInt8`, содержащее [CIDR](https://ru.wikipedia.org/wiki/Бесклассовая_адресация). Возвращает кортеж с двумя IPv4, содержащими нижний и более высокий диапазон подсети. ``` sql -SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) +SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16); ``` ``` text @@ -170,7 +221,7 @@ SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) Принимает на вход IPv6 и значение `UInt8`, содержащее CIDR. Возвращает кортеж с двумя IPv6, содержащими нижний и более высокий диапазон подсети. ``` sql -SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32) +SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); ``` ``` text @@ -211,38 +262,62 @@ SELECT └───────────────────────────────────┴──────────────────────────┘ ``` -## toIPv6(string) {#toipv6string} +## toIPv6 {#toipv6string} -Псевдоним функции `IPv6StringToNum()` которая принимает строку с адресом IPv6 и возвращает значение типа [IPv6](../../sql-reference/functions/ip-address-functions.md), которое равно значению, возвращаемому функцией `IPv6StringToNum()`. +Приводит строку с адресом в формате IPv6 к типу [IPv6](../../sql-reference/data-types/domains/ipv6.md). Возвращает пустое значение, если входящая строка не является корректным IP адресом. +Похоже на функцию [IPv6StringToNum](#ipv6stringtonums), которая представляет адрес IPv6 в двоичном виде. -``` sql -WITH - '2001:438:ffff::407d:1bc1' as IPv6_string -SELECT - toTypeName(IPv6StringToNum(IPv6_string)), - toTypeName(toIPv6(IPv6_string)) +Если входящая строка содержит корректный IPv4 адрес, функция возвращает его IPv6 эквивалент. + +**Синтаксис** + +```sql +toIPv6(string) ``` -``` text -┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ -│ FixedString(16) │ IPv6 │ -└──────────────────────────────────────────┴─────────────────────────────────┘ -``` +**Аргумент** + +- `string` — IP адрес. [String](../../sql-reference/data-types/string.md) + +**Возвращаемое значение** + +- IP адрес. + +Тип: [IPv6](../../sql-reference/data-types/domains/ipv6.md). + +**Примеры** + +Запрос: ``` sql -WITH - '2001:438:ffff::407d:1bc1' as IPv6_string +WITH '2001:438:ffff::407d:1bc1' AS IPv6_string SELECT hex(IPv6StringToNum(IPv6_string)), - hex(toIPv6(IPv6_string)) + hex(toIPv6(IPv6_string)); ``` +Результат: + ``` text ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ │ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ └───────────────────────────────────┴──────────────────────────────────┘ ``` +Запрос: + +``` sql +SELECT toIPv6('127.0.0.1'); +``` + +Результат: + +``` text +┌─toIPv6('127.0.0.1')─┐ +│ ::ffff:127.0.0.1 │ +└─────────────────────┘ +``` + ## isIPv4String {#isipv4string} Определяет, является ли строка адресом IPv4 или нет. Также вернет `0`, если `string` — адрес IPv6. @@ -253,7 +328,7 @@ SELECT isIPv4String(string) ``` -**Параметры** +**Аргументы** - `string` — IP адрес. [String](../../sql-reference/data-types/string.md). @@ -268,7 +343,7 @@ isIPv4String(string) Запрос: ```sql -SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr +SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr; ``` Результат: @@ -291,7 +366,7 @@ SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:1 isIPv6String(string) ``` -**Параметры** +**Аргументы** - `string` — IP адрес. [String](../../sql-reference/data-types/string.md). @@ -306,7 +381,7 @@ isIPv6String(string) Запрос: ``` sql -SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr +SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr; ``` Результат: @@ -320,4 +395,54 @@ SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0 └──────────────────┴────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/ip_address_functions/) +## isIPAddressInRange {#isipaddressinrange} + +Проверяет попадает ли IP адрес в интервал, заданный в [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) нотации. + +**Syntax** + +``` sql +isIPAddressInRange(address, prefix) +``` +Функция принимает IPv4 или IPv6 адрес виде строки. Возвращает `0`, если версия адреса и интервала не совпадают. + +**Аргументы** + +- `address` — IPv4 или IPv6 адрес. [String](../../sql-reference/data-types/string.md). +- `prefix` — IPv4 или IPv6 подсеть, заданная в CIDR нотации. [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +- `1` или `0`. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Примеры** + +Запрос: + +``` sql +SELECT isIPAddressInRange('127.0.0.1', '127.0.0.0/8') +``` + +Результат: + +``` text +┌─isIPAddressInRange('127.0.0.1', '127.0.0.0/8')─┐ +│ 1 │ +└────────────────────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT isIPAddressInRange('127.0.0.1', 'ffff::/16') +``` + +Результат: + +``` text +┌─isIPAddressInRange('127.0.0.1', 'ffff::/16')─┐ +│ 0 │ +└──────────────────────────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/json-functions.md b/docs/ru/sql-reference/functions/json-functions.md index 69b8f8f98f5..4de487c03ad 100644 --- a/docs/ru/sql-reference/functions/json-functions.md +++ b/docs/ru/sql-reference/functions/json-functions.md @@ -16,51 +16,65 @@ toc_title: JSON ## visitParamHas(params, name) {#visitparamhasparams-name} -Проверить наличие поля с именем name. +Проверяет наличие поля с именем `name`. + +Алиас: `simpleJSONHas`. ## visitParamExtractUInt(params, name) {#visitparamextractuintparams-name} -Распарсить UInt64 из значения поля с именем name. Если поле строковое - попытаться распарсить число из начала строки. Если такого поля нет, или если оно есть, но содержит не число, то вернуть 0. +Пытается выделить число типа UInt64 из значения поля с именем `name`. Если поле строковое, пытается выделить число из начала строки. Если такого поля нет, или если оно есть, но содержит не число, то возвращает 0. + +Алиас: `simpleJSONExtractUInt`. ## visitParamExtractInt(params, name) {#visitparamextractintparams-name} Аналогично для Int64. +Алиас: `simpleJSONExtractInt`. + ## visitParamExtractFloat(params, name) {#visitparamextractfloatparams-name} Аналогично для Float64. +Алиас: `simpleJSONExtractFloat`. + ## visitParamExtractBool(params, name) {#visitparamextractboolparams-name} -Распарсить значение true/false. Результат - UInt8. +Пытается выделить значение true/false. Результат — UInt8. + +Алиас: `simpleJSONExtractBool`. ## visitParamExtractRaw(params, name) {#visitparamextractrawparams-name} -Вернуть значение поля, включая разделители. +Возвращает значение поля, включая разделители. + +Алиас: `simpleJSONExtractRaw`. Примеры: ``` sql -visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' -visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' +visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"'; +visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}'; ``` ## visitParamExtractString(params, name) {#visitparamextractstringparams-name} -Распарсить строку в двойных кавычках. У значения убирается экранирование. Если убрать экранированные символы не удалось, то возвращается пустая строка. +Разбирает строку в двойных кавычках. У значения убирается экранирование. Если убрать экранированные символы не удалось, то возвращается пустая строка. + +Алиас: `simpleJSONExtractString`. Примеры: ``` sql -visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' -visitParamExtractString('{"abc":"hello}', 'abc') = '' +visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'; +visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺'; +visitParamExtractString('{"abc":"\\u263"}', 'abc') = ''; +visitParamExtractString('{"abc":"hello}', 'abc') = ''; ``` -На данный момент, не поддерживаются записанные в формате `\uXXXX\uYYYY` кодовые точки не из basic multilingual plane (они переводятся не в UTF-8, а в CESU-8). +На данный момент не поддерживаются записанные в формате `\uXXXX\uYYYY` кодовые точки не из basic multilingual plane (они переводятся не в UTF-8, а в CESU-8). -Следующие функции используют [simdjson](https://github.com/lemire/simdjson) который разработан под более сложные требования для разбора JSON. Упомянутое выше предположение 2 по-прежнему применимо. +Следующие функции используют [simdjson](https://github.com/lemire/simdjson), который разработан под более сложные требования для разбора JSON. Упомянутое выше допущение 2 по-прежнему применимо. ## isValidJSON(json) {#isvalidjsonjson} @@ -211,7 +225,7 @@ SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') Пример: ``` sql -SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' +SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]'; ``` ## JSONExtractArrayRaw(json\[, indices_or_keys\]…) {#jsonextractarrayrawjson-indices-or-keys} @@ -223,7 +237,7 @@ SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, Пример: ``` sql -SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = ['-100', '200.0', '"hello"']' +SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = ['-100', '200.0', '"hello"']'; ``` ## JSONExtractKeysAndValuesRaw {#json-extract-keys-and-values-raw} @@ -236,29 +250,28 @@ SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = JSONExtractKeysAndValuesRaw(json[, p, a, t, h]) ``` -**Параметры** +**Аргументы** -- `json` — [Строка](../data-types/string.md), содержащая валидный JSON. -- `p, a, t, h` — Индексы или ключи, разделенные запятыми, которые указывают путь к внутреннему полю во вложенном объекте JSON. Каждый аргумент может быть либо [строкой](../data-types/string.md) для получения поля по ключу, либо [целым числом](../data-types/int-uint.md) для получения N-го поля (индексирование начинается с 1, отрицательные числа используются для отсчета с конца). Если параметр не задан, весь JSON парсится как объект верхнего уровня. Необязательный параметр. +- `json` — [строка](../data-types/string.md), содержащая валидный JSON. +- `p, a, t, h` — индексы или ключи, разделенные запятыми, которые указывают путь к внутреннему полю во вложенном объекте JSON. Каждый аргумент может быть либо [строкой](../data-types/string.md) для получения поля по ключу, либо [целым числом](../data-types/int-uint.md) для получения N-го поля (индексирование начинается с 1, отрицательные числа используются для отсчета с конца). Если параметр не задан, весь JSON парсится как объект верхнего уровня. Необязательный параметр. **Возвращаемые значения** -- Массив с кортежами `('key', 'value')`. Члены кортежа — строки. +- Массив с кортежами `('key', 'value')`. Члены кортежа — строки. -- Пустой массив, если заданный объект не существует или входные данные не валидный JSON. +- Пустой массив, если заданный объект не существует или входные данные не валидный JSON. -Тип: Type: [Array](../data-types/array.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md), [String](../data-types/string.md)). -. +Тип: [Array](../data-types/array.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md), [String](../data-types/string.md)). **Примеры** Запрос: ``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}') +SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}'); ``` -Ответ: +Результат: ``` text ┌─JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}')─┐ @@ -269,10 +282,10 @@ SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello" Запрос: ``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', 'b') +SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', 'b'); ``` -Ответ: +Результат: ``` text ┌─JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', 'b')─┐ @@ -283,15 +296,13 @@ SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello" Запрос: ``` sql -SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', -1, 'c') +SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', -1, 'c'); ``` -Ответ: +Результат: ``` text ┌─JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello", "f": "world"}}}', -1, 'c')─┐ │ [('d','"hello"'),('f','"world"')] │ └───────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` - -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/json_functions/) diff --git a/docs/ru/sql-reference/functions/logical-functions.md b/docs/ru/sql-reference/functions/logical-functions.md index 9b1ee6a66a7..8566657d2eb 100644 --- a/docs/ru/sql-reference/functions/logical-functions.md +++ b/docs/ru/sql-reference/functions/logical-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 37 -toc_title: "\u041b\u043e\u0433\u0438\u0447\u0435\u0441\u043a\u0438\u0435\u0020\u0444\u0443\u043d\u043a\u0446\u0438\u0438" +toc_title: "Логические функции" --- # Логические функции {#logicheskie-funktsii} @@ -17,4 +17,3 @@ toc_title: "\u041b\u043e\u0433\u0438\u0447\u0435\u0441\u043a\u0438\u0435\u0020\u ## xor {#xor} -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/logical_functions/) diff --git a/docs/ru/sql-reference/functions/machine-learning-functions.md b/docs/ru/sql-reference/functions/machine-learning-functions.md index 2ffdfd05613..a1716eed6c2 100644 --- a/docs/ru/sql-reference/functions/machine-learning-functions.md +++ b/docs/ru/sql-reference/functions/machine-learning-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 64 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043c\u0430\u0448\u0438\u043d\u043d\u043e\u0433\u043e\u0020\u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f" +toc_title: "Функции машинного обучения" --- # Функции машинного обучения {#funktsii-mashinnogo-obucheniia} @@ -27,7 +27,7 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043c\u0430\u0448\u bayesAB(distribution_name, higher_is_better, variant_names, x, y) ``` -**Параметры** +**Аргументы** - `distribution_name` — вероятностное распределение. [String](../../sql-reference/data-types/string.md). Возможные значения: @@ -36,14 +36,14 @@ bayesAB(distribution_name, higher_is_better, variant_names, x, y) - `higher_is_better` — способ определения предпочтений. [Boolean](../../sql-reference/data-types/boolean.md). Возможные значения: - - `0` - чем меньше значение, тем лучше - - `1` - чем больше значение, тем лучше + - `0` — чем меньше значение, тем лучше + - `1` — чем больше значение, тем лучше -- `variant_names` - массив, содержащий названия вариантов. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- `variant_names` — массив, содержащий названия вариантов. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -- `x` - массив, содержащий число проведенных тестов (испытаний) для каждого варианта. [Array](../../sql-reference/data-types/array.md)([Float64](../../sql-reference/data-types/float.md)). +- `x` — массив, содержащий число проведенных тестов (испытаний) для каждого варианта. [Array](../../sql-reference/data-types/array.md)([Float64](../../sql-reference/data-types/float.md)). -- `y` - массив, содержащий число успешных тестов (испытаний) для каждого варианта. [Array](../../sql-reference/data-types/array.md)([Float64](../../sql-reference/data-types/float.md)). +- `y` — массив, содержащий число успешных тестов (испытаний) для каждого варианта. [Array](../../sql-reference/data-types/array.md)([Float64](../../sql-reference/data-types/float.md)). !!! note "Замечание" Все три массива должны иметь одинаковый размер. Все значения `x` и `y` должны быть неотрицательными числами (константами). Значение `y` не может превышать соответствующее значение `x`. @@ -51,8 +51,8 @@ bayesAB(distribution_name, higher_is_better, variant_names, x, y) **Возвращаемые значения** Для каждого варианта рассчитываются: -- `beats_control` - вероятность, что данный вариант превосходит контрольный в долгосрочной перспективе -- `to_be_best` - вероятность, что данный вариант является лучшим в долгосрочной перспективе +- `beats_control` — вероятность, что данный вариант превосходит контрольный в долгосрочной перспективе +- `to_be_best` — вероятность, что данный вариант является лучшим в долгосрочной перспективе Тип: JSON. diff --git a/docs/ru/sql-reference/functions/math-functions.md b/docs/ru/sql-reference/functions/math-functions.md index 2e57aca6a0a..da075e922cd 100644 --- a/docs/ru/sql-reference/functions/math-functions.md +++ b/docs/ru/sql-reference/functions/math-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 44 -toc_title: "\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0438\u0435\u0020\u0444\u0443\u043d\u043a\u0446\u0438\u0438" +toc_title: "Математические функции" --- # Математические функции {#matematicheskie-funktsii} @@ -54,7 +54,7 @@ toc_title: "\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u Пример (правило трёх сигм): ``` sql -SELECT erf(3 / sqrt(2)) +SELECT erf(3 / sqrt(2)); ``` ``` text @@ -113,7 +113,7 @@ SELECT erf(3 / sqrt(2)) cosh(x) ``` -**Параметры** +**Аргументы** - `x` — угол в радианах. Значения из интервала: `-∞ < x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64). @@ -149,7 +149,7 @@ SELECT cosh(0); acosh(x) ``` -**Параметры** +**Аргументы** - `x` — гиперболический косинус угла. Значения из интервала: `1 <= x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64). @@ -189,7 +189,7 @@ SELECT acosh(1); sinh(x) ``` -**Параметры** +**Аргументы** - `x` — угол в радианах. Значения из интервала: `-∞ < x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64). @@ -225,7 +225,7 @@ SELECT sinh(0); asinh(x) ``` -**Параметры** +**Аргументы** - `x` — гиперболический синус угла. Значения из интервала: `-∞ < x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64). @@ -265,7 +265,7 @@ SELECT asinh(0); atanh(x) ``` -**Параметры** +**Аргументы** - `x` — гиперболический тангенс угла. Значения из интервала: `–1 < x < 1`. [Float64](../../sql-reference/data-types/float.md#float32-float64). @@ -301,7 +301,7 @@ SELECT atanh(0); atan2(y, x) ``` -**Параметры** +**Аргументы** - `y` — координата y точки, в которую проведена линия. [Float64](../../sql-reference/data-types/float.md#float32-float64). - `x` — координата х точки, в которую проведена линия. [Float64](../../sql-reference/data-types/float.md#float32-float64). @@ -338,7 +338,7 @@ SELECT atan2(1, 1); hypot(x, y) ``` -**Параметры** +**Аргументы** - `x` — первый катет прямоугольного треугольника. [Float64](../../sql-reference/data-types/float.md#float32-float64). - `y` — второй катет прямоугольного треугольника. [Float64](../../sql-reference/data-types/float.md#float32-float64). @@ -375,7 +375,7 @@ SELECT hypot(1, 1); log1p(x) ``` -**Параметры** +**Аргументы** - `x` — значения из интервала: `-1 < x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64). @@ -405,4 +405,66 @@ SELECT log1p(0); - [log(x)](../../sql-reference/functions/math-functions.md#logx) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/math_functions/) +## sign(x) {#signx} + +Возвращает знак действительного числа. + +**Синтаксис** + +``` sql +sign(x) +``` + +**Аргумент** + +- `x` — Значения от `-∞` до `+∞`. Любой числовой тип, поддерживаемый ClickHouse. + +**Возвращаемое значение** + +- -1 если `x < 0` +- 0 если `x = 0` +- 1 если `x > 0` + +**Примеры** + +Результат sign() для нуля: + +``` sql +SELECT sign(0); +``` +Результат: + +``` text +┌─sign(0)─┐ +│ 0 │ +└─────────┘ +``` + +Результат sign() для положительного аргумента: + +``` sql +SELECT sign(1); +``` + +Результат: + +``` text +┌─sign(1)─┐ +│ 1 │ +└─────────┘ +``` + +Результат sign() для отрицательного аргумента: + +``` sql +SELECT sign(-1); +``` + +Результат: + +``` text +┌─sign(-1)─┐ +│ -1 │ +└──────────┘ +``` + diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md index a738ba755b1..84bbc6af968 100644 --- a/docs/ru/sql-reference/functions/other-functions.md +++ b/docs/ru/sql-reference/functions/other-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 66 -toc_title: "\u041f\u0440\u043e\u0447\u0438\u0435\u0020\u0444\u0443\u043d\u043a\u0446\u0438\u0438" +toc_title: "Прочие функции" --- # Прочие функции {#other-functions} @@ -16,16 +16,16 @@ toc_title: "\u041f\u0440\u043e\u0447\u0438\u0435\u0020\u0444\u0443\u043d\u043a\u **Синтаксис** ```sql -getMacro(name); +getMacro(name) ``` -**Параметры** +**Аргументы** -- `name` — Имя, которое необходимо получить из секции `macros`. [String](../../sql-reference/data-types/string.md#string). +- `name` — имя, которое необходимо получить из секции `macros`. [String](../../sql-reference/data-types/string.md#string). **Возвращаемое значение** -- Значение по указанному имени. +- Значение по указанному имени. Тип: [String](../../sql-reference/data-types/string.md). @@ -66,7 +66,6 @@ WHERE macro = 'test' └───────┴──────────────┘ ``` - ## FQDN {#fqdn} Возвращает полное имя домена. @@ -74,7 +73,7 @@ WHERE macro = 'test' **Синтаксис** ``` sql -fqdn(); +fqdn() ``` Эта функция регистронезависимая. @@ -93,7 +92,7 @@ fqdn(); SELECT FQDN(); ``` -Ответ: +Результат: ``` text ┌─FQDN()──────────────────────────┐ @@ -109,9 +108,9 @@ SELECT FQDN(); basename( expr ) ``` -**Параметры** +**Аргументы** -- `expr` — Выражение, возвращающее значение типа [String](../../sql-reference/functions/other-functions.md). В результирующем значении все бэкслэши должны быть экранированы. +- `expr` — выражение, возвращающее значение типа [String](../../sql-reference/functions/other-functions.md). В результирующем значении все бэкслэши должны быть экранированы. **Возвращаемое значение** @@ -126,7 +125,7 @@ basename( expr ) **Пример** ``` sql -SELECT 'some/long/path/to/file' AS a, basename(a) +SELECT 'some/long/path/to/file' AS a, basename(a); ``` ``` text @@ -136,7 +135,7 @@ SELECT 'some/long/path/to/file' AS a, basename(a) ``` ``` sql -SELECT 'some\\long\\path\\to\\file' AS a, basename(a) +SELECT 'some\\long\\path\\to\\file' AS a, basename(a); ``` ``` text @@ -146,7 +145,7 @@ SELECT 'some\\long\\path\\to\\file' AS a, basename(a) ``` ``` sql -SELECT 'some-file-name' AS a, basename(a) +SELECT 'some-file-name' AS a, basename(a); ``` ``` text @@ -193,7 +192,7 @@ SELECT visibleWidth(NULL) byteSize(argument [, ...]) ``` -**Параметры** +**Аргументы** - `argument` — значение. @@ -246,7 +245,7 @@ INSERT INTO test VALUES(1, 8, 16, 32, 64, -8, -16, -32, -64, 32.32, 64.64); SELECT key, byteSize(u8) AS `byteSize(UInt8)`, byteSize(u16) AS `byteSize(UInt16)`, byteSize(u32) AS `byteSize(UInt32)`, byteSize(u64) AS `byteSize(UInt64)`, byteSize(i8) AS `byteSize(Int8)`, byteSize(i16) AS `byteSize(Int16)`, byteSize(i32) AS `byteSize(Int32)`, byteSize(i64) AS `byteSize(Int64)`, byteSize(f32) AS `byteSize(Float32)`, byteSize(f64) AS `byteSize(Float64)` FROM test ORDER BY key ASC FORMAT Vertical; ``` -Result: +Результат: ``` text Row 1: @@ -324,7 +323,7 @@ SELECT currentUser(); SELECT currentUser(); ``` -Ответ: +Результат: ``` text ┌─currentUser()─┐ @@ -346,14 +345,14 @@ SELECT currentUser(); isConstant(x) ``` -**Параметры** +**Аргументы** -- `x` — Выражение для проверки. +- `x` — выражение для проверки. **Возвращаемые значения** -- `1` — Выражение `x` является константным. -- `0` — Выражение `x` не является константным. +- `1` — выражение `x` является константным. +- `0` — выражение `x` не является константным. Тип: [UInt8](../data-types/int-uint.md). @@ -362,7 +361,7 @@ isConstant(x) Запрос: ```sql -SELECT isConstant(x + 1) FROM (SELECT 43 AS x) +SELECT isConstant(x + 1) FROM (SELECT 43 AS x); ``` Результат: @@ -376,7 +375,7 @@ SELECT isConstant(x + 1) FROM (SELECT 43 AS x) Запрос: ```sql -WITH 3.14 AS pi SELECT isConstant(cos(pi)) +WITH 3.14 AS pi SELECT isConstant(cos(pi)); ``` Результат: @@ -413,10 +412,10 @@ SELECT isConstant(number) FROM numbers(1) ifNotFinite(x,y) -**Параметры** +**Аргументы** -- `x` — Значение, которое нужно проверить на бесконечность. Тип: [Float\*](../../sql-reference/functions/other-functions.md). -- `y` — Запасное значение. Тип: [Float\*](../../sql-reference/functions/other-functions.md). +- `x` — значение, которое нужно проверить на бесконечность. Тип: [Float\*](../../sql-reference/functions/other-functions.md). +- `y` — запасное значение. Тип: [Float\*](../../sql-reference/functions/other-functions.md). **Возвращаемые значения** @@ -458,7 +457,7 @@ SELECT isConstant(number) FROM numbers(1) `bar(x, min, max, width)` рисует полосу ширины пропорциональной `(x - min)` и равной `width` символов при `x = max`. -Параметры: +Аргументы: - `x` — Величина для отображения. - `min, max` — Целочисленные константы, значение должно помещаться в `Int64`. @@ -659,7 +658,7 @@ SELECT ## neighbor {#neighbor} -Функция позволяет получить доступ к значению в колонке `column`, находящемуся на смещении `offset` относительно текущей строки. Является частичной реализацией [оконных функций](https://en.wikipedia.org/wiki/SQL_window_function) `LEAD()` и `LAG()`. +Функция позволяет получить доступ к значению в столбце `column`, находящемуся на смещении `offset` относительно текущей строки. Является частичной реализацией [оконных функций](https://en.wikipedia.org/wiki/SQL_window_function) `LEAD()` и `LAG()`. **Синтаксис** @@ -667,13 +666,19 @@ SELECT neighbor(column, offset[, default_value]) ``` -Результат функции зависит от затронутых блоков данных и порядка данных в блоке. Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат. +Результат функции зависит от затронутых блоков данных и порядка данных в блоке. -**Параметры** +!!! warning "Предупреждение" + Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных. -- `column` — Имя столбца или скалярное выражение. -- `offset` - Смещение от текущей строки `column`. [Int64](../../sql-reference/functions/other-functions.md). -- `default_value` - Опциональный параметр. Значение, которое будет возвращено, если смещение выходит за пределы блока данных. +Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю. +Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса. + +**Аргументы** + +- `column` — имя столбца или скалярное выражение. +- `offset` — смещение от текущей строки `column`. [Int64](../../sql-reference/functions/other-functions.md). +- `default_value` — опциональный параметр. Значение, которое будет возвращено, если смещение выходит за пределы блока данных. **Возвращаемое значение** @@ -690,7 +695,7 @@ neighbor(column, offset[, default_value]) SELECT number, neighbor(number, 2) FROM system.numbers LIMIT 10; ``` -Ответ: +Результат: ``` text ┌─number─┬─neighbor(number, 2)─┐ @@ -713,7 +718,7 @@ SELECT number, neighbor(number, 2) FROM system.numbers LIMIT 10; SELECT number, neighbor(number, 2, 999) FROM system.numbers LIMIT 10; ``` -Ответ: +Результат: ``` text ┌─number─┬─neighbor(number, 2, 999)─┐ @@ -744,7 +749,7 @@ SELECT FROM numbers(16) ``` -Ответ: +Результат: ``` text ┌──────month─┬─money─┬─prev_year─┬─year_over_year─┐ @@ -767,13 +772,18 @@ FROM numbers(16) └────────────┴───────┴───────────┴────────────────┘ ``` -## runningDifference(x) {#runningdifferencex} +## runningDifference(x) {#other_functions-runningdifference} Считает разницу между последовательными значениями строк в блоке данных. Возвращает 0 для первой строки и разницу с предыдущей строкой для каждой последующей строки. +!!! warning "Предупреждение" + Функция может взять значение предыдущей строки только внутри текущего обработанного блока данных. + Результат функции зависит от затронутых блоков данных и порядка данных в блоке. -Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат. + +Порядок строк, используемый при вычислении функции `runningDifference`, может отличаться от порядка строк, возвращаемых пользователю. +Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса. Пример: @@ -839,7 +849,64 @@ WHERE diff != 1 ## runningDifferenceStartingWithFirstValue {#runningdifferencestartingwithfirstvalue} -То же, что и \[runningDifference\] (./other_functions.md # other_functions-runningdifference), но в первой строке возвращается значение первой строки, а не ноль. +То же, что и [runningDifference](./other-functions.md#other_functions-runningdifference), но в первой строке возвращается значение первой строки, а не ноль. + +## runningConcurrency {#runningconcurrency} + +Подсчитывает количество одновременно идущих событий. +У каждого события есть время начала и время окончания. Считается, что время начала включено в событие, а время окончания исключено из него. Столбцы со временем начала и окончания событий должны иметь одинаковый тип данных. +Функция подсчитывает количество событий, происходящих одновременно на момент начала каждого из событий в выборке. + +!!! warning "Предупреждение" + События должны быть отсортированы по возрастанию времени начала. Если это требование нарушено, то функция вызывает исключение. + Каждый блок данных обрабатывается независимо. Если события из разных блоков данных накладываются по времени, они не могут быть корректно обработаны. + +**Синтаксис** + +``` sql +runningConcurrency(start, end) +``` + +**Аргументы** + +- `start` — Столбец с временем начала событий. [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). +- `end` — Столбец с временем окончания событий. [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md). + +**Возвращаемое значение** + +- Количество одновременно идущих событий на момент начала каждого события. + +Тип: [UInt32](../../sql-reference/data-types/int-uint.md) + +**Пример** + +Рассмотрим таблицу: + +``` text +┌──────start─┬────────end─┐ +│ 2021-03-03 │ 2021-03-11 │ +│ 2021-03-06 │ 2021-03-12 │ +│ 2021-03-07 │ 2021-03-08 │ +│ 2021-03-11 │ 2021-03-12 │ +└────────────┴────────────┘ +``` + +Запрос: + +``` sql +SELECT start, runningConcurrency(start, end) FROM example_table; +``` + +Результат: + +``` text +┌──────start─┬─runningConcurrency(start, end)─┐ +│ 2021-03-03 │ 1 │ +│ 2021-03-06 │ 2 │ +│ 2021-03-07 │ 3 │ +│ 2021-03-11 │ 2 │ +└────────────┴────────────────────────────────┘ +``` ## MACNumToString(num) {#macnumtostringnum} @@ -861,9 +928,9 @@ WHERE diff != 1 getSizeOfEnumType(value) ``` -**Параметры** +**Аргументы** -- `value` — Значение типа `Enum`. +- `value` — значение типа `Enum`. **Возвращаемые значения** @@ -890,9 +957,9 @@ SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x blockSerializedSize(value[, value[, ...]]) ``` -**Параметры** +**Аргументы** -- `value` — Значение произвольного типа. +- `value` — значение произвольного типа. **Возвращаемые значения** @@ -922,9 +989,9 @@ SELECT blockSerializedSize(maxState(1)) as x toColumnTypeName(value) ``` -**Параметры** +**Аргументы** -- `value` — Значение произвольного типа. +- `value` — значение произвольного типа. **Возвращаемые значения** @@ -962,9 +1029,9 @@ SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) dumpColumnStructure(value) ``` -**Параметры** +**Аргументы** -- `value` — Значение произвольного типа. +- `value` — значение произвольного типа. **Возвращаемые значения** @@ -992,9 +1059,9 @@ SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) defaultValueOfArgumentType(expression) ``` -**Параметры** +**Аргументы** -- `expression` — Значение произвольного типа или выражение, результатом которого является значение произвольного типа. +- `expression` — значение произвольного типа или выражение, результатом которого является значение произвольного типа. **Возвращаемые значения** @@ -1034,7 +1101,7 @@ SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) defaultValueOfTypeName(type) ``` -**Параметры:** +**Аргументы** - `type` — тип данных. @@ -1066,6 +1133,111 @@ SELECT defaultValueOfTypeName('Nullable(Int8)') └──────────────────────────────────────────┘ ``` +## indexHint {#indexhint} +Возвращает все данные из диапазона, в который попадают данные, соответствующие указанному выражению. +Переданное выражение не будет вычислено. Выбор диапазона производится по индексу. +Индекс в ClickHouse разреженный, при чтении диапазона в ответ попадают «лишние» соседние данные. + +**Синтаксис** + +```sql +SELECT * FROM table WHERE indexHint() +``` + +**Возвращаемое значение** + +Возвращает диапазон индекса, в котором выполняется заданное условие. + +Тип: [Uint8](https://clickhouse.yandex/docs/ru/data_types/int_uint/#diapazony-uint). + +**Пример** + +Рассмотрим пример с использованием тестовых данных таблицы [ontime](../../getting-started/example-datasets/ontime.md). + +Исходная таблица: + +```sql +SELECT count() FROM ontime +``` + +```text +┌─count()─┐ +│ 4276457 │ +└─────────┘ +``` + +В таблице есть индексы по полям `(FlightDate, (Year, FlightDate))`. + +Выполним выборку по дате, где индекс не используется. + +Запрос: + +```sql +SELECT FlightDate AS k, count() FROM ontime GROUP BY k ORDER BY k +``` + +ClickHouse обработал всю таблицу (`Processed 4.28 million rows`). + +Результат: + +```text +┌──────────k─┬─count()─┐ +│ 2017-01-01 │ 13970 │ +│ 2017-01-02 │ 15882 │ +........................ +│ 2017-09-28 │ 16411 │ +│ 2017-09-29 │ 16384 │ +│ 2017-09-30 │ 12520 │ +└────────────┴─────────┘ +``` + +Для подключения индекса выбираем конкретную дату. + +Запрос: + +```sql +SELECT FlightDate AS k, count() FROM ontime WHERE k = '2017-09-15' GROUP BY k ORDER BY k +``` + +При использовании индекса ClickHouse обработал значительно меньшее количество строк (`Processed 32.74 thousand rows`). + +Результат: + +```text +┌──────────k─┬─count()─┐ +│ 2017-09-15 │ 16428 │ +└────────────┴─────────┘ +``` + +Передадим в функцию `indexHint` выражение `k = '2017-09-15'`. + +Запрос: + +```sql +SELECT + FlightDate AS k, + count() +FROM ontime +WHERE indexHint(k = '2017-09-15') +GROUP BY k +ORDER BY k ASC +``` + +ClickHouse применил индекс по аналогии с примером выше (`Processed 32.74 thousand rows`). +Выражение `k = '2017-09-15'` не используется при формировании результата. +Функция `indexHint` позволяет увидеть соседние данные. + +Результат: + +```text +┌──────────k─┬─count()─┐ +│ 2017-09-14 │ 7071 │ +│ 2017-09-15 │ 16428 │ +│ 2017-09-16 │ 1077 │ +│ 2017-09-30 │ 8167 │ +└────────────┴─────────┘ +``` + ## replicate {#other-functions-replicate} Создает массив, заполненный одним значением. @@ -1076,10 +1248,10 @@ SELECT defaultValueOfTypeName('Nullable(Int8)') SELECT replicate(x, arr); ``` -**Параметры** +**Аргументы** -- `arr` — Исходный массив. ClickHouse создаёт новый массив такой же длины как исходный и заполняет его значением `x`. -- `x` — Значение, которым будет заполнен результирующий массив. +- `arr` — исходный массив. ClickHouse создаёт новый массив такой же длины как исходный и заполняет его значением `x`. +- `x` — значение, которым будет заполнен результирующий массив. **Возвращаемое значение** @@ -1159,7 +1331,7 @@ filesystemFree() SELECT formatReadableSize(filesystemFree()) AS "Free space", toTypeName(filesystemFree()) AS "Type"; ``` -Ответ: +Результат: ``` text ┌─Free space─┬─Type───┐ @@ -1191,7 +1363,7 @@ filesystemCapacity() SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesystemCapacity()) AS "Type" ``` -Ответ: +Результат: ``` text ┌─Capacity──┬─Type───┐ @@ -1209,7 +1381,7 @@ SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesy finalizeAggregation(state) ``` -**Параметры** +**Аргументы** - `state` — состояние агрегатной функции. [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction). @@ -1310,17 +1482,17 @@ FROM numbers(10); **Синтаксис** ```sql -runningAccumulate(agg_state[, grouping]); +runningAccumulate(agg_state[, grouping]) ``` -**Параметры** +**Аргументы** -- `agg_state` — Состояние агрегатной функции. [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction). -- `grouping` — Ключ группировки. Опциональный параметр. Состояние функции обнуляется, если значение `grouping` меняется. Параметр может быть любого [поддерживаемого типа данных](../../sql-reference/data-types/index.md), для которого определен оператор равенства. +- `agg_state` — состояние агрегатной функции. [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction). +- `grouping` — ключ группировки. Опциональный параметр. Состояние функции обнуляется, если значение `grouping` меняется. Параметр может быть любого [поддерживаемого типа данных](../../sql-reference/data-types/index.md), для которого определен оператор равенства. **Возвращаемое значение** -- Каждая результирующая строка содержит результат агрегатной функции, накопленный для всех входных строк от 0 до текущей позиции. `runningAccumulate` обнуляет состояния для каждого нового блока данных или при изменении значения `grouping`. +- Каждая результирующая строка содержит результат агрегатной функции, накопленный для всех входных строк от 0 до текущей позиции. `runningAccumulate` обнуляет состояния для каждого нового блока данных или при изменении значения `grouping`. Тип зависит от используемой агрегатной функции. @@ -1419,7 +1591,7 @@ FROM joinGet(join_storage_table_name, `value_column`, join_keys) ``` -**Параметры** +**Аргументы** - `join_storage_table_name` — [идентификатор](../syntax.md#syntax-identifiers), который указывает, откуда производится выборка данных. Поиск по идентификатору осуществляется в базе данных по умолчанию (см. конфигурацию `default_database`). Чтобы переопределить базу данных по умолчанию, используйте команду `USE db_name`, или укажите базу данных и таблицу через разделитель `db_name.db_table`, см. пример. - `value_column` — столбец, из которого нужно произвести выборку данных. @@ -1524,9 +1696,9 @@ SELECT identity(42) randomPrintableASCII(length) ``` -**Параметры** +**Аргументы** -- `length` — Длина результирующей строки. Положительное целое число. +- `length` — длина результирующей строки. Положительное целое число. Если передать `length < 0`, то поведение функции не определено. @@ -1560,7 +1732,7 @@ SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers randomString(length) ``` -**Параметры** +**Аргументы** - `length` — длина строки. Положительное целое число. @@ -1608,11 +1780,11 @@ len: 30 randomFixedString(length); ``` -**Параметры** +**Аргументы** -- `length` — Длина строки в байтах. [UInt64](../../sql-reference/data-types/int-uint.md). +- `length` — длина строки в байтах. [UInt64](../../sql-reference/data-types/int-uint.md). -**Returned value(s)** +**Возвращаемое значение** - Строка, заполненная случайными байтами. @@ -1642,12 +1814,12 @@ SELECT randomFixedString(13) as rnd, toTypeName(rnd) **Синтаксис** ``` sql -randomStringUTF8(length); +randomStringUTF8(length) ``` -**Параметры** +**Аргументы** -- `length` — Длина итоговой строки в кодовых точках. [UInt64](../../sql-reference/data-types/int-uint.md). +- `length` — длина итоговой строки в кодовых точках. [UInt64](../../sql-reference/data-types/int-uint.md). **Возвращаемое значение** @@ -1679,7 +1851,7 @@ SELECT randomStringUTF8(13) **Синтаксис** ```sql -getSetting('custom_setting'); +getSetting('custom_setting') ``` **Параметр** @@ -1717,7 +1889,7 @@ SELECT getSetting('custom_a'); isDecimalOverflow(d, [p]) ``` -**Параметры** +**Аргументы** - `d` — число. [Decimal](../../sql-reference/data-types/decimal.md). - `p` — точность. Необязательный параметр. Если опущен, используется исходная точность первого аргумента. Использование этого параметра может быть полезно для извлечения данных в другую СУБД или файл. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). @@ -1754,7 +1926,7 @@ SELECT isDecimalOverflow(toDecimal32(1000000000, 0), 9), countDigits(x) ``` -**Параметры** +**Аргументы** - `x` — [целое](../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64) или [дробное](../../sql-reference/data-types/decimal.md) число. @@ -1813,7 +1985,7 @@ UNSUPPORTED_METHOD tcpPort() ``` -**Параметры** +**Аргументы** - Нет. @@ -1843,4 +2015,3 @@ SELECT tcpPort(); - [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/other_functions/) diff --git a/docs/ru/sql-reference/functions/random-functions.md b/docs/ru/sql-reference/functions/random-functions.md index f3889504fa6..bbf0affb081 100644 --- a/docs/ru/sql-reference/functions/random-functions.md +++ b/docs/ru/sql-reference/functions/random-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 51 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0433\u0435\u043d\u0435\u0440\u0430\u0446\u0438\u0438\u0020\u043f\u0441\u0435\u0432\u0434\u043e\u0441\u043b\u0443\u0447\u0430\u0439\u043d\u044b\u0445\u0020\u0447\u0438\u0441\u0435\u043b" +toc_title: "Функции генерации псевдослучайных чисел" --- # Функции генерации псевдослучайных чисел {#functions-for-generating-pseudo-random-numbers} @@ -31,9 +31,9 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0433\u0435\u043d\u randConstant([x]) ``` -**Параметры** +**Аргументы** -- `x` — [Выражение](../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../data-types/index.md#data_types). Значение используется, чтобы избежать [склейки одинаковых выражений](index.md#common-subexpression-elimination), если функция вызывается несколько раз в одном запросе. Необязательный параметр. +- `x` — [выражение](../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../data-types/index.md#data_types). Значение используется, чтобы избежать [склейки одинаковых выражений](index.md#common-subexpression-elimination), если функция вызывается несколько раз в одном запросе. Необязательный параметр. **Возвращаемое значение** @@ -79,7 +79,7 @@ fuzzBits([s], [prob]) ``` Инвертирует каждый бит `s` с вероятностью `prob`. -**Параметры** +**Аргументы** - `s` — `String` or `FixedString` - `prob` — constant `Float32/64` @@ -107,4 +107,3 @@ FROM numbers(3) └───────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/random_functions/) diff --git a/docs/ru/sql-reference/functions/rounding-functions.md b/docs/ru/sql-reference/functions/rounding-functions.md index 78033160396..276f85bf6b7 100644 --- a/docs/ru/sql-reference/functions/rounding-functions.md +++ b/docs/ru/sql-reference/functions/rounding-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 45 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043e\u043a\u0440\u0443\u0433\u043b\u0435\u043d\u0438\u044f" +toc_title: "Функции округления" --- # Функции округления {#funktsii-okrugleniia} @@ -33,10 +33,10 @@ N может быть отрицательным. round(expression [, decimal_places]) ``` -**Параметры:** +**Аргументы** -- `expression` — Число для округления. Может быть любым [выражением](../syntax.md#syntax-expressions), возвращающим числовой [тип данных](../../sql-reference/functions/rounding-functions.md#data_types). -- `decimal-places` — Целое значение. +- `expression` — число для округления. Может быть любым [выражением](../syntax.md#syntax-expressions), возвращающим числовой [тип данных](../../sql-reference/functions/rounding-functions.md#data_types). +- `decimal-places` — целое значение. - Если `decimal-places > 0`, то функция округляет значение справа от запятой. - Если `decimal-places < 0` то функция округляет значение слева от запятой. - Если `decimal-places = 0`, то функция округляет значение до целого. В этом случае аргумент можно опустить. @@ -112,13 +112,13 @@ round(3.65, 1) = 3.6 roundBankers(expression [, decimal_places]) ``` -**Параметры** +**Аргументы** -- `expression` — Число для округления. Может быть любым [выражением](../syntax.md#syntax-expressions), возвращающим числовой [тип данных](../../sql-reference/functions/rounding-functions.md#data_types). -- `decimal-places` — Десятичный разряд. Целое число. - - `decimal-places > 0` — Функция округляет значение выражения до ближайшего чётного числа на соответствующей позиции справа от запятой. Например, `roundBankers(3.55, 1) = 3.6`. - - `decimal-places < 0` — Функция округляет значение выражения до ближайшего чётного числа на соответствующей позиции слева от запятой. Например, `roundBankers(24.55, -1) = 20`. - - `decimal-places = 0` — Функция округляет значение до целого. В этом случае аргумент можно не передавать. Например, `roundBankers(2.5) = 2`. +- `expression` — число для округления. Может быть любым [выражением](../syntax.md#syntax-expressions), возвращающим числовой [тип данных](../../sql-reference/functions/rounding-functions.md#data_types). +- `decimal-places` — десятичный разряд. Целое число. + - `decimal-places > 0` — функция округляет значение выражения до ближайшего чётного числа на соответствующей позиции справа от запятой. Например, `roundBankers(3.55, 1) = 3.6`. + - `decimal-places < 0` — функция округляет значение выражения до ближайшего чётного числа на соответствующей позиции слева от запятой. Например, `roundBankers(24.55, -1) = 20`. + - `decimal-places = 0` — функция округляет значение до целого. В этом случае аргумент можно не передавать. Например, `roundBankers(2.5) = 2`. **Возвращаемое значение** @@ -177,4 +177,3 @@ roundBankers(10.755, 2) = 11,76 Принимает число. Если число меньше 18 - возвращает 0. Иначе округляет число вниз до чисел из набора: 18, 25, 35, 45, 55. Эта функция специфична для Яндекс.Метрики и предназначена для реализации отчёта по возрасту посетителей. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/rounding_functions/) diff --git a/docs/ru/sql-reference/functions/splitting-merging-functions.md b/docs/ru/sql-reference/functions/splitting-merging-functions.md index d451eabc407..b8d04982b91 100644 --- a/docs/ru/sql-reference/functions/splitting-merging-functions.md +++ b/docs/ru/sql-reference/functions/splitting-merging-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 47 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0440\u0430\u0437\u0431\u0438\u0435\u043d\u0438\u044f\u0020\u0438\u0020\u0441\u043b\u0438\u044f\u043d\u0438\u044f\u0020\u0441\u0442\u0440\u043e\u043a\u0020\u0438\u0020\u043c\u0430\u0441\u0441\u0438\u0432\u043e\u0432" +toc_title: "Функции разбиения и слияния строк и массивов" --- # Функции разбиения и слияния строк и массивов {#funktsii-razbieniia-i-sliianiia-strok-i-massivov} @@ -17,10 +17,10 @@ separator должен быть константной строкой из ро splitByChar(, ) ``` -**Параметры** +**Аргументы** -- `separator` — Разделитель, состоящий из одного символа. [String](../../sql-reference/data-types/string.md). -- `s` — Разбиваемая строка. [String](../../sql-reference/data-types/string.md). +- `separator` — разделитель, состоящий из одного символа. [String](../../sql-reference/data-types/string.md). +- `s` — разбиваемая строка. [String](../../sql-reference/data-types/string.md). **Возвращаемые значения** @@ -54,10 +54,10 @@ SELECT splitByChar(',', '1,2,3,abcde') splitByString(separator, s) ``` -**Параметры** +**Аргументы** -- `separator` — Разделитель. [String](../../sql-reference/data-types/string.md). -- `s` — Разбиваемая строка. [String](../../sql-reference/data-types/string.md). +- `separator` — разделитель. [String](../../sql-reference/data-types/string.md). +- `s` — разбиваемая строка. [String](../../sql-reference/data-types/string.md). **Возвращаемые значения** @@ -67,7 +67,7 @@ splitByString(separator, s) - Задано несколько последовательных разделителей; - Исходная строка `s` пуста. -Type: [Array](../../sql-reference/data-types/array.md) of [String](../../sql-reference/data-types/string.md). +Тип: [Array](../../sql-reference/data-types/array.md) of [String](../../sql-reference/data-types/string.md). **Примеры** @@ -115,4 +115,3 @@ SELECT alphaTokens('abca1abc') └─────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/splitting_merging_functions/) diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index 236583c211a..04af599c09a 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 40 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u043e\u0020\u0441\u0442\u0440\u043e\u043a\u0430\u043c\u0438" +toc_title: "Функции для работы со строками" --- # Функции для работы со строками {#funktsii-dlia-raboty-so-strokami} @@ -70,19 +70,19 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u Заменяет некорректные символы UTF-8 на символ `�` (U+FFFD). Все идущие подряд некорректные символы схлопываются в один заменяющий символ. ``` sql -toValidUTF8( input_string ) +toValidUTF8(input_string) ``` -Параметры: +**Аргументы** -- input_string — произвольный набор байтов, представленный как объект типа [String](../../sql-reference/functions/string-functions.md). +- `input_string` — произвольный набор байтов, представленный как объект типа [String](../../sql-reference/functions/string-functions.md). Возвращаемое значение: Корректная строка UTF-8. **Пример** ``` sql -SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') +SELECT toValidUTF8('\x61\xF0\x80\x80\x80b'); ``` ``` text @@ -95,16 +95,18 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') Повторяет строку определенное количество раз и объединяет повторяемые значения в одну строку. +Синоним: `REPEAT`. + **Синтаксис** ``` sql repeat(s, n) ``` -**Параметры** +**Аргументы** -- `s` — Строка для повторения. [String](../../sql-reference/functions/string-functions.md). -- `n` — Количество повторов. [UInt](../../sql-reference/functions/string-functions.md). +- `s` — строка для повторения. [String](../../sql-reference/functions/string-functions.md). +- `n` — количество повторов. [UInt](../../sql-reference/functions/string-functions.md). **Возвращаемое значение** @@ -117,10 +119,10 @@ repeat(s, n) Запрос: ``` sql -SELECT repeat('abc', 10) +SELECT repeat('abc', 10); ``` -Ответ: +Результат: ``` text ┌─repeat('abc', 10)──────────────┐ @@ -170,7 +172,7 @@ SELECT format('{} {}', 'Hello', 'World') concat(s1, s2, ...) ``` -**Параметры** +**Аргументы** Значения типа String или FixedString. @@ -185,10 +187,10 @@ concat(s1, s2, ...) Запрос: ``` sql -SELECT concat('Hello, ', 'World!') +SELECT concat('Hello, ', 'World!'); ``` -Ответ: +Результат: ``` text ┌─concat('Hello, ', 'World!')─┐ @@ -208,7 +210,7 @@ SELECT concat('Hello, ', 'World!') concatAssumeInjective(s1, s2, ...) ``` -**Параметры** +**Аргументы** Значения типа String или FixedString. @@ -240,10 +242,10 @@ SELECT * from key_val Запрос: ``` sql -SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY (key1, key2) +SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY (key1, key2); ``` -Ответ: +Результат: ``` text ┌─concat(key1, key2)─┬─sum(value)─┐ @@ -273,10 +275,14 @@ SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY (key1, key2) Производит кодирование строки s в base64-представление. +Синоним: `TO_BASE64`. + ## base64Decode(s) {#base64decode} Декодирует base64-представление s в исходную строку. При невозможности декодирования выбрасывает исключение +Синоним: `FROM_BASE64`. + ## tryBase64Decode(s) {#trybase64decode} Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. @@ -306,7 +312,7 @@ SELECT startsWith('Spider-Man', 'Spi'); SELECT startsWith('Hello, world!', 'He'); ``` -Ответ: +Результат: ``` text ┌─startsWith('Hello, world!', 'He')─┐ @@ -325,7 +331,7 @@ SELECT startsWith('Hello, world!', 'He'); trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) ``` -**Параметры** +**Аргументы** - `trim_character` — один или несколько символов, подлежащие удалению. [String](../../sql-reference/functions/string-functions.md). - `input_string` — строка для обрезки. [String](../../sql-reference/functions/string-functions.md). @@ -341,10 +347,10 @@ trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) Запрос: ``` sql -SELECT trim(BOTH ' ()' FROM '( Hello, world! )') +SELECT trim(BOTH ' ()' FROM '( Hello, world! )'); ``` -Ответ: +Результат: ``` text ┌─trim(BOTH ' ()' FROM '( Hello, world! )')─┐ @@ -364,7 +370,7 @@ trimLeft(input_string) Алиас: `ltrim(input_string)`. -**Параметры** +**Аргументы** - `input_string` — строка для обрезки. [String](../../sql-reference/functions/string-functions.md). @@ -379,10 +385,10 @@ trimLeft(input_string) Запрос: ``` sql -SELECT trimLeft(' Hello, world! ') +SELECT trimLeft(' Hello, world! '); ``` -Ответ: +Результат: ``` text ┌─trimLeft(' Hello, world! ')─┐ @@ -402,7 +408,7 @@ trimRight(input_string) Алиас: `rtrim(input_string)`. -**Параметры** +**Аргументы** - `input_string` — строка для обрезки. [String](../../sql-reference/functions/string-functions.md). @@ -417,10 +423,10 @@ trimRight(input_string) Запрос: ``` sql -SELECT trimRight(' Hello, world! ') +SELECT trimRight(' Hello, world! '); ``` -Ответ: +Результат: ``` text ┌─trimRight(' Hello, world! ')─┐ @@ -440,7 +446,7 @@ trimBoth(input_string) Алиас: `trim(input_string)`. -**Параметры** +**Аргументы** - `input_string` — строка для обрезки. [String](../../sql-reference/functions/string-functions.md). @@ -455,10 +461,10 @@ trimBoth(input_string) Запрос: ``` sql -SELECT trimBoth(' Hello, world! ') +SELECT trimBoth(' Hello, world! '); ``` -Ответ: +Результат: ``` text ┌─trimBoth(' Hello, world! ')─┐ @@ -488,14 +494,15 @@ SELECT trimBoth(' Hello, world! ') Заменяет литералы, последовательности литералов и сложные псевдонимы заполнителями. -**Синтаксис** +**Синтаксис** + ``` sql normalizeQuery(x) ``` -**Параметры** +**Аргументы** -- `x` — Последовательность символов. [String](../../sql-reference/data-types/string.md). +- `x` — последовательность символов. [String](../../sql-reference/data-types/string.md). **Возвращаемое значение** @@ -529,9 +536,9 @@ SELECT normalizeQuery('[1, 2, 3, x]') AS query; normalizedQueryHash(x) ``` -**Параметры** +**Аргументы** -- `x` — Последовательность символов. [String](../../sql-reference/data-types/string.md). +- `x` — последовательность символов. [String](../../sql-reference/data-types/string.md). **Возвращаемое значение** @@ -567,7 +574,7 @@ SELECT normalizedQueryHash('SELECT 1 AS `xyz`') != normalizedQueryHash('SELECT 1 encodeXMLComponent(x) ``` -**Параметры** +**Аргументы** - `x` — последовательность символов. [String](../../sql-reference/data-types/string.md). @@ -597,7 +604,6 @@ Hello, "world"! 'foo' ``` - ## decodeXMLComponent {#decode-xml-component} Заменяет символами предопределенные мнемоники XML: `"` `&` `'` `>` `<` @@ -609,7 +615,7 @@ Hello, "world"! decodeXMLComponent(x) ``` -**Параметры** +**Аргументы** - `x` — последовательность символов. [String](../../sql-reference/data-types/string.md). @@ -639,4 +645,66 @@ SELECT decodeXMLComponent('< Σ >'); - [Мнемоники в HTML](https://ru.wikipedia.org/wiki/%D0%9C%D0%BD%D0%B5%D0%BC%D0%BE%D0%BD%D0%B8%D0%BA%D0%B8_%D0%B2_HTML) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_functions/) + + +## extractTextFromHTML {#extracttextfromhtml} + +Функция для извлечения текста из HTML или XHTML. +Она не соответствует всем HTML, XML или XHTML стандартам на 100%, но ее реализация достаточно точная и быстрая. Правила обработки следующие: + +1. Комментарии удаляются. Пример: ``. Комментарий должен оканчиваться символами `-->`. Вложенные комментарии недопустимы. +Примечание: конструкции наподобие `` и `` не являются допустимыми комментариями в HTML, но они будут удалены согласно другим правилам. +2. Содержимое CDATA вставляется дословно. Примечание: формат CDATA специфичен для XML/XHTML. Но он обрабатывается всегда по принципу "наилучшего возможного результата". +3. Элементы `script` и `style` удаляются вместе со всем содержимым. Примечание: предполагается, что закрывающий тег не может появиться внутри содержимого. Например, в JS строковый литерал должен быть экранирован как `"<\/script>"`. +Примечание: комментарии и CDATA возможны внутри `script` или `style` - тогда закрывающие теги не ищутся внутри CDATA. Пример: `]]>`. Но они ищутся внутри комментариев. Иногда возникают сложные случаи: ` var y = "-->"; alert(x + y);` +Примечание: `script` и `style` могут быть названиями пространств имен XML - тогда они не обрабатываются как обычные элементы `script` или `style`. Пример: `Hello`. +Примечание: пробелы возможны после имени закрывающего тега: ``, но не перед ним: `< / script>`. +4. Другие теги или элементы, подобные тегам, удаляются, а их внутреннее содержимое остается. Пример: `.` +Примечание: ожидается, что такой HTML является недопустимым: `` +Примечание: функция также удаляет подобные тегам элементы: `<>`, ``, и т. д. +Примечание: если встречается тег без завершающего символа `>`, то удаляется этот тег и весь следующий за ним текст: `world`, `Helloworld` — в HTML нет пробелов, но функция вставляет их. Также следует учитывать такие варианты написания: `Hello

world

`, `Hello
world`. Подобные результаты выполнения функции могут использоваться для анализа данных, например, для преобразования HTML-текста в набор используемых слов. +7. Также обратите внимание, что правильная обработка пробелов требует поддержки `
` и свойств CSS `display` и `white-space`.
+
+**Синтаксис**
+
+``` sql
+extractTextFromHTML(x)
+```
+
+**Аргументы**
+
+-   `x` — текст для обработки. [String](../../sql-reference/data-types/string.md). 
+
+**Возвращаемое значение**
+
+-   Извлеченный текст.
+
+Тип: [String](../../sql-reference/data-types/string.md).
+
+**Пример**
+
+Первый пример содержит несколько тегов и комментарий. На этом примере также видно, как обрабатываются пробелы.
+Второй пример показывает обработку `CDATA` и тега `script`.
+В третьем примере текст выделяется из полного HTML ответа, полученного с помощью функции [url](../../sql-reference/table-functions/url.md).
+
+Запрос:
+
+``` sql
+SELECT extractTextFromHTML(' 

A text withtags.

'); +SELECT extractTextFromHTML('CDATA]]> '); +SELECT extractTextFromHTML(html) FROM url('http://www.donothingfor2minutes.com/', RawBLOB, 'html String'); +``` + +Результат: + +``` text +A text with tags . +The content within CDATA +Do Nothing for 2 Minutes 2:00   +``` diff --git a/docs/ru/sql-reference/functions/string-replace-functions.md b/docs/ru/sql-reference/functions/string-replace-functions.md index f334d6804f9..9426e8685b0 100644 --- a/docs/ru/sql-reference/functions/string-replace-functions.md +++ b/docs/ru/sql-reference/functions/string-replace-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 42 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u043e\u0438\u0441\u043a\u0430\u0020\u0438\u0020\u0437\u0430\u043c\u0435\u043d\u044b\u0020\u0432\u0020\u0441\u0442\u0440\u043e\u043a\u0430\u0445" +toc_title: "Функции поиска и замены в строках" --- # Функции поиска и замены в строках {#funktsii-poiska-i-zameny-v-strokakh} @@ -83,4 +83,3 @@ SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res └─────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_replace_functions/) diff --git a/docs/ru/sql-reference/functions/string-search-functions.md b/docs/ru/sql-reference/functions/string-search-functions.md index b7193da6f33..2417a1c6ffd 100644 --- a/docs/ru/sql-reference/functions/string-search-functions.md +++ b/docs/ru/sql-reference/functions/string-search-functions.md @@ -1,13 +1,13 @@ --- toc_priority: 41 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u043e\u0438\u0441\u043a\u0430\u0020\u0432\u0020\u0441\u0442\u0440\u043e\u043a\u0430\u0445" +toc_title: "Функции поиска в строках" --- # Функции поиска в строках {#funktsii-poiska-v-strokakh} Во всех функциях, поиск регистрозависимый по умолчанию. Существуют варианты функций для регистронезависимого поиска. -## position(haystack, needle) {#position} +## position(haystack, needle), locate(haystack, needle) {#position} Поиск подстроки `needle` в строке `haystack`. @@ -21,13 +21,20 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u043e\u0438\u position(haystack, needle[, start_pos]) ``` +``` sql +position(needle IN haystack) +``` + Алиас: `locate(haystack, needle[, start_pos])`. -**Параметры** +!!! note "Примечание" + Синтаксис `position(needle IN haystack)` обеспечивает совместимость с SQL, функция работает так же, как `position(haystack, needle)`. + +**Аргументы** - `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). - `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). -- `start_pos` – Опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md) +- `start_pos` — опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -43,10 +50,10 @@ position(haystack, needle[, start_pos]) Запрос: ``` sql -SELECT position('Hello, world!', '!') +SELECT position('Hello, world!', '!'); ``` -Ответ: +Результат: ``` text ┌─position('Hello, world!', '!')─┐ @@ -59,10 +66,10 @@ SELECT position('Hello, world!', '!') Запрос: ``` sql -SELECT position('Привет, мир!', '!') +SELECT position('Привет, мир!', '!'); ``` -Ответ: +Результат: ``` text ┌─position('Привет, мир!', '!')─┐ @@ -70,6 +77,36 @@ SELECT position('Привет, мир!', '!') └───────────────────────────────┘ ``` +**Примеры работы функции с синтаксисом POSITION(needle IN haystack)** + +Запрос: + +```sql +SELECT 1 = position('абв' IN 'абв'); +``` + +Результат: + +```text +┌─equals(1, position('абв', 'абв'))─┐ +│ 1 │ +└───────────────────────────────────┘ +``` + +Запрос: + +```sql +SELECT 0 = position('абв' IN ''); +``` + +Результат: + +```text +┌─equals(0, position('', 'абв'))─┐ +│ 1 │ +└────────────────────────────────┘ +``` + ## positionCaseInsensitive {#positioncaseinsensitive} Такая же, как и [position](#position), но работает без учета регистра. Возвращает позицию в байтах найденной подстроки в строке, начиная с 1. @@ -82,11 +119,11 @@ SELECT position('Привет, мир!', '!') positionCaseInsensitive(haystack, needle[, start_pos]) ``` -**Параметры** +**Аргументы** - `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). - `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). -- `start_pos` – Опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md) +- `start_pos` — опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -100,10 +137,10 @@ positionCaseInsensitive(haystack, needle[, start_pos]) Запрос: ``` sql -SELECT positionCaseInsensitive('Hello, world!', 'hello') +SELECT positionCaseInsensitive('Hello, world!', 'hello'); ``` -Ответ: +Результат: ``` text ┌─positionCaseInsensitive('Hello, world!', 'hello')─┐ @@ -125,11 +162,11 @@ SELECT positionCaseInsensitive('Hello, world!', 'hello') positionUTF8(haystack, needle[, start_pos]) ``` -**Параметры** +**Аргументы** - `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). - `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). -- `start_pos` – Опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md) +- `start_pos` — опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -145,10 +182,10 @@ positionUTF8(haystack, needle[, start_pos]) Запрос: ``` sql -SELECT positionUTF8('Привет, мир!', '!') +SELECT positionUTF8('Привет, мир!', '!'); ``` -Ответ: +Результат: ``` text ┌─positionUTF8('Привет, мир!', '!')─┐ @@ -161,7 +198,7 @@ SELECT positionUTF8('Привет, мир!', '!') Запрос для символа `é`, который представлен одной кодовой точкой `U+00E9`: ``` sql -SELECT positionUTF8('Salut, étudiante!', '!') +SELECT positionUTF8('Salut, étudiante!', '!'); ``` Result: @@ -175,10 +212,10 @@ Result: Запрос для символа `é`, который представлен двумя кодовыми точками `U+0065U+0301`: ``` sql -SELECT positionUTF8('Salut, étudiante!', '!') +SELECT positionUTF8('Salut, étudiante!', '!'); ``` -Ответ: +Результат: ``` text ┌─positionUTF8('Salut, étudiante!', '!')─┐ @@ -198,11 +235,11 @@ SELECT positionUTF8('Salut, étudiante!', '!') positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) ``` -**Параметры** +**Аргументы** - `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). - `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). -- `start_pos` – Опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md) +- `start_pos` — опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -216,10 +253,10 @@ positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) Запрос: ``` sql -SELECT positionCaseInsensitiveUTF8('Привет, мир!', 'Мир') +SELECT positionCaseInsensitiveUTF8('Привет, мир!', 'Мир'); ``` -Ответ: +Результат: ``` text ┌─positionCaseInsensitiveUTF8('Привет, мир!', 'Мир')─┐ @@ -257,7 +294,7 @@ multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) Query: ``` sql -SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']) +SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']); ``` Result: @@ -357,7 +394,7 @@ Result: extractAllGroupsHorizontal(haystack, pattern) ``` -**Параметры** +**Аргументы** - `haystack` — строка для разбора. Тип: [String](../../sql-reference/data-types/string.md). - `pattern` — регулярное выражение, построенное по синтаксическим правилам [re2](https://github.com/google/re2/wiki/Syntax). Выражение должно содержать группы, заключенные в круглые скобки. Если выражение не содержит групп, генерируется исключение. Тип: [String](../../sql-reference/data-types/string.md). @@ -373,7 +410,7 @@ extractAllGroupsHorizontal(haystack, pattern) Запрос: ``` sql -SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)') +SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)'); ``` Результат: @@ -384,8 +421,9 @@ SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=( └──────────────────────────────────────────────────────────────────────────────────────────┘ ``` -**См. также** -- функция [extractAllGroupsVertical](#extractallgroups-vertical) +**Смотрите также** + +- Функция [extractAllGroupsVertical](#extractallgroups-vertical) ## extractAllGroupsVertical {#extractallgroups-vertical} @@ -397,7 +435,7 @@ SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=( extractAllGroupsVertical(haystack, pattern) ``` -**Параметры** +**Аргументы** - `haystack` — строка для разбора. Тип: [String](../../sql-reference/data-types/string.md). - `pattern` — регулярное выражение, построенное по синтаксическим правилам [re2](https://github.com/google/re2/wiki/Syntax). Выражение должно содержать группы, заключенные в круглые скобки. Если выражение не содержит групп, генерируется исключение. Тип: [String](../../sql-reference/data-types/string.md). @@ -413,7 +451,7 @@ extractAllGroupsVertical(haystack, pattern) Запрос: ``` sql -SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)') +SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)'); ``` Результат: @@ -424,8 +462,9 @@ SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[ └────────────────────────────────────────────────────────────────────────────────────────┘ ``` -**См. также** -- функция [extractAllGroupsHorizontal](#extractallgroups-horizontal) +**Смотрите также** + +- Функция [extractAllGroupsHorizontal](#extractallgroups-horizontal) ## like(haystack, pattern), оператор haystack LIKE pattern {#function-like} @@ -455,10 +494,10 @@ SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[ ilike(haystack, pattern) ``` -**Параметры** +**Аргументы** -- `haystack` — Входная строка. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `pattern` — Если `pattern` не содержит процента или нижнего подчеркивания, тогда `pattern` представляет саму строку. Нижнее подчеркивание (`_`) в `pattern` обозначает любой отдельный символ. Знак процента (`%`) соответствует последовательности из любого количества символов: от нуля и более. +- `haystack` — входная строка. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `pattern` — если `pattern` не содержит процента или нижнего подчеркивания, тогда `pattern` представляет саму строку. Нижнее подчеркивание (`_`) в `pattern` обозначает любой отдельный символ. Знак процента (`%`) соответствует последовательности из любого количества символов: от нуля и более. Некоторые примеры `pattern`: @@ -490,7 +529,7 @@ ilike(haystack, pattern) Запрос: ``` sql -SELECT * FROM Months WHERE ilike(name, '%j%') +SELECT * FROM Months WHERE ilike(name, '%j%'); ``` Результат: @@ -530,7 +569,7 @@ SELECT * FROM Months WHERE ilike(name, '%j%') countMatches(haystack, pattern) ``` -**Параметры** +**Аргументы** - `haystack` — строка, по которой выполняется поиск. [String](../../sql-reference/syntax.md#syntax-string-literal). - `pattern` — регулярное выражение, построенное по синтаксическим правилам [re2](https://github.com/google/re2/wiki/Syntax). [String](../../sql-reference/data-types/string.md). @@ -583,11 +622,11 @@ SELECT countMatches('aaaa', 'aa'); countSubstrings(haystack, needle[, start_pos]) ``` -**Параметры** +**Аргументы** - `haystack` — строка, в которой ведется поиск. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — искомая подстрока. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md). +- `start_pos` — позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -649,11 +688,11 @@ SELECT countSubstrings('abc___abc', 'abc', 4); countSubstringsCaseInsensitive(haystack, needle[, start_pos]) ``` -**Параметры** +**Аргументы** - `haystack` — строка, в которой ведется поиск. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — искомая подстрока. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md). +- `start_pos` — позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -715,11 +754,11 @@ SELECT countSubstringsCaseInsensitive('abC___abC', 'aBc', 2); SELECT countSubstringsCaseInsensitiveUTF8(haystack, needle[, start_pos]) ``` -**Параметры** +**Аргументы** - `haystack` — строка, в которой ведется поиск. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — искомая подстрока. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md). +- `start_pos` — позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md). **Возвращаемые значения** @@ -756,5 +795,3 @@ SELECT countSubstringsCaseInsensitiveUTF8('аБв__АбВ__абв', 'Абв'); │ 3 │ └────────────────────────────────────────────────────────────┘ ``` - -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_search_functions/) diff --git a/docs/ru/sql-reference/functions/tuple-functions.md b/docs/ru/sql-reference/functions/tuple-functions.md index f88886ec6f1..381743a450b 100644 --- a/docs/ru/sql-reference/functions/tuple-functions.md +++ b/docs/ru/sql-reference/functions/tuple-functions.md @@ -45,9 +45,9 @@ untuple(x) Чтобы пропустить некоторые столбцы в результате запроса, вы можете использовать выражение `EXCEPT`. -**Параметры** +**Аргументы** -- `x` - функция `tuple`, столбец или кортеж элементов. [Tuple](../../sql-reference/data-types/tuple.md). +- `x` — функция `tuple`, столбец или кортеж элементов. [Tuple](../../sql-reference/data-types/tuple.md). **Возвращаемое значение** @@ -111,4 +111,55 @@ SELECT untuple((* EXCEPT (v2, v3),)) FROM kv; - [Tuple](../../sql-reference/data-types/tuple.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/functions/tuple-functions/) +## tupleHammingDistance {#tuplehammingdistance} + +Возвращает [расстояние Хэмминга](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D1%81%D1%82%D0%BE%D1%8F%D0%BD%D0%B8%D0%B5_%D0%A5%D1%8D%D0%BC%D0%BC%D0%B8%D0%BD%D0%B3%D0%B0) между двумя кортежами одинакового размера. + +**Синтаксис** + +``` sql +tupleHammingDistance(tuple1, tuple2) +``` + +**Аргументы** + +- `tuple1` — первый кортеж. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple2` — второй кортеж. [Tuple](../../sql-reference/data-types/tuple.md). + +Кортежи должны иметь одинаковый размер и тип элементов. + +**Возвращаемое значение** + +- Расстояние Хэмминга. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Примеры** + +Запрос: + +``` sql +SELECT tupleHammingDistance((1, 2, 3), (3, 2, 1)) AS HammingDistance; +``` + +Результат: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` + +Может быть использовано с функциями [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) для проверки строк на совпадение: + +``` sql +SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'Clickhouse is a column-oriented database management system for online analytical processing of queries.' AS string); +``` + +Результат: + +``` text +┌─HammingDistance─┐ +│ 2 │ +└─────────────────┘ +``` diff --git a/docs/ru/sql-reference/functions/tuple-map-functions.md b/docs/ru/sql-reference/functions/tuple-map-functions.md index a36613280a1..c385dbd8f87 100644 --- a/docs/ru/sql-reference/functions/tuple-map-functions.md +++ b/docs/ru/sql-reference/functions/tuple-map-functions.md @@ -15,7 +15,7 @@ toc_title: Работа с контейнерами map map(key1, value1[, key2, value2, ...]) ``` -**Параметры** +**Аргументы** - `key` — ключ. [String](../../sql-reference/data-types/string.md) или [Integer](../../sql-reference/data-types/int-uint.md). - `value` — значение. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) или [Array](../../sql-reference/data-types/array.md). @@ -62,9 +62,10 @@ SELECT a['key2'] FROM table_map; └─────────────────────────┘ ``` -**См. также** +**Смотрите также** - тип данных [Map(key, value)](../../sql-reference/data-types/map.md) + ## mapAdd {#function-mapadd} Собирает все ключи и суммирует соответствующие значения. @@ -75,7 +76,7 @@ SELECT a['key2'] FROM table_map; mapAdd(Tuple(Array, Array), Tuple(Array, Array) [, ...]) ``` -**Параметры** +**Аргументы** Аргументами являются [кортежи](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array), где элементы в первом массиве представляют ключи, а второй массив содержит значения для каждого ключа. Все массивы ключей должны иметь один и тот же тип, а все массивы значений должны содержать элементы, которые можно приводить к одному типу ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) или [Float64](../../sql-reference/data-types/float.md#float32-float64)). @@ -111,7 +112,7 @@ SELECT mapAdd(([toUInt8(1), 2], [1, 1]), ([toUInt8(1), 2], [1, 1])) as res, toTy mapSubtract(Tuple(Array, Array), Tuple(Array, Array) [, ...]) ``` -**Параметры** +**Аргументы** Аргументами являются [кортежи](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array), где элементы в первом массиве представляют ключи, а второй массив содержит значения для каждого ключа. Все массивы ключей должны иметь один и тот же тип, а все массивы значений должны содержать элементы, которые можно приводить к одному типу ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) или [Float64](../../sql-reference/data-types/float.md#float32-float64)). @@ -151,10 +152,10 @@ mapPopulateSeries(keys, values[, max]) Количество элементов в `keys` и `values` должно быть одинаковым для каждой строки. -**Параметры** +**Аргументы** -- `keys` — Массив ключей [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)). -- `values` — Массив значений. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)). +- `keys` — массив ключей [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)). +- `values` — массив значений. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)). **Возвращаемое значение** @@ -176,4 +177,128 @@ select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type └──────────────────────────────┴───────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/query_language/functions/tuple-map-functions/) +## mapContains {#mapcontains} + +Определяет, содержит ли контейнер `map` ключ `key`. + +**Синтаксис** + +``` sql +mapContains(map, key) +``` + +**Аргументы** + +- `map` — контейнер Map. [Map](../../sql-reference/data-types/map.md). +- `key` — ключ. Тип соответстует типу ключей параметра `map`. + +**Возвращаемое значение** + +- `1` если `map` включает `key`, иначе `0`. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapContains(a, 'name') FROM test; + +``` + +Результат: + +```text +┌─mapContains(a, 'name')─┐ +│ 1 │ +│ 0 │ +└────────────────────────┘ +``` + +## mapKeys {#mapkeys} + +Возвращает все ключи контейнера `map`. + +**Синтаксис** + +```sql +mapKeys(map) +``` + +**Аргументы** + +- `map` — контейнер Map. [Map](../../sql-reference/data-types/map.md). + +**Возвращаемое значение** + +- Массив со всеми ключами контейнера `map`. + +Тип: [Array](../../sql-reference/data-types/array.md). + +**Пример** + +Запрос: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapKeys(a) FROM test; +``` + +Результат: + +```text +┌─mapKeys(a)────────────┐ +│ ['name','age'] │ +│ ['number','position'] │ +└───────────────────────┘ +``` + +## mapValues {#mapvalues} + +Возвращает все значения контейнера `map`. + +**Синтаксис** + +```sql +mapKeys(map) +``` + +**Аргументы** + +- `map` — контейнер Map. [Map](../../sql-reference/data-types/map.md). + +**Возвращаемое значение** + +- Массив со всеми значениями контейнера `map`. + +Тип: [Array](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapValues(a) FROM test; +``` + +Результат: + +```text +┌─mapValues(a)─────┐ +│ ['eleven','11'] │ +│ ['twelve','6.0'] │ +└──────────────────┘ +``` + diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 4a314bd22d8..fc1dd15f8e3 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 38 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u044f\u0020\u0442\u0438\u043f\u043e\u0432" +toc_title: "Функции преобразования типов" --- # Функции преобразования типов {#funktsii-preobrazovaniia-tipov} @@ -22,7 +22,7 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u - `toInt128(expr)` — возвращает значение типа `Int128`. - `toInt256(expr)` — возвращает значение типа `Int256`. -**Параметры** +**Аргументы** - `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее число или строку с десятичным представление числа. Бинарное, восьмеричное и шестнадцатеричное представление числа не поддержаны. Ведущие нули обрезаются. @@ -36,10 +36,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u **Пример** +Запрос: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Результат: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) **Пример** +Запрос: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +SELECT toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123') **Пример** +Запрос: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +SELECT toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -88,7 +100,7 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123') - `toUInt64(expr)` — возвращает значение типа `UInt64`. - `toUInt256(expr)` — возвращает значение типа `UInt256`. -**Параметры** +**Аргументы** - `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее число или строку с десятичным представление числа. Бинарное, восьмеричное и шестнадцатеричное представление числа не поддержаны. Ведущие нули обрезаются. @@ -102,10 +114,14 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123') **Пример** +Запрос: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Результат: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -124,6 +140,8 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ## toDate {#todate} +Cиноним: `DATE`. + ## toDateOrZero {#todateorzero} ## toDateOrNull {#todateornull} @@ -154,7 +172,7 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) Эти функции следует использовать вместо функций `toDecimal*()`, если при ошибке обработки входного значения вы хотите получать `NULL` вместо исключения. -**Параметры** +**Аргументы** - `expr` — [выражение](../syntax.md#syntax-expressions), возвращающее значение типа [String](../../sql-reference/functions/type-conversion-functions.md). ClickHouse ожидает текстовое представление десятичного числа. Например, `'1.111'`. - `S` — количество десятичных знаков в результирующем значении. @@ -168,20 +186,28 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) **Примеры** +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -199,7 +225,7 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) Эти функции следует использовать вместо функций `toDecimal*()`, если при ошибке обработки входного значения вы хотите получать `0` вместо исключения. -**Параметры** +**Аргументы** - `expr` — [выражение](../syntax.md#syntax-expressions), возвращающее значение типа [String](../../sql-reference/functions/type-conversion-functions.md). ClickHouse ожидает текстовое представление десятичного числа. Например, `'1.111'`. - `S` — количество десятичных знаков в результирующем значении. @@ -213,20 +239,28 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) **Пример** +Запрос: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -258,12 +292,18 @@ YYYY-MM-DD hh:mm:ss Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне. +**Пример** + +Запрос: + ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat + toString(now(), 'Asia/Yekaterinburg') AS now_yekat; ``` +Результат: + ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -281,22 +321,30 @@ SELECT Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. -Пример: +**Примеры** + +Запрос: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` +Запрос: + ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -329,7 +377,7 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut reinterpretAsUUID(fixed_string) ``` -**Параметры** +**Аргументы** - `fixed_string` — cтрока с big-endian порядком байтов. [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring). @@ -344,7 +392,7 @@ reinterpretAsUUID(fixed_string) Запрос: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Результат: @@ -375,12 +423,53 @@ SELECT uuid = uuid2; └─────────────────────┘ ``` +## reinterpret(x, T) {#type_conversion_function-reinterpret} + +Использует туже самую исходную последовательность байт в памяти для значения `x` и переинтерпретирует ее как конечный тип данных + +Запрос: +```sql +SELECT reinterpret(toInt8(-1), 'UInt8') as int_to_uint, + reinterpret(toInt8(1), 'Float32') as int_to_float, + reinterpret('1', 'UInt32') as string_to_int; +``` + +Результат: + +``` +┌─int_to_uint─┬─int_to_float─┬─string_to_int─┐ +│ 255 │ 1e-45 │ 49 │ +└─────────────┴──────────────┴───────────────┘ +``` + ## CAST(x, T) {#type_conversion_function-cast} -Преобразует x в тип данных t. -Поддерживается также синтаксис CAST(x AS t). +Преобразует входное значение `x` в указанный тип данных `T`. В отличии от функции `reinterpret` использует внешнее представление значения `x`. -Пример: +Поддерживается также синтаксис `CAST(x AS t)`. + +Обратите внимание, что если значение `x` не может быть преобразовано к типу `T`, возникает переполнение. Например, `CAST(-1, 'UInt8')` возвращает 255. + +**Примеры** + +Запрос: + +```sql +SELECT + cast(toInt8(-1), 'UInt8') AS cast_int_to_uint, + cast(toInt8(1), 'Float32') AS cast_int_to_float, + cast('1', 'UInt32') AS cast_string_to_int +``` + +Результат: + +``` +┌─cast_int_to_uint─┬─cast_int_to_float─┬─cast_string_to_int─┐ +│ 255 │ 1 │ 1 │ +└──────────────────┴───────────────────┴────────────────────┘ +``` + +Запрос: ``` sql SELECT @@ -388,9 +477,11 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string + CAST(timestamp, 'FixedString(22)') AS fixed_string; ``` +Результат: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -399,12 +490,18 @@ SELECT Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N). -Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. Пример: +Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. + +**Примеры** + +Запрос: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -412,10 +509,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Запрос: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -423,10 +524,98 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null └─────────────────────────────────────────┘ ``` -**См. также** +**Смотрите также** - Настройка [cast_keep_nullable](../../operations/settings/settings.md#cast_keep_nullable) +## accurateCast(x, T) {#type_conversion_function-accurate-cast} + +Преобразует входное значение `x` в указанный тип данных `T`. + +В отличие от функции [cast(x, T)](#type_conversion_function-cast), `accurateCast` не допускает переполнения при преобразовании числовых типов. Например, `accurateCast(-1, 'UInt8')` вызовет исключение. + +**Примеры** + +Запрос: + +``` sql +SELECT cast(-1, 'UInt8') as uint8; +``` + +Результат: + +``` text +┌─uint8─┐ +│ 255 │ +└───────┘ +``` + +Запрос: + +```sql +SELECT accurateCast(-1, 'UInt8') as uint8; +``` + +Результат: + +``` text +Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. +``` + +## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} + +Преобразует входное значение `x` в указанный тип данных `T`. + +Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md). Если исходное значение не может быть преобразовано к целевому типу, возвращает [NULL](../../sql-reference/syntax.md#null-literal). + +**Синтаксис** + +```sql +accurateCastOrNull(x, T) +``` + +**Аргументы** + +- `x` — входное значение. +- `T` — имя возвращаемого типа данных. + +**Возвращаемое значение** + +- Значение, преобразованное в указанный тип `T`. + +**Примеры** + +Запрос: + +``` sql +SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); +``` + +Результат: + +``` text +┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ +│ Nullable(UInt8) │ +└────────────────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT + accurateCastOrNull(-1, 'UInt8') as uint8, + accurateCastOrNull(128, 'Int8') as int8, + accurateCastOrNull('Test', 'FixedString(2)') as fixed_string; +``` + +Результат: + +``` text +┌─uint8─┬─int8─┬─fixed_string─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +└───────┴──────┴──────────────┘ +``` + ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} Приводит аргумент из числового типа данных к типу данных [IntervalType](../../sql-reference/data-types/special-data-types/interval.md). @@ -444,7 +633,7 @@ toIntervalQuarter(number) toIntervalYear(number) ``` -**Параметры** +**Аргументы** - `number` — длительность интервала. Положительное целое число. @@ -454,6 +643,8 @@ toIntervalYear(number) **Пример** +Запрос: + ``` sql WITH toDate('2019-01-01') AS date, @@ -461,9 +652,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Результат: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -471,6 +664,7 @@ SELECT ``` ## parseDateTimeBestEffort {#parsedatetimebesteffort} +## parseDateTime32BestEffort {#parsedatetime32besteffort} Преобразует дату и время в [строковом](../../sql-reference/functions/type-conversion-functions.md) представлении к типу данных [DateTime](../../sql-reference/functions/type-conversion-functions.md#data_type-datetime). @@ -479,10 +673,10 @@ SELECT **Синтаксис** ``` sql -parseDateTimeBestEffort(time_string[, time_zone]); +parseDateTimeBestEffort(time_string[, time_zone]) ``` -**Параметры** +**Аргументы** - `time_string` — строка, содержащая дату и время для преобразования. [String](../../sql-reference/functions/type-conversion-functions.md). - `time_zone` — часовой пояс. Функция анализирует `time_string` в соответствии с заданным часовым поясом. [String](../../sql-reference/functions/type-conversion-functions.md). @@ -522,7 +716,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -537,7 +731,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -552,7 +746,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -566,7 +760,7 @@ AS parseDateTimeBestEffort Запрос: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Результат: @@ -577,7 +771,7 @@ SELECT parseDateTimeBestEffort('10 20:19') └─────────────────────────────────────┘ ``` -**См. также** +**Смотрите также** - [Информация о формате ISO 8601 от @xkcd](https://xkcd.com/1179/) - [RFC 1123](https://tools.ietf.org/html/rfc1123) @@ -591,10 +785,10 @@ SELECT parseDateTimeBestEffort('10 20:19') **Синтаксис** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` -**Параметры** +**Аргументы** - `time_string` — строка, содержащая дату и время для преобразования. [String](../../sql-reference/data-types/string.md). - `time_zone` — часовой пояс. Функция анализирует `time_string` в соответствии с часовым поясом. [String](../../sql-reference/data-types/string.md). @@ -620,7 +814,7 @@ SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -635,7 +829,7 @@ SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -650,7 +844,7 @@ SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -658,9 +852,282 @@ AS parseDateTimeBestEffortUS; └─────────────────────────——┘ ``` -## toUnixTimestamp64Milli -## toUnixTimestamp64Micro -## toUnixTimestamp64Nano +## parseDateTimeBestEffortOrNull {#parsedatetimebesteffortornull} +## parseDateTime32BestEffortOrNull {#parsedatetime32besteffortornull} + +Работает также как [parseDateTimeBestEffort](#parsedatetimebesteffort), но возвращает `NULL` когда получает формат даты который не может быть обработан. + +## parseDateTimeBestEffortOrZero {#parsedatetimebesteffortorzero} +## parseDateTime32BestEffortOrZero {#parsedatetime32besteffortorzero} + +Работает также как [parseDateTimeBestEffort](#parsedatetimebesteffort), но возвращает нулевую дату или нулевую дату и время когда получает формат даты который не может быть обработан. + +## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} + +Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает `NULL`, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). + +**Синтаксис** + +``` sql +parseDateTimeBestEffortUSOrNull(time_string[, time_zone]) +``` + +**Аргументы** + +- `time_string` — строка, содержащая дату или дату со временем для преобразования. Дата должна быть в американском формате (`MM/DD/YYYY` и т.д.). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция анализирует `time_string` в соответствии с заданным часовым поясом. Опциональный параметр. [String](../../sql-reference/data-types/string.md). + +**Поддерживаемые нестандартные форматы** + +- Строка в формате [unix timestamp](https://en.wikipedia.org/wiki/Unix_time), содержащая 9-10 цифр. +- Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss` и т.д. +- Строка, содержащая дату без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` и т.д. +- Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`. +- Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`. + +**Возвращаемые значения** + +- `time_string`, преобразованная в тип данных `DateTime`. +- `NULL`, если входная строка не может быть преобразована в тип данных `DateTime`. + +**Примеры** + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-11 00:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ ᴺᵁᴸᴸ │ +└─────────────────────────────────┘ +``` + +## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} + +Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает нулевую дату (`1970-01-01`) или нулевую дату со временем (`1970-01-01 00:00:00`), если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). + +**Синтаксис** + +``` sql +parseDateTimeBestEffortUSOrZero(time_string[, time_zone]) +``` + +**Аргументы** + +- `time_string` — строка, содержащая дату или дату со временем для преобразования. Дата должна быть в американском формате (`MM/DD/YYYY` и т.д.). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция анализирует `time_string` в соответствии с заданным часовым поясом. Опциональный параметр. [String](../../sql-reference/data-types/string.md). + +**Поддерживаемые нестандартные форматы** + +- Строка в формате [unix timestamp](https://en.wikipedia.org/wiki/Unix_time), содержащая 9-10 цифр. +- Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss` и т.д. +- Строка, содержащая дату без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` и т.д. +- Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`. +- Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`. + +**Возвращаемые значения** + +- `time_string`, преобразованная в тип данных `DateTime`. +- Нулевая дата или нулевая дата со временем, если входная строка не может быть преобразована в тип данных `DateTime`. + +**Примеры** + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-11 00:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.2021') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 1970-01-01 00:00:00 │ +└─────────────────────────────────┘ +``` + +## parseDateTime64BestEffort {#parsedatetime64besteffort} + +Работает также как функция [parseDateTimeBestEffort](#parsedatetimebesteffort) но также понимамет милисекунды и микросекунды и возвращает `DateTime64(3)` или `DateTime64(6)` типы данных в зависимости от заданной точности. + +**Syntax** + +``` sql +parseDateTime64BestEffort(time_string [, precision [, time_zone]]) +``` + +**Parameters** + +- `time_string` — String containing a date or date with time to convert. [String](../../sql-reference/data-types/string.md). +- `precision` — `3` for milliseconds, `6` for microseconds. Default `3`. Optional [UInt8](../../sql-reference/data-types/int-uint.md). +- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). + +**Примеры** + +Запрос: + +```sql +SELECT parseDateTime64BestEffort('2021-01-01') AS a, toTypeName(a) AS t +UNION ALL +SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346') AS a, toTypeName(a) AS t +UNION ALL +SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346',6) AS a, toTypeName(a) AS t +UNION ALL +SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346',3,'Europe/Moscow') AS a, toTypeName(a) AS t +FORMAT PrettyCompactMonoBlcok +``` + +Результат: + +``` +┌──────────────────────────a─┬─t──────────────────────────────┐ +│ 2021-01-01 01:01:00.123000 │ DateTime64(3) │ +│ 2021-01-01 00:00:00.000000 │ DateTime64(3) │ +│ 2021-01-01 01:01:00.123460 │ DateTime64(6) │ +│ 2020-12-31 22:01:00.123000 │ DateTime64(3, 'Europe/Moscow') │ +└────────────────────────────┴────────────────────────────────┘ +``` + +## parseDateTime64BestEffortOrNull {#parsedatetime32besteffortornull} + +Работает также как функция [parseDateTime64BestEffort](#parsedatetime64besteffort) но возвращает `NULL` когда встречает формат даты который не может обработать. + +## parseDateTime64BestEffortOrZero {#parsedatetime64besteffortorzero} + +Работает также как функция [parseDateTime64BestEffort](#parsedatetimebesteffort) но возвращает "нулевую" дату и время когда встречает формат даты который не может обработать. + + +## toLowCardinality {#tolowcardinality} + +Преобразует входные данные в версию [LowCardianlity](../data-types/lowcardinality.md) того же типа данных. + +Чтобы преобразовать данные из типа `LowCardinality`, используйте функцию [CAST](#type_conversion_function-cast). Например, `CAST(x as String)`. + +**Синтаксис** + +```sql +toLowCardinality(expr) +``` + +**Аргументы** + +- `expr` — [выражение](../syntax.md#syntax-expressions), которое в результате преобразуется в один из [поддерживаемых типов данных](../data-types/index.md#data_types). + +**Возвращаемое значение** + +- Результат преобразования `expr`. + +Тип: `LowCardinality(expr_result_type)` + +**Пример** + +Запрос: + +```sql +SELECT toLowCardinality('1'); +``` + +Результат: + +```text +┌─toLowCardinality('1')─┐ +│ 1 │ +└───────────────────────┘ +``` + +## toUnixTimestamp64Milli {#tounixtimestamp64milli} + +## toUnixTimestamp64Micro {#tounixtimestamp64micro} + +## toUnixTimestamp64Nano {#tounixtimestamp64nano} Преобразует значение `DateTime64` в значение `Int64` с фиксированной точностью менее одной секунды. Входное значение округляется соответствующим образом вверх или вниз в зависимости от его точности. Обратите внимание, что возвращаемое значение - это временная метка в UTC, а не в часовом поясе `DateTime64`. @@ -671,7 +1138,7 @@ AS parseDateTimeBestEffortUS; toUnixTimestamp64Milli(value) ``` -**Параметры** +**Аргументы** - `value` — значение `DateTime64` с любой точностью. @@ -685,10 +1152,10 @@ toUnixTimestamp64Milli(value) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Milli(dt64)─┐ @@ -700,10 +1167,10 @@ SELECT toUnixTimestamp64Milli(dt64) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Nano(dt64)─┐ @@ -711,9 +1178,11 @@ SELECT toUnixTimestamp64Nano(dt64) └─────────────────────────────┘ ``` -## fromUnixTimestamp64Milli -## fromUnixTimestamp64Micro -## fromUnixTimestamp64Nano +## fromUnixTimestamp64Milli {#fromunixtimestamp64milli} + +## fromUnixTimestamp64Micro {#fromunixtimestamp64micro} + +## fromUnixTimestamp64Nano {#fromunixtimestamp64nano} Преобразует значение `Int64` в значение `DateTime64` с фиксированной точностью менее одной секунды и дополнительным часовым поясом. Входное значение округляется соответствующим образом вверх или вниз в зависимости от его точности. Обратите внимание, что входное значение обрабатывается как метка времени UTC, а не метка времени в заданном (или неявном) часовом поясе. @@ -723,7 +1192,7 @@ SELECT toUnixTimestamp64Nano(dt64) fromUnixTimestamp64Milli(value [, ti]) ``` -**Параметры** +**Аргументы** - `value` — значение типы `Int64` с любой точностью. - `timezone` — (не обязательный параметр) часовой пояс в формате `String` для возвращаемого результата. @@ -738,10 +1207,10 @@ fromUnixTimestamp64Milli(value [, ti]) ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` -Ответ: +Результат: ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ @@ -749,45 +1218,6 @@ SELECT fromUnixTimestamp64Milli(i64, 'UTC') └──────────────────────────────────────┘ ``` -## toLowCardinality {#tolowcardinality} - -Преобразует входные данные в версию [LowCardianlity](../data-types/lowcardinality.md) того же типа данных. - -Чтобы преобразовать данные из типа `LowCardinality`, используйте функцию [CAST](#type_conversion_function-cast). Например, `CAST(x as String)`. - -**Синтаксис** - -```sql -toLowCardinality(expr) -``` - -**Параметры** - -- `expr` — [Выражение](../syntax.md#syntax-expressions), которое в результате преобразуется в один из [поддерживаемых типов данных](../data-types/index.md#data_types). - - -**Возвращаемое значение** - -- Результат преобразования `expr`. - -Тип: `LowCardinality(expr_result_type)` - -**Example** - -Запрос: - -```sql -SELECT toLowCardinality('1') -``` - -Результат: - -```text -┌─toLowCardinality('1')─┐ -│ 1 │ -└───────────────────────┘ -``` - ## formatRow {#formatrow} Преобразует произвольные выражения в строку заданного формата. @@ -798,10 +1228,10 @@ SELECT toLowCardinality('1') formatRow(format, x, y, ...) ``` -**Параметры** +**Аргументы** -- `format` — Текстовый формат. Например, [CSV](../../interfaces/formats.md#csv), [TSV](../../interfaces/formats.md#tabseparated). -- `x`,`y`, ... — Выражения. +- `format` — текстовый формат. Например, [CSV](../../interfaces/formats.md#csv), [TSV](../../interfaces/formats.md#tabseparated). +- `x`,`y`, ... — выражения. **Возвращаемое значение** @@ -813,10 +1243,10 @@ formatRow(format, x, y, ...) ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRow('CSV', number, 'good')─┐ @@ -839,10 +1269,10 @@ FROM numbers(3) formatRowNoNewline(format, x, y, ...) ``` -**Параметры** +**Аргументы** -- `format` — Текстовый формат. Например, [CSV](../../interfaces/formats.md#csv), [TSV](../../interfaces/formats.md#tabseparated). -- `x`,`y`, ... — Выражения. +- `format` — текстовый формат. Например, [CSV](../../interfaces/formats.md#csv), [TSV](../../interfaces/formats.md#tabseparated). +- `x`,`y`, ... — выражения. **Возвращаемое значение** @@ -854,10 +1284,10 @@ formatRowNoNewline(format, x, y, ...) ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRowNoNewline('CSV', number, 'good')─┐ @@ -867,4 +1297,3 @@ FROM numbers(3) └───────────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/type_conversion_functions/) diff --git a/docs/ru/sql-reference/functions/url-functions.md b/docs/ru/sql-reference/functions/url-functions.md index 7541e16bed4..bdf9beeabf5 100644 --- a/docs/ru/sql-reference/functions/url-functions.md +++ b/docs/ru/sql-reference/functions/url-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 54 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u0055\u0052\u004c" +toc_title: "Функции для работы с URL" --- # Функции для работы с URL {#funktsii-dlia-raboty-s-url} @@ -23,7 +23,7 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u domain(url) ``` -**Параметры** +**Аргументы** - `url` — URL. Тип — [String](../../sql-reference/functions/url-functions.md). @@ -53,7 +53,7 @@ yandex.com **Пример** ``` sql -SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk') +SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk'); ``` ``` text @@ -74,7 +74,7 @@ SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk') topLevelDomain(url) ``` -**Параметры** +**Аргументы** - `url` — URL. Тип — [String](../../sql-reference/functions/url-functions.md). @@ -96,7 +96,7 @@ https://yandex.com/time/ **Пример** ``` sql -SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk') +SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk'); ``` ``` text @@ -138,7 +138,7 @@ SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk') cutToFirstSignificantSubdomain(URL, TLD) ``` -**Parameters** +**Аргументы** - `URL` — URL. [String](../../sql-reference/data-types/string.md). - `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md). @@ -192,7 +192,7 @@ SELECT cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', ' cutToFirstSignificantSubdomainCustomWithWWW(URL, TLD) ``` -**Параметры** +**Аргументы** - `URL` — URL. [String](../../sql-reference/data-types/string.md). - `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md). @@ -246,7 +246,7 @@ SELECT cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_lis firstSignificantSubdomainCustom(URL, TLD) ``` -**Параметры** +**Аргументы** - `URL` — URL. [String](../../sql-reference/data-types/string.md). - `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md). @@ -355,7 +355,7 @@ SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS Decod netloc(URL) ``` -**Параметры** +**Аргументы** - `url` — URL. Тип — [String](../../sql-reference/data-types/string.md). @@ -405,4 +405,3 @@ SELECT netloc('http://paul@www.example.com:80/'); Удаляет параметр URL с именем name, если такой есть. Функция работает при допущении, что имя параметра закодировано в URL в точности таким же образом, что и в переданном аргументе. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/url_functions/) diff --git a/docs/ru/sql-reference/functions/uuid-functions.md b/docs/ru/sql-reference/functions/uuid-functions.md index 6082fcaa712..f0017adbc8b 100644 --- a/docs/ru/sql-reference/functions/uuid-functions.md +++ b/docs/ru/sql-reference/functions/uuid-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 53 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u0020\u0055\u0055\u0049\u0044" +toc_title: "Функции для работы с UUID" --- # Функции для работы с UUID {#funktsii-dlia-raboty-s-uuid} diff --git a/docs/ru/sql-reference/functions/ym-dict-functions.md b/docs/ru/sql-reference/functions/ym-dict-functions.md index c3b04e4ab66..d4bbe2eb709 100644 --- a/docs/ru/sql-reference/functions/ym-dict-functions.md +++ b/docs/ru/sql-reference/functions/ym-dict-functions.md @@ -1,6 +1,6 @@ --- toc_priority: 59 -toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u0020\u0440\u0430\u0431\u043e\u0442\u044b\u0020\u0441\u043e\u0020\u0441\u043b\u043e\u0432\u0430\u0440\u044f\u043c\u0438\u0020\u042f\u043d\u0434\u0435\u043a\u0441\u002e\u041c\u0435\u0442\u0440\u0438\u043a\u0438" +toc_title: "Функции для работы со словарями Яндекс.Метрики" --- # Функции для работы со словарями Яндекс.Метрики {#ym-dict-functions} @@ -113,13 +113,13 @@ LIMIT 15 **Синтаксис** ``` sql -regionToTopContinent(id[, geobase]); +regionToTopContinent(id[, geobase]) ``` -**Параметры** +**Аргументы** -- `id` — Идентификатор региона из геобазы Яндекса. [UInt32](../../sql-reference/functions/ym-dict-functions.md). -- `geobase` — Ключ словаря. Смотрите [Множественные геобазы](#multiple-geobases). [String](../../sql-reference/functions/ym-dict-functions.md). Опциональный параметр. +- `id` — идентификатор региона из геобазы Яндекса. [UInt32](../../sql-reference/functions/ym-dict-functions.md). +- `geobase` — ключ словаря. Смотрите [Множественные геобазы](#multiple-geobases). [String](../../sql-reference/functions/ym-dict-functions.md). Опциональный параметр. **Возвращаемое значение** @@ -151,4 +151,3 @@ regionToTopContinent(id[, geobase]); `ua` и `uk` обозначают одно и то же - украинский язык. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/ym_dict_functions/) diff --git a/docs/ru/sql-reference/index.md b/docs/ru/sql-reference/index.md index f59232ee047..62d6a9cecde 100644 --- a/docs/ru/sql-reference/index.md +++ b/docs/ru/sql-reference/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0421\u043F\u0440\u0430\u0432\u043A\u0430 \u043F\u043E SQL" +toc_folder_title: "Справка по SQL" toc_hidden: true toc_priority: 28 toc_title: hidden @@ -13,4 +13,3 @@ toc_title: hidden - [ALTER](statements/alter/index.md#query_language_queries_alter) - [Прочие виды запросов](statements/misc.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/) diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index 4c1290df166..b092dd365bf 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -13,10 +13,28 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... Если слева стоит один столбец, входящий в индекс, а справа - множество констант, то при выполнении запроса, система воспользуется индексом. -Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел «Внешние данные для обработки запроса»), и затем воспользоваться подзапросом. +Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел [Внешние данные для обработки запроса](../../engines/table-engines/special/external-data.md)), и затем воспользоваться подзапросом. В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. +Если типы данных в левой и правой частях подзапроса `IN` различаются, ClickHouse преобразует значение в левой части к типу данных из правой части. Преобразование выполняется по аналогии с функцией [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null), т.е. тип данных становится [Nullable](../../sql-reference/data-types/nullable.md), а если преобразование не может быть выполнено, возвращается значение [NULL](../../sql-reference/syntax.md#null-literal). + +**Пример** + +Запрос: + +``` sql +SELECT '1' IN (SELECT 1); +``` + +Результат: + +``` text +┌─in('1', _subquery49)─┐ +│ 1 │ +└──────────────────────┘ +``` + Если в качестве правой части оператора указано имя таблицы (например, `UserID IN users`), то это эквивалентно подзапросу `UserID IN (SELECT * FROM users)`. Это используется при работе с внешними данными, отправляемым вместе с запросом. Например, вместе с запросом может быть отправлено множество идентификаторов посетителей, загруженное во временную таблицу users, по которому следует выполнить фильтрацию. Если в качестве правой части оператора, указано имя таблицы, имеющий движок Set (подготовленное множество, постоянно находящееся в оперативке), то множество не будет создаваться заново при каждом запросе. @@ -197,3 +215,25 @@ SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL 5. Если в GLOBAL IN есть частая необходимость, то спланируйте размещение кластера ClickHouse таким образом, чтобы в каждом дата-центре была хотя бы одна реплика каждого шарда, и среди них была быстрая сеть - чтобы запрос целиком можно было бы выполнить, передавая данные в пределах одного дата-центра. В секции `GLOBAL IN` также имеет смысл указывать локальную таблицу - в случае, если эта локальная таблица есть только на сервере-инициаторе запроса, и вы хотите воспользоваться данными из неё на удалённых серверах. + +### Распределенные подзапросы и max_parallel_replicas {#max_parallel_replica-subqueries} + +Когда настройка max_parallel_replicas больше чем 1, распределенные запросы преобразуются. Например, следующий запрос: + +```sql +SELECT CounterID, count() FROM distributed_table_1 WHERE UserID IN (SELECT UserID FROM local_table_2 WHERE CounterID < 100) +SETTINGS max_parallel_replicas=3 +``` + +преобразуются на каждом сервере в + +```sql +SELECT CounterID, count() FROM local_table_1 WHERE UserID IN (SELECT UserID FROM local_table_2 WHERE CounterID < 100) +SETTINGS parallel_replicas_count=3, parallel_replicas_offset=M +``` + +где M значение между 1 и 3 зависящее от того на какой реплике выполняется локальный запрос. Эти параметры влияют на каждую таблицу семейства MergeTree в запросе и имеют тот же эффект, что и применение `SAMPLE 1/3 OFFSET (M-1)/3` для каждой таблицы. + +Поэтому применение настройки max_parallel_replicas даст корректные результаты если обе таблицы имеют одинаковую схему репликации и семплированы по UserID выражению от UserID. В частности, если local_table_2 не имеет семплирующего ключа, будут получены неверные результаты. Тоже правило применяется для JOIN. + +Один из способов избежать этого, если local_table_2 не удовлетворяет требованиям, использовать `GLOBAL IN` или `GLOBAL JOIN`. diff --git a/docs/ru/sql-reference/operators/index.md b/docs/ru/sql-reference/operators/index.md index 1eddfc4dcaf..b7cacaf7a03 100644 --- a/docs/ru/sql-reference/operators/index.md +++ b/docs/ru/sql-reference/operators/index.md @@ -1,6 +1,6 @@ --- toc_priority: 38 -toc_title: "\u041e\u043f\u0435\u0440\u0430\u0442\u043e\u0440\u044b" +toc_title: "Операторы" --- # Операторы {#operatory} @@ -297,4 +297,3 @@ SELECT * FROM t_null WHERE y IS NOT NULL └───┴───┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/operators/) diff --git a/docs/ru/sql-reference/statements/alter/column.md b/docs/ru/sql-reference/statements/alter/column.md index 7a394e2f684..158ab2e7385 100644 --- a/docs/ru/sql-reference/statements/alter/column.md +++ b/docs/ru/sql-reference/statements/alter/column.md @@ -1,6 +1,6 @@ --- toc_priority: 37 -toc_title: "\u041c\u0430\u043d\u0438\u043f\u0443\u043b\u044f\u0446\u0438\u0438\u0020\u0441\u043e\u0020\u0441\u0442\u043e\u043b\u0431\u0446\u0430\u043c\u0438" +toc_title: "Манипуляции со столбцами" --- # Манипуляции со столбцами {#manipuliatsii-so-stolbtsami} @@ -13,6 +13,7 @@ toc_title: "\u041c\u0430\u043d\u0438\u043f\u0443\u043b\u044f\u0446\u0438\u0438\u - [COMMENT COLUMN](#alter_comment-column) — добавляет комментарий к столбцу; - [MODIFY COLUMN](#alter_modify-column) — изменяет тип столбца, выражение для значения по умолчанию и TTL. - [MODIFY COLUMN REMOVE](#modify-remove) — удаляет какое-либо из свойств столбца. +- [RENAME COLUMN](#alter_rename-column) — переименовывает существующий столбец. Подробное описание для каждого действия приведено ниже. @@ -62,6 +63,9 @@ DROP COLUMN [IF EXISTS] name Запрос удаляет данные из файловой системы. Так как это представляет собой удаление целых файлов, запрос выполняется почти мгновенно. +!!! warning "Предупреждение" + Вы не можете удалить столбец, используемый в [материализованном представлениии](../../../sql-reference/statements/create/view.md#materialized). В противном случае будет ошибка. + Пример: ``` sql @@ -116,7 +120,7 @@ MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] [AFTER name_after | F - TTL - Примеры изменения TTL столбца смотрите в разделе [TTL столбца](ttl.md#mergetree-column-ttl). + Примеры изменения TTL столбца смотрите в разделе [TTL столбца](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl). Если указано `IF EXISTS`, запрос не возвращает ошибку, если столбца не существует. @@ -154,10 +158,26 @@ ALTER TABLE table_name MODIFY column_name REMOVE property; ALTER TABLE table_with_ttl MODIFY COLUMN column_ttl REMOVE TTL; ``` -## Смотрите также +**Смотрите также** - [REMOVE TTL](ttl.md). +## RENAME COLUMN {#alter_rename-column} + +Переименовывает существующий столбец. + +Синтаксис: + +```sql +ALTER TABLE table_name RENAME COLUMN column_name TO new_column_name +``` + +**Пример** + +```sql +ALTER TABLE table_with_ttl RENAME COLUMN column_ttl TO column_ttl_new; +``` + ## Ограничения запроса ALTER {#ogranicheniia-zaprosa-alter} Запрос `ALTER` позволяет создавать и удалять отдельные элементы (столбцы) вложенных структур данных, но не вложенные структуры данных целиком. Для добавления вложенной структуры данных, вы можете добавить столбцы с именем вида `name.nested_name` и типом `Array(T)` - вложенная структура данных полностью эквивалентна нескольким столбцам-массивам с именем, имеющим одинаковый префикс до точки. @@ -170,4 +190,3 @@ ALTER TABLE table_with_ttl MODIFY COLUMN column_ttl REMOVE TTL; Для таблиц, которые не хранят данные самостоятельно (типа [Merge](../../../sql-reference/statements/alter/index.md) и [Distributed](../../../sql-reference/statements/alter/index.md)), `ALTER` всего лишь меняет структуру таблицы, но не меняет структуру подчинённых таблиц. Для примера, при ALTER-е таблицы типа `Distributed`, вам также потребуется выполнить запрос `ALTER` для таблиц на всех удалённых серверах. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/column/) diff --git a/docs/ru/sql-reference/statements/alter/constraint.md b/docs/ru/sql-reference/statements/alter/constraint.md index e26db208493..452bf649415 100644 --- a/docs/ru/sql-reference/statements/alter/constraint.md +++ b/docs/ru/sql-reference/statements/alter/constraint.md @@ -1,6 +1,6 @@ --- toc_priority: 43 -toc_title: "\u041c\u0430\u043d\u0438\u043f\u0443\u043b\u044f\u0446\u0438\u0438\u0020\u0441\u0020\u043e\u0433\u0440\u0430\u043d\u0438\u0447\u0435\u043d\u0438\u044f\u043c\u0438" +toc_title: "Манипуляции с ограничениями" --- # Манипуляции с ограничениями (constraints) {#manipuliatsii-s-ogranicheniiami-constraints} @@ -20,4 +20,3 @@ ALTER TABLE [db].name DROP CONSTRAINT constraint_name; Запрос на изменение ограничений для Replicated таблиц реплицируется, сохраняя новые метаданные в ZooKeeper и применяя изменения на всех репликах. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/constraint/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/delete.md b/docs/ru/sql-reference/statements/alter/delete.md index ee5f03d9d95..70a411dab83 100644 --- a/docs/ru/sql-reference/statements/alter/delete.md +++ b/docs/ru/sql-reference/statements/alter/delete.md @@ -26,4 +26,3 @@ ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr - [Синхронность запросов ALTER](../../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) - [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/delete/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/index.md b/docs/ru/sql-reference/statements/alter/index.md index 830c4a5745b..648fb7e7c5c 100644 --- a/docs/ru/sql-reference/statements/alter/index.md +++ b/docs/ru/sql-reference/statements/alter/index.md @@ -69,4 +69,3 @@ ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name Для запросов `ALTER TABLE ... UPDATE|DELETE` синхронность выполнения определяется настройкой [mutations_sync](../../../operations/settings/settings.md#mutations_sync). -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/index/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/index/index.md b/docs/ru/sql-reference/statements/alter/index/index.md index 2cadbbe065e..862def5cc04 100644 --- a/docs/ru/sql-reference/statements/alter/index/index.md +++ b/docs/ru/sql-reference/statements/alter/index/index.md @@ -1,7 +1,7 @@ --- toc_hidden_folder: true toc_priority: 42 -toc_title: "\u041c\u0430\u043d\u0438\u043f\u0443\u043b\u044f\u0446\u0438\u0438\u0020\u0441\u0020\u0438\u043d\u0434\u0435\u043a\u0441\u0430\u043c\u0438" +toc_title: "Манипуляции с индексами" --- # Манипуляции с индексами {#manipuliatsii-s-indeksami} @@ -21,4 +21,3 @@ ALTER TABLE [db].name DROP INDEX name Запрос на изменение индексов реплицируется, сохраняя новые метаданные в ZooKeeper и применяя изменения на всех репликах. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/index/index/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/order-by.md b/docs/ru/sql-reference/statements/alter/order-by.md index 32c0e382445..f0a9bfe3730 100644 --- a/docs/ru/sql-reference/statements/alter/order-by.md +++ b/docs/ru/sql-reference/statements/alter/order-by.md @@ -19,4 +19,3 @@ MODIFY ORDER BY new_expression сортировки, разрешено добавлять в ключ только новые столбцы (т.е. столбцы, добавляемые командой `ADD COLUMN` в том же запросе `ALTER`), у которых нет выражения по умолчанию. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/order-by/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/partition.md b/docs/ru/sql-reference/statements/alter/partition.md index 8776c70c89e..3e7b069b066 100644 --- a/docs/ru/sql-reference/statements/alter/partition.md +++ b/docs/ru/sql-reference/statements/alter/partition.md @@ -306,4 +306,3 @@ OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; Примеры запросов `ALTER ... PARTITION` можно посмотреть в тестах: [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_local.sql) и [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/partition/) diff --git a/docs/ru/sql-reference/statements/alter/quota.md b/docs/ru/sql-reference/statements/alter/quota.md index 0bdac1381da..2c73b8dace3 100644 --- a/docs/ru/sql-reference/statements/alter/quota.md +++ b/docs/ru/sql-reference/statements/alter/quota.md @@ -14,14 +14,14 @@ ALTER QUOTA [IF EXISTS] name [ON CLUSTER cluster_name] [RENAME TO new_name] [KEYED BY {user_name | ip_address | client_key | client_key,user_name | client_key,ip_address} | NOT KEYED] [FOR [RANDOMIZED] INTERVAL number {second | minute | hour | day | week | month | quarter | year} - {MAX { {queries | errors | result_rows | result_bytes | read_rows | read_bytes | execution_time} = number } [,...] | + {MAX { {queries | query_selects | query_inserts | errors | result_rows | result_bytes | read_rows | read_bytes | execution_time} = number } [,...] | NO LIMITS | TRACKING ONLY} [,...]] [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] ``` Ключи `user_name`, `ip_address`, `client_key`, `client_key, user_name` и `client_key, ip_address` соответствуют полям таблицы [system.quotas](../../../operations/system-tables/quotas.md). -Параметры `queries`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time` соответствуют полям таблицы [system.quotas_usage](../../../operations/system-tables/quotas_usage.md). +Параметры `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time` соответствуют полям таблицы [system.quotas_usage](../../../operations/system-tables/quotas_usage.md). В секции `ON CLUSTER` можно указать кластеры, на которых создается квота, см. [Распределенные DDL запросы](../../../sql-reference/distributed-ddl.md). @@ -37,6 +37,4 @@ ALTER QUOTA IF EXISTS qA FOR INTERVAL 15 month MAX queries = 123 TO CURRENT_USER ``` sql ALTER QUOTA IF EXISTS qB FOR INTERVAL 30 minute MAX execution_time = 0.5, FOR INTERVAL 5 quarter MAX queries = 321, errors = 10 TO default; -``` - -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/alter/quota/) +``` \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/role.md b/docs/ru/sql-reference/statements/alter/role.md index 69f7c5828c5..e9ce62c58d5 100644 --- a/docs/ru/sql-reference/statements/alter/role.md +++ b/docs/ru/sql-reference/statements/alter/role.md @@ -15,4 +15,3 @@ ALTER ROLE [IF EXISTS] name1 [ON CLUSTER cluster_name1] [RENAME TO new_name1] [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/alter/role/) diff --git a/docs/ru/sql-reference/statements/alter/row-policy.md b/docs/ru/sql-reference/statements/alter/row-policy.md index e2d23cda3ff..cff4d4e497a 100644 --- a/docs/ru/sql-reference/statements/alter/row-policy.md +++ b/docs/ru/sql-reference/statements/alter/row-policy.md @@ -18,4 +18,3 @@ ALTER [ROW] POLICY [IF EXISTS] name1 [ON CLUSTER cluster_name1] ON [database1.]t [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/alter/row-policy/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/settings-profile.md b/docs/ru/sql-reference/statements/alter/settings-profile.md index 54502901837..9b8646919ca 100644 --- a/docs/ru/sql-reference/statements/alter/settings-profile.md +++ b/docs/ru/sql-reference/statements/alter/settings-profile.md @@ -15,4 +15,3 @@ ALTER SETTINGS PROFILE [IF EXISTS] TO name1 [ON CLUSTER cluster_name1] [RENAME T [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | INHERIT 'profile_name'] [,...] ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/alter/settings-profile) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/ttl.md b/docs/ru/sql-reference/statements/alter/ttl.md index 5721ec9cf27..2a2d10b69de 100644 --- a/docs/ru/sql-reference/statements/alter/ttl.md +++ b/docs/ru/sql-reference/statements/alter/ttl.md @@ -18,7 +18,7 @@ ALTER TABLE table-name MODIFY TTL ttl-expression Удалить табличный TTL можно запросом следующего вида: ```sql -ALTER TABLE table_name REMOVE TTL +ALTER TABLE table_name REMOVE TTL ``` **Пример** @@ -64,7 +64,7 @@ ALTER TABLE table_with_ttl REMOVE TTL; Заново вставляем удаленную строку и снова принудительно запускаем очистку по `TTL` с помощью `OPTIMIZE`: -```sql +```sql INSERT INTO table_with_ttl VALUES (now() - INTERVAL 4 MONTH, 2, 'username2'); OPTIMIZE TABLE table_with_ttl FINAL; SELECT * FROM table_with_ttl; @@ -81,6 +81,5 @@ SELECT * FROM table_with_ttl; ### Смотрите также -- Подробнее о [свойстве TTL](../../../engines/table-engines/mergetree-family/mergetree#table_engine-mergetree-ttl). - -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/ttl/) +- Подробнее о [свойстве TTL](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl). +- Изменить столбец [с TTL](../../../sql-reference/statements/alter/column.md#alter_modify-column). \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/update.md b/docs/ru/sql-reference/statements/alter/update.md index e3d6725419a..206412d4be9 100644 --- a/docs/ru/sql-reference/statements/alter/update.md +++ b/docs/ru/sql-reference/statements/alter/update.md @@ -26,4 +26,3 @@ ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr - [Синхронность запросов ALTER](../../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) - [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/update/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/user.md b/docs/ru/sql-reference/statements/alter/user.md index 41574f74200..53d090f8eab 100644 --- a/docs/ru/sql-reference/statements/alter/user.md +++ b/docs/ru/sql-reference/statements/alter/user.md @@ -12,10 +12,10 @@ toc_title: USER ``` sql ALTER USER [IF EXISTS] name1 [ON CLUSTER cluster_name1] [RENAME TO new_name1] [, name2 [ON CLUSTER cluster_name2] [RENAME TO new_name2] ...] - [IDENTIFIED [WITH {PLAINTEXT_PASSWORD|SHA256_PASSWORD|DOUBLE_SHA1_PASSWORD}] BY {'password'|'hash'}] - [[ADD|DROP] HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] + [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']}] + [[ADD | DROP] HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [DEFAULT ROLE role [,...] | ALL | ALL EXCEPT role [,...] ] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] + [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY | WRITABLE] | PROFILE 'profile_name'] [,...] ``` Для выполнения `ALTER USER` необходима привилегия [ALTER USER](../grant.md#grant-access-management). @@ -44,4 +44,3 @@ ALTER USER user DEFAULT ROLE ALL ALTER USER user DEFAULT ROLE ALL EXCEPT role1, role2 ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/user/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/attach.md b/docs/ru/sql-reference/statements/attach.md index 259ab893e63..b135507b818 100644 --- a/docs/ru/sql-reference/statements/attach.md +++ b/docs/ru/sql-reference/statements/attach.md @@ -5,19 +5,55 @@ toc_title: ATTACH # ATTACH Statement {#attach} -Запрос полностью аналогичен запросу `CREATE`, но: +Выполняет подключение таблицы, например, при перемещении базы данных на другой сервер. -- вместо слова `CREATE` используется слово `ATTACH`; -- запрос не создаёт данные на диске, а предполагает, что данные уже лежат в соответствующих местах, и всего лишь добавляет информацию о таблице на сервер. После выполнения запроса `ATTACH` сервер будет знать о существовании таблицы. +Запрос не создаёт данные на диске, а предполагает, что данные уже лежат в соответствующих местах, и всего лишь добавляет информацию о таблице на сервер. После выполнения запроса `ATTACH` сервер будет знать о существовании таблицы. -Если таблица перед этим была отсоединена (`DETACH`), т.е. её структура известна, можно использовать сокращенную форму записи без определения структуры. +Если таблица перед этим была отключена при помощи ([DETACH](../../sql-reference/statements/detach.md)), т.е. её структура известна, можно использовать сокращенную форму записи без определения структуры. + +## Варианты синтаксиса {#syntax-forms} +### Присоединение существующей таблицы {#attach-existing-table} ``` sql ATTACH TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster] ``` -Этот запрос используется при старте сервера. Сервер хранит метаданные таблиц в виде файлов с запросами `ATTACH`, которые он просто исполняет при запуске (за исключением системных таблиц, которые явно создаются на сервере). +Этот запрос используется при старте сервера. Сервер хранит метаданные таблиц в виде файлов с запросами `ATTACH`, которые он просто исполняет при запуске (за исключением некоторых системных таблиц, которые явно создаются на сервере). +Если таблица была отключена перманентно, она не будет подключена обратно во время старта сервера, так что нужно явно использовать запрос `ATTACH`, чтобы подключить ее. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/attach/) +### Создание новой таблицы и присоединение данных {#create-new-table-and-attach-data} +**С указанием пути к табличным данным** + +```sql +ATTACH TABLE name FROM 'path/to/data/' (col1 Type1, ...) +``` + +Cоздает новую таблицу с указанной структурой и присоединяет табличные данные из соответствующего каталога в `user_files`. + +**Пример** + +Запрос: + +```sql +DROP TABLE IF EXISTS test; +INSERT INTO TABLE FUNCTION file('01188_attach/test/data.TSV', 'TSV', 's String, n UInt8') VALUES ('test', 42); +ATTACH TABLE test FROM '01188_attach/test' (s String, n UInt8) ENGINE = File(TSV); +SELECT * FROM test; +``` +Результат: + +```sql +┌─s────┬──n─┐ +│ test │ 42 │ +└──────┴────┘ +``` + +**С указанием UUID таблицы** (Только для баз данных `Atomic`) + +```sql +ATTACH TABLE name UUID '' (col1 Type1, ...) +``` + +Cоздает новую таблицу с указанной структурой и присоединяет данные из таблицы с указанным UUID. diff --git a/docs/ru/sql-reference/statements/check-table.md b/docs/ru/sql-reference/statements/check-table.md index 3dc135d87c6..9592c1a5bc2 100644 --- a/docs/ru/sql-reference/statements/check-table.md +++ b/docs/ru/sql-reference/statements/check-table.md @@ -29,9 +29,36 @@ CHECK TABLE [db.]name В движках `*Log` не предусмотрено автоматическое восстановление данных после сбоя. Используйте запрос `CHECK TABLE`, чтобы своевременно выявлять повреждение данных. -Для движков из семейства `MergeTree` запрос `CHECK TABLE` показывает статус проверки для каждого отдельного куска данных таблицы на локальном сервере. +## Проверка таблиц семейства MergeTree {#checking-mergetree-tables} -**Что делать, если данные повреждены** +Для таблиц семейства `MergeTree` если [check_query_single_value_result](../../operations/settings/settings.md#check_query_single_value_result) = 0, запрос `CHECK TABLE` возвращает статус каждого куска данных таблицы на локальном сервере. + +```sql +SET check_query_single_value_result = 0; +CHECK TABLE test_table; +``` + +```text +┌─part_path─┬─is_passed─┬─message─┐ +│ all_1_4_1 │ 1 │ │ +│ all_1_4_2 │ 1 │ │ +└───────────┴───────────┴─────────┘ +``` + +Если `check_query_single_value_result` = 0, запрос `CHECK TABLE` возвращает статус таблицы в целом. + +```sql +SET check_query_single_value_result = 1; +CHECK TABLE test_table; +``` + +```text +┌─result─┐ +│ 1 │ +└────────┘ +``` + +## Что делать, если данные повреждены {#if-data-is-corrupted} В этом случае можно скопировать оставшиеся неповрежденные данные в другую таблицу. Для этого: @@ -41,4 +68,3 @@ CHECK TABLE [db.]name 4. Перезапустите `clickhouse-client`, чтобы вернуть предыдущее значение параметра `max_threads`. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/check-table/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/create/database.md b/docs/ru/sql-reference/statements/create/database.md index e6c561f8e0b..7d19f3e8f17 100644 --- a/docs/ru/sql-reference/statements/create/database.md +++ b/docs/ru/sql-reference/statements/create/database.md @@ -1,6 +1,6 @@ --- toc_priority: 35 -toc_title: "\u0411\u0430\u0437\u0430\u0020\u0434\u0430\u043d\u043d\u044b\u0445" +toc_title: "База данных" --- # CREATE DATABASE {#query-language-create-database} @@ -31,5 +31,4 @@ CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(.. По умолчанию ClickHouse использует собственный движок баз данных. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/database) diff --git a/docs/ru/sql-reference/statements/create/dictionary.md b/docs/ru/sql-reference/statements/create/dictionary.md index 3134a89483b..a41b2cb9ad5 100644 --- a/docs/ru/sql-reference/statements/create/dictionary.md +++ b/docs/ru/sql-reference/statements/create/dictionary.md @@ -1,6 +1,6 @@ --- toc_priority: 38 -toc_title: "\u0421\u043b\u043e\u0432\u0430\u0440\u044c" +toc_title: "Словарь" --- # CREATE DICTIONARY {#create-dictionary-query} @@ -27,5 +27,4 @@ LIFETIME({MIN min_val MAX max_val | max_val}) Смотрите [Внешние словари](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/dictionary) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/create/index.md b/docs/ru/sql-reference/statements/create/index.md index 28ddce2afe3..dfa5c28fff7 100644 --- a/docs/ru/sql-reference/statements/create/index.md +++ b/docs/ru/sql-reference/statements/create/index.md @@ -1,7 +1,7 @@ --- toc_folder_title: CREATE toc_priority: 34 -toc_title: "\u041e\u0431\u0437\u043e\u0440" +toc_title: "Обзор" --- # Запросы CREATE {#create-queries} @@ -18,4 +18,3 @@ toc_title: "\u041e\u0431\u0437\u043e\u0440" - [QUOTA](../../../sql-reference/statements/create/quota.md) - [SETTINGS PROFILE](../../../sql-reference/statements/create/settings-profile.md) -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/statements/create/) diff --git a/docs/ru/sql-reference/statements/create/quota.md b/docs/ru/sql-reference/statements/create/quota.md index 65762071ea2..38957ed8c6d 100644 --- a/docs/ru/sql-reference/statements/create/quota.md +++ b/docs/ru/sql-reference/statements/create/quota.md @@ -1,6 +1,6 @@ --- toc_priority: 42 -toc_title: "\u041a\u0432\u043e\u0442\u0430" +toc_title: "Квота" --- # CREATE QUOTA {#create-quota-statement} @@ -13,13 +13,13 @@ toc_title: "\u041a\u0432\u043e\u0442\u0430" CREATE QUOTA [IF NOT EXISTS | OR REPLACE] name [ON CLUSTER cluster_name] [KEYED BY {user_name | ip_address | client_key | client_key, user_name | client_key, ip_address} | NOT KEYED] [FOR [RANDOMIZED] INTERVAL number {second | minute | hour | day | week | month | quarter | year} - {MAX { {queries | errors | result_rows | result_bytes | read_rows | read_bytes | execution_time} = number } [,...] | + {MAX { {queries | query_selects | query_inserts | errors | result_rows | result_bytes | read_rows | read_bytes | execution_time} = number } [,...] | NO LIMITS | TRACKING ONLY} [,...]] [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] ``` Ключи `user_name`, `ip_address`, `client_key`, `client_key, user_name` и `client_key, ip_address` соответствуют полям таблицы [system.quotas](../../../operations/system-tables/quotas.md). -Параметры `queries`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time` соответствуют полям таблицы [system.quotas_usage](../../../operations/system-tables/quotas_usage.md). +Параметры `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time` соответствуют полям таблицы [system.quotas_usage](../../../operations/system-tables/quotas_usage.md). В секции `ON CLUSTER` можно указать кластеры, на которых создается квота, см. [Распределенные DDL запросы](../../../sql-reference/distributed-ddl.md). @@ -35,7 +35,4 @@ CREATE QUOTA qA FOR INTERVAL 15 month MAX queries = 123 TO CURRENT_USER; ``` sql CREATE QUOTA qB FOR INTERVAL 30 minute MAX execution_time = 0.5, FOR INTERVAL 5 quarter MAX queries = 321, errors = 10 TO default; -``` - -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/quota) - \ No newline at end of file +``` \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/create/role.md b/docs/ru/sql-reference/statements/create/role.md index 521117c0e89..16450b41126 100644 --- a/docs/ru/sql-reference/statements/create/role.md +++ b/docs/ru/sql-reference/statements/create/role.md @@ -1,6 +1,6 @@ --- toc_priority: 40 -toc_title: "\u0420\u043e\u043b\u044c" +toc_title: "Роль" --- # CREATE ROLE {#create-role-statement} @@ -46,5 +46,4 @@ SET ROLE accountant; SELECT * FROM db.*; ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/role) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/create/row-policy.md b/docs/ru/sql-reference/statements/create/row-policy.md index e79a19d4cbe..6fe1dc45815 100644 --- a/docs/ru/sql-reference/statements/create/row-policy.md +++ b/docs/ru/sql-reference/statements/create/row-policy.md @@ -1,11 +1,11 @@ --- toc_priority: 41 -toc_title: "\u041f\u043e\u043b\u0438\u0442\u0438\u043a\u0430\u0020\u0434\u043e\u0441\u0442\u0443\u043f\u0430" +toc_title: "Политика доступа" --- # CREATE ROW POLICY {#create-row-policy-statement} -Создает [фильтры для строк](../../../operations/access-rights.md#row-policy-management), которые пользователь может прочесть из таблицы. +Создает [политики доступа к строкам](../../../operations/access-rights.md#row-policy-management), т.е. фильтры, которые определяют, какие строки пользователь может читать из таблицы. Синтаксис: @@ -13,34 +13,74 @@ toc_title: "\u041f\u043e\u043b\u0438\u0442\u0438\u043a\u0430\u0020\u0434\u043e\u CREATE [ROW] POLICY [IF NOT EXISTS | OR REPLACE] policy_name1 [ON CLUSTER cluster_name1] ON [db1.]table1 [, policy_name2 [ON CLUSTER cluster_name2] ON [db2.]table2 ...] [AS {PERMISSIVE | RESTRICTIVE}] - [FOR SELECT] - [USING condition] + [FOR SELECT] USING condition [TO {role [,...] | ALL | ALL EXCEPT role [,...]}] ``` -Секция `ON CLUSTER` позволяет создавать фильтры для строк на кластере, см. [Распределенные DDL запросы](../../../sql-reference/distributed-ddl.md). +## Секция USING {#create-row-policy-using} -## Секция AS {#create-row-policy-as} - -С помощью данной секции можно создать политику разрешения или ограничения. - -Политика разрешения предоставляет доступ к строкам. Разрешительные политики, которые применяются к одной таблице, объединяются с помощью логического оператора `OR`. Политики являются разрешительными по умолчанию. - -Политика ограничения запрещает доступ к строкам. Ограничительные политики, которые применяются к одной таблице, объединяются логическим оператором `AND`. - -Ограничительные политики применяются к строкам, прошедшим фильтр разрешительной политики. Если вы не зададите разрешительные политики, пользователь не сможет обращаться ни к каким строкам из таблицы. +Секция `USING` указывает условие для фильтрации строк. Пользователь может видеть строку, если это условие, вычисленное для строки, дает ненулевой результат. ## Секция TO {#create-row-policy-to} -В секции `TO` вы можете перечислить как роли, так и пользователей. Например, `CREATE ROW POLICY ... TO accountant, john@localhost`. +В секции `TO` перечисляются пользователи и роли, для которых должна действовать политика. Например, `CREATE ROW POLICY ... TO accountant, john@localhost`. Ключевым словом `ALL` обозначаются все пользователи, включая текущего. Ключевые слова `ALL EXCEPT` позволяют исключить пользователей из списка всех пользователей. Например, `CREATE ROW POLICY ... TO ALL EXCEPT accountant, john@localhost` +!!! note "Note" + Если для таблицы не задано ни одной политики доступа к строкам, то любой пользователь может выполнить команду SELECT и получить все строки таблицы. Если определить хотя бы одну политику для таблицы, до доступ к строкам будет управляться этими политиками, причем для всех пользователей (даже для тех, для кого политики не определялись). Например, следующая политика + + `CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter` + + запретит пользователям `mira` и `peter` видеть строки с `b != 1`, и еще запретит всем остальным пользователям (например, пользователю `paul`) видеть какие-либо строки вообще из таблицы `mydb.table1`. + + Если это нежелательно, такое поведение можно исправить, определив дополнительную политику: + + `CREATE ROW POLICY pol2 ON mydb.table1 USING 1 TO ALL EXCEPT mira, peter` + +## Секция AS {#create-row-policy-as} + +Может быть одновременно активно более одной политики для одной и той же таблицы и одного и того же пользователя. Поэтому нам нужен способ комбинировать политики. + +По умолчанию политики комбинируются с использованием логического оператора `OR`. Например, политики: + +``` sql +CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter +CREATE ROW POLICY pol2 ON mydb.table1 USING c=2 TO peter, antonio +``` + +разрешат пользователю с именем `peter` видеть строки, для которых будет верно `b=1` или `c=2`. + +Секция `AS` указывает, как политики должны комбинироваться с другими политиками. Политики могут быть или разрешительными (`PERMISSIVE`), или ограничительными (`RESTRICTIVE`). По умолчанию политики создаются разрешительными (`PERMISSIVE`); такие политики комбинируются с использованием логического оператора `OR`. + +Ограничительные (`RESTRICTIVE`) политики комбинируются с использованием логического оператора `AND`. + +Общая формула выглядит так: + +``` +строка_видима = (одна или больше permissive-политик дала ненулевой результат проверки условия) И + (все restrictive-политики дали ненулевой результат проверки условия) +``` + +Например, политики + +``` sql +CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter +CREATE ROW POLICY pol2 ON mydb.table1 USING c=2 AS RESTRICTIVE TO peter, antonio +``` + +разрешат пользователю с именем `peter` видеть только те строки, для которых будет одновременно `b=1` и `c=2`. + +## Секция ON CLUSTER {#create-row-policy-on-cluster} + +Секция `ON CLUSTER` позволяет создавать политики на кластере, см. [Распределенные DDL запросы](../../../sql-reference/distributed-ddl.md). + ## Примеры -`CREATE ROW POLICY filter ON mydb.mytable FOR SELECT USING a<1000 TO accountant, john@localhost` +`CREATE ROW POLICY filter1 ON mydb.mytable USING a<1000 TO accountant, john@localhost` -`CREATE ROW POLICY filter ON mydb.mytable FOR SELECT USING a<1000 TO ALL EXCEPT mira` +`CREATE ROW POLICY filter2 ON mydb.mytable USING a<1000 AND b=5 TO ALL EXCEPT mira` + +`CREATE ROW POLICY filter3 ON mydb.mytable USING 1 TO admin` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/row-policy) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/create/settings-profile.md b/docs/ru/sql-reference/statements/create/settings-profile.md index 643f9d92eac..522caf04c80 100644 --- a/docs/ru/sql-reference/statements/create/settings-profile.md +++ b/docs/ru/sql-reference/statements/create/settings-profile.md @@ -1,6 +1,6 @@ --- toc_priority: 43 -toc_title: "\u041f\u0440\u043e\u0444\u0438\u043b\u044c\u0020\u043d\u0430\u0441\u0442\u0440\u043e\u0435\u043a" +toc_title: "Профиль настроек" --- # CREATE SETTINGS PROFILE {#create-settings-profile-statement} @@ -25,5 +25,4 @@ CREATE SETTINGS PROFILE [IF NOT EXISTS | OR REPLACE] TO name1 [ON CLUSTER cluste CREATE SETTINGS PROFILE max_memory_usage_profile SETTINGS max_memory_usage = 100000001 MIN 90000000 MAX 110000000 TO robin ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/settings-profile) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/create/table.md b/docs/ru/sql-reference/statements/create/table.md index 9f582042a36..1ccd0a600f3 100644 --- a/docs/ru/sql-reference/statements/create/table.md +++ b/docs/ru/sql-reference/statements/create/table.md @@ -1,11 +1,15 @@ --- toc_priority: 36 -toc_title: "\u0422\u0430\u0431\u043b\u0438\u0446\u0430" +toc_title: "Таблица" --- # CREATE TABLE {#create-table-query} -Запрос `CREATE TABLE` может иметь несколько форм. +Запрос `CREATE TABLE` может иметь несколько форм, которые используются в зависимости от контекста и решаемых задач. + +По умолчанию таблицы создаются на текущем сервере. Распределенные DDL запросы создаются с помощью секции `ON CLUSTER`, которая [описана отдельно](../../../sql-reference/distributed-ddl.md). +## Варианты синтаксиса {#syntax-forms} +### С описанием структуры {#with-explicit-schema} ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] @@ -23,28 +27,51 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Также могут быть указаны выражения для значений по умолчанию - смотрите ниже. При необходимости можно указать [первичный ключ](#primary-key) с одним или несколькими ключевыми выражениями. + +### Со структурой, аналогичной другой таблице {#with-a-schema-similar-to-other-table} + ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS [db2.]name2 [ENGINE = engine] ``` Создаёт таблицу с такой же структурой, как другая таблица. Можно указать другой движок для таблицы. Если движок не указан, то будет выбран такой же движок, как у таблицы `db2.name2`. +### Из табличной функции {#from-a-table-function} + ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_function() ``` +Создаёт таблицу с такой же структурой и данными, как результат соответствующей табличной функции. Созданная таблица будет работать так же, как и указанная табличная функция. -Создаёт таблицу с такой же структурой и данными, как результат соответствующей табличной функцией. +### Из запроса SELECT {#from-select-query} ``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... +CREATE TABLE [IF NOT EXISTS] [db.]table_name[(name1 [type1], name2 [type2], ...)] ENGINE = engine AS SELECT ... ``` -Создаёт таблицу со структурой, как результат запроса `SELECT`, с движком engine, и заполняет её данными из SELECT-а. +Создаёт таблицу со структурой, как результат запроса `SELECT`, с движком `engine`, и заполняет её данными из `SELECT`. Также вы можете явно задать описание столбцов. -Во всех случаях, если указано `IF NOT EXISTS`, то запрос не будет возвращать ошибку, если таблица уже существует. В этом случае, запрос будет ничего не делать. +Если таблица уже существует и указано `IF NOT EXISTS`, то запрос ничего не делает. После секции `ENGINE` в запросе могут использоваться и другие секции в зависимости от движка. Подробную документацию по созданию таблиц смотрите в описаниях [движков таблиц](../../../engines/table-engines/index.md#table_engines). +**Пример** + +Запрос: + +``` sql +CREATE TABLE t1 (x String) ENGINE = Memory AS SELECT 1; +SELECT x, toTypeName(x) FROM t1; +``` + +Результат: + +```text +┌─x─┬─toTypeName(x)─┐ +│ 1 │ String │ +└───┴───────────────┘ +``` + ## Модификатор NULL или NOT NULL {#null-modifiers} Модификатор `NULL` или `NOT NULL`, указанный после типа данных в определении столбца, позволяет или не позволяет типу данных быть [Nullable](../../../sql-reference/data-types/nullable.md#data_type-nullable). @@ -53,7 +80,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... Смотрите также настройку [data_type_default_nullable](../../../operations/settings/settings.md#data_type_default_nullable). -### Значения по умолчанию {#create-default-values} +## Значения по умолчанию {#create-default-values} В описании столбца, может быть указано выражение для значения по умолчанию, одного из следующих видов: `DEFAULT expr`, `MATERIALIZED expr`, `ALIAS expr`. @@ -67,16 +94,22 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... В качестве выражения для умолчания, может быть указано произвольное выражение от констант и столбцов таблицы. При создании и изменении структуры таблицы, проверяется, что выражения не содержат циклов. При INSERT-е проверяется разрешимость выражений - что все столбцы, из которых их можно вычислить, переданы. +### DEFAULT {#default} + `DEFAULT expr` Обычное значение по умолчанию. Если в запросе INSERT не указан соответствующий столбец, то он будет заполнен путём вычисления соответствующего выражения. +### MATERIALIZED {#materialized} + `MATERIALIZED expr` Материализованное выражение. Такой столбец не может быть указан при INSERT, то есть, он всегда вычисляется. При INSERT без указания списка столбцов, такие столбцы не рассматриваются. Также этот столбец не подставляется при использовании звёздочки в запросе SELECT. Это необходимо, чтобы сохранить инвариант, что дамп, полученный путём `SELECT *`, можно вставить обратно в таблицу INSERT-ом без указания списка столбцов. +### ALIAS {#alias} + `ALIAS expr` Синоним. Такой столбец вообще не хранится в таблице. @@ -118,7 +151,7 @@ PRIMARY KEY(expr1[, expr2,...]); !!! warning "Предупреждение" Вы не можете сочетать оба способа в одном запросе. -### Ограничения (constraints) {#constraints} +## Ограничения {#constraints} Наряду с объявлением столбцов можно объявить ограничения на значения в столбцах таблицы: @@ -136,11 +169,11 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Добавление большого числа ограничений может негативно повлиять на производительность `INSERT` запросов. -### Выражение для TTL {#vyrazhenie-dlia-ttl} +## Выражение для TTL {#vyrazhenie-dlia-ttl} Определяет время хранения значений. Может быть указано только для таблиц семейства MergeTree. Подробнее смотрите в [TTL для столбцов и таблиц](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). -### Кодеки сжатия столбцов {#codecs} +## Кодеки сжатия столбцов {#codecs} По умолчанию, ClickHouse применяет к столбцу метод сжатия, определённый в [конфигурации сервера](../../../operations/server-configuration-parameters/settings.md). Кроме этого, можно задать метод сжатия для каждого отдельного столбца в запросе `CREATE TABLE`. @@ -182,7 +215,18 @@ ALTER TABLE codec_example MODIFY COLUMN float_value CODEC(Default); ClickHouse поддерживает кодеки общего назначения и специализированные кодеки. -#### Специализированные кодеки {#create-query-specialized-codecs} +### Кодеки общего назначения {#create-query-common-purpose-codecs} + +Кодеки: + +- `NONE` — без сжатия. +- `LZ4` — [алгоритм сжатия без потерь](https://github.com/lz4/lz4) используемый по умолчанию. Применяет быстрое сжатие LZ4. +- `LZ4HC[(level)]` — алгоритм LZ4 HC (high compression) с настраиваемым уровнем сжатия. Уровень по умолчанию — 9. Настройка `level <= 0` устанавливает уровень сжания по умолчанию. Возможные уровни сжатия: \[1, 12\]. Рекомендуемый диапазон уровней: \[4, 9\]. +- `ZSTD[(level)]` — [алгоритм сжатия ZSTD](https://en.wikipedia.org/wiki/Zstandard) с настраиваемым уровнем сжатия `level`. Возможные уровни сжатия: \[1, 22\]. Уровень сжатия по умолчанию: 1. + +Высокие уровни сжатия полезны для ассимметричных сценариев, подобных «один раз сжал, много раз распаковал». Они подразумевают лучшее сжатие, но большее использование CPU. + +### Специализированные кодеки {#create-query-specialized-codecs} Эти кодеки разработаны для того, чтобы, используя особенности данных сделать сжатие более эффективным. Некоторые из этих кодеков не сжимают данные самостоятельно. Они готовят данные для кодеков общего назначения, которые сжимают подготовленные данные эффективнее, чем неподготовленные. @@ -203,19 +247,7 @@ CREATE TABLE codec_example ) ENGINE = MergeTree() ``` - -#### Кодеки общего назначения {#create-query-common-purpose-codecs} - -Кодеки: - -- `NONE` — без сжатия. -- `LZ4` — [алгоритм сжатия без потерь](https://github.com/lz4/lz4) используемый по умолчанию. Применяет быстрое сжатие LZ4. -- `LZ4HC[(level)]` — алгоритм LZ4 HC (high compression) с настраиваемым уровнем сжатия. Уровень по умолчанию — 9. Настройка `level <= 0` устанавливает уровень сжания по умолчанию. Возможные уровни сжатия: \[1, 12\]. Рекомендуемый диапазон уровней: \[4, 9\]. -- `ZSTD[(level)]` — [алгоритм сжатия ZSTD](https://en.wikipedia.org/wiki/Zstandard) с настраиваемым уровнем сжатия `level`. Возможные уровни сжатия: \[1, 22\]. Уровень сжатия по умолчанию: 1. - -Высокие уровни сжатия полезны для ассимметричных сценариев, подобных «один раз сжал, много раз распаковал». Высокие уровни сжатия подразумеваю лучшее сжатие, но большее использование CPU. - -## Временные таблицы {#vremennye-tablitsy} +## Временные таблицы {#temporary-tables} ClickHouse поддерживает временные таблицы со следующими характеристиками: @@ -241,7 +273,77 @@ CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name Вместо временных можно использовать обычные таблицы с [ENGINE = Memory](../../../engines/table-engines/special/memory.md). +## REPLACE TABLE {#replace-table-query} +Запрос `REPLACE` позволяет частично изменить таблицу (структуру или данные). + +!!!note "Замечание" + Такие запросы поддерживаются только движком БД [Atomic](../../../engines/database-engines/atomic.md). + +Чтобы удалить часть данных из таблицы, вы можете создать новую таблицу, добавить в нее данные из старой таблицы, которые вы хотите оставить (отобрав их с помощью запроса `SELECT`), затем удалить старую таблицу и переименовать новую таблицу так как старую: + +```sql +CREATE TABLE myNewTable AS myOldTable; +INSERT INTO myNewTable SELECT * FROM myOldTable WHERE CounterID <12345; +DROP TABLE myOldTable; +RENAME TABLE myNewTable TO myOldTable; +``` + +Вместо перечисленных выше операций можно использовать один запрос: + +```sql +REPLACE TABLE myOldTable SELECT * FROM myOldTable WHERE CounterID <12345; +``` + +### Синтаксис + +{CREATE [OR REPLACE]|REPLACE} TABLE [db.]table_name + +Для данного запроса можно использовать любые варианты синтаксиса запроса `CREATE`. Запрос `REPLACE` для несуществующей таблицы вызовет ошибку. + +### Примеры: + +Рассмотрим таблицу: + +```sql +CREATE DATABASE base ENGINE = Atomic; +CREATE OR REPLACE TABLE base.t1 (n UInt64, s String) ENGINE = MergeTree ORDER BY n; +INSERT INTO base.t1 VALUES (1, 'test'); +SELECT * FROM base.t1; +``` + +```text +┌─n─┬─s────┐ +│ 1 │ test │ +└───┴──────┘ +``` + +Используем запрос `REPLACE` для удаления всех данных: + +```sql +CREATE OR REPLACE TABLE base.t1 (n UInt64, s Nullable(String)) ENGINE = MergeTree ORDER BY n; +INSERT INTO base.t1 VALUES (2, null); +SELECT * FROM base.t1; +``` + +```text +┌─n─┬─s──┐ +│ 2 │ \N │ +└───┴────┘ +``` + +Используем запрос `REPLACE` для изменения структуры таблицы: + +```sql +REPLACE TABLE base.t1 (n UInt64) ENGINE = MergeTree ORDER BY n; +INSERT INTO base.t1 VALUES (3); +SELECT * FROM base.t1; +``` + +```text +┌─n─┐ +│ 3 │ +└───┘ +``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/table) diff --git a/docs/ru/sql-reference/statements/create/user.md b/docs/ru/sql-reference/statements/create/user.md index bcc9768eb43..a487d1ac593 100644 --- a/docs/ru/sql-reference/statements/create/user.md +++ b/docs/ru/sql-reference/statements/create/user.md @@ -1,6 +1,6 @@ --- toc_priority: 39 -toc_title: "\u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044c" +toc_title: "Пользователь" --- # CREATE USER {#create-user-statement} @@ -9,15 +9,17 @@ toc_title: "\u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u Синтаксис: -```sql +``` sql CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] [, name2 [ON CLUSTER cluster_name2] ...] - [IDENTIFIED [WITH {NO_PASSWORD|PLAINTEXT_PASSWORD|SHA256_PASSWORD|SHA256_HASH|DOUBLE_SHA1_PASSWORD|DOUBLE_SHA1_HASH}] BY {'password'|'hash'}] + [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']}] [HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [DEFAULT ROLE role [,...]] - [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] + [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY | WRITABLE] | PROFILE 'profile_name'] [,...] ``` +`ON CLUSTER` позволяет создавать пользователей в кластере, см. [Распределенные DDL](../../../sql-reference/distributed-ddl.md). + ## Идентификация Существует несколько способов идентификации пользователя: @@ -28,6 +30,8 @@ CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] - `IDENTIFIED WITH sha256_hash BY 'hash'` - `IDENTIFIED WITH double_sha1_password BY 'qwerty'` - `IDENTIFIED WITH double_sha1_hash BY 'hash'` +- `IDENTIFIED WITH ldap SERVER 'server_name'` +- `IDENTIFIED WITH kerberos` or `IDENTIFIED WITH kerberos REALM 'realm'` ## Пользовательский хост @@ -81,5 +85,4 @@ CREATE USER user DEFAULT ROLE ALL CREATE USER john DEFAULT ROLE ALL EXCEPT role1, role2 ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/user) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/create/view.md b/docs/ru/sql-reference/statements/create/view.md index f4b91b5ae17..4e34b5e3b6e 100644 --- a/docs/ru/sql-reference/statements/create/view.md +++ b/docs/ru/sql-reference/statements/create/view.md @@ -1,6 +1,6 @@ --- toc_priority: 37 -toc_title: "\u041f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u0435" +toc_title: "Представление" --- # CREATE VIEW {#create-view} @@ -13,7 +13,7 @@ toc_title: "\u041f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] AS SELECT ... ``` -Обычные представления не хранят никаких данных, они выполняют чтение данных из другой таблицы при каждом доступе. Другими словами, обычное представление - это не что иное, как сохраненный запрос. При чтении данных из представления этот сохраненный запрос используется как подзапрос в секции [FROM](../../../sql-reference/statements/select/from.md). +Обычные представления не хранят никаких данных, они выполняют чтение данных из другой таблицы при каждом доступе. Другими словами, обычное представление — это не что иное, как сохраненный запрос. При чтении данных из представления этот сохраненный запрос используется как подзапрос в секции [FROM](../../../sql-reference/statements/select/from.md). Для примера, пусть вы создали представление: @@ -43,12 +43,12 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na При создании материализованного представления без использования `TO [db].[table]`, нужно обязательно указать `ENGINE` - движок таблицы для хранения данных. -При создании материализованного представления с испольованием `TO [db].[table]`, нельзя указывать `POPULATE` +При создании материализованного представления с испольованием `TO [db].[table]`, нельзя указывать `POPULATE`. Материализованное представление устроено следующим образом: при вставке данных в таблицу, указанную в SELECT-е, кусок вставляемых данных преобразуется этим запросом SELECT, и полученный результат вставляется в представление. !!! important "Важно" - Материализованные представлени в ClickHouse больше похожи на `after insert` триггеры. Если в запросе материализованного представления есть агрегирование, оно применяется только к вставляемому блоку записей. Любые изменения существующих данных исходной таблицы (например обновление, удаление, удаление раздела и т.д.) не изменяют материализованное представление. + Материализованные представления в ClickHouse больше похожи на `after insert` триггеры. Если в запросе материализованного представления есть агрегирование, оно применяется только к вставляемому блоку записей. Любые изменения существующих данных исходной таблицы (например обновление, удаление, удаление раздела и т.д.) не изменяют материализованное представление. Если указано `POPULATE`, то при создании представления, в него будут вставлены имеющиеся данные таблицы, как если бы был сделан запрос `CREATE TABLE ... AS SELECT ...` . Иначе, представление будет содержать только данные, вставляемые в таблицу после создания представления. Не рекомендуется использовать POPULATE, так как вставляемые в таблицу данные во время создания представления, не попадут в него. @@ -56,10 +56,177 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na Недоработано выполнение запросов `ALTER` над материализованными представлениями, поэтому они могут быть неудобными для использования. Если материализованное представление использует конструкцию `TO [db.]name`, то можно выполнить `DETACH` представления, `ALTER` для целевой таблицы и последующий `ATTACH` ранее отсоединенного (`DETACH`) представления. -Обратите внимание, что работа материлизованного представления находится под влиянием настройки [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert). Перед вставкой данных в таблицу происходит их слияние. +Обратите внимание, что работа материализованного представления находится под влиянием настройки [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert). Перед вставкой данных в таблицу происходит их слияние. Представления выглядят так же, как обычные таблицы. Например, они перечисляются в результате запроса `SHOW TABLES`. -Отсутствует отдельный запрос для удаления представлений. Чтобы удалить представление, следует использовать `DROP TABLE`. +Чтобы удалить представление, следует использовать [DROP VIEW](../../../sql-reference/statements/drop.md#drop-view). Впрочем, `DROP TABLE` тоже работает для представлений. + +## LIVE-представления {#live-view} + +!!! important "Важно" + Представления `LIVE VIEW` являются экспериментальной возможностью. Их использование может повлечь потерю совместимости в будущих версиях. + Чтобы использовать `LIVE VIEW` и запросы `WATCH`, включите настройку [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view). + +```sql +CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... +``` +`LIVE VIEW` хранит результат запроса [SELECT](../../../sql-reference/statements/select/index.md), указанного при создании, и обновляется сразу же при изменении этого результата. Конечный результат запроса и промежуточные данные, из которых формируется результат, хранятся в оперативной памяти, и это обеспечивает высокую скорость обработки для повторяющихся запросов. LIVE-представления могут отправлять push-уведомления при изменении результата исходного запроса `SELECT`. Для этого используйте запрос [WATCH](../../../sql-reference/statements/watch.md). + +Изменение `LIVE VIEW` запускается при вставке данных в таблицу, указанную в исходном запросе `SELECT`. + +LIVE-представления работают по тому же принципу, что и распределенные таблицы. Но вместо объединения отдельных частей данных с разных серверов, LIVE-представления объединяют уже имеющийся результат с новыми данными. Если в исходном запросе LIVE-представления есть вложенный подзапрос, его результаты не кешируются, в кеше хранится только результат основного запроса. + +!!! info "Ограничения" + - [Табличные функции](../../../sql-reference/table-functions/index.md) в основном запросе не поддерживаются. + - Таблицы, не поддерживающие изменение с помощью запроса `INSERT`, такие как [словари](../../../sql-reference/dictionaries/index.md) и [системные таблицы](../../../operations/system-tables/index.md), а также [нормальные представления](#normal) или [материализованные представления](#materialized), не запускают обновление LIVE-представления. + - В LIVE-представлениях могут использоваться только такие запросы, которые объединяют результаты по старым и новым данным. LIVE-представления не работают с запросами, требующими полного пересчета данных или агрегирования с сохранением состояния. + - `LIVE VIEW` не работает для реплицируемых и распределенных таблиц, добавление данных в которые происходит на разных узлах. + - `LIVE VIEW` не обновляется, если в исходном запросе используются несколько таблиц. + + В случаях, когда `LIVE VIEW` не обновляется автоматически, чтобы обновлять его принудительно с заданной периодичностью, используйте [WITH REFRESH](#live-view-with-refresh). + +### Отслеживание изменений {#live-view-monitoring} + +Для отслеживания изменений LIVE-представления используйте запрос [WATCH](../../../sql-reference/statements/watch.md). + + +**Пример:** + +```sql +CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x; +CREATE LIVE VIEW lv AS SELECT sum(x) FROM mt; +``` +Отслеживаем изменения LIVE-представления при вставке данных в исходную таблицу. + +```sql +WATCH lv; +``` + +```bash +┌─sum(x)─┬─_version─┐ +│ 1 │ 1 │ +└────────┴──────────┘ +┌─sum(x)─┬─_version─┐ +│ 2 │ 2 │ +└────────┴──────────┘ +┌─sum(x)─┬─_version─┐ +│ 6 │ 3 │ +└────────┴──────────┘ +... +``` + +```sql +INSERT INTO mt VALUES (1); +INSERT INTO mt VALUES (2); +INSERT INTO mt VALUES (3); +``` + +Для получения списка изменений используйте ключевое слово [EVENTS](../../../sql-reference/statements/watch.md#events-clause). + + +```sql +WATCH lv EVENTS; +``` + +```bash +┌─version─┐ +│ 1 │ +└─────────┘ +┌─version─┐ +│ 2 │ +└─────────┘ +┌─version─┐ +│ 3 │ +└─────────┘ +... +``` + +Для работы с LIVE-представлениями, как и с любыми другими, можно использовать запросы [SELECT](../../../sql-reference/statements/select/index.md). Если результат запроса кеширован, он будет возвращен немедленно, без обращения к исходным таблицам представления. + +```sql +SELECT * FROM [db.]live_view WHERE ... +``` + +### Принудительное обновление {#live-view-alter-refresh} + +Чтобы принудительно обновить LIVE-представление, используйте запрос `ALTER LIVE VIEW [db.]table_name REFRESH`. + +### Секция WITH TIMEOUT {#live-view-with-timeout} + +LIVE-представление, созданное с параметром `WITH TIMEOUT`, будет автоматически удалено через определенное количество секунд с момента предыдущего запроса [WATCH](../../../sql-reference/statements/watch.md), примененного к данному LIVE-представлению. + +```sql +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ... +``` + +Если временной промежуток не указан, используется значение настройки [temporary_live_view_timeout](../../../operations/settings/settings.md#temporary-live-view-timeout). + +**Пример:** + +```sql +CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x; +CREATE LIVE VIEW lv WITH TIMEOUT 15 AS SELECT sum(x) FROM mt; +``` + +### Секция WITH REFRESH {#live-view-with-refresh} + +LIVE-представление, созданное с параметром `WITH REFRESH`, будет автоматически обновляться через указанные промежутки времени, начиная с момента последнего обновления. + +```sql +CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ... +``` + +Если значение временного промежутка не задано, используется значение [periodic_live_view_refresh](../../../operations/settings/settings.md#periodic-live-view-refresh). + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); +WATCH lv; +``` + +```bash +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 08:47:05 │ 1 │ +└─────────────────────┴──────────┘ +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 08:47:10 │ 2 │ +└─────────────────────┴──────────┘ +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 08:47:15 │ 3 │ +└─────────────────────┴──────────┘ +``` + +Параметры `WITH TIMEOUT` и `WITH REFRESH` можно сочетать с помощью `AND`. + +```sql +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AND REFRESH [value_in_sec] AS SELECT ... +``` + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH TIMEOUT 15 AND REFRESH 5 AS SELECT now(); +``` + +По истечении 15 секунд представление будет автоматически удалено, если нет активного запроса `WATCH`. + +```sql +WATCH lv; +``` + +``` +Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.lv doesn't exist.. +``` + +### Использование {#live-view-usage} + +Наиболее частые случаи использования `LIVE-VIEW`: + +- Получение push-уведомлений об изменениях данных без дополнительных периодических запросов. +- Кеширование результатов часто используемых запросов для получения их без задержки. +- Отслеживание изменений таблицы для запуска других запросов `SELECT`. +- Отслеживание показателей из системных таблиц с помощью периодических обновлений. [Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/view) diff --git a/docs/ru/sql-reference/statements/describe-table.md b/docs/ru/sql-reference/statements/describe-table.md index 64ed61de232..c66dbb66521 100644 --- a/docs/ru/sql-reference/statements/describe-table.md +++ b/docs/ru/sql-reference/statements/describe-table.md @@ -21,4 +21,3 @@ DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] Вложенные структуры данных выводятся в «развёрнутом» виде. То есть, каждый столбец - по отдельности, с именем через точку. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/describe-table/) diff --git a/docs/ru/sql-reference/statements/detach.md b/docs/ru/sql-reference/statements/detach.md index 00d0a4b20c6..d707acd7ccf 100644 --- a/docs/ru/sql-reference/statements/detach.md +++ b/docs/ru/sql-reference/statements/detach.md @@ -5,15 +5,65 @@ toc_title: DETACH # DETACH {#detach-statement} -Удаляет из сервера информацию о таблице name. Сервер перестаёт знать о существовании таблицы. +Заставляет сервер "забыть" о существовании таблицы или материализованного представления. + +Синтаксис: ``` sql -DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] +DETACH TABLE|VIEW [IF EXISTS] [db.]name [PERMANENTLY] [ON CLUSTER cluster] ``` -Но ни данные, ни метаданные таблицы не удаляются. При следующем запуске сервера, сервер прочитает метаданные и снова узнает о таблице. -Также, «отцепленную» таблицу можно прицепить заново запросом `ATTACH` (за исключением системных таблиц, для которых метаданные не хранятся). +Но ни данные, ни метаданные таблицы или материализованного представления не удаляются. При следующем запуске сервера, если не было использовано `PERMANENTLY`, сервер прочитает метаданные и снова узнает о таблице/представлении. Если таблица или представление были отключены перманентно, сервер не подключит их обратно автоматически. -Запроса `DETACH DATABASE` нет. +Независимо от того, каким способом таблица была отключена, ее можно подключить обратно с помощью запроса [ATTACH](../../sql-reference/statements/attach.md). Системные log таблицы также могут быть подключены обратно (к примеру `query_log`, `text_log` и др.) Другие системные таблицы не могут быть подключены обратно, но на следующем запуске сервер снова "вспомнит" об этих таблицах. + +`ATTACH MATERIALIZED VIEW` не может быть использован с кратким синтаксисом (без `SELECT`), но можно подключить представление с помощью запроса `ATTACH TABLE`. + +Обратите внимание, что нельзя перманентно отключить таблицу, которая уже временно отключена. Для этого ее сначала надо подключить обратно, а затем снова отключить перманентно. + +Также нельзя использовать [DROP](../../sql-reference/statements/drop.md#drop-table) с отключенной таблицей или создавать таблицу с помощью [CREATE TABLE](../../sql-reference/statements/create/table.md) с таким же именем, как у отключенной таблицы. Еще нельзя заменить отключенную таблицу другой с помощью запроса [RENAME TABLE](../../sql-reference/statements/rename.md). + +**Пример** + +Создание таблицы: + +Запрос: + +``` sql +CREATE TABLE test ENGINE = Log AS SELECT * FROM numbers(10); +SELECT * FROM test; +``` + +Результат: + +``` text +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ 4 │ +│ 5 │ +│ 6 │ +│ 7 │ +│ 8 │ +│ 9 │ +└────────┘ +``` + +Отключение таблицы: + +Запрос: + +``` sql +DETACH TABLE test; +SELECT * FROM test; +``` + +Результат: + +``` text +Received exception from server (version 21.4.1): +Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.test doesn't exist. +``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/detach/) diff --git a/docs/ru/sql-reference/statements/drop.md b/docs/ru/sql-reference/statements/drop.md index 514a92db91f..118f8eb923a 100644 --- a/docs/ru/sql-reference/statements/drop.md +++ b/docs/ru/sql-reference/statements/drop.md @@ -97,4 +97,3 @@ DROP [SETTINGS] PROFILE [IF EXISTS] name [,...] [ON CLUSTER cluster_name] DROP VIEW [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/drop/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/exists.md b/docs/ru/sql-reference/statements/exists.md index 0b2fd69273c..d4f1f707e79 100644 --- a/docs/ru/sql-reference/statements/exists.md +++ b/docs/ru/sql-reference/statements/exists.md @@ -12,4 +12,3 @@ EXISTS [TEMPORARY] TABLE [db.]name [INTO OUTFILE filename] [FORMAT format] Возвращает один столбец типа `UInt8`, содержащий одно значение - `0`, если таблицы или БД не существует и `1`, если таблица в указанной БД существует. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/exists/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/grant.md b/docs/ru/sql-reference/statements/grant.md index d38e2ea38a0..093e6eb3b93 100644 --- a/docs/ru/sql-reference/statements/grant.md +++ b/docs/ru/sql-reference/statements/grant.md @@ -93,7 +93,7 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION - `ALTER ADD CONSTRAINT` - `ALTER DROP CONSTRAINT` - `ALTER TTL` - - `ALTER MATERIALIZE TTL` + - `ALTER MATERIALIZE TTL` - `ALTER SETTINGS` - `ALTER MOVE PARTITION` - `ALTER FETCH PARTITION` @@ -104,9 +104,9 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION - [CREATE](#grant-create) - `CREATE DATABASE` - `CREATE TABLE` + - `CREATE TEMPORARY TABLE` - `CREATE VIEW` - `CREATE DICTIONARY` - - `CREATE TEMPORARY TABLE` - [DROP](#grant-drop) - `DROP DATABASE` - `DROP TABLE` @@ -152,7 +152,7 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION - `SYSTEM RELOAD` - `SYSTEM RELOAD CONFIG` - `SYSTEM RELOAD DICTIONARY` - - `SYSTEM RELOAD EMBEDDED DICTIONARIES` + - `SYSTEM RELOAD EMBEDDED DICTIONARIES` - `SYSTEM MERGES` - `SYSTEM TTL MERGES` - `SYSTEM FETCHES` @@ -279,7 +279,7 @@ GRANT INSERT(x,y) ON db.table TO john - `ALTER ADD CONSTRAINT`. Уровень: `TABLE`. Алиасы: `ADD CONSTRAINT` - `ALTER DROP CONSTRAINT`. Уровень: `TABLE`. Алиасы: `DROP CONSTRAINT` - `ALTER TTL`. Уровень: `TABLE`. Алиасы: `ALTER MODIFY TTL`, `MODIFY TTL` - - `ALTER MATERIALIZE TTL`. Уровень: `TABLE`. Алиасы: `MATERIALIZE TTL` + - `ALTER MATERIALIZE TTL`. Уровень: `TABLE`. Алиасы: `MATERIALIZE TTL` - `ALTER SETTINGS`. Уровень: `TABLE`. Алиасы: `ALTER SETTING`, `ALTER MODIFY SETTING`, `MODIFY SETTING` - `ALTER MOVE PARTITION`. Уровень: `TABLE`. Алиасы: `ALTER MOVE PART`, `MOVE PARTITION`, `MOVE PART` - `ALTER FETCH PARTITION`. Уровень: `TABLE`. Алиасы: `FETCH PARTITION` @@ -307,9 +307,9 @@ GRANT INSERT(x,y) ON db.table TO john - `CREATE`. Уровень: `GROUP` - `CREATE DATABASE`. Уровень: `DATABASE` - `CREATE TABLE`. Уровень: `TABLE` + - `CREATE TEMPORARY TABLE`. Уровень: `GLOBAL` - `CREATE VIEW`. Уровень: `VIEW` - `CREATE DICTIONARY`. Уровень: `DICTIONARY` - - `CREATE TEMPORARY TABLE`. Уровень: `GLOBAL` **Дополнительно** @@ -407,7 +407,7 @@ GRANT INSERT(x,y) ON db.table TO john - `SYSTEM RELOAD`. Уровень: `GROUP` - `SYSTEM RELOAD CONFIG`. Уровень: `GLOBAL`. Алиасы: `RELOAD CONFIG` - `SYSTEM RELOAD DICTIONARY`. Уровень: `GLOBAL`. Алиасы: `SYSTEM RELOAD DICTIONARIES`, `RELOAD DICTIONARY`, `RELOAD DICTIONARIES` - - `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Уровень: `GLOBAL`. Алиасы: `RELOAD EMBEDDED DICTIONARIES` + - `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Уровень: `GLOBAL`. Алиасы: `RELOAD EMBEDDED DICTIONARIES` - `SYSTEM MERGES`. Уровень: `TABLE`. Алиасы: `SYSTEM STOP MERGES`, `SYSTEM START MERGES`, `STOP MERGES`, `START MERGES` - `SYSTEM TTL MERGES`. Уровень: `TABLE`. Алиасы: `SYSTEM STOP TTL MERGES`, `SYSTEM START TTL MERGES`, `STOP TTL MERGES`, `START TTL MERGES` - `SYSTEM FETCHES`. Уровень: `TABLE`. Алиасы: `SYSTEM STOP FETCHES`, `SYSTEM START FETCHES`, `STOP FETCHES`, `START FETCHES` @@ -483,4 +483,3 @@ GRANT INSERT(x,y) ON db.table TO john Привилегия `ADMIN OPTION` разрешает пользователю назначать свои роли другому пользователю. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/grant/) diff --git a/docs/ru/sql-reference/statements/index.md b/docs/ru/sql-reference/statements/index.md index c7862015e64..5e72aa7cca0 100644 --- a/docs/ru/sql-reference/statements/index.md +++ b/docs/ru/sql-reference/statements/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: "\u0412\u044B\u0440\u0430\u0436\u0435\u043D\u0438\u044F" +toc_folder_title: "Выражения" toc_priority: 31 --- diff --git a/docs/ru/sql-reference/statements/insert-into.md b/docs/ru/sql-reference/statements/insert-into.md index 0ad85ed0166..bbd330962cf 100644 --- a/docs/ru/sql-reference/statements/insert-into.md +++ b/docs/ru/sql-reference/statements/insert-into.md @@ -119,4 +119,3 @@ INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... - Данные поступают в режиме реального времени. - Вы загружаете данные, которые как правило отсортированы по времени. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/insert_into/) diff --git a/docs/ru/sql-reference/statements/kill.md b/docs/ru/sql-reference/statements/kill.md index e2556a7f782..6981d630dd8 100644 --- a/docs/ru/sql-reference/statements/kill.md +++ b/docs/ru/sql-reference/statements/kill.md @@ -70,4 +70,3 @@ KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = ' Данные, уже изменённые мутацией, остаются в таблице (отката на старую версию данных не происходит). -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/kill/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/misc.md b/docs/ru/sql-reference/statements/misc.md index e9ceece8b2c..cedf52b7a34 100644 --- a/docs/ru/sql-reference/statements/misc.md +++ b/docs/ru/sql-reference/statements/misc.md @@ -19,4 +19,3 @@ toc_priority: 41 - [TRUNCATE](../../sql-reference/statements/truncate.md) - [USE](../../sql-reference/statements/use.md) -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/misc/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/optimize.md b/docs/ru/sql-reference/statements/optimize.md index 9b94c31a8f7..e1a9d613537 100644 --- a/docs/ru/sql-reference/statements/optimize.md +++ b/docs/ru/sql-reference/statements/optimize.md @@ -5,21 +5,83 @@ toc_title: OPTIMIZE # OPTIMIZE {#misc_operations-optimize} -``` sql -OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE] -``` - -Запрос пытается запустить внеплановый мёрж кусков данных для таблиц семейства [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). Другие движки таблиц не поддерживаются. - -Если `OPTIMIZE` применяется к таблицам семейства [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md), ClickHouse создаёт задачу на мёрж и ожидает её исполнения на всех узлах (если активирована настройка `replication_alter_partitions_sync`). - -- Если `OPTIMIZE` не выполняет мёрж по любой причине, ClickHouse не оповещает об этом клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop). -- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr). -- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. -- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех колонках), имеет смысл только для движка MergeTree. +Запрос пытается запустить внеплановое слияние кусков данных для таблиц. !!! warning "Внимание" - Запрос `OPTIMIZE` не может устранить причину появления ошибки «Too many parts». + `OPTIMIZE` не устраняет причину появления ошибки `Too many parts`. - -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/optimize/) \ No newline at end of file +**Синтаксис** + +``` sql +OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]] +``` + +Может применяться к таблицам семейства [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md), [MaterializedView](../../engines/table-engines/special/materializedview.md) и [Buffer](../../engines/table-engines/special/buffer.md). Другие движки таблиц не поддерживаются. + +Если запрос `OPTIMIZE` применяется к таблицам семейства [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md), ClickHouse создаёт задачу на слияние и ожидает её исполнения на всех узлах (если активирована настройка `replication_alter_partitions_sync`). + +- По умолчанию, если запросу `OPTIMIZE` не удалось выполнить слияние, то +ClickHouse не оповещает клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop). +- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr). +- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске данных. Кроме того, слияние является принудительным, даже если выполняются параллельные слияния. +- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех столбцах), имеет смысл только для движка MergeTree. + +## Выражение BY {#by-expression} + +Чтобы выполнить дедупликацию по произвольному набору столбцов, вы можете явно указать список столбцов или использовать любую комбинацию подстановки [`*`](../../sql-reference/statements/select/index.md#asterisk), выражений [`COLUMNS`](../../sql-reference/statements/select/index.md#columns-expression) и [`EXCEPT`](../../sql-reference/statements/select/index.md#except-modifier). + + Список столбцов для дедупликации должен включать все столбцы, указанные в условиях сортировки (первичный ключ и ключ сортировки), а также в условиях партиционирования (ключ партиционирования). + + !!! note "Примечание" + Обратите внимание, что символ подстановки `*` обрабатывается так же, как и в запросах `SELECT`: столбцы `MATERIALIZED` и `ALIAS` не включаются в результат. + Если указать пустой список или выражение, которое возвращает пустой список, или дедуплицировать столбец по псевдониму (`ALIAS`), то сервер вернет ошибку. + + +**Примеры** + +Рассмотрим таблицу: + +``` sql +CREATE TABLE example ( + primary_key Int32, + secondary_key Int32, + value UInt32, + partition_key UInt32, + materialized_value UInt32 MATERIALIZED 12345, + aliased_value UInt32 ALIAS 2, + PRIMARY KEY primary_key +) ENGINE=MergeTree +PARTITION BY partition_key; +``` + +Прежний способ дедупликации, когда учитываются все столбцы. Строка удаляется только в том случае, если все значения во всех столбцах равны соответствующим значениям в предыдущей строке. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE; +``` + +Дедупликация по всем столбцам, кроме `ALIAS` и `MATERIALIZED`: `primary_key`, `secondary_key`, `value`, `partition_key` и `materialized_value`. + + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY *; +``` + +Дедупликация по всем столбцам, кроме `ALIAS`, `MATERIALIZED` и `materialized_value`: столбцы `primary_key`, `secondary_key`, `value` и `partition_key`. + + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY * EXCEPT materialized_value; +``` + +Дедупликация по столбцам `primary_key`, `secondary_key` и `partition_key`. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY primary_key, secondary_key, partition_key; +``` + +Дедупликация по любому столбцу, соответствующему регулярному выражению: столбцам `primary_key`, `secondary_key` и `partition_key`. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY COLUMNS('.*_key'); +``` diff --git a/docs/ru/sql-reference/statements/rename.md b/docs/ru/sql-reference/statements/rename.md index 94bf3c682a1..192426dbafa 100644 --- a/docs/ru/sql-reference/statements/rename.md +++ b/docs/ru/sql-reference/statements/rename.md @@ -3,8 +3,16 @@ toc_priority: 48 toc_title: RENAME --- -# RENAME {#misc_operations-rename} +# RENAME Statement {#misc_operations-rename} +## RENAME DATABASE {#misc_operations-rename_database} +Переименование базы данных + +``` +RENAME DATABASE atomic_database1 TO atomic_database2 [ON CLUSTER cluster] +``` + +## RENAME TABLE {#misc_operations-rename_table} Переименовывает одну или несколько таблиц. ``` sql @@ -12,6 +20,3 @@ RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ... ``` Переименовывание таблицы является лёгкой операцией. Если вы указали после `TO` другую базу данных, то таблица будет перенесена в эту базу данных. При этом, директории с базами данных должны быть расположены в одной файловой системе (иначе возвращается ошибка). В случае переименования нескольких таблиц в одном запросе — это неатомарная операция, может выполнится частично, запросы в других сессиях могут получить ошибку `Table ... doesn't exist...`. - - -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/rename/) diff --git a/docs/ru/sql-reference/statements/revoke.md b/docs/ru/sql-reference/statements/revoke.md index 339746b8591..a3a282d6e5c 100644 --- a/docs/ru/sql-reference/statements/revoke.md +++ b/docs/ru/sql-reference/statements/revoke.md @@ -45,4 +45,3 @@ GRANT SELECT ON accounts.staff TO mira; REVOKE SELECT(wage) ON accounts.staff FROM mira; ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/operations/settings/settings/) diff --git a/docs/ru/sql-reference/statements/select/all.md b/docs/ru/sql-reference/statements/select/all.md new file mode 100644 index 00000000000..d36a23ca54e --- /dev/null +++ b/docs/ru/sql-reference/statements/select/all.md @@ -0,0 +1,21 @@ +--- +toc_title: ALL +--- + +# Секция ALL {#select-all} + +Если в таблице несколько совпадающих строк, то `ALL` возвращает все из них. Поведение запроса `SELECT ALL` точно такое же, как и `SELECT` без аргумента `DISTINCT`. Если указаны оба аргумента: `ALL` и `DISTINCT`, функция вернет исключение. + + +`ALL` может быть указан внутри агрегатной функции, например, результат выполнения запроса: + +```sql +SELECT sum(ALL number) FROM numbers(10); +``` + +равен результату выполнения запроса: + +```sql +SELECT sum(number) FROM numbers(10); +``` + diff --git a/docs/ru/sql-reference/statements/select/index.md b/docs/ru/sql-reference/statements/select/index.md index b0b6e80d7be..886952ea5cf 100644 --- a/docs/ru/sql-reference/statements/select/index.md +++ b/docs/ru/sql-reference/statements/select/index.md @@ -1,8 +1,8 @@ --- -title: "\u0421\u0438\u043d\u0442\u0430\u043a\u0441\u0438\u0441\u0020\u0437\u0430\u043f\u0440\u043e\u0441\u043e\u0432\u0020\u0053\u0045\u004c\u0045\u0043\u0054" +title: "Синтаксис запросов SELECT" toc_folder_title: SELECT toc_priority: 32 -toc_title: "\u041e\u0431\u0437\u043e\u0440" +toc_title: "Обзор" --- # Синтаксис запросов SELECT {#select-queries-syntax} @@ -280,4 +280,3 @@ SELECT * REPLACE(i + 1 AS i) EXCEPT (j) APPLY(sum) from columns_transformers; SELECT * FROM some_table SETTINGS optimize_read_in_order=1, cast_keep_nullable=1; ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/select/) diff --git a/docs/ru/sql-reference/statements/select/join.md b/docs/ru/sql-reference/statements/select/join.md index c5548d74156..4bd883c87ff 100644 --- a/docs/ru/sql-reference/statements/select/join.md +++ b/docs/ru/sql-reference/statements/select/join.md @@ -102,7 +102,7 @@ USING (equi_column1, ... equi_columnN, asof_column) - При использовании обычного `JOIN` , запрос отправляется на удалённые серверы. На каждом из них выполняются подзапросы для формирования «правой» таблицы, и с этой таблицей выполняется соединение. То есть, «правая» таблица формируется на каждом сервере отдельно. - При использовании `GLOBAL ... JOIN`, сначала сервер-инициатор запроса запускает подзапрос для вычисления правой таблицы. Эта временная таблица передаётся на каждый удалённый сервер, и на них выполняются запросы с использованием переданных временных данных. -Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](#select-distributed-subqueries). +Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](../../../sql-reference/operators/in.md#select-distributed-subqueries). ## Рекомендации по использованию {#usage-recommendations} diff --git a/docs/ru/sql-reference/statements/select/order-by.md b/docs/ru/sql-reference/statements/select/order-by.md index f8b838cbd15..9ddec923701 100644 --- a/docs/ru/sql-reference/statements/select/order-by.md +++ b/docs/ru/sql-reference/statements/select/order-by.md @@ -473,4 +473,3 @@ SELECT * FROM test_fetch ORDER BY a OFFSET 3 ROW FETCH FIRST 3 ROWS WITH TIES; └───┴───┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/select/order-by/) diff --git a/docs/ru/sql-reference/statements/select/union.md b/docs/ru/sql-reference/statements/select/union.md index 8f1dc11c802..de8a9b0e4ea 100644 --- a/docs/ru/sql-reference/statements/select/union.md +++ b/docs/ru/sql-reference/statements/select/union.md @@ -78,4 +78,3 @@ SELECT 1 UNION SELECT 2 UNION SELECT 3 UNION SELECT 2; Запросы, которые являются частью `UNION/UNION ALL/UNION DISTINCT`, выполняются параллельно, и их результаты могут быть смешаны вместе. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/select/union/) diff --git a/docs/ru/sql-reference/statements/select/with.md b/docs/ru/sql-reference/statements/select/with.md index 328b28c27ef..7e09d94770a 100644 --- a/docs/ru/sql-reference/statements/select/with.md +++ b/docs/ru/sql-reference/statements/select/with.md @@ -67,4 +67,3 @@ WITH test1 AS (SELECT i + 1, j + 1 FROM test1) SELECT * FROM test1; ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/statements/select/with/) diff --git a/docs/ru/sql-reference/statements/set-role.md b/docs/ru/sql-reference/statements/set-role.md index ccbef41aa9b..b21a9ec8319 100644 --- a/docs/ru/sql-reference/statements/set-role.md +++ b/docs/ru/sql-reference/statements/set-role.md @@ -54,4 +54,3 @@ SET DEFAULT ROLE ALL EXCEPT role1, role2 TO user ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/set-role/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/set.md b/docs/ru/sql-reference/statements/set.md index b60dfcf8324..fa96c3c2a1b 100644 --- a/docs/ru/sql-reference/statements/set.md +++ b/docs/ru/sql-reference/statements/set.md @@ -19,4 +19,3 @@ SET profile = 'profile-name-from-the-settings-file' Подробности смотрите в разделе [Настройки](../../operations/settings/settings.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/set/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/show.md b/docs/ru/sql-reference/statements/show.md index 56528f28c65..6d39bab4990 100644 --- a/docs/ru/sql-reference/statements/show.md +++ b/docs/ru/sql-reference/statements/show.md @@ -362,4 +362,68 @@ SHOW [CURRENT] QUOTA SHOW ACCESS ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/show/) +## SHOW SETTINGS {#show-settings} + +Возвращает список системных настроек и их значений. Использует данные из таблицы [system.settings](../../operations/system-tables/settings.md). + +**Синтаксис** + +```sql +SHOW [CHANGED] SETTINGS LIKE|ILIKE +``` + +**Секции** + +При использовании `LIKE|ILIKE` можно задавать шаблон для имени настройки. Этот шаблон может содержать символы подстановки, такие как `%` или `_`. При использовании `LIKE` шаблон чувствителен к регистру, а при использовании `ILIKE` — не чувствителен. + +Если используется `CHANGED`, запрос вернет только те настройки, значения которых были изменены, т.е. отличны от значений по умолчанию. + +**Примеры** + +Запрос с использованием `LIKE`: + +```sql +SHOW SETTINGS LIKE 'send_timeout'; +``` +Результат: + +```text +┌─name─────────┬─type────┬─value─┐ +│ send_timeout │ Seconds │ 300 │ +└──────────────┴─────────┴───────┘ +``` + +Запрос с использованием `ILIKE`: + +```sql +SHOW SETTINGS ILIKE '%CONNECT_timeout%' +``` + +Результат: + +```text +┌─name────────────────────────────────────┬─type─────────┬─value─┐ +│ connect_timeout │ Seconds │ 10 │ +│ connect_timeout_with_failover_ms │ Milliseconds │ 50 │ +│ connect_timeout_with_failover_secure_ms │ Milliseconds │ 100 │ +└─────────────────────────────────────────┴──────────────┴───────┘ +``` + +Запрос с использованием `CHANGED`: + +```sql +SHOW CHANGED SETTINGS ILIKE '%MEMORY%' +``` + +Результат: + +```text +┌─name─────────────┬─type───┬─value───────┐ +│ max_memory_usage │ UInt64 │ 10000000000 │ +└──────────────────┴────────┴─────────────┘ +``` + +**См. также** + +- Таблица [system.settings](../../operations/system-tables/settings.md) + diff --git a/docs/ru/sql-reference/statements/system.md b/docs/ru/sql-reference/statements/system.md index a6a6c5047af..ab68033d4f3 100644 --- a/docs/ru/sql-reference/statements/system.md +++ b/docs/ru/sql-reference/statements/system.md @@ -265,4 +265,3 @@ SYSTEM RESTART REPLICA [db.]replicated_merge_tree_family_table_name ### RESTART REPLICAS {#query_language-system-restart-replicas} Реинициализация состояния Zookeeper сессий для всех `ReplicatedMergeTree` таблиц, сравнивает текущее состояние с тем что хранится в Zookeeper как источник правды и добавляет задачи Zookeeper очередь если необходимо -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/system/) diff --git a/docs/ru/sql-reference/statements/truncate.md b/docs/ru/sql-reference/statements/truncate.md index 4909d349658..b23d96d5b08 100644 --- a/docs/ru/sql-reference/statements/truncate.md +++ b/docs/ru/sql-reference/statements/truncate.md @@ -14,4 +14,3 @@ TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] Запрос `TRUNCATE` не поддерживается для следующих движков: [View](../../engines/table-engines/special/view.md), [File](../../engines/table-engines/special/file.md), [URL](../../engines/table-engines/special/url.md) и [Null](../../engines/table-engines/special/null.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/truncate/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/use.md b/docs/ru/sql-reference/statements/use.md index c84329ea5ff..0d40870c23a 100644 --- a/docs/ru/sql-reference/statements/use.md +++ b/docs/ru/sql-reference/statements/use.md @@ -13,4 +13,3 @@ USE db Текущая база данных используется для поиска таблиц, если база данных не указана в запросе явно через точку перед именем таблицы. При использовании HTTP протокола запрос не может быть выполнен, так как понятия сессии не существует. -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/use/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/watch.md b/docs/ru/sql-reference/statements/watch.md new file mode 100644 index 00000000000..ef5b2f80584 --- /dev/null +++ b/docs/ru/sql-reference/statements/watch.md @@ -0,0 +1,106 @@ +--- +toc_priority: 53 +toc_title: WATCH +--- + +# Запрос WATCH {#watch} + +!!! important "Важно" + Это экспериментальная функция. Она может повлечь потерю совместимости в будущих версиях. + Чтобы использовать `LIVE VIEW` и запросы `WATCH`, включите настройку `set allow_experimental_live_view = 1`. + +**Синтаксис** + +``` sql +WATCH [db.]live_view [EVENTS] [LIMIT n] [FORMAT format] +``` + +Запрос `WATCH` постоянно возвращает содержимое [LIVE-представления](./create/view.md#live-view). Если параметр `LIMIT` не был задан, запрос `WATCH` будет непрерывно обновлять содержимое [LIVE-представления](./create/view.md#live-view). + +```sql +WATCH [db.]live_view; +``` +## Виртуальные столбцы {#watch-virtual-columns} + +Виртуальный столбец `_version` в результате запроса обозначает версию данного результата. + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); +WATCH lv; +``` + +```bash +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 09:17:21 │ 1 │ +└─────────────────────┴──────────┘ +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 09:17:26 │ 2 │ +└─────────────────────┴──────────┘ +┌───────────────now()─┬─_version─┐ +│ 2021-02-21 09:17:31 │ 3 │ +└─────────────────────┴──────────┘ +... +``` + +По умолчанию запрашиваемые данные возвращаются клиенту, однако в сочетании с запросом [INSERT INTO](../../sql-reference/statements/insert-into.md) они могут быть перенаправлены для вставки в другую таблицу. + +**Пример:** + +```sql +INSERT INTO [db.]table WATCH [db.]live_view ... +``` + +## Секция EVENTS {#events-clause} + +С помощью параметра `EVENTS` можно получить компактную форму результата запроса `WATCH`. Вместо полного результата вы получаете номер последней версии результата. + +```sql +WATCH [db.]live_view EVENTS; +``` + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); +WATCH lv EVENTS; +``` + +```bash +┌─version─┐ +│ 1 │ +└─────────┘ +┌─version─┐ +│ 2 │ +└─────────┘ +... +``` + +## Секция LIMIT {#limit-clause} + +Параметр `LIMIT n` задает количество обновлений запроса `WATCH`, после которого отслеживание прекращается. По умолчанию это число не задано, поэтому запрос будет выполняться постоянно. Значение `LIMIT 0` означает, что запрос `WATCH` вернет единственный актуальный результат запроса и прекратит отслеживание. + +```sql +WATCH [db.]live_view LIMIT 1; +``` + +**Пример:** + +```sql +CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now(); +WATCH lv EVENTS LIMIT 1; +``` + +```bash +┌─version─┐ +│ 1 │ +└─────────┘ +``` + +## Секция FORMAT {#format-clause} + +Параметр `FORMAT` работает аналогично одноименному параметру запроса [SELECT](../../sql-reference/statements/select/format.md#format-clause). + +!!! info "Примечание" + При отслеживании [LIVE VIEW](./create/view.md#live-view) через интерфейс HTTP следует использовать формат [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress). Постоянные сообщения об изменениях будут добавлены в поток вывода для поддержания активности долговременного HTTP-соединения до тех пор, пока результат запроса изменяется. Проомежуток времени между сообщениями об изменениях управляется настройкой[live_view_heartbeat_interval](./create/view.md#live-view-settings). diff --git a/docs/ru/sql-reference/syntax.md b/docs/ru/sql-reference/syntax.md index ca73d3a137e..6a923fd6b58 100644 --- a/docs/ru/sql-reference/syntax.md +++ b/docs/ru/sql-reference/syntax.md @@ -1,6 +1,6 @@ --- toc_priority: 31 -toc_title: "\u0421\u0438\u043d\u0442\u0430\u043a\u0441\u0438\u0441" +toc_title: "Синтаксис" --- # Синтаксис {#sintaksis} @@ -181,4 +181,3 @@ Code: 184. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception Список выражений - одно выражение или несколько выражений через запятую. Функции и операторы, в свою очередь, в качестве аргументов, могут иметь произвольные выражения. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/syntax/) diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index d3e6e106125..1d8604528be 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -5,23 +5,27 @@ toc_title: file # file {#file} -Создаёт таблицу из файла. Данная табличная функция похожа на табличные функции [file](file.md) и [hdfs](hdfs.md). +Создаёт таблицу из файла. Данная табличная функция похожа на табличные функции [url](../../sql-reference/table-functions/url.md) и [hdfs](../../sql-reference/table-functions/hdfs.md). + +Функция `file` может использоваться в запросах `SELECT` и `INSERT` при работе с движком таблиц [File](../../engines/table-engines/special/file.md). + +**Синтаксис** ``` sql file(path, format, structure) ``` -**Входные параметры** +**Параметры** -- `path` — относительный путь до файла от [user_files_path](../../sql-reference/table-functions/file.md#server_configuration_parameters-user_files_path). Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, \``'abc', 'def'` — строки. +- `path` — относительный путь до файла от [user_files_path](../../sql-reference/table-functions/file.md#server_configuration_parameters-user_files_path). Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, `'abc', 'def'` — строки. - `format` — [формат](../../interfaces/formats.md#formats) файла. -- `structure` — структура таблицы. Формат `'colunmn1_name column1_ype, column2_name column2_type, ...'`. +- `structure` — структура таблицы. Формат: `'colunmn1_name column1_ype, column2_name column2_type, ...'`. **Возвращаемое значение** Таблица с указанной структурой, предназначенная для чтения или записи данных в указанном файле. -**Пример** +**Примеры** Настройка `user_files_path` и содержимое файла `test.csv`: @@ -35,12 +39,10 @@ $ cat /var/lib/clickhouse/user_files/test.csv 78,43,45 ``` -Таблица из `test.csv` и выборка первых двух строк из неё: +Получение данных из таблицы в файле `test.csv` и выборка первых двух строк из неё: ``` sql -SELECT * -FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') -LIMIT 2 +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2; ``` ``` text @@ -50,45 +52,61 @@ LIMIT 2 └─────────┴─────────┴─────────┘ ``` -Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). +Получение первых 10 строк таблицы, содержащей 3 столбца типа [UInt32](../../sql-reference/data-types/int-uint.md), из CSV-файла: -- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. -- `?` — Заменяет ровно один любой символ. -- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно (может содержать ведущие нули). +``` sql +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10; +``` + +Вставка данных из файла в таблицу: + +``` sql +INSERT INTO FUNCTION file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') VALUES (1, 2, 3), (3, 2, 1); +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32'); +``` + +``` text +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` + +## Шаблоны поиска в компонентах пути {#globs-in-path} + +При описании пути к файлу могут использоваться шаблоны поиска. Обрабатываются только те файлы, у которых путь и название соответствуют шаблону полностью (а не только префикс или суффикс). + +- `*` — заменяет любое количество любых символов кроме `/`, включая отсутствие символов. +- `?` — заменяет ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — заменяет любое число в интервале от `N` до `M` включительно (может содержать ведущие нули). Конструкция с `{}` аналогична табличной функции [remote](remote.md). **Пример** -1. Предположим у нас есть несколько файлов со следующими относительными путями: +Предположим, у нас есть несколько файлов со следующими относительными путями: -- ‘some_dir/some_file_1’ -- ‘some_dir/some_file_2’ -- ‘some_dir/some_file_3’ -- ‘another_dir/some_file_1’ -- ‘another_dir/some_file_2’ -- ‘another_dir/some_file_3’ +- 'some_dir/some_file_1' +- 'some_dir/some_file_2' +- 'some_dir/some_file_3' +- 'another_dir/some_file_1' +- 'another_dir/some_file_2' +- 'another_dir/some_file_3' -1. Запросим количество строк в этих файлах: - - +Запросим количество строк в этих файлах: ``` sql -SELECT count(*) -FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') +SELECT count(*) FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32'); ``` -1. Запросим количество строк во всех файлах этих двух директорий: - - +Запросим количество строк во всех файлах этих двух директорий: ``` sql -SELECT count(*) -FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') +SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32'); ``` -!!! warning "Warning" +!!! warning "Предупреждение" Если ваш список файлов содержит интервал с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры по отдельности или используйте `?`. **Пример** @@ -96,17 +114,15 @@ FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') Запрос данных из файлов с именами `file000`, `file001`, … , `file999`: ``` sql -SELECT count(*) -FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') +SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32'); ``` ## Виртуальные столбцы {#virtualnye-stolbtsy} -- `_path` — Путь к файлу. -- `_file` — Имя файла. +- `_path` — путь к файлу. +- `_file` — имя файла. **Смотрите также** - [Виртуальные столбцы](index.md#table_engines-virtual_columns) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/file/) diff --git a/docs/ru/sql-reference/table-functions/generate.md b/docs/ru/sql-reference/table-functions/generate.md index 47b7e43bc86..6dd88b622fc 100644 --- a/docs/ru/sql-reference/table-functions/generate.md +++ b/docs/ru/sql-reference/table-functions/generate.md @@ -10,10 +10,11 @@ toc_title: generateRandom Поддерживает все типы данных, которые могут храниться в таблице, за исключением `LowCardinality` и `AggregateFunction`. ``` sql -generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]); +generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]) ``` -**Входные параметры** +**Аргументы** + - `name` — название соответствующего столбца. - `TypeName` — тип соответствующего столбца. - `max_array_length` — максимальная длина массива для всех сгенерированных массивов. По умолчанию `10`. @@ -38,4 +39,3 @@ SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64( └──────────┴──────────────┴────────────────────────────────────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/generate/) diff --git a/docs/ru/sql-reference/table-functions/hdfs.md b/docs/ru/sql-reference/table-functions/hdfs.md index 6edd70b7b1b..56aaeae487c 100644 --- a/docs/ru/sql-reference/table-functions/hdfs.md +++ b/docs/ru/sql-reference/table-functions/hdfs.md @@ -61,4 +61,3 @@ LIMIT 2 - [Виртуальные столбцы](index.md#table_engines-virtual_columns) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/hdfs/) diff --git a/docs/ru/sql-reference/table-functions/index.md b/docs/ru/sql-reference/table-functions/index.md index 83225d54e60..fcd428df5d1 100644 --- a/docs/ru/sql-reference/table-functions/index.md +++ b/docs/ru/sql-reference/table-functions/index.md @@ -1,8 +1,7 @@ --- -toc_folder_title: "\u0422\u0430\u0431\u043B\u0438\u0447\u043D\u044B\u0435 \u0444\u0443\ - \u043D\u043A\u0446\u0438\u0438" +toc_folder_title: "Табличные функции" toc_priority: 34 -toc_title: "\u0412\u0432\u0435\u0434\u0435\u043D\u0438\u0435" +toc_title: "Введение" --- # Табличные функции {#table-functions} @@ -33,5 +32,6 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043D\u0438\u0435" | [jdbc](jdbc.md) | Создаёт таблицу с дижком [JDBC](../../engines/table-engines/integrations/jdbc.md). | | [odbc](odbc.md) | Создаёт таблицу с движком [ODBC](../../engines/table-engines/integrations/odbc.md). | | [hdfs](hdfs.md) | Создаёт таблицу с движком [HDFS](../../engines/table-engines/integrations/hdfs.md). | +| [s3](s3.md) | Создаёт таблицу с движком [S3](../../engines/table-engines/integrations/s3.md). | -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/) +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/) diff --git a/docs/ru/sql-reference/table-functions/input.md b/docs/ru/sql-reference/table-functions/input.md index 96cf7515d52..0f5f621a247 100644 --- a/docs/ru/sql-reference/table-functions/input.md +++ b/docs/ru/sql-reference/table-functions/input.md @@ -43,4 +43,3 @@ $ cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" $ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/input/) diff --git a/docs/ru/sql-reference/table-functions/jdbc.md b/docs/ru/sql-reference/table-functions/jdbc.md index d388262606f..4fc237f940d 100644 --- a/docs/ru/sql-reference/table-functions/jdbc.md +++ b/docs/ru/sql-reference/table-functions/jdbc.md @@ -24,4 +24,3 @@ SELECT * FROM jdbc('mysql://localhost:3306/?user=root&password=root', 'schema', SELECT * FROM jdbc('datasource://mysql-local', 'schema', 'table') ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/jdbc/) diff --git a/docs/ru/sql-reference/table-functions/merge.md b/docs/ru/sql-reference/table-functions/merge.md index 0822fdfe535..5b33f458468 100644 --- a/docs/ru/sql-reference/table-functions/merge.md +++ b/docs/ru/sql-reference/table-functions/merge.md @@ -9,4 +9,3 @@ toc_title: merge Структура таблицы берётся из первой попавшейся таблицы, подходящей под регулярное выражение. -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/merge/) diff --git a/docs/ru/sql-reference/table-functions/mysql.md b/docs/ru/sql-reference/table-functions/mysql.md index 21841eee67a..665f1058ba2 100644 --- a/docs/ru/sql-reference/table-functions/mysql.md +++ b/docs/ru/sql-reference/table-functions/mysql.md @@ -5,13 +5,15 @@ toc_title: mysql # mysql {#mysql} -Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом MySQL сервере. +Позволяет выполнять запросы `SELECT` и `INSERT` над данными, хранящимися на удалённом MySQL сервере. + +**Синтаксис** ``` sql -mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); +mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']) ``` -**Параметры** +**Аргументы** - `host:port` — адрес сервера MySQL. @@ -23,13 +25,14 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_ - `password` — пароль пользователя. -- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Если `replace_query=1`, то запрос заменяется. +- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Возможные значения: + - `0` - выполняется запрос `INSERT INTO`. + - `1` - выполняется запрос `REPLACE INTO`. -- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое в запрос `INSERT`. +- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое в запрос `INSERT`. Может быть передано только с помощью `replace_query = 0` (если вы одновременно передадите `replace_query = 1` и `on_duplicate_clause`, будет сгенерировано исключение). - Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1`. Чтобы узнать какие `on_duplicate_clause` можно использовать с секцией `ON DUPLICATE KEY` обратитесь к документации MySQL. - - Чтобы указать `'on_duplicate_clause'` необходимо передать `0` в параметр `replace_query`. Если одновременно передать `replace_query = 1` и `'on_duplicate_clause'`, то ClickHouse сгенерирует исключение. + Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1`. + Выражения, которые могут использоваться в качестве `on_duplicate_clause` в секции `ON DUPLICATE KEY`, можно посмотреть в документации по [MySQL](http://www.mysql.ru/docs/). Простые условия `WHERE` такие как `=, !=, >, >=, <, =` выполняются на стороне сервера MySQL. @@ -39,46 +42,58 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_ Объект таблицы с теми же столбцами, что и в исходной таблице MySQL. -## Пример использования {#primer-ispolzovaniia} +!!! note "Примечание" + Чтобы отличить табличную функцию `mysql (...)` в запросе `INSERT` от имени таблицы со списком столбцов, используйте ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже. + +**Примеры** Таблица в MySQL: ``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `int_nullable` INT NULL DEFAULT NULL, -> `float` FLOAT NOT NULL, - -> `float_nullable` FLOAT NULL DEFAULT NULL, -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) -mysql> insert into test (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) +mysql> INSERT INTO test (`int_id`, `float`) VALUES (1,2); -mysql> select * from test; -+--------+--------------+-------+----------------+ -| int_id | int_nullable | float | float_nullable | -+--------+--------------+-------+----------------+ -| 1 | NULL | 2 | NULL | -+--------+--------------+-------+----------------+ -1 row in set (0,00 sec) +mysql> SELECT * FROM test; ++--------+-------+ +| int_id | float | ++--------+-------+ +| 1 | 2 | ++--------+-------+ ``` Получение данных в ClickHouse: ``` sql -SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123') +SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); ``` ``` text -┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ -│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ -└────────┴──────────────┴───────┴────────────────┘ +┌─int_id─┬─float─┐ +│ 1 │ 2 │ +└────────┴───────┘ ``` -## Смотрите также {#smotrite-takzhe} +Замена и вставка: + +```sql +INSERT INTO FUNCTION mysql('localhost:3306', 'test', 'test', 'bayonet', '123', 1) (int_id, float) VALUES (1, 3); +INSERT INTO TABLE FUNCTION mysql('localhost:3306', 'test', 'test', 'bayonet', '123', 0, 'UPDATE int_id = int_id + 1') (int_id, float) VALUES (1, 4); +SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); +``` + +``` text +┌─int_id─┬─float─┐ +│ 1 │ 3 │ +│ 2 │ 4 │ +└────────┴───────┘ +``` + +**Смотрите также** - [Движок таблиц ‘MySQL’](../../sql-reference/table-functions/mysql.md) - [Использование MySQL как источника данных для внешнего словаря](../../sql-reference/table-functions/mysql.md#dicts-external_dicts_dict_sources-mysql) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/mysql/) diff --git a/docs/ru/sql-reference/table-functions/numbers.md b/docs/ru/sql-reference/table-functions/numbers.md index 005f400e082..71f63078415 100644 --- a/docs/ru/sql-reference/table-functions/numbers.md +++ b/docs/ru/sql-reference/table-functions/numbers.md @@ -25,4 +25,3 @@ SELECT * FROM system.numbers LIMIT 10; select toDate('2010-01-01') + number as d FROM numbers(365); ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/numbers/) diff --git a/docs/ru/sql-reference/table-functions/odbc.md b/docs/ru/sql-reference/table-functions/odbc.md index 19203123840..557e7d2a15b 100644 --- a/docs/ru/sql-reference/table-functions/odbc.md +++ b/docs/ru/sql-reference/table-functions/odbc.md @@ -103,4 +103,3 @@ SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') - [Внешние словари ODBC](../../sql-reference/table-functions/odbc.md#dicts-external_dicts_dict_sources-odbc) - [Движок таблиц ODBC](../../sql-reference/table-functions/odbc.md). -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/jdbc/) diff --git a/docs/ru/sql-reference/table-functions/postgresql.md b/docs/ru/sql-reference/table-functions/postgresql.md new file mode 100644 index 00000000000..2d8afe28f1e --- /dev/null +++ b/docs/ru/sql-reference/table-functions/postgresql.md @@ -0,0 +1,120 @@ +--- +toc_priority: 42 +toc_title: postgresql +--- + +# postgresql {#postgresql} + +Позволяет выполнять запросы `SELECT` и `INSERT` над таблицами удаленной БД PostgreSQL. + +**Синтаксис** + +``` sql +postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) +``` + +**Аргументы** + +- `host:port` — адрес сервера PostgreSQL. +- `database` — имя базы данных на удалённом сервере. +- `table` — имя таблицы на удалённом сервере. +- `user` — пользователь PostgreSQL. +- `password` — пароль пользователя. +- `schema` — имя схемы, если не используется схема по умолчанию. Необязательный аргумент. + +**Возвращаемое значение** + +Таблица с теми же столбцами, что и в исходной таблице PostgreSQL. + +!!! info "Примечание" + В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже. + +## Особенности реализации {#implementation-details} + +Запросы `SELECT` на стороне PostgreSQL выполняются как `COPY (SELECT ...) TO STDOUT` внутри транзакции PostgreSQL только на чтение с коммитом после каждого запроса `SELECT`. + +Простые условия для `WHERE` такие как `=`, `!=`, `>`, `>=`, `<`, `<=` и `IN` исполняются на стороне PostgreSQL сервера. + +Все операции объединения, аггрегации, сортировки, условия `IN [ array ]` и ограничения `LIMIT` выполняются на стороне ClickHouse только после того как запрос к PostgreSQL закончился. + +Запросы `INSERT` на стороне PostgreSQL выполняются как `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` внутри PostgreSQL транзакции с автоматическим коммитом после каждого запроса `INSERT`. + +PostgreSQL массивы конвертируются в массивы ClickHouse. + +!!! info "Примечание" + Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. + +При использовании словаря PostgreSQL поддерживается приоритет реплик. Чем больше номер реплики, тем ниже ее приоритет. Наивысший приоритет у реплики с номером `0`. + +**Примеры** + +Таблица в PostgreSQL: + +``` text +postgres=# CREATE TABLE "public"."test" ( +"int_id" SERIAL, +"int_nullable" INT NULL DEFAULT NULL, +"float" FLOAT NOT NULL, +"str" VARCHAR(100) NOT NULL DEFAULT '', +"float_nullable" FLOAT NULL DEFAULT NULL, +PRIMARY KEY (int_id)); + +CREATE TABLE + +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); +INSERT 0 1 + +postgresql> SELECT * FROM test; + int_id | int_nullable | float | str | float_nullable + --------+--------------+-------+------+---------------- + 1 | | 2 | test | + (1 row) +``` + +Получение данных в ClickHouse: + +```sql +SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'password') WHERE str IN ('test'); +``` + +``` text +┌─int_id─┬─int_nullable─┬─float─┬─str──┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ 2 │ test │ ᴺᵁᴸᴸ │ +└────────┴──────────────┴───────┴──────┴────────────────┘ +``` + +Вставка данных: + +```sql +INSERT INTO TABLE FUNCTION postgresql('localhost:5432', 'test', 'test', 'postgrsql_user', 'password') (int_id, float) VALUES (2, 3); +SELECT * FROM postgresql('localhost:5432', 'test', 'test', 'postgresql_user', 'password'); +``` + +``` text +┌─int_id─┬─int_nullable─┬─float─┬─str──┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ 2 │ test │ ᴺᵁᴸᴸ │ +│ 2 │ ᴺᵁᴸᴸ │ 3 │ │ ᴺᵁᴸᴸ │ +└────────┴──────────────┴───────┴──────┴────────────────┘ +``` + +Using Non-default Schema: + +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**См. также** + +- [Движок таблиц PostgreSQL](../../sql-reference/table-functions/postgresql.md) +- [Использование PostgreSQL как источника данных для внешнего словаря](../../sql-reference/table-functions/postgresql.md#dicts-external_dicts_dict_sources-postgresql) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/postgresql/) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index 901317a805d..00179abb207 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -5,9 +5,11 @@ toc_title: remote # remote, remoteSecure {#remote-remotesecure} -Позволяет обратиться к удалённым серверам без создания таблицы типа `Distributed`. +Позволяет обратиться к удалённым серверам без создания таблицы типа [Distributed](../../engines/table-engines/special/distributed.md). Функция `remoteSecure` работает аналогично `remote`, но использует защищенное соединение. -Сигнатуры: +Обе функции могут использоваться в запросах `SELECT` и `INSERT`. + +**Синтаксис** ``` sql remote('addresses_expr', db, table[, 'user'[, 'password']]) @@ -16,12 +18,40 @@ remoteSecure('addresses_expr', db, table[, 'user'[, 'password']]) remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) ``` -`addresses_expr` - выражение, генерирующее адреса удалённых серверов. Это может быть просто один адрес сервера. Адрес сервера - это `хост:порт`, или только `хост`. Хост может быть указан в виде имени сервера, или в виде IPv4 или IPv6 адреса. IPv6 адрес указывается в квадратных скобках. Порт - TCP-порт удалённого сервера. Если порт не указан, используется `tcp_port` из конфигурационного файла сервера (по умолчанию - 9000). +**Параметры** + +- `addresses_expr` — выражение, генерирующее адреса удалённых серверов. Это может быть просто один адрес сервера. Адрес сервера — это `host:port` или только `host`. + + Вместо параметра `host` может быть указано имя сервера или его адрес в формате IPv4 или IPv6. IPv6 адрес указывается в квадратных скобках. + + `port` — TCP-порт удалённого сервера. Если порт не указан, используется [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) из конфигурационного файла сервера, к которому обратились через функцию `remote` (по умолчанию - 9000), и [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure), к которому обратились через функцию `remoteSecure` (по умолчанию — 9440). -!!! important "Важно" С IPv6-адресом обязательно нужно указывать порт. -Примеры: + Тип: [String](../../sql-reference/data-types/string.md). + +- `db` — имя базы данных. Тип: [String](../../sql-reference/data-types/string.md). +- `table` — имя таблицы. Тип: [String](../../sql-reference/data-types/string.md). +- `user` — имя пользователя. Если пользователь не указан, то по умолчанию `default`. Тип: [String](../../sql-reference/data-types/string.md). +- `password` — пароль. Если пароль не указан, то используется пустой пароль. Тип: [String](../../sql-reference/data-types/string.md). +- `sharding_key` — ключ шардирования для поддержки распределения данных между узлами. Например: `insert into remote('127.0.0.1:9000,127.0.0.2', db, table, 'default', rand())`. Тип: [UInt32](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +Набор данных с удаленных серверов. + +**Использование** + +Использование табличной функции `remote` менее оптимально, чем создание таблицы типа `Distributed`, так как в этом случае соединения с серверами устанавливаются заново при каждом запросе. Если указываются имена серверов, то приходится также выполнять поиск сервера по имени. Кроме того, не ведётся сквозной подсчёт ошибок при работе с разными репликами. При обработке большого количества запросов всегда создавайте таблицу типа `Distributed`, использовать табличную функцию `remote` в таких случаях не рекомендуется. + +Табличная функция `remote` может быть полезна в следующих случаях: + +- Обращение на конкретный сервер для сравнения данных, отладки и тестирования. +- Запросы между разными кластерами ClickHouse для исследований. +- Нечастые распределённые запросы, задаваемые вручную. +- Распределённые запросы, где набор серверов определяется каждый раз заново. + +**Адреса** ``` text example01-01-1 @@ -32,9 +62,7 @@ localhost [2a02:6b8:0:1111::11]:9000 ``` -Адреса можно указать через запятую, в этом случае ClickHouse обработает запрос как распределённый, т.е. отправит его по всем указанным адресам как на шарды с разными данными. - -Пример: +Адреса можно указать через запятую. В этом случае ClickHouse обработает запрос как распределённый, т.е. отправит его по всем указанным адресам как на шарды с разными данными. Пример: ``` text example01-01-1,example01-02-1 @@ -46,38 +74,35 @@ example01-01-1,example01-02-1 example01-0{1,2}-1 ``` -В фигурных скобках может быть указан диапазон (неотрицательных целых) чисел через две точки. В этом случае, диапазон раскрывается в множество значений, генерирующих адреса шардов. Если запись первого числа начинается с нуля, то значения формируются с таким же выравниванием нулями. Предыдущий пример может быть записан следующим образом: +В фигурных скобках может быть указан диапазон (неотрицательных целых) чисел через две точки. В этом случае диапазон раскрывается в множество значений, генерирующих адреса шардов. Если запись первого числа начинается с нуля, то значения формируются с таким же выравниванием нулями. Предыдущий пример может быть записан следующим образом: ``` text example01-{01..02}-1 ``` -При наличии нескольких пар фигурных скобок, генерируется прямое произведение соответствующих множеств. +При наличии нескольких пар фигурных скобок генерируется прямое произведение соответствующих множеств. -Адреса или их фрагменты в фигурных скобках можно указать через символ \|. В этом случае, соответствующие множества адресов понимаются как реплики - запрос будет отправлен на первую живую реплику. При этом, реплики перебираются в порядке, согласно текущей настройке [load_balancing](../../operations/settings/settings.md). - -Пример: +Адреса или их фрагменты в фигурных скобках можно указать через символ \|. В этом случае соответствующие множества адресов понимаются как реплики — запрос будет отправлен на первую живую реплику. При этом реплики перебираются в порядке, согласно текущей настройке [load_balancing](../../operations/settings/settings.md#settings-load_balancing). В этом примере указаны два шарда, в каждом из которых имеются две реплики: ``` text example01-{01..02}-{1|2} ``` -В этом примере указано два шарда, в каждом из которых имеется две реплики. +Количество генерируемых адресов ограничено константой. Сейчас это 1000 адресов. -Количество генерируемых адресов ограничено константой - сейчас это 1000 штук. +**Примеры** -Использование табличной функции `remote` менее оптимально, чем создание таблицы типа `Distributed`, так как в этом случае, соединения с серверами устанавливаются заново при каждом запросе, в случае задания имён хостов, делается резолвинг имён, а также не ведётся подсчёт ошибок при работе с разными репликами. При обработке большого количества запросов, всегда создавайте `Distributed` таблицу заранее, не используйте табличную функцию `remote`. +Выборка данных с удаленного сервера: -Табличная функция `remote` может быть полезна для следующих случаях: +``` sql +SELECT * FROM remote('127.0.0.1', db.remote_engine_table) LIMIT 3; +``` -- обращение на конкретный сервер в целях сравнения данных, отладки и тестирования; -- запросы между разными кластерами ClickHouse в целях исследований; -- нечастых распределённых запросов, задаваемых вручную; -- распределённых запросов, где набор серверов определяется каждый раз заново. +Вставка данных с удаленного сервера в таблицу: -Если пользователь не задан,то используется `default`. -Если пароль не задан, то используется пустой пароль. +``` sql +CREATE TABLE remote_table (name String, value UInt32) ENGINE=Memory; +INSERT INTO FUNCTION remote('127.0.0.1', currentDatabase(), 'remote_table') VALUES ('test', 42); +SELECT * FROM remote_table; +``` -`remoteSecure` - аналогично функции `remote`, но с соединением по шифрованному каналу. Порт по умолчанию - `tcp_port_secure` из конфига или 9440. - -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/remote/) diff --git a/docs/ru/sql-reference/table-functions/s3.md b/docs/ru/sql-reference/table-functions/s3.md new file mode 100644 index 00000000000..e062e59c67c --- /dev/null +++ b/docs/ru/sql-reference/table-functions/s3.md @@ -0,0 +1,141 @@ +--- +toc_priority: 45 +toc_title: s3 +--- + +# Табличная Функция S3 {#s3-table-function} + +Предоставляет табличный интерфейс для выбора/вставки файлов в [Amazon S3](https://aws.amazon.com/s3/). Эта табличная функция похожа на [hdfs](../../sql-reference/table-functions/hdfs.md), но обеспечивает специфические для S3 возможности. + +**Синтаксис** + +``` sql +s3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compression]) +``` + +**Aргументы** + +- `path` — URL-адрес бакета с указанием пути к файлу. Поддерживает следующие подстановочные знаки в режиме "только чтение": `*, ?, {abc,def} и {N..M}` где `N, M` — числа, `'abc', 'def'` — строки. Подробнее смотри [здесь](../../engines/table-engines/integrations/s3.md#wildcards-in-path). +- `format` — [формат](../../interfaces/formats.md#formats) файла. +- `structure` — cтруктура таблицы. Формат `'column1_name column1_type, column2_name column2_type, ...'`. +- `compression` — автоматически обнаруживает сжатие по расширению файла. Возможные значения: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Необязательный параметр. + +**Возвращаемые значения** + +Таблица с указанной структурой для чтения или записи данных в указанный файл. + +**Примеры** + +Создание таблицы из файла S3 `https://storage.yandexcloud.net/my-test-bucket-768/data.csv` и выбор первых трех столбцов из нее: + +Запрос: + +``` sql +SELECT * +FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/data.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') +LIMIT 2; +``` + +Результат: + +``` text +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` + +То же самое, но файл со сжатием `gzip`: + +Запрос: + +``` sql +SELECT * +FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/data.csv.gz', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32', 'gzip') +LIMIT 2; +``` + +Результат: + +``` text +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` + +## Примеры использования {#usage-examples} + +Предположим, у нас есть несколько файлов со следующими URI на S3: + +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_3.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_4.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_1.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv' +- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_4.csv' + +Подсчитаем количество строк в файлах, заканчивающихся цифрами от 1 до 3: + +``` sql +SELECT count(*) +FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}.csv', 'CSV', 'name String, value UInt32'); +``` + +``` text +┌─count()─┐ +│ 18 │ +└─────────┘ +``` + +Подсчитаем общее количество строк во всех файлах этих двух каталогов: + +``` sql +SELECT count(*) +FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV', 'name String, value UInt32'); +``` + +``` text +┌─count()─┐ +│ 24 │ +└─────────┘ +``` + +!!! warning "Warning" + Если список файлов содержит диапазоны чисел с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры отдельно или используйте `?`. + +Подсчитаем общее количество строк в файлах с именами `file-000.csv`, `file-001.csv`, … , `file-999.csv`: + +``` sql +SELECT count(*) +FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV', 'name String, value UInt32'); +``` + +``` text +┌─count()─┐ +│ 12 │ +└─────────┘ +``` + +Запишем данные в файл `test-data.csv.gz`: + +``` sql +INSERT INTO s3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip') +VALUES ('test-data', 1), ('test-data-2', 2); +``` + +Запишем данные из существующей таблицы в файл `test-data.csv.gz`: + +``` sql +INSERT INTO s3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip') +SELECT name, value FROM existing_table; +``` + +**Смотрите также** + +- [Движок таблиц S3](../../engines/table-engines/integrations/s3.md) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/s3/) + diff --git a/docs/ru/sql-reference/table-functions/url.md b/docs/ru/sql-reference/table-functions/url.md index 0cd7c24c663..a41a1f53cde 100644 --- a/docs/ru/sql-reference/table-functions/url.md +++ b/docs/ru/sql-reference/table-functions/url.md @@ -5,21 +5,39 @@ toc_title: url # url {#url} -`url(URL, format, structure)` - возвращает таблицу со столбцами, указанными в -`structure`, созданную из данных находящихся по `URL` в формате `format`. +Функция `url` берет данные по указанному адресу `URL` и создает из них таблицу указанной структуры со столбцами указанного формата. -URL - адрес, по которому сервер принимает `GET` и/или `POST` запросы по -протоколу HTTP или HTTPS. +Функция `url` может быть использована в запросах `SELECT` и `INSERT` с таблицами на движке [URL](../../engines/table-engines/special/url.md). -format - [формат](../../interfaces/formats.md#formats) данных. - -structure - структура таблицы в форме `'UserID UInt64, Name String'`. Определяет имена и типы столбцов. - -**Пример** +**Синтаксис** ``` sql --- получение 3-х строк таблицы, состоящей из двух колонк типа String и UInt32 от сервера, отдающего данные в формате CSV -SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 +url(URL, format, structure) +``` + +**Параметры** + +- `URL` — HTTP или HTTPS-адрес сервера, который может принимать запросы `GET` или `POST` (для запросов `SELECT` или `INSERT` соответственно). Тип: [String](../../sql-reference/data-types/string.md). +- `format` — [формат](../../interfaces/formats.md#formats) данных. Тип: [String](../../sql-reference/data-types/string.md). +- `structure` — структура таблицы в формате `'UserID UInt64, Name String'`. Определяет имена и типы столбцов. Тип: [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +Таблица с указанными форматом и структурой, а также с данными, полученными из указанного адреса `URL`. + +**Примеры** + +Получение с HTTP-сервера первых 3 строк таблицы с данными в формате [CSV](../../interfaces/formats.md#csv), содержащей столбцы типа [String](../../sql-reference/data-types/string.md) и [UInt32](../../sql-reference/data-types/int-uint.md). + +``` sql +SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3; +``` + +Вставка данных в таблицу: + +``` sql +CREATE TABLE test_table (column1 String, column2 UInt32) ENGINE=Memory; +INSERT INTO FUNCTION url('http://127.0.0.1:8123/?query=INSERT+INTO+test_table+FORMAT+CSV', 'CSV', 'column1 String, column2 UInt32') VALUES ('http interface', 42); +SELECT * FROM test_table; ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/url/) diff --git a/docs/ru/sql-reference/table-functions/view.md b/docs/ru/sql-reference/table-functions/view.md index 8a97253d048..bfd0a891179 100644 --- a/docs/ru/sql-reference/table-functions/view.md +++ b/docs/ru/sql-reference/table-functions/view.md @@ -8,7 +8,7 @@ view(subquery) ``` -**Входные параметры** +**Аргументы** - `subquery` — запрос `SELECT`. @@ -32,7 +32,7 @@ view(subquery) Запрос: ``` sql -SELECT * FROM view(SELECT name FROM months) +SELECT * FROM view(SELECT name FROM months); ``` Результат: @@ -49,14 +49,15 @@ SELECT * FROM view(SELECT name FROM months) Вы можете использовать функцию `view` как параметр табличных функций [remote](https://clickhouse.tech/docs/ru/sql-reference/table-functions/remote/#remote-remotesecure) и [cluster](https://clickhouse.tech/docs/ru/sql-reference/table-functions/cluster/#cluster-clusterallreplicas): ``` sql -SELECT * FROM remote(`127.0.0.1`, view(SELECT a, b, c FROM table_name)) +SELECT * FROM remote(`127.0.0.1`, view(SELECT a, b, c FROM table_name)); ``` ``` sql -SELECT * FROM cluster(`cluster_name`, view(SELECT a, b, c FROM table_name)) +SELECT * FROM cluster(`cluster_name`, view(SELECT a, b, c FROM table_name)); ``` **Смотрите также** - [view](https://clickhouse.tech/docs/ru/engines/table-engines/special/view/#table_engines-view) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/view/) \ No newline at end of file + +[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/table-functions/view/) diff --git a/docs/ru/whats-new/extended-roadmap.md b/docs/ru/whats-new/extended-roadmap.md deleted file mode 120000 index 471ca0aefaa..00000000000 --- a/docs/ru/whats-new/extended-roadmap.md +++ /dev/null @@ -1 +0,0 @@ -roadmap.md \ No newline at end of file diff --git a/docs/ru/whats-new/extended-roadmap.md b/docs/ru/whats-new/extended-roadmap.md new file mode 100644 index 00000000000..7b317d424f1 --- /dev/null +++ b/docs/ru/whats-new/extended-roadmap.md @@ -0,0 +1,9 @@ +--- +toc_priority: 74 +toc_title: Roadmap +--- + +# Планы развития {#roadmap} + +Планы развития на 2021 год опубликованы для обсуждения [здесь](https://github.com/ClickHouse/ClickHouse/issues/17623). + diff --git a/docs/ru/whats-new/index.md b/docs/ru/whats-new/index.md index b8ba6133454..d8a26423813 100644 --- a/docs/ru/whats-new/index.md +++ b/docs/ru/whats-new/index.md @@ -1,6 +1,8 @@ --- -toc_folder_title: "\u0427\u0442\u043E \u043D\u043E\u0432\u043E\u0433\u043E?" +toc_folder_title: "Что нового?" toc_priority: 82 --- +# Что нового в ClickHouse? +Планы развития вкратце изложены [здесь](extended-roadmap.md), а новости по предыдущим релизам подробно описаны в [журнале изменений](changelog/index.md). diff --git a/docs/ru/whats-new/roadmap.md b/docs/ru/whats-new/roadmap.md deleted file mode 120000 index 2c383b2ad5d..00000000000 --- a/docs/ru/whats-new/roadmap.md +++ /dev/null @@ -1 +0,0 @@ -../../en/whats-new/roadmap.md \ No newline at end of file diff --git a/docs/ru/whats-new/security-changelog.md b/docs/ru/whats-new/security-changelog.md index 1f46535833d..e3d26e772c4 100644 --- a/docs/ru/whats-new/security-changelog.md +++ b/docs/ru/whats-new/security-changelog.md @@ -73,4 +73,3 @@ unixODBC позволял указать путь для подключения Обнаружено благодаря: the UK’s National Cyber Security Centre (NCSC) -{## [Оригинальная статья](https://clickhouse.tech/docs/ru/security_changelog/) ##} diff --git a/docs/tools/.gitignore b/docs/tools/.gitignore new file mode 100644 index 00000000000..443cee8638c --- /dev/null +++ b/docs/tools/.gitignore @@ -0,0 +1,3 @@ +build +__pycache__ +*.pyc diff --git a/docs/tools/README.md b/docs/tools/README.md index 3c8862f1079..0a6c41d8089 100644 --- a/docs/tools/README.md +++ b/docs/tools/README.md @@ -51,5 +51,5 @@ The easiest way to see the result is to use `--livereload=8888` argument of buil At the moment there’s no easy way to do just that, but you can consider: -- To hit the “Watch” button on top of GitHub web interface to know as early as possible, even during pull request. Alternative to this is `#github-activity` channel of [public ClickHouse Slack](https://join.slack.com/t/clickhousedb/shared_invite/enQtOTUzMjM4ODQwNTc5LWJmMjE3Yjc2YmI1ZDBlZmI4ZTc3OWY3ZTIwYTljYzY4MzBlODM3YzBjZTc1YmYyODRlZTJkYTgzYzBiNTA2Yjk). +- To hit the “Watch” button on top of GitHub web interface to know as early as possible, even during pull request. Alternative to this is `#github-activity` channel of [public ClickHouse Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-nwwakmk4-xOJ6cdy0sJC3It8j348~IA). - Some search engines allow to subscribe on specific website changes via email and you can opt-in for that for https://clickhouse.tech. diff --git a/docs/tools/build.py b/docs/tools/build.py index dfb9661c326..5a1f10268ab 100755 --- a/docs/tools/build.py +++ b/docs/tools/build.py @@ -65,8 +65,6 @@ def build_for_lang(lang, args): languages = { 'en': 'English', 'zh': '中文', - 'es': 'Español', - 'fr': 'Français', 'ru': 'Русский', 'ja': '日本語' } @@ -74,8 +72,6 @@ def build_for_lang(lang, args): site_names = { 'en': 'ClickHouse %s Documentation', 'zh': 'ClickHouse文档 %s', - 'es': 'Documentación de ClickHouse %s', - 'fr': 'Documentation ClickHouse %s', 'ru': 'Документация ClickHouse %s', 'ja': 'ClickHouseドキュメント %s' } @@ -183,7 +179,7 @@ if __name__ == '__main__': website_dir = os.path.join(src_dir, 'website') arg_parser = argparse.ArgumentParser() - arg_parser.add_argument('--lang', default='en,es,fr,ru,zh,ja') + arg_parser.add_argument('--lang', default='en,ru,zh,ja') arg_parser.add_argument('--blog-lang', default='en,ru') arg_parser.add_argument('--docs-dir', default='.') arg_parser.add_argument('--theme-dir', default=website_dir) diff --git a/docs/tools/make_links.sh b/docs/tools/make_links.sh index 743d4eebf16..801086178bf 100755 --- a/docs/tools/make_links.sh +++ b/docs/tools/make_links.sh @@ -8,7 +8,7 @@ BASE_DIR=$(dirname $(readlink -f $0)) function do_make_links() { set -x - langs=(en es zh fr ru ja tr fa) + langs=(en zh ru ja) src_file="$1" for lang in "${langs[@]}" do diff --git a/docs/tools/output.md b/docs/tools/output.md deleted file mode 100644 index 91ec6e75999..00000000000 --- a/docs/tools/output.md +++ /dev/null @@ -1,204 +0,0 @@ -# What is ClickHouse? {#what-is-clickhouse} - -ClickHouse is a column-oriented database management system (DBMS) for -online analytical processing of queries (OLAP). - -In a “normal” row-oriented DBMS, data is stored in this order: - - Row WatchID JavaEnable Title GoodEvent EventTime - ----- ------------- ------------ -------------------- ----------- --------------------- - #0 89354350662 1 Investor Relations 1 2016-05-18 05:19:20 - #1 90329509958 0 Contact us 1 2016-05-18 08:10:20 - #2 89953706054 1 Mission 1 2016-05-18 07:38:00 - #N ... ... ... ... ... - -In other words, all the values related to a row are physically stored -next to each other. - -Examples of a row-oriented DBMS are MySQL, Postgres, and MS SQL Server. -{: .grey } - -In a column-oriented DBMS, data is stored like this: - - Row: #0 #1 #2 #N - ------------- --------------------- --------------------- --------------------- ----- - WatchID: 89354350662 90329509958 89953706054 ... - JavaEnable: 1 0 1 ... - Title: Investor Relations Contact us Mission ... - GoodEvent: 1 1 1 ... - EventTime: 2016-05-18 05:19:20 2016-05-18 08:10:20 2016-05-18 07:38:00 ... - -These examples only show the order that data is arranged in. The values -from different columns are stored separately, and data from the same -column is stored together. - -Examples of a column-oriented DBMS: Vertica, Paraccel (Actian Matrix and -Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB -(VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google -PowerDrill, Druid, and kdb+. {: .grey } - -Different orders for storing data are better suited to different -scenarios. The data access scenario refers to what queries are made, how -often, and in what proportion; how much data is read for each type of -query – rows, columns, and bytes; the relationship between reading and -updating data; the working size of the data and how locally it is used; -whether transactions are used, and how isolated they are; requirements -for data replication and logical integrity; requirements for latency and -throughput for each type of query, and so on. - -The higher the load on the system, the more important it is to customize -the system set up to match the requirements of the usage scenario, and -the more fine grained this customization becomes. There is no system -that is equally well-suited to significantly different scenarios. If a -system is adaptable to a wide set of scenarios, under a high load, the -system will handle all the scenarios equally poorly, or will work well -for just one or few of possible scenarios. - -## Key Properties of the OLAP scenario {#key-properties-of-the-olap-scenario} - -- The vast majority of requests are for read access. -- Data is updated in fairly large batches (\> 1000 rows), not by - single rows; or it is not updated at all. -- Data is added to the DB but is not modified. -- For reads, quite a large number of rows are extracted from the DB, - but only a small subset of columns. -- Tables are “wide,” meaning they contain a large number of columns. -- Queries are relatively rare (usually hundreds of queries per server - or less per second). -- For simple queries, latencies around 50 ms are allowed. -- Column values are fairly small: numbers and short strings (for - example, 60 bytes per URL). -- Requires high throughput when processing a single query (up to - billions of rows per second per server). -- Transactions are not necessary. -- Low requirements for data consistency. -- There is one large table per query. All tables are small, except for - one. -- A query result is significantly smaller than the source data. In - other words, data is filtered or aggregated, so the result fits in a - single server’s RAM. - -It is easy to see that the OLAP scenario is very different from other -popular scenarios (such as OLTP or Key-Value access). So it doesn’t make -sense to try to use OLTP or a Key-Value DB for processing analytical -queries if you want to get decent performance. For example, if you try -to use MongoDB or Redis for analytics, you will get very poor -performance compared to OLAP databases. - -## Why Column-Oriented Databases Work Better in the OLAP Scenario {#why-column-oriented-databases-work-better-in-the-olap-scenario} - -Column-oriented databases are better suited to OLAP scenarios: they are -at least 100 times faster in processing most queries. The reasons are -explained in detail below, but the fact is easier to demonstrate -visually: - -**Row-oriented DBMS** - -![Row-oriented](images/row_oriented.gif#) - -**Column-oriented DBMS** - -![Column-oriented](images/column_oriented.gif#) - -See the difference? - -### Input/output {#inputoutput} - -1. For an analytical query, only a small number of table columns need - to be read. In a column-oriented database, you can read just the - data you need. For example, if you need 5 columns out of 100, you - can expect a 20-fold reduction in I/O. -2. Since data is read in packets, it is easier to compress. Data in - columns is also easier to compress. This further reduces the I/O - volume. -3. Due to the reduced I/O, more data fits in the system cache. - -For example, the query “count the number of records for each advertising -platform” requires reading one “advertising platform ID” column, which -takes up 1 byte uncompressed. If most of the traffic was not from -advertising platforms, you can expect at least 10-fold compression of -this column. When using a quick compression algorithm, data -decompression is possible at a speed of at least several gigabytes of -uncompressed data per second. In other words, this query can be -processed at a speed of approximately several billion rows per second on -a single server. This speed is actually achieved in practice. - -
- -Example - - $ clickhouse-client - ClickHouse client version 0.0.52053. - Connecting to localhost:9000. - Connected to ClickHouse server version 0.0.52053. - - :) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 - - SELECT - CounterID, - count() - FROM hits - GROUP BY CounterID - ORDER BY count() DESC - LIMIT 20 - - ┌─CounterID─┬──count()─┐ - │ 114208 │ 56057344 │ - │ 115080 │ 51619590 │ - │ 3228 │ 44658301 │ - │ 38230 │ 42045932 │ - │ 145263 │ 42042158 │ - │ 91244 │ 38297270 │ - │ 154139 │ 26647572 │ - │ 150748 │ 24112755 │ - │ 242232 │ 21302571 │ - │ 338158 │ 13507087 │ - │ 62180 │ 12229491 │ - │ 82264 │ 12187441 │ - │ 232261 │ 12148031 │ - │ 146272 │ 11438516 │ - │ 168777 │ 11403636 │ - │ 4120072 │ 11227824 │ - │ 10938808 │ 10519739 │ - │ 74088 │ 9047015 │ - │ 115079 │ 8837972 │ - │ 337234 │ 8205961 │ - └───────────┴──────────┘ - - 20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) - - :) - -
- -### CPU {#cpu} - -Since executing a query requires processing a large number of rows, it -helps to dispatch all operations for entire vectors instead of for -separate rows, or to implement the query engine so that there is almost -no dispatching cost. If you don’t do this, with any half-decent disk -subsystem, the query interpreter inevitably stalls the CPU. It makes -sense to both store data in columns and process it, when possible, by -columns. - -There are two ways to do this: - -1. A vector engine. All operations are written for vectors, instead of - for separate values. This means you don’t need to call operations - very often, and dispatching costs are negligible. Operation code - contains an optimized internal cycle. - -2. Code generation. The code generated for the query has all the - indirect calls in it. - -This is not done in “normal” databases, because it doesn’t make sense -when running simple queries. However, there are exceptions. For example, -MemSQL uses code generation to reduce latency when processing SQL -queries. (For comparison, analytical DBMSs require optimization of -throughput, not latency.) - -Note that for CPU efficiency, the query language must be declarative -(SQL or MDX), or at least a vector (J, K). The query should only contain -implicit loops, allowing for optimization. - -[Original article](https://clickhouse.tech/docs/en/) diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index 4106100bfa3..85f9dc2a9dd 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -10,7 +10,7 @@ cssmin==0.2.0 future==0.18.2 htmlmin==0.1.12 idna==2.10 -Jinja2==2.11.2 +Jinja2>=2.11.3 jinja2-highlight==0.6.1 jsmin==2.2.2 livereload==2.6.2 @@ -23,10 +23,9 @@ nltk==3.5 nose==1.3.7 protobuf==3.14.0 numpy==1.19.2 -Pygments==2.5.2 pymdown-extensions==8.0 python-slugify==4.0.1 -PyYAML==5.3.1 +PyYAML==5.4.1 repackage==0.7.3 requests==2.24.0 singledispatch==3.4.0.3 @@ -36,3 +35,4 @@ termcolor==1.1.0 tornado==6.1 Unidecode==1.1.1 urllib3==1.25.10 +Pygments>=2.7.4 diff --git a/docs/tools/single_page.py b/docs/tools/single_page.py index 05d50e768e2..a1e650d3ad3 100644 --- a/docs/tools/single_page.py +++ b/docs/tools/single_page.py @@ -24,55 +24,78 @@ def recursive_values(item): yield item +anchor_not_allowed_chars = re.compile(r'[^\w\-]') +def generate_anchor_from_path(path): + return re.sub(anchor_not_allowed_chars, '-', path) + +absolute_link = re.compile(r'^https?://') + + +def replace_link(match, path): + title = match.group(1) + link = match.group(2) + + # Not a relative link + if re.search(absolute_link, link): + return match.group(0) + + if link.endswith('/'): + link = link[0:-1] + '.md' + + return '{}(#{})'.format(title, generate_anchor_from_path(os.path.normpath(os.path.join(os.path.dirname(path), link)))) + + +# Concatenates Markdown files to a single file. def concatenate(lang, docs_path, single_page_file, nav): lang_path = os.path.join(docs_path, lang) - az_re = re.compile(r'[a-z]') proj_config = f'{docs_path}/toc_{lang}.yml' if os.path.exists(proj_config): with open(proj_config) as cfg_file: nav = yaml.full_load(cfg_file.read())['nav'] + files_to_concatenate = list(recursive_values(nav)) files_count = len(files_to_concatenate) logging.info(f'{files_count} files will be concatenated into single md-file for {lang}.') logging.debug('Concatenating: ' + ', '.join(files_to_concatenate)) assert files_count > 0, f'Empty single-page for {lang}' + link_regexp = re.compile(r'(\[[^\]]+\])\(([^)#]+)(?:#[^\)]+)?\)') + for path in files_to_concatenate: - if path.endswith('introduction/info.md'): - continue try: with open(os.path.join(lang_path, path)) as f: - anchors = set() - tmp_path = path.replace('/index.md', '/').replace('.md', '/') - prefixes = ['', '../', '../../', '../../../'] - parts = tmp_path.split('/') - anchors.add(parts[-2] + '/') - anchors.add('/'.join(parts[1:])) - - for part in parts[0:-2] if len(parts) > 2 else parts: - for prefix in prefixes: - anchor = prefix + tmp_path - if anchor: - anchors.add(anchor) - anchors.add('../' + anchor) - anchors.add('../../' + anchor) - tmp_path = tmp_path.replace(part, '..') - - for anchor in anchors: - if re.search(az_re, anchor): - single_page_file.write('' % anchor) - - single_page_file.write('\n') + # Insert a horizontal ruler. Then insert an anchor that we will link to. Its name will be a path to the .md file. + single_page_file.write('\n______\n\n' % generate_anchor_from_path(path)) in_metadata = False - for l in f: - if l.startswith('---'): + for line in f: + # Skip YAML metadata. + if line == '---\n': in_metadata = not in_metadata - if l.startswith('#'): - l = '#' + l + continue + if not in_metadata: - single_page_file.write(l) + # Increase the level of headers. + if line.startswith('#'): + line = '#' + line + + # Replace links within the docs. + + if re.search(link_regexp, line): + line = re.sub( + link_regexp, + lambda match: replace_link(match, path), + line) + + # If failed to replace the relative link, print to log + if '../' in line: + logging.info('Failed to resolve relative link:') + logging.info(path) + logging.info(line) + + single_page_file.write(line) + except IOError as e: logging.warning(str(e)) @@ -86,7 +109,8 @@ def build_single_page_version(lang, args, nav, cfg): extra['single_page'] = True extra['is_amp'] = False - with util.autoremoved_file(os.path.join(args.docs_dir, lang, 'single.md')) as single_md: + single_md_path = os.path.join(args.docs_dir, lang, 'single.md') + with open(single_md_path, 'w') as single_md: concatenate(lang, args.docs_dir, single_md, nav) with util.temp_dir() as site_temp: @@ -123,11 +147,14 @@ def build_single_page_version(lang, args, nav, cfg): single_page_index_html = os.path.join(single_page_output_path, 'index.html') single_page_content_js = os.path.join(single_page_output_path, 'content.js') + with open(single_page_index_html, 'r') as f: sp_prefix, sp_js, sp_suffix = f.read().split('') + with open(single_page_index_html, 'w') as f: f.write(sp_prefix) f.write(sp_suffix) + with open(single_page_content_js, 'w') as f: if args.minify: import jsmin @@ -151,6 +178,7 @@ def build_single_page_version(lang, args, nav, cfg): js_in = ' '.join(website.get_js_in(args)) subprocess.check_call(f'cat {css_in} > {test_dir}/css/base.css', shell=True) subprocess.check_call(f'cat {js_in} > {test_dir}/js/base.js', shell=True) + if args.save_raw_single_page: shutil.copytree(test_dir, args.save_raw_single_page) @@ -194,3 +222,7 @@ def build_single_page_version(lang, args, nav, cfg): subprocess.check_call(' '.join(create_pdf_command), shell=True) logging.info(f'Finished building single page version for {lang}') + + if os.path.exists(single_md_path): + os.unlink(single_md_path) + \ No newline at end of file diff --git a/docs/tools/test.py b/docs/tools/test.py index 7d11157c986..00d1d47137f 100755 --- a/docs/tools/test.py +++ b/docs/tools/test.py @@ -68,17 +68,17 @@ def test_single_page(input_path, lang): f, features='html.parser' ) + anchor_points = set() + duplicate_anchor_points = 0 links_to_nowhere = 0 + for tag in soup.find_all(): for anchor_point in [tag.attrs.get('name'), tag.attrs.get('id')]: if anchor_point: - if anchor_point in anchor_points: - duplicate_anchor_points += 1 - logging.info('Duplicate anchor point: %s' % anchor_point) - else: - anchor_points.add(anchor_point) + anchor_points.add(anchor_point) + for tag in soup.find_all(): href = tag.attrs.get('href') if href and href.startswith('#') and href != '#': @@ -87,11 +87,8 @@ def test_single_page(input_path, lang): logging.info("Tag %s", tag) logging.info('Link to nowhere: %s' % href) - if duplicate_anchor_points: - logging.warning('Found %d duplicate anchor points' % duplicate_anchor_points) - if links_to_nowhere: - if lang == 'en' or lang == 'ru': # TODO: check all languages again + if lang == 'en' or lang == 'ru': logging.error(f'Found {links_to_nowhere} links to nowhere in {lang}') sys.exit(1) else: diff --git a/docs/tools/translate/add_meta_flag.py b/docs/tools/translate/add_meta_flag.py deleted file mode 100755 index d87aa044faf..00000000000 --- a/docs/tools/translate/add_meta_flag.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env python3 - -import sys - -import util - -if __name__ == '__main__': - flag_name = sys.argv[1] - path = sys.argv[2] - meta, content = util.read_md_file(path) - meta[flag_name] = True - util.write_md_file(path, meta, content) diff --git a/docs/tools/translate/babel-mapping.ini b/docs/tools/translate/babel-mapping.ini deleted file mode 100644 index 6a9a3e5c073..00000000000 --- a/docs/tools/translate/babel-mapping.ini +++ /dev/null @@ -1,3 +0,0 @@ -[python: **.py] -[jinja2: **/templates/**.html] -extensions=jinja2.ext.i18n,jinja2.ext.autoescape,jinja2.ext.with_ diff --git a/docs/tools/translate/filter.py b/docs/tools/translate/filter.py deleted file mode 100755 index 61e1104d345..00000000000 --- a/docs/tools/translate/filter.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -import json.decoder - -import pandocfilters -import slugify - -import translate -import util - - -is_debug = os.environ.get('DEBUG') is not None - -filename = os.getenv('INPUT') - - -def debug(*args): - if is_debug: - print(*args, file=sys.stderr) - - -def process_buffer(buffer, new_value, item=None, is_header=False): - if buffer: - text = ''.join(buffer) - - try: - translated_text = translate.translate(text) - except TypeError: - translated_text = text - except json.decoder.JSONDecodeError as e: - print('Failed to translate', str(e), file=sys.stderr) - sys.exit(1) - - debug(f'Translate: "{text}" -> "{translated_text}"') - - if text and text[0].isupper() and not translated_text[0].isupper(): - translated_text = translated_text[0].upper() + translated_text[1:] - - if text.startswith(' ') and not translated_text.startswith(' '): - translated_text = ' ' + translated_text - - if text.endswith(' ') and not translated_text.endswith(' '): - translated_text = translated_text + ' ' - - if is_header and translated_text.endswith('.'): - translated_text = translated_text.rstrip('.') - - title_case = is_header and translate.default_target_language == 'en' and text[0].isupper() - title_case_whitelist = { - 'a', 'an', 'the', 'and', 'or', 'that', - 'of', 'on', 'for', 'from', 'with', 'to', 'in' - } - is_first_iteration = True - for token in translated_text.split(' '): - if title_case and token.isascii() and not token.isupper(): - if len(token) > 1 and token.lower() not in title_case_whitelist: - token = token[0].upper() + token[1:] - elif not is_first_iteration: - token = token.lower() - is_first_iteration = False - - new_value.append(pandocfilters.Str(token)) - new_value.append(pandocfilters.Space()) - - if item is None and len(new_value): - new_value.pop(len(new_value) - 1) - else: - new_value[-1] = item - elif item: - new_value.append(item) - - -def process_sentence(value, is_header=False): - new_value = [] - buffer = [] - for item in value: - if isinstance(item, list): - new_value.append([process_sentence(subitem, is_header) for subitem in item]) - continue - elif isinstance(item, dict): - t = item.get('t') - c = item.get('c') - if t == 'Str': - buffer.append(c) - elif t == 'Space': - buffer.append(' ') - elif t == 'DoubleQuote': - buffer.append('"') - else: - process_buffer(buffer, new_value, item, is_header) - buffer = [] - else: - new_value.append(item) - process_buffer(buffer, new_value, is_header=is_header) - return new_value - - -def translate_filter(key, value, _format, _): - if key not in ['Space', 'Str']: - debug(key, value) - try: - cls = getattr(pandocfilters, key) - except AttributeError: - return - - if key == 'Para' and value: - marker = value[0].get('c') - if isinstance(marker, str) and marker.startswith('!!!') and len(value) > 2: - # Admonition case - if marker != '!!!': - # Lost space after !!! case - value.insert(1, pandocfilters.Str(marker[3:])) - value.insert(1, pandocfilters.Space()) - value[0]['c'] = '!!!' - admonition_value = [] - remaining_para_value = [] - in_admonition = True - break_value = [pandocfilters.LineBreak(), pandocfilters.Str(' ' * 4)] - for item in value: - if in_admonition: - if item.get('t') == 'SoftBreak': - in_admonition = False - else: - admonition_value.append(item) - else: - if item.get('t') == 'SoftBreak': - remaining_para_value += break_value - else: - remaining_para_value.append(item) - - if admonition_value[-1].get('t') == 'Quoted': - text = process_sentence(admonition_value[-1]['c'][-1]) - text[0]['c'] = '"' + text[0]['c'] - text[-1]['c'] = text[-1]['c'] + '"' - admonition_value.pop(-1) - admonition_value += text - else: - text = admonition_value[-1].get('c') - if text: - text = translate.translate(text[0].upper() + text[1:]) - admonition_value.append(pandocfilters.Space()) - admonition_value.append(pandocfilters.Str(f'"{text}"')) - - return cls(admonition_value + break_value + process_sentence(remaining_para_value)) - else: - return cls(process_sentence(value)) - elif key == 'Plain' or key == 'Strong' or key == 'Emph': - return cls(process_sentence(value)) - elif key == 'Link': - try: - # Plain links case - if value[2][0] == value[1][0].get('c'): - return pandocfilters.Str(value[2][0]) - except IndexError: - pass - - value[1] = process_sentence(value[1]) - href = value[2][0] - if not (href.startswith('http') or href.startswith('#')): - anchor = None - attempts = 10 - if '#' in href: - href, anchor = href.split('#', 1) - if href.endswith('.md') and not href.startswith('/'): - parts = [part for part in os.environ['INPUT'].split('/') if len(part) == 2] - lang = parts[-1] - script_path = os.path.dirname(__file__) - base_path = os.path.abspath(f'{script_path}/../../{lang}') - href = os.path.join( - os.path.relpath(base_path, os.path.dirname(os.environ['INPUT'])), - os.path.relpath(href, base_path) - ) - if anchor: - href = f'{href}#{anchor}' - value[2][0] = href - return cls(*value) - elif key == 'Header': - if value[1][0].islower() and '_' not in value[1][0]: # Preserve some manually specified anchors - value[1][0] = slugify.slugify(value[1][0], separator='-', word_boundary=True, save_order=True) - - # TODO: title case header in en - value[2] = process_sentence(value[2], is_header=True) - return cls(*value) - elif key == 'SoftBreak': - return pandocfilters.LineBreak() - - return - - -if __name__ == "__main__": - os.environ['INPUT'] = os.path.abspath(os.environ['INPUT']) - pwd = os.path.dirname(filename or '.') - if pwd: - with util.cd(pwd): - pandocfilters.toJSONFilter(translate_filter) - else: - pandocfilters.toJSONFilter(translate_filter) diff --git a/docs/tools/translate/normalize-markdown.sh b/docs/tools/translate/normalize-markdown.sh deleted file mode 100755 index 7850fa34b1d..00000000000 --- a/docs/tools/translate/normalize-markdown.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash -# Usage: normalize-en-markdown.sh -set -e -BASE_DIR=$(dirname $(readlink -f $0)) -TEMP_FILE=$(mktemp) -trap 'rm -f -- "${TEMP_FILE}"' INT TERM HUP EXIT -INPUT="$1" -if [[ ! -L "${INPUT}" ]] -then - export INPUT - cat "${INPUT}" > "${TEMP_FILE}" - "${BASE_DIR}/translate.sh" "en" "${TEMP_FILE}" "${INPUT}" -fi diff --git a/docs/tools/translate/remove_machine_translated_meta.py b/docs/tools/translate/remove_machine_translated_meta.py deleted file mode 100755 index 26cfde97f1e..00000000000 --- a/docs/tools/translate/remove_machine_translated_meta.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) -import convert_toc -import util - - -if __name__ == '__main__': - path = sys.argv[1][2:] - convert_toc.init_redirects() - try: - path = convert_toc.redirects[path] - except KeyError: - pass - meta, content = util.read_md_file(path) - if 'machine_translated' in meta: - del meta['machine_translated'] - if 'machine_translated_rev' in meta: - del meta['machine_translated_rev'] - util.write_md_file(path, meta, content) diff --git a/docs/tools/translate/replace-with-translation.sh b/docs/tools/translate/replace-with-translation.sh deleted file mode 100755 index 922ac65a921..00000000000 --- a/docs/tools/translate/replace-with-translation.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash -# Usage: replace-with-translation.sh -set -e -BASE_DIR=$(dirname $(readlink -f $0)) -TEMP_FILE=$(mktemp) -trap 'rm -f -- "${TEMP_FILE}"' INT TERM HUP EXIT -TARGET_LANGUAGE="$1" -export INPUT="$2" -cat "${INPUT}" > "${TEMP_FILE}" -if [[ ! -z $SLEEP ]] -then - sleep $[ ( $RANDOM % 20 ) + 1 ]s -fi -rm -f "${INPUT}" -mkdir -p $(dirname "${INPUT}") || true -YANDEX=1 "${BASE_DIR}/translate.sh" "${TARGET_LANGUAGE}" "${TEMP_FILE}" "${INPUT}" -git add "${INPUT}" diff --git a/docs/tools/translate/requirements.txt b/docs/tools/translate/requirements.txt deleted file mode 100644 index 1bbd119b823..00000000000 --- a/docs/tools/translate/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -Babel==2.8.0 -certifi==2020.6.20 -chardet==3.0.4 -googletrans==3.0.0 -idna==2.10 -Jinja2==2.11.2 -pandocfilters==1.4.2 -python-slugify==4.0.1 -PyYAML==5.3.1 -requests==2.24.0 -text-unidecode==1.3 -urllib3==1.25.10 diff --git a/docs/tools/translate/split_meta.py b/docs/tools/translate/split_meta.py deleted file mode 100755 index b38b93e10b4..00000000000 --- a/docs/tools/translate/split_meta.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -import os -import subprocess -import sys - -import translate -import util - - -if __name__ == '__main__': - path = sys.argv[1] - content_path = f'{path}.content' - meta_path = f'{path}.meta' - meta, content = util.read_md_file(path) - - target_language = os.getenv('TARGET_LANGUAGE') - if target_language is not None and target_language != 'en': - rev = subprocess.check_output( - 'git rev-parse HEAD', shell=True - ).decode('utf-8').strip() - meta['machine_translated'] = True - meta['machine_translated_rev'] = rev - title = meta.get('toc_title') - if title: - meta['toc_title'] = translate.translate(title, target_language) - folder_title = meta.get('toc_folder_title') - if folder_title: - meta['toc_folder_title'] = translate.translate(folder_title, target_language) - if 'en_copy' in meta: - del meta['en_copy'] - - with open(content_path, 'w') as f: - print(content, file=f) - - util.write_md_file(meta_path, meta, '') diff --git a/docs/tools/translate/translate.py b/docs/tools/translate/translate.py deleted file mode 100755 index 605ff78f424..00000000000 --- a/docs/tools/translate/translate.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 - -import os -import random -import re -import sys -import time -import urllib.parse - -import googletrans -import requests -import yaml - - -translator = googletrans.Translator() -default_target_language = os.environ.get('TARGET_LANGUAGE', 'ru') -curly_braces_re = re.compile('({[^}]+})') - -is_yandex = os.environ.get('YANDEX') is not None - - -def translate_impl(text, target_language=None): - target_language = target_language or default_target_language - if target_language == 'en': - return text - elif is_yandex: - text = text.replace('‘', '\'') - text = text.replace('’', '\'') - has_alpha = any([char.isalpha() for char in text]) - if text.isascii() and has_alpha and not text.isupper(): - text = urllib.parse.quote(text) - url = f'http://translate.yandex.net/api/v1/tr.json/translate?srv=docs&lang=en-{target_language}&text={text}' - result = requests.get(url).json() - if result.get('code') == 200: - return result['text'][0] - else: - result = str(result) - print(f'Failed to translate "{text}": {result}', file=sys.stderr) - sys.exit(1) - else: - return text - else: - time.sleep(random.random()) - return translator.translate(text, target_language).text - - -def translate(text, target_language=None): - return "".join( - [ - part - if part.startswith("{") and part.endswith("}") - else translate_impl(part, target_language=target_language) - for part in re.split(curly_braces_re, text) - ] - ) - - -def translate_po(): - import babel.messages.pofile - base_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'website', 'locale') - for lang in ['en', 'zh', 'es', 'fr', 'ru', 'ja']: - po_path = os.path.join(base_dir, lang, 'LC_MESSAGES', 'messages.po') - with open(po_path, 'r') as f: - po_file = babel.messages.pofile.read_po(f, locale=lang, domain='messages') - for item in po_file: - if not item.string: - global is_yandex - is_yandex = True - item.string = translate(item.id, lang) - with open(po_path, 'wb') as f: - babel.messages.pofile.write_po(f, po_file) - - -if __name__ == '__main__': - target_language = sys.argv[1] - if target_language == 'po': - translate_po() - else: - result = translate_toc(yaml.full_load(sys.stdin.read())['nav'], sys.argv[1]) - print(yaml.dump({'nav': result})) diff --git a/docs/tools/translate/translate.sh b/docs/tools/translate/translate.sh deleted file mode 100755 index 1acf645eb81..00000000000 --- a/docs/tools/translate/translate.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -# Usage: translate.sh -set -e -BASE_DIR=$(dirname $(readlink -f $0)) -OUTPUT=${3:-/dev/stdout} -export TARGET_LANGUAGE="$1" -export DEBUG -TEMP_FILE=$(mktemp) -export INPUT_PATH="$2" -INPUT_META="${INPUT_PATH}.meta" -INPUT_CONTENT="${INPUT_PATH}.content" - -trap 'rm -f -- "${TEMP_FILE}" "${INPUT_META}" "${INPUT_CONTENT}"' INT TERM HUP EXIT -source "${BASE_DIR}/venv/bin/activate" - -${BASE_DIR}/split_meta.py "${INPUT_PATH}" - -pandoc "${INPUT_CONTENT}" --filter "${BASE_DIR}/filter.py" -o "${TEMP_FILE}" \ - -f "markdown-space_in_atx_header" -t "markdown_strict+pipe_tables+markdown_attribute+all_symbols_escapable+backtick_code_blocks+autolink_bare_uris-link_attributes+markdown_attribute+mmd_link_attributes-raw_attribute+header_attributes-grid_tables+definition_lists" \ - --atx-headers --wrap=none --columns=99999 --tab-stop=4 -perl -pi -e 's/{\\#\\#/{##/g' "${TEMP_FILE}" -perl -pi -e 's/\\#\\#}/##}/g' "${TEMP_FILE}" -perl -pi -e 's/ *$//gg' "${TEMP_FILE}" -if [[ "${TARGET_LANGUAGE}" == "ru" ]] -then - perl -pi -e 's/“/«/gg' "${TEMP_FILE}" - perl -pi -e 's/”/»/gg' "${TEMP_FILE}" -fi -cat "${INPUT_META}" "${TEMP_FILE}" > "${OUTPUT}" diff --git a/docs/tools/translate/typograph_ru.py b/docs/tools/translate/typograph_ru.py deleted file mode 100644 index 2d970cf2a2e..00000000000 --- a/docs/tools/translate/typograph_ru.py +++ /dev/null @@ -1,45 +0,0 @@ -import requests - -class TypographError(Exception): - pass - - -def typograph(text): - text = text.replace('&', '&') - text = text.replace('<', '<') - text = text.replace('>', '>') - template = f''' - - - - {text} - 3 - 0 - 0 - 0 - - - - ''' - result = requests.post( - url='http://typograf.artlebedev.ru/webservices/typograf.asmx', - data=template.encode('utf-8'), - headers={ - 'Content-Type': 'text/xml', - 'SOAPAction': 'http://typograf.artlebedev.ru/webservices/ProcessText' - } - ) - if result.ok and 'ProcessTextResult' in result.text: - result_text = result.text.split('')[1].split('')[0].rstrip() - result_text = result_text.replace('&', '&') - result_text = result_text.replace('<', '<') - result_text = result_text.replace('>', '>') - return result_text - else: - raise TypographError(result.text) - - -if __name__ == '__main__': - import sys - print((typograph(sys.stdin.read()))) diff --git a/docs/tools/translate/update-all-machine-translated.sh b/docs/tools/translate/update-all-machine-translated.sh deleted file mode 100755 index fae2aae787f..00000000000 --- a/docs/tools/translate/update-all-machine-translated.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -BASE_DIR=$(dirname $(readlink -f $0)) - -function translate() { - set -x - LANGUAGE=$1 - DOCS_ROOT="${BASE_DIR}/../../" - REV="$(git rev-parse HEAD)" - for FILENAME in $(find "${DOCS_ROOT}${LANGUAGE}" -name "*.md" -type f) - do - HAS_MT_TAG=$(grep -c "machine_translated: true" "${FILENAME}") - IS_UP_TO_DATE=$(grep -c "machine_translated_rev: \"${REV}\"" "${FILENAME}") - if [ "${HAS_MT_TAG}" -eq "1" ] && [ "${IS_UP_TO_DATE}" -eq "0" ] - then - set -e - EN_FILENAME=${FILENAME/\/${LANGUAGE}\///en/} - rm "${FILENAME}" || true - cp "${EN_FILENAME}" "${FILENAME}" - DEBUG=1 SLEEP=1 ${BASE_DIR}/replace-with-translation.sh ${LANGUAGE} "${FILENAME}" - set +e - fi - done -} -export BASE_DIR -export -f translate -parallel translate ::: es fr zh ja fa tr diff --git a/docs/tools/translate/update-po.sh b/docs/tools/translate/update-po.sh deleted file mode 100755 index f2f4039bcb8..00000000000 --- a/docs/tools/translate/update-po.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -# Usage: update-po.sh -set -ex -BASE_DIR=$(dirname $(readlink -f $0)) -WEBSITE_DIR="${BASE_DIR}/../../../website" -LOCALE_DIR="${WEBSITE_DIR}/locale" -MESSAGES_POT="${LOCALE_DIR}/messages.pot" -BABEL_INI="${BASE_DIR}/babel-mapping.ini" -LANGS="en zh es fr ru ja tr fa" -source "${BASE_DIR}/venv/bin/activate" -cd "${WEBSITE_DIR}" -pybabel extract "." -o "${MESSAGES_POT}" -F "${BABEL_INI}" -for L in ${LANGS} -do - pybabel update -d locale -l "${L}" -i "${MESSAGES_POT}" || \ - pybabel init -d locale -l "${L}" -i "${MESSAGES_POT}" -done -python3 "${BASE_DIR}/translate.py" po -for L in ${LANGS} -do - pybabel compile -d locale -l "${L}" -done diff --git a/docs/tools/translate/util.py b/docs/tools/translate/util.py deleted file mode 120000 index 7f16d68497e..00000000000 --- a/docs/tools/translate/util.py +++ /dev/null @@ -1 +0,0 @@ -../util.py \ No newline at end of file diff --git a/docs/tools/util.py b/docs/tools/util.py index b840dc1168a..25961561f99 100644 --- a/docs/tools/util.py +++ b/docs/tools/util.py @@ -22,15 +22,6 @@ def temp_dir(): shutil.rmtree(path) -@contextlib.contextmanager -def autoremoved_file(path): - try: - with open(path, 'w') as handle: - yield handle - finally: - os.unlink(path) - - @contextlib.contextmanager def cd(new_cwd): old_cwd = os.getcwd() diff --git a/docs/zh/commercial/cloud.md b/docs/zh/commercial/cloud.md index 9cca8776d14..c74ffa93e9a 100644 --- a/docs/zh/commercial/cloud.md +++ b/docs/zh/commercial/cloud.md @@ -29,6 +29,18 @@ toc_title: 云 - 跨可用区扩展以实现性能和高可用性 - 内置监控和SQL查询编辑器 +## 阿里云 {#alibaba-cloud} + +阿里云的 ClickHouse 托管服务 [中国站](https://www.aliyun.com/product/clickhouse) (国际站于2021年5月初开放) 提供以下主要功能: + +- 基于阿里飞天分布式系统的高可靠云盘存储引擎 +- 按需扩容,无需手动进行数据搬迁 +- 支持单节点、单副本、多节点、多副本多种架构,支持冷热数据分层 +- 支持访问白名单和一键恢复,多层网络安全防护,云盘加密 +- 与云上日志系统、数据库、数据应用工具无缝集成 +- 内置监控和数据库管理平台 +- 专业的数据库专家技术支持和服务 + ## 腾讯云 {#tencent-cloud} [腾讯云的 ClickHouse 托管服务](https://cloud.tencent.com/product/cdwch)提供以下主要功能: diff --git a/docs/zh/engines/table-engines/mergetree-family/replication.md b/docs/zh/engines/table-engines/mergetree-family/replication.md index 07ef255d6bb..a4d03e8da36 100644 --- a/docs/zh/engines/table-engines/mergetree-family/replication.md +++ b/docs/zh/engines/table-engines/mergetree-family/replication.md @@ -47,7 +47,7 @@ 如果配置文件中没有设置 ZooKeeper ,则无法创建复制表,并且任何现有的复制表都将变为只读。 -`SELECT` 查询并不需要借助 ZooKeeper ,复本并不影响 `SELECT` 的性能,查询复制表与非复制表速度是一样的。查询分布式表时,ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md) 修改。 +`SELECT` 查询并不需要借助 ZooKeeper ,副本并不影响 `SELECT` 的性能,查询复制表与非复制表速度是一样的。查询分布式表时,ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md) 修改。 对于每个 `INSERT` 语句,会通过几个事务将十来个记录添加到 ZooKeeper。(确切地说,这是针对每个插入的数据块; 每个 INSERT 语句的每 `max_insert_block_size = 1048576` 行和最后剩余的都各算作一个块。)相比非复制表,写 zk 会导致 `INSERT` 的延迟略长一些。但只要你按照建议每秒不超过一个 `INSERT` 地批量插入数据,不会有任何问题。一个 ZooKeeper 集群能给整个 ClickHouse 集群支撑协调每秒几百个 `INSERT`。数据插入的吞吐量(每秒的行数)可以跟不用复制的数据一样高。 diff --git a/docs/zh/faq/terms_translation_zh.md b/docs/zh/faq/terms_translation_zh.md new file mode 100644 index 00000000000..d252b4e293e --- /dev/null +++ b/docs/zh/faq/terms_translation_zh.md @@ -0,0 +1,38 @@ +# 术语翻译约定 +本文档用来维护从英文翻译成中文的术语集。 + + + +## 保持英文,不译 +Parquet + +## 英文 <-> 中文 +Integer 整数 +floating-point 浮点数 +Fitting 拟合 +Decimal 定点数 +Tuple 元组 +function 函数 +array 数组/阵列 +hash 哈希/散列 +Parameters 参数 +Arguments 参数 + + +## +1. 对于array的翻译,保持初始翻译 数组/阵列 不变。 + +2. 对于倒装句。翻译时非直译,会调整语序。 +比如, groupArrayInsertAt 翻译中 + +``` text +- `x` — [Expression] resulting in one of the [supported data types]. +``` + +``` text +`x` — 生成所[支持的数据类型](数据)的[表达式]。 +``` + +3. See also 参见 + + diff --git a/docs/zh/getting-started/index.md b/docs/zh/getting-started/index.md index fdffca954f7..c5ec7ded932 100644 --- a/docs/zh/getting-started/index.md +++ b/docs/zh/getting-started/index.md @@ -1,7 +1,5 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "\u5BFC\u8A00" +toc_folder_title: 快速上手 toc_priority: 2 --- @@ -9,7 +7,7 @@ toc_priority: 2 如果您是ClickHouse的新手,并希望亲身体验它的性能。 -首先需要进行 [环境安装与部署](install.md). +首先需要完成 [安装与部署](install.md). 之后,您可以通过教程与示例数据完成自己的入门第一步: diff --git a/docs/zh/guides/apply-catboost-model.md b/docs/zh/guides/apply-catboost-model.md index 5e374751052..9002e5cf005 100644 --- a/docs/zh/guides/apply-catboost-model.md +++ b/docs/zh/guides/apply-catboost-model.md @@ -238,6 +238,6 @@ FROM ``` !!! note "注" - 查看函数说明 [avg()](../sql-reference/aggregate-functions/reference.md#agg_function-avg) 和 [log()](../sql-reference/functions/math-functions.md) 。 + 查看函数说明 [avg()](../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg) 和 [log()](../sql-reference/functions/math-functions.md) 。 [原始文章](https://clickhouse.tech/docs/en/guides/apply_catboost_model/) diff --git a/docs/zh/interfaces/index.md b/docs/zh/interfaces/index.md index b678adc765a..4bc14539896 100644 --- a/docs/zh/interfaces/index.md +++ b/docs/zh/interfaces/index.md @@ -1,5 +1,5 @@ --- -toc_folder_title: Interfaces +toc_folder_title: 接口 toc_priority: 14 toc_title: 客户端 --- diff --git a/docs/zh/introduction/distinctive-features.md b/docs/zh/introduction/distinctive-features.md index e9a506f2481..f74c98a0c1d 100644 --- a/docs/zh/introduction/distinctive-features.md +++ b/docs/zh/introduction/distinctive-features.md @@ -17,7 +17,7 @@ toc_title: ClickHouse的特性 在一些列式数据库管理系统中(例如:InfiniDB CE 和 MonetDB) 并没有使用数据压缩。但是, 若想达到比较优异的性能,数据压缩确实起到了至关重要的作用。 -除了在磁盘空间和CPU消耗之间进行不同权衡的高效通用压缩编解码器之外,ClickHouse还提供针对特定类型数据的[专用编解码器](../sql-reference/statements/create/table.md#create-query-specialized-codecs),这使得ClickHouse能够与更小的数据库(如时间序列数据库)竞争并超越它们。 +除了在磁盘空间和CPU消耗之间进行不同权衡的高效通用压缩编解码器之外,ClickHouse还提供针对特定类型数据的[专用编解码器](../sql-reference/statements/create.md#create-query-specialized-codecs),这使得ClickHouse能够与更小的数据库(如时间序列数据库)竞争并超越它们。 ## 数据的磁盘存储 {#shu-ju-de-ci-pan-cun-chu} diff --git a/docs/zh/introduction/index.md b/docs/zh/introduction/index.md index 3b9deddd5cc..64466809d18 100644 --- a/docs/zh/introduction/index.md +++ b/docs/zh/introduction/index.md @@ -1,7 +1,5 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "\u5BFC\u8A00" +toc_folder_title: 简介 toc_priority: 1 --- diff --git a/docs/zh/operations/settings/query-complexity.md b/docs/zh/operations/settings/query-complexity.md index 1e7dc82f7e1..680d5e001e0 100644 --- a/docs/zh/operations/settings/query-complexity.md +++ b/docs/zh/operations/settings/query-complexity.md @@ -45,7 +45,7 @@ Restrictions on the «maximum amount of something» can take the value 0, which 用于在单个服务器上运行用户查询的最大RAM量。 -默认值定义在 [设置。h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Settings.h#L244). 默认情况下,金额不受限制 (`max_memory_usage_for_user = 0`). +默认值定义在 [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Settings.h#L244). 默认情况下,数额不受限制 (`max_memory_usage_for_user = 0`). 另请参阅说明 [max_memory_usage](#settings_max_memory_usage). @@ -53,7 +53,7 @@ Restrictions on the «maximum amount of something» can take the value 0, which 用于在单个服务器上运行所有查询的最大RAM数量。 -默认值定义在 [设置。h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Settings.h#L245). 默认情况下,金额不受限制 (`max_memory_usage_for_all_queries = 0`). +默认值定义在 [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Settings.h#L245). 默认情况下,数额不受限制 (`max_memory_usage_for_all_queries = 0`). 另请参阅说明 [max_memory_usage](#settings_max_memory_usage). diff --git a/docs/zh/operations/settings/settings.md b/docs/zh/operations/settings/settings.md index 64625c19c6a..720b822ce29 100644 --- a/docs/zh/operations/settings/settings.md +++ b/docs/zh/operations/settings/settings.md @@ -988,15 +988,15 @@ ClickHouse生成异常 ## count_distinct_implementation {#settings-count_distinct_implementation} -指定其中的 `uniq*` 函数应用于执行 [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference.md#agg_function-count) 建筑。 +指定其中的 `uniq*` 函数应用于执行 [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) 建筑。 可能的值: -- [uniq](../../sql-reference/aggregate-functions/reference.md#agg_function-uniq) -- [uniqCombined](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqcombined) -- [uniqCombined64](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqcombined64) -- [uniqHLL12](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqhll12) -- [uniqExact](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqexact) +- [uniq](../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined](../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) +- [uniqCombined64](../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +- [uniqHLL12](../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) +- [uniqExact](../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) 默认值: `uniqExact`. diff --git a/docs/zh/operations/system-tables/query_log.md b/docs/zh/operations/system-tables/query_log.md index 6d8d7a39699..aa954fc4845 100644 --- a/docs/zh/operations/system-tables/query_log.md +++ b/docs/zh/operations/system-tables/query_log.md @@ -5,86 +5,87 @@ machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 # system.query_log {#system_tables-query_log} -包含有关已执行查询的信息,例如,开始时间、处理持续时间、错误消息。 +包含已执行查询的相关信息,例如:开始时间、处理持续时间、错误消息。 !!! note "注" 此表不包含以下内容的摄取数据 `INSERT` 查询。 -您可以更改查询日志记录的设置 [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) 服务器配置部分。 +您可以更改query_log的设置,在服务器配置的 [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) 部分。 -您可以通过设置禁用查询日志记录 [log_queries=0](../../operations/settings/settings.md#settings-log-queries). 我们不建议关闭日志记录,因为此表中的信息对于解决问题很重要。 +您可以通过设置 [log_queries=0](../../operations/settings/settings.md#settings-log-queries)来禁用query_log. 我们不建议关闭此日志,因为此表中的信息对于解决问题很重要。 -数据的冲洗周期设置在 `flush_interval_milliseconds` 的参数 [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) 服务器设置部分。 要强制冲洗,请使用 [SYSTEM FLUSH LOGS](../../sql-reference/statements/system.md#query_language-system-flush_logs) 查询。 +数据刷新的周期可通过 `flush_interval_milliseconds` 参数来设置 [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) 。 要强制刷新,请使用 [SYSTEM FLUSH LOGS](../../sql-reference/statements/system.md#query_language-system-flush_logs)。 -ClickHouse不会自动从表中删除数据。 看 [导言](../../operations/system-tables/index.md#system-tables-introduction) 欲了解更多详情。 +ClickHouse不会自动从表中删除数据。更多详情请看 [introduction](../../operations/system-tables/index.md#system-tables-introduction) 。 -该 `system.query_log` 表注册两种查询: +`system.query_log` 表注册两种查询: 1. 客户端直接运行的初始查询。 2. 由其他查询启动的子查询(用于分布式查询执行)。 对于这些类型的查询,有关父查询的信息显示在 `initial_*` 列。 -每个查询创建一个或两个行中 `query_log` 表,这取决于状态(见 `type` 列)的查询: +每个查询在`query_log` 表中创建一或两行记录,这取决于查询的状态(见 `type` 列): -1. 如果查询执行成功,则两行具有 `QueryStart` 和 `QueryFinish` 创建类型。 -2. 如果在查询处理过程中发生错误,两个事件与 `QueryStart` 和 `ExceptionWhileProcessing` 创建类型。 -3. 如果在启动查询之前发生错误,则单个事件具有 `ExceptionBeforeStart` 创建类型。 +1. 如果查询执行成功,会创建type分别为`QueryStart` 和 `QueryFinish` 的两行记录。 +2. 如果在查询处理过程中发生错误,会创建type分别为`QueryStart` 和 `ExceptionWhileProcessing` 的两行记录。 +3. 如果在启动查询之前发生错误,则创建一行type为`ExceptionBeforeStart` 的记录。 列: -- `type` ([枚举8](../../sql-reference/data-types/enum.md)) — Type of an event that occurred when executing the query. Values: - - `'QueryStart' = 1` — Successful start of query execution. - - `'QueryFinish' = 2` — Successful end of query execution. - - `'ExceptionBeforeStart' = 3` — Exception before the start of query execution. - - `'ExceptionWhileProcessing' = 4` — Exception during the query execution. -- `event_date` ([日期](../../sql-reference/data-types/date.md)) — Query starting date. -- `event_time` ([日期时间](../../sql-reference/data-types/datetime.md)) — Query starting time. -- `query_start_time` ([日期时间](../../sql-reference/data-types/datetime.md)) — Start time of query execution. -- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Duration of query execution in milliseconds. -- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Total number or rows read from all tables and table functions participated in query. It includes usual subqueries, subqueries for `IN` 和 `JOIN`. 对于分布式查询 `read_rows` 包括在所有副本上读取的行总数。 每个副本发送它的 `read_rows` 值,并且查询的服务器-发起方汇总所有接收到的和本地的值。 缓存卷不会影响此值。 -- `read_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Total number or bytes read from all tables and table functions participated in query. It includes usual subqueries, subqueries for `IN` 和 `JOIN`. 对于分布式查询 `read_bytes` 包括在所有副本上读取的行总数。 每个副本发送它的 `read_bytes` 值,并且查询的服务器-发起方汇总所有接收到的和本地的值。 缓存卷不会影响此值。 -- `written_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` 查询,写入的行数。 对于其他查询,列值为0。 -- `written_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` 查询时,写入的字节数。 对于其他查询,列值为0。 -- `result_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of rows in a result of the `SELECT` 查询,或者在一些行 `INSERT` 查询。 -- `result_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — RAM volume in bytes used to store a query result. -- `memory_usage` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Memory consumption by the query. -- `query` ([字符串](../../sql-reference/data-types/string.md)) — Query string. -- `exception` ([字符串](../../sql-reference/data-types/string.md)) — Exception message. -- `exception_code` ([Int32](../../sql-reference/data-types/int-uint.md)) — Code of an exception. -- `stack_trace` ([字符串](../../sql-reference/data-types/string.md)) — [堆栈跟踪](https://en.wikipedia.org/wiki/Stack_trace). 如果查询成功完成,则为空字符串。 -- `is_initial_query` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Query type. Possible values: - - 1 — Query was initiated by the client. - - 0 — Query was initiated by another query as part of distributed query execution. -- `user` ([字符串](../../sql-reference/data-types/string.md)) — Name of the user who initiated the current query. -- `query_id` ([字符串](../../sql-reference/data-types/string.md)) — ID of the query. -- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that was used to make the query. -- `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The client port that was used to make the query. -- `initial_user` ([字符串](../../sql-reference/data-types/string.md)) — Name of the user who ran the initial query (for distributed query execution). -- `initial_query_id` ([字符串](../../sql-reference/data-types/string.md)) — ID of the initial query (for distributed query execution). -- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that the parent query was launched from. -- `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The client port that was used to make the parent query. -- `interface` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Interface that the query was initiated from. Possible values: +- `type` ([Enum8](../../sql-reference/data-types/enum.md)) — 执行查询时的事件类型. 值: + - `'QueryStart' = 1` — 查询成功启动. + - `'QueryFinish' = 2` — 查询成功完成. + - `'ExceptionBeforeStart' = 3` — 查询执行前有异常. + - `'ExceptionWhileProcessing' = 4` — 查询执行期间有异常. +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — 查询开始日期. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — 查询开始时间. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — 查询开始时间(毫秒精度). +- `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — 查询执行的开始时间. +- `query_start_time_microseconds` (DateTime64) — 查询执行的开始时间(毫秒精度). +- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 查询消耗的时间(毫秒). +- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 从参与了查询的所有表和表函数读取的总行数. 包括:普通的子查询, `IN` 和 `JOIN`的子查询. 对于分布式查询 `read_rows` 包括在所有副本上读取的行总数。 每个副本发送它的 `read_rows` 值,并且查询的服务器-发起方汇总所有接收到的和本地的值。 缓存卷不会影响此值。 +- `read_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 从参与了查询的所有表和表函数读取的总字节数. 包括:普通的子查询, `IN` 和 `JOIN`的子查询. 对于分布式查询 `read_bytes` 包括在所有副本上读取的字节总数。 每个副本发送它的 `read_bytes` 值,并且查询的服务器-发起方汇总所有接收到的和本地的值。 缓存卷不会影响此值。 +- `written_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 对于 `INSERT` 查询,为写入的行数。 对于其他查询,值为0。 +- `written_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 对于 `INSERT` 查询时,为写入的字节数。 对于其他查询,值为0。 +- `result_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — `SELECT` 查询结果的行数,或`INSERT` 的行数。 +- `result_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 存储查询结果的RAM量. +- `memory_usage` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 查询使用的内存. +- `query` ([String](../../sql-reference/data-types/string.md)) — 查询语句. +- `exception` ([String](../../sql-reference/data-types/string.md)) — 异常信息. +- `exception_code` ([Int32](../../sql-reference/data-types/int-uint.md)) — 异常码. +- `stack_trace` ([String](../../sql-reference/data-types/string.md)) — [Stack Trace](https://en.wikipedia.org/wiki/Stack_trace). 如果查询成功完成,则为空字符串。 +- `is_initial_query` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 查询类型. 可能的值: + - 1 — 客户端发起的查询. + - 0 — 由另一个查询发起的,作为分布式查询的一部分. +- `user` ([String](../../sql-reference/data-types/string.md)) — 发起查询的用户. +- `query_id` ([String](../../sql-reference/data-types/string.md)) — 查询ID. +- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — 发起查询的客户端IP地址. +- `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — 发起查询的客户端端口. +- `initial_user` ([String](../../sql-reference/data-types/string.md)) — 初始查询的用户名(用于分布式查询执行). +- `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — 运行初始查询的ID(用于分布式查询执行). +- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — 运行父查询的IP地址. +- `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — 发起父查询的客户端端口. +- `interface` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 发起查询的接口. 可能的值: - 1 — TCP. - 2 — HTTP. -- `os_user` ([字符串](../../sql-reference/data-types/string.md)) — Operating system username who runs [ツ环板clientョツ嘉ッツ偲](../../interfaces/cli.md). -- `client_hostname` ([字符串](../../sql-reference/data-types/string.md)) — Hostname of the client machine where the [ツ环板clientョツ嘉ッツ偲](../../interfaces/cli.md) 或者运行另一个TCP客户端。 -- `client_name` ([字符串](../../sql-reference/data-types/string.md)) — The [ツ环板clientョツ嘉ッツ偲](../../interfaces/cli.md) 或另一个TCP客户端名称。 -- `client_revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Revision of the [ツ环板clientョツ嘉ッツ偲](../../interfaces/cli.md) 或另一个TCP客户端。 -- `client_version_major` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Major version of the [ツ环板clientョツ嘉ッツ偲](../../interfaces/cli.md) 或另一个TCP客户端。 -- `client_version_minor` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Minor version of the [ツ环板clientョツ嘉ッツ偲](../../interfaces/cli.md) 或另一个TCP客户端。 -- `client_version_patch` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Patch component of the [ツ环板clientョツ嘉ッツ偲](../../interfaces/cli.md) 或另一个TCP客户端版本。 -- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: - - 0 — The query was launched from the TCP interface. - - 1 — `GET` 方法被使用。 - - 2 — `POST` 方法被使用。 -- `http_user_agent` ([字符串](../../sql-reference/data-types/string.md)) — The `UserAgent` http请求中传递的标头。 -- `quota_key` ([字符串](../../sql-reference/data-types/string.md)) — The “quota key” 在指定 [配额](../../operations/quotas.md) 设置(见 `keyed`). +- `os_user` ([String](../../sql-reference/data-types/string.md)) — 运行 [clickhouse-client](../../interfaces/cli.md)的操作系统用户名. +- `client_hostname` ([String](../../sql-reference/data-types/string.md)) — 运行[clickhouse-client](../../interfaces/cli.md) 或其他TCP客户端的机器的主机名。 +- `client_name` ([String](../../sql-reference/data-types/string.md)) — [clickhouse-client](../../interfaces/cli.md) 或其他TCP客户端的名称。 +- `client_revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — [clickhouse-client](../../interfaces/cli.md) 或其他TCP客户端的Revision。 +- `client_version_major` ([UInt32](../../sql-reference/data-types/int-uint.md)) — [clickhouse-client](../../interfaces/cli.md) 或其他TCP客户端的Major version。 +- `client_version_minor` ([UInt32](../../sql-reference/data-types/int-uint.md)) — [clickhouse-client](../../interfaces/cli.md) 或其他TCP客户端的Minor version。 +- `client_version_patch` ([UInt32](../../sql-reference/data-types/int-uint.md)) — [clickhouse-client](../../interfaces/cli.md) 或其他TCP客户端的Patch component。 +- `http_method` (UInt8) — 发起查询的HTTP方法. 可能值: + - 0 — TCP接口的查询. + - 1 — `GET` + - 2 — `POST` +- `http_user_agent` ([String](../../sql-reference/data-types/string.md)) — The `UserAgent` The UserAgent header passed in the HTTP request。 +- `quota_key` ([String](../../sql-reference/data-types/string.md)) — 在[quotas](../../operations/quotas.md) 配置里设置的“quota key” (见 `keyed`). - `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse revision. -- `thread_numbers` ([数组(UInt32)](../../sql-reference/data-types/array.md)) — Number of threads that are participating in query execution. -- `ProfileEvents.Names` ([数组(字符串)](../../sql-reference/data-types/array.md)) — Counters that measure different metrics. The description of them could be found in the table [系统。活动](../../operations/system-tables/events.md#system_tables-events) -- `ProfileEvents.Values` ([数组(UInt64)](../../sql-reference/data-types/array.md)) — Values of metrics that are listed in the `ProfileEvents.Names` 列。 -- `Settings.Names` ([数组(字符串)](../../sql-reference/data-types/array.md)) — Names of settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` 参数为1。 -- `Settings.Values` ([数组(字符串)](../../sql-reference/data-types/array.md)) — Values of settings that are listed in the `Settings.Names` 列。 - +- `thread_numbers` ([Array(UInt32)](../../sql-reference/data-types/array.md)) — 参与查询的线程数. +- `ProfileEvents.Names` ([Array(String)](../../sql-reference/data-types/array.md)) — 衡量不同指标的计数器。 可以在[system.events](../../operations/system-tables/events.md#system_tables-events)中找到它们的描述。 +- `ProfileEvents.Values` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — `ProfileEvents.Names` 列中列出的指标的值。 +- `Settings.Names` ([Array(String)](../../sql-reference/data-types/array.md)) — 客户端运行查询时更改的设置的名称。 要启用对设置的日志记录更改,请将log_query_settings参数设置为1。 +- `Settings.Values` ([Array(String)](../../sql-reference/data-types/array.md)) — `Settings.Names` 列中列出的设置的值。 **示例** ``` sql @@ -140,4 +141,4 @@ Settings.Values: ['0','random','1','10000000000'] **另请参阅** -- [system.query_thread_log](../../operations/system-tables/query_thread_log.md#system_tables-query_thread_log) — This table contains information about each query execution thread. +- [system.query_thread_log](../../operations/system-tables/query_thread_log.md#system_tables-query_thread_log) — 这个表包含了每个查询执行线程的信息 diff --git a/docs/zh/operations/tips.md b/docs/zh/operations/tips.md index 511e8a22644..6b46dbb5285 100644 --- a/docs/zh/operations/tips.md +++ b/docs/zh/operations/tips.md @@ -1,24 +1,8 @@ # 使用建议 {#usage-recommendations} -## CPU {#cpu} +## CPU频率调节器 {#cpu-scaling-governor} -必须支持SSE4.2指令集。 现代处理器(自2008年以来)支持它。 - -选择处理器时,与较少的内核和较高的时钟速率相比,更喜欢大量内核和稍慢的时钟速率。 -例如,具有2600MHz的16核心比具有3600MHz的8核心更好。 - -## 超线程 {#hyper-threading} - -不要禁用超线程。 它有助于某些查询,但不适用于其他查询。 - -## 超频 {#turbo-boost} - -强烈推荐超频(turbo-boost)。 它显着提高了典型负载的性能。 -您可以使用 `turbostat` 要查看负载下的CPU的实际时钟速率。 - -## CPU缩放调控器 {#cpu-scaling-governor} - -始终使用 `performance` 缩放调控器。 该 `on-demand` 随着需求的不断增加,缩放调节器的工作要糟糕得多。 +始终使用 `performance` 频率调节器。 `on-demand` 频率调节器在持续高需求的情况下,效果更差。 ``` bash echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor @@ -26,68 +10,70 @@ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_gover ## CPU限制 {#cpu-limitations} -处理器可能会过热。 使用 `dmesg` 看看CPU的时钟速率是否由于过热而受到限制。 -此限制也可以在数据中心级别的外部设置。 您可以使用 `turbostat` 在负载下监视它。 +处理器可能会过热。 使用 `dmesg` 查看CPU的时钟速率是否由于过热而受到限制。 +该限制也可以在数据中心级别外部设置。 您可以使用 `turbostat` 在负载下对其进行监控。 ## RAM {#ram} -对于少量数据(高达-200GB压缩),最好使用与数据量一样多的内存。 -对于大量数据和处理交互式(在线)查询时,应使用合理数量的RAM(128GB或更多),以便热数据子集适合页面缓存。 -即使对于每台服务器约50TB的数据量,使用128GB的RAM与64GB相比显着提高了查询性能。 +对于少量数据(压缩后约200GB),最好使用与数据量一样多的内存。 +对于大量数据,以及在处理交互式(在线)查询时,应使用合理数量的RAM(128GB或更多),以便热数据子集适合页面缓存。 +即使对于每台服务器约50TB的数据量,与64GB相比,使用128GB的RAM也可以显着提高查询性能。 -## 交换文件 {#swap-file} +不要禁用 overcommit。`cat /proc/sys/vm/overcommit_memory` 的值应该为0或1。运行 -始终禁用交换文件。 不这样做的唯一原因是,如果您使用的ClickHouse在您的个人笔记本电脑。 +``` bash +$ echo 0 | sudo tee /proc/sys/vm/overcommit_memory +``` ## 大页(Huge Pages) {#huge-pages} -始终禁用透明大页(transparent huge pages)。 它会干扰内存分alloc,从而导致显着的性能下降。 +始终禁用透明大页(transparent huge pages)。 它会干扰内存分配器,从而导致显着的性能下降。 ``` bash echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled ``` -使用 `perf top` 观察内核中用于内存管理的时间。 +使用 `perf top` 来查看内核在内存管理上花费的时间。 永久大页(permanent huge pages)也不需要被分配。 -## 存储系统 {#storage-subsystem} +## 存储子系统 {#storage-subsystem} 如果您的预算允许您使用SSD,请使用SSD。 如果没有,请使用硬盘。 SATA硬盘7200转就行了。 -优先选择带有本地硬盘驱动器的大量服务器,而不是带有附加磁盘架的小量服务器。 -但是对于存储具有罕见查询的档案,货架将起作用。 +优先选择许多带有本地硬盘驱动器的服务器,而不是少量带有附加磁盘架的服务器。 +但是对于存储极少查询的档案,架子可以使用。 ## RAID {#raid} 当使用硬盘,你可以结合他们的RAID-10,RAID-5,RAID-6或RAID-50。 -对于Linux,软件RAID更好(与 `mdadm`). 我们不建议使用LVM。 +对于Linux,软件RAID更好(使用 `mdadm`). 我们不建议使用LVM。 当创建RAID-10,选择 `far` 布局。 如果您的预算允许,请选择RAID-10。 -如果您有超过4个磁盘,请使用RAID-6(首选)或RAID-50,而不是RAID-5。 +如果您有4个以上的磁盘,请使用RAID-6(首选)或RAID-50,而不是RAID-5。 当使用RAID-5、RAID-6或RAID-50时,始终增加stripe_cache_size,因为默认值通常不是最佳选择。 ``` bash echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size ``` -使用以下公式,从设备数量和块大小计算确切数量: `2 * num_devices * chunk_size_in_bytes / 4096`. +使用以下公式从设备数量和块大小中计算出确切的数量: `2 * num_devices * chunk_size_in_bytes / 4096`。 -1025KB的块大小足以满足所有RAID配置。 +1024KB的块大小足以满足所有RAID配置。 切勿将块大小设置得太小或太大。 您可以在SSD上使用RAID-0。 -无论使用何种RAID,始终使用复制来保证数据安全。 +无论使用哪种RAID,始终使用复制来保证数据安全。 -使用长队列启用NCQ。 对于HDD,选择CFQ调度程序,对于SSD,选择noop。 不要减少 ‘readahead’ 设置。 +启用有长队列的NCQ。 对于HDD,选择CFQ调度程序,对于SSD,选择noop。 不要减少 ‘readahead’ 设置。 对于HDD,启用写入缓存。 ## 文件系统 {#file-system} Ext4是最可靠的选择。 设置挂载选项 `noatime, nobarrier`. -XFS也是合适的,但它还没有经过ClickHouse的彻底测试。 -大多数其他文件系统也应该正常工作。 具有延迟分配的文件系统工作得更好。 +XFS也是合适的,但它还没有经过ClickHouse的全面测试。 +大多数其他文件系统也应该可以正常工作。 具有延迟分配的文件系统工作得更好。 ## Linux内核 {#linux-kernel} @@ -95,26 +81,43 @@ XFS也是合适的,但它还没有经过ClickHouse的彻底测试。 ## 网络 {#network} -如果您使用的是IPv6,请增加路由缓存的大小。 -3.2之前的Linux内核在IPv6实现方面遇到了许多问题。 +如果使用的是IPv6,请增加路由缓存的大小。 +3.2之前的Linux内核在IPv6实现方面存在许多问题。 -如果可能的话,至少使用一个10GB的网络。 1Gb也可以工作,但对于使用数十tb的数据修补副本或处理具有大量中间数据的分布式查询,情况会更糟。 +如果可能的话,至少使用10GB的网络。1GB也可以工作,但对于使用数十TB的数据修补副本或处理具有大量中间数据的分布式查询,情况会更糟。 + +## 虚拟机监视器(Hypervisor)配置 + +如果您使用的是OpenStack,请在nova.conf中设置 +``` +cpu_mode=host-passthrough +``` +。 + +如果您使用的是libvirt,请在XML配置中设置 +``` + +``` +。 + +这对于ClickHouse能够通过 `cpuid` 指令获取正确的信息非常重要。 +否则,当在旧的CPU型号上运行虚拟机监视器时,可能会导致 `Illegal instruction` 崩溃。 ## Zookeeper {#zookeeper} -您可能已经将ZooKeeper用于其他目的。 您可以使用相同的zookeeper安装,如果它还没有超载。 +您可能已经将ZooKeeper用于其他目的。 如果它还没有超载,您可以使用相同的zookeeper。 -最好使用新版本的 Zookeeper – 3.4.9 或之后的版本. 稳定 Liunx 发行版中的 Zookeeper 版本可能是落后的。 +最好使用新版本的Zookeeper – 3.4.9 或更高的版本. 稳定的Liunx发行版中的Zookeeper版本可能已过时。 -你永远不该使用自己手写的脚本在不同的 Zookeeper 集群之间转移数据, 这可能会导致序列节点的数据不正确。出于同样的原因,永远不要使用 zkcopy 工具: https://github.com/ksprojects/zkcopy/issues/15 +你永远不要使用手动编写的脚本在不同的Zookeeper集群之间传输数据, 这可能会导致序列节点的数据不正确。出于相同的原因,永远不要使用 zkcopy 工具: https://github.com/ksprojects/zkcopy/issues/15 -如果要将现有ZooKeeper集群分为两个,正确的方法是增加其副本的数量,然后将其重新配置为两个独立的集群。 +如果要将现有的ZooKeeper集群分为两个,正确的方法是增加其副本的数量,然后将其重新配置为两个独立的集群。 -不要在与ClickHouse相同的服务器上运行ZooKeeper。 因为ZooKeeper对延迟非常敏感,而ClickHouse可能会占用所有可用的系统资源。 +不要在ClickHouse所在的服务器上运行ZooKeeper。 因为ZooKeeper对延迟非常敏感,而ClickHouse可能会占用所有可用的系统资源。 默认设置下,ZooKeeper 就像是一个定时炸弹: -当使用默认配置时,ZooKeeper服务不会从旧快照和日志中删除文件(请参阅autopurge),这是操作员的责任。 +当使用默认配置时,ZooKeeper服务器不会从旧的快照和日志中删除文件(请参阅autopurge),这是操作员的责任。 必须拆除炸弹。 @@ -222,7 +225,7 @@ JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \ -XX:+CMSParallelRemarkEnabled" ``` -Salt init: +初始化: description "zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} centralized coordination service" diff --git a/docs/zh/sql-reference/aggregate-functions/combinators.md b/docs/zh/sql-reference/aggregate-functions/combinators.md index c458097a5fb..6d1cd9c775c 100644 --- a/docs/zh/sql-reference/aggregate-functions/combinators.md +++ b/docs/zh/sql-reference/aggregate-functions/combinators.md @@ -27,7 +27,7 @@ toc_title: 聚合函数组合器 ## -State {#agg-functions-combinator-state} -如果应用此combinator,则聚合函数不会返回结果值(例如唯一值的数量 [uniq](reference.md#agg_function-uniq) 函数),但是返回聚合的中间状态(对于 `uniq`,返回的是计算唯一值的数量的哈希表)。 这是一个 `AggregateFunction(...)` 可用于进一步处理或存储在表中以完成稍后的聚合。 +如果应用此combinator,则聚合函数不会返回结果值(例如唯一值的数量 [uniq](./reference/uniq.md#agg_function-uniq) 函数),但是返回聚合的中间状态(对于 `uniq`,返回的是计算唯一值的数量的哈希表)。 这是一个 `AggregateFunction(...)` 可用于进一步处理或存储在表中以完成稍后的聚合。 要使用这些状态,请使用: @@ -209,7 +209,7 @@ FROM 让我们得到的人的名字,他们的年龄在于的时间间隔 `[30,60)` 和 `[60,75)`。 由于我们使用整数表示的年龄,我们得到的年龄 `[30, 59]` 和 `[60,74]` 间隔。 -要在数组中聚合名称,我们使用 [groupArray](reference.md#agg_function-grouparray) 聚合函数。 这需要一个参数。 在我们的例子中,它是 `name` 列。 `groupArrayResample` 函数应该使用 `age` 按年龄聚合名称, 要定义所需的时间间隔,我们传入 `30, 75, 30` 参数给 `groupArrayResample` 函数。 +要在数组中聚合名称,我们使用 [groupArray](./reference/grouparray.md#agg_function-grouparray) 聚合函数。 这需要一个参数。 在我们的例子中,它是 `name` 列。 `groupArrayResample` 函数应该使用 `age` 按年龄聚合名称, 要定义所需的时间间隔,我们传入 `30, 75, 30` 参数给 `groupArrayResample` 函数。 ``` sql SELECT groupArrayResample(30, 75, 30)(name, age) FROM people diff --git a/docs/zh/sql-reference/aggregate-functions/parametric-functions.md b/docs/zh/sql-reference/aggregate-functions/parametric-functions.md index d151bbc3957..be9166e5737 100644 --- a/docs/zh/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/zh/sql-reference/aggregate-functions/parametric-functions.md @@ -493,6 +493,6 @@ FROM ## sumMapFiltered(keys_to_keep)(keys, values) {#summapfilteredkeys-to-keepkeys-values} -和 [sumMap](reference.md#agg_functions-summap) 基本一致, 除了一个键数组作为参数传递。这在使用高基数key时尤其有用。 +和 [sumMap](./reference/summap.md#agg_functions-summap) 基本一致, 除了一个键数组作为参数传递。这在使用高基数key时尤其有用。 [原始文章](https://clickhouse.tech/docs/en/query_language/agg_functions/parametric_functions/) diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md deleted file mode 100644 index 3a224886a00..00000000000 --- a/docs/zh/sql-reference/aggregate-functions/reference.md +++ /dev/null @@ -1,1912 +0,0 @@ ---- -toc_priority: 36 -toc_title: 参考手册 ---- - -# 参考手册 {#aggregate-functions-reference} - -## count {#agg_function-count} - -计数行数或非空值。 - -ClickHouse支持以下语法 `count`: -- `count(expr)` 或 `COUNT(DISTINCT expr)`. -- `count()` 或 `COUNT(*)`. 该 `count()` 语法是ClickHouse特定的。 - -**参数** - -该功能可以采取: - -- 零参数。 -- 一 [表达式](../syntax.md#syntax-expressions). - -**返回值** - -- 如果没有参数调用函数,它会计算行数。 -- 如果 [表达式](../syntax.md#syntax-expressions) 被传递,则该函数计数此表达式返回的次数非null。 如果表达式返回 [可为空](../../sql-reference/data-types/nullable.md)-键入值,然后结果 `count` 保持不 `Nullable`. 如果返回表达式,则该函数返回0 `NULL` 对于所有的行。 - -在这两种情况下,返回值的类型为 [UInt64](../../sql-reference/data-types/int-uint.md). - -**详细信息** - -ClickHouse支持 `COUNT(DISTINCT ...)` 语法 这种结构的行为取决于 [count_distinct_implementation](../../operations/settings/settings.md#settings-count_distinct_implementation) 设置。 它定义了其中的 [uniq\*](#agg_function-uniq) 函数用于执行操作。 默认值为 [uniqExact](#agg_function-uniqexact) 功能。 - -该 `SELECT count() FROM table` 查询未被优化,因为表中的条目数没有单独存储。 它从表中选择一个小列并计算其中的值数。 - -**例** - -示例1: - -``` sql -SELECT count() FROM t -``` - -``` text -┌─count()─┐ -│ 5 │ -└─────────┘ -``` - -示例2: - -``` sql -SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation' -``` - -``` text -┌─name──────────────────────────┬─value─────┐ -│ count_distinct_implementation │ uniqExact │ -└───────────────────────────────┴───────────┘ -``` - -``` sql -SELECT count(DISTINCT num) FROM t -``` - -``` text -┌─uniqExact(num)─┐ -│ 3 │ -└────────────────┘ -``` - -这个例子表明 `count(DISTINCT num)` 由执行 `uniqExact` 根据功能 `count_distinct_implementation` 设定值。 - -## any(x) {#agg_function-any} - -选择第一个遇到的值。 -查询可以以任何顺序执行,甚至每次都以不同的顺序执行,因此此函数的结果是不确定的。 -要获得确定的结果,您可以使用 ‘min’ 或 ‘max’ 功能,而不是 ‘any’. - -在某些情况下,可以依靠执行的顺序。 这适用于SELECT来自使用ORDER BY的子查询的情况。 - -当一个 `SELECT` 查询具有 `GROUP BY` 子句或至少一个聚合函数,ClickHouse(相对于MySQL)要求在所有表达式 `SELECT`, `HAVING`,和 `ORDER BY` 子句可以从键或聚合函数计算。 换句话说,从表中选择的每个列必须在键或聚合函数内使用。 要获得像MySQL这样的行为,您可以将其他列放在 `any` 聚合函数。 - -## anyHeavy(x) {#anyheavyx} - -使用选择一个频繁出现的值 [重打者](http://www.cs.umd.edu/~samir/498/karp.pdf) 算法。 如果某个值在查询的每个执行线程中出现的情况超过一半,则返回此值。 通常情况下,结果是不确定的。 - -``` sql -anyHeavy(column) -``` - -**参数** - -- `column` – The column name. - -**示例** - -就拿 [时间](../../getting-started/example-datasets/ontime.md) 数据集,并选择在任何频繁出现的值 `AirlineID` 列。 - -``` sql -SELECT anyHeavy(AirlineID) AS res -FROM ontime -``` - -``` text -┌───res─┐ -│ 19690 │ -└───────┘ -``` - -## anyLast(x) {#anylastx} - -选择遇到的最后一个值。 -其结果是一样不确定的 `any` 功能。 - -## groupBitAnd {#groupbitand} - -按位应用 `AND` 对于一系列的数字。 - -``` sql -groupBitAnd(expr) -``` - -**参数** - -`expr` – An expression that results in `UInt*` 类型。 - -**返回值** - -的价值 `UInt*` 类型。 - -**示例** - -测试数据: - -``` text -binary decimal -00101100 = 44 -00011100 = 28 -00001101 = 13 -01010101 = 85 -``` - -查询: - -``` sql -SELECT groupBitAnd(num) FROM t -``` - -哪里 `num` 是包含测试数据的列。 - -结果: - -``` text -binary decimal -00000100 = 4 -``` - -## groupBitOr {#groupbitor} - -按位应用 `OR` 对于一系列的数字。 - -``` sql -groupBitOr(expr) -``` - -**参数** - -`expr` – An expression that results in `UInt*` 类型。 - -**返回值** - -的价值 `UInt*` 类型。 - -**示例** - -测试数据: - -``` text -binary decimal -00101100 = 44 -00011100 = 28 -00001101 = 13 -01010101 = 85 -``` - -查询: - -``` sql -SELECT groupBitOr(num) FROM t -``` - -哪里 `num` 是包含测试数据的列。 - -结果: - -``` text -binary decimal -01111101 = 125 -``` - -## groupBitXor {#groupbitxor} - -按位应用 `XOR` 对于一系列的数字。 - -``` sql -groupBitXor(expr) -``` - -**参数** - -`expr` – An expression that results in `UInt*` 类型。 - -**返回值** - -的价值 `UInt*` 类型。 - -**示例** - -测试数据: - -``` text -binary decimal -00101100 = 44 -00011100 = 28 -00001101 = 13 -01010101 = 85 -``` - -查询: - -``` sql -SELECT groupBitXor(num) FROM t -``` - -哪里 `num` 是包含测试数据的列。 - -结果: - -``` text -binary decimal -01101000 = 104 -``` - -## groupBitmap {#groupbitmap} - -从无符号整数列的位图或聚合计算,返回UInt64类型的基数,如果添加后缀状态,则返回 [位图对象](../../sql-reference/functions/bitmap-functions.md). - -``` sql -groupBitmap(expr) -``` - -**参数** - -`expr` – An expression that results in `UInt*` 类型。 - -**返回值** - -的价值 `UInt64` 类型。 - -**示例** - -测试数据: - -``` text -UserID -1 -1 -2 -3 -``` - -查询: - -``` sql -SELECT groupBitmap(UserID) as num FROM t -``` - -结果: - -``` text -num -3 -``` - -## min(x) {#agg_function-min} - -计算最小值。 - -## max(x) {#agg_function-max} - -计算最大值。 - -## argMin(arg,val) {#agg-function-argmin} - -计算 ‘arg’ 最小值的值 ‘val’ 价值。 如果有几个不同的值 ‘arg’ 对于最小值 ‘val’,遇到的第一个值是输出。 - -**示例:** - -``` text -┌─user─────┬─salary─┐ -│ director │ 5000 │ -│ manager │ 3000 │ -│ worker │ 1000 │ -└──────────┴────────┘ -``` - -``` sql -SELECT argMin(user, salary) FROM salary -``` - -``` text -┌─argMin(user, salary)─┐ -│ worker │ -└──────────────────────┘ -``` - -## argMax(arg,val) {#agg-function-argmax} - -计算 ‘arg’ 最大值 ‘val’ 价值。 如果有几个不同的值 ‘arg’ 对于最大值 ‘val’,遇到的第一个值是输出。 - -## sum(x) {#agg_function-sum} - -计算总和。 -只适用于数字。 - -## sumWithOverflow(x) {#sumwithoverflowx} - -使用与输入参数相同的数据类型计算数字的总和。 如果总和超过此数据类型的最大值,则函数返回错误。 - -只适用于数字。 - -## sumMap(key,value),sumMap(Tuple(key,value)) {#agg_functions-summap} - -总计 ‘value’ 数组根据在指定的键 ‘key’ 阵列。 -传递键和值数组的元组与传递两个键和值数组是同义的。 -元素的数量 ‘key’ 和 ‘value’ 总计的每一行必须相同。 -返回两个数组的一个二元组: key是排好序的,value是对应key的求和。 - -示例: - -``` sql -CREATE TABLE sum_map( - date Date, - timeslot DateTime, - statusMap Nested( - status UInt16, - requests UInt64 - ), - statusMapTuple Tuple(Array(Int32), Array(Int32)) -) ENGINE = Log; -INSERT INTO sum_map VALUES - ('2000-01-01', '2000-01-01 00:00:00', [1, 2, 3], [10, 10, 10], ([1, 2, 3], [10, 10, 10])), - ('2000-01-01', '2000-01-01 00:00:00', [3, 4, 5], [10, 10, 10], ([3, 4, 5], [10, 10, 10])), - ('2000-01-01', '2000-01-01 00:01:00', [4, 5, 6], [10, 10, 10], ([4, 5, 6], [10, 10, 10])), - ('2000-01-01', '2000-01-01 00:01:00', [6, 7, 8], [10, 10, 10], ([6, 7, 8], [10, 10, 10])); - -SELECT - timeslot, - sumMap(statusMap.status, statusMap.requests), - sumMap(statusMapTuple) -FROM sum_map -GROUP BY timeslot -``` - -``` text -┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┬─sumMap(statusMapTuple)─────────┐ -│ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10]) │ ([1,2,3,4,5],[10,10,20,10,10]) │ -│ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ ([4,5,6,7,8],[10,10,20,10,10]) │ -└─────────────────────┴──────────────────────────────────────────────┴────────────────────────────────┘ -``` - -## skewPop {#skewpop} - -计算的序列[偏度](https://en.wikipedia.org/wiki/Skewness)。 - -``` sql -skewPop(expr) -``` - -**参数** - -`expr` — [表达式](../syntax.md#syntax-expressions) 返回一个数字。 - -**返回值** - -给定序列的偏度。类型 — [Float64](../../sql-reference/data-types/float.md) - -**示例** - -``` sql -SELECT skewPop(value) FROM series_with_value_column -``` - -## skewSamp {#skewsamp} - -计算 [样品偏度](https://en.wikipedia.org/wiki/Skewness) 的序列。 - -它表示随机变量的偏度的无偏估计,如果传递的值形成其样本。 - -``` sql -skewSamp(expr) -``` - -**参数** - -`expr` — [表达式](../syntax.md#syntax-expressions) 返回一个数字。 - -**返回值** - -给定序列的偏度。 类型 — [Float64](../../sql-reference/data-types/float.md). 如果 `n <= 1` (`n` 是样本的大小),则该函数返回 `nan`. - -**示例** - -``` sql -SELECT skewSamp(value) FROM series_with_value_column -``` - -## kurtPop {#kurtpop} - -计算 [峰度](https://en.wikipedia.org/wiki/Kurtosis) 的序列。 - -``` sql -kurtPop(expr) -``` - -**参数** - -`expr` — [表达式](../syntax.md#syntax-expressions) 返回一个数字。 - -**返回值** - -给定序列的峰度。 类型 — [Float64](../../sql-reference/data-types/float.md) - -**示例** - -``` sql -SELECT kurtPop(value) FROM series_with_value_column -``` - -## kurtSamp {#kurtsamp} - -计算 [峰度样本](https://en.wikipedia.org/wiki/Kurtosis) 的序列。 - -它表示随机变量峰度的无偏估计,如果传递的值形成其样本。 - -``` sql -kurtSamp(expr) -``` - -**参数** - -`expr` — [表达式](../syntax.md#syntax-expressions) 返回一个数字。 - -**返回值** - -给定序列的峰度。类型 — [Float64](../../sql-reference/data-types/float.md). 如果 `n <= 1` (`n` 是样本的大小),则该函数返回 `nan`. - -**示例** - -``` sql -SELECT kurtSamp(value) FROM series_with_value_column -``` - -## avg(x) {#agg_function-avg} - -计算平均值。 -只适用于数字。 -结果总是Float64。 - -## avgWeighted {#avgweighted} - -计算 [加权算术平均值](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean). - -**语法** - -``` sql -avgWeighted(x, weight) -``` - -**参数** - -- `x` — 值。 [整数](../data-types/int-uint.md) 或 [浮点](../data-types/float.md). -- `weight` — 值的加权。 [整数](../data-types/int-uint.md) 或 [浮点](../data-types/float.md). - -`x` 和 `weight` 的类型一定是一样的 - -**返回值** - -- 加权平均值。 -- `NaN`. 如果所有的权重都等于0。 - -类型: [Float64](../data-types/float.md). - -**示例** - -查询: - -``` sql -SELECT avgWeighted(x, w) -FROM values('x Int8, w Int8', (4, 1), (1, 0), (10, 2)) -``` - -结果: - -``` text -┌─avgWeighted(x, weight)─┐ -│ 8 │ -└────────────────────────┘ -``` - -## uniq {#agg_function-uniq} - -计算参数的不同值的近似数量。 - -``` sql -uniq(x[, ...]) -``` - -**参数** - -该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`,或数字类型。 - -**返回值** - -- A [UInt64](../../sql-reference/data-types/int-uint.md)-键入号码。 - -**实现细节** - -功能: - -- 计算聚合中所有参数的哈希值,然后在计算中使用它。 - -- 使用自适应采样算法。 对于计算状态,该函数使用最多65536个元素哈希值的样本。 - - 这个算法是非常精确的,并且对于CPU来说非常高效。如果查询包含一些这样的函数,那和其他聚合函数相比 `uniq` 将是几乎一样快。 - -- 确定性地提供结果(它不依赖于查询处理顺序)。 - -我们建议在几乎所有情况下使用此功能。 - -**另请参阅** - -- [uniqCombined](#agg_function-uniqcombined) -- [uniqCombined64](#agg_function-uniqcombined64) -- [uniqHLL12](#agg_function-uniqhll12) -- [uniqExact](#agg_function-uniqexact) - -## uniqCombined {#agg_function-uniqcombined} - -计算不同参数值的近似数量。 - -``` sql -uniqCombined(HLL_precision)(x[, ...]) -``` - -该 `uniqCombined` 函数是计算不同数值数量的不错选择。 - -**参数** - -该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`,或数字类型。 - -`HLL_precision` 是以2为底的单元格数的对数 [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). 可选,您可以将该函数用作 `uniqCombined(x[, ...])`. 默认值 `HLL_precision` 是17,这是有效的96KiB的空间(2^17个单元,每个6比特)。 - -**返回值** - -- 一个[UInt64](../../sql-reference/data-types/int-uint.md)类型的数字。 - -**实现细节** - -功能: - -- 计算散列(64位散列 `String` 否则32位)对于聚合中的所有参数,然后在计算中使用它。 - -- 使用三种算法的组合:数组、哈希表和包含错误修正表的HyperLogLog。 - - 少量的不同的值,使用数组。 值再多一些,使用哈希表。对于大量的数据来说,使用HyperLogLog,HyperLogLog占用一个固定的内存空间。 - -- 确定性地提供结果(它不依赖于查询处理顺序)。 - -!!! note "注" - 因为它使用32位散列非-`String` 类型,结果将有非常高的误差基数显着大于 `UINT_MAX` (错误将在几百亿不同值之后迅速提高),因此在这种情况下,您应该使用 [uniqCombined64](#agg_function-uniqcombined64) - -相比于 [uniq](#agg_function-uniq) 功能,该 `uniqCombined`: - -- 消耗少几倍的内存。 -- 计算精度高出几倍。 -- 通常具有略低的性能。 在某些情况下, `uniqCombined` 可以表现得比 `uniq` 好,例如,使用通过网络传输大量聚合状态的分布式查询。 - -**另请参阅** - -- [uniq](#agg_function-uniq) -- [uniqCombined64](#agg_function-uniqcombined64) -- [uniqHLL12](#agg_function-uniqhll12) -- [uniqExact](#agg_function-uniqexact) - -## uniqCombined64 {#agg_function-uniqcombined64} - -和 [uniqCombined](#agg_function-uniqcombined),但对所有数据类型使用64位哈希。 - -## uniqHLL12 {#agg_function-uniqhll12} - -计算不同参数值的近似数量,使用 [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) 算法。 - -``` sql -uniqHLL12(x[, ...]) -``` - -**参数** - -该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`,或数字类型。 - -**返回值** - -- A [UInt64](../../sql-reference/data-types/int-uint.md)-键入号码。 - -**实现细节** - -功能: - -- 计算聚合中所有参数的哈希值,然后在计算中使用它。 - -- 使用HyperLogLog算法来近似不同参数值的数量。 - - 212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). - -- 提供确定结果(它不依赖于查询处理顺序)。 - -我们不建议使用此功能。 在大多数情况下,使用 [uniq](#agg_function-uniq) 或 [uniqCombined](#agg_function-uniqcombined) 功能。 - -**另请参阅** - -- [uniq](#agg_function-uniq) -- [uniqCombined](#agg_function-uniqcombined) -- [uniqExact](#agg_function-uniqexact) - -## uniqExact {#agg_function-uniqexact} - -计算不同参数值的准确数目。 - -``` sql -uniqExact(x[, ...]) -``` - -如果你绝对需要一个确切的结果,使用 `uniqExact` 功能。 否则使用 [uniq](#agg_function-uniq) 功能。 - -`uniqExact` 比 `uniq` 使用更多的内存,因为状态的大小随着不同值的数量的增加而无界增长。 - -**参数** - -该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`,或数字类型。 - -**另请参阅** - -- [uniq](#agg_function-uniq) -- [uniqCombined](#agg_function-uniqcombined) -- [uniqHLL12](#agg_function-uniqhll12) - -## groupArray(x), groupArray(max_size)(x) {#agg_function-grouparray} - -创建参数值的数组。 -值可以按任何(不确定)顺序添加到数组中。 - -第二个版本(与 `max_size` 参数)将结果数组的大小限制为 `max_size` 元素。 -例如, `groupArray (1) (x)` 相当于 `[any (x)]`. - -在某些情况下,您仍然可以依靠执行的顺序。 这适用于以下情况 `SELECT` 来自使用 `ORDER BY`. - -## groupArrayInsertAt {#grouparrayinsertat} - -在指定位置向数组中插入一个值。 - -**语法** - -``` sql -groupArrayInsertAt(default_x, size)(x, pos); -``` - -如果在一个查询中将多个值插入到同一位置,则该函数的行为方式如下: - -- 如果在单个线程中执行查询,则使用第一个插入的值。 -- 如果在多个线程中执行查询,则结果值是未确定的插入值之一。 - -**参数** - -- `x` — 被插入的值。[表达式](../syntax.md#syntax-expressions) 导致的一个 [支持的数据类型](../../sql-reference/data-types/index.md). -- `pos` — `x` 将被插入的位置。 数组中的索引编号从零开始。 [UInt32](../../sql-reference/data-types/int-uint.md#uint-ranges). -- `default_x`— 如果代入值为空,则使用默认值。可选参数。[表达式](../syntax.md#syntax-expressions) 为 `x` 数据类型的数据。 如果 `default_x` 未定义,则 [默认值](../../sql-reference/statements/create.md#create-default-values) 被使用。 -- `size`— 结果数组的长度。可选参数。如果使用该参数,`default_x` 必须指定。 [UInt32](../../sql-reference/data-types/int-uint.md#uint-ranges). - -**返回值** - -- 具有插入值的数组。 - -类型: [阵列](../../sql-reference/data-types/array.md#data-type-array). - -**示例** - -查询: - -``` sql -SELECT groupArrayInsertAt(toString(number), number * 2) FROM numbers(5); -``` - -结果: - -``` text -┌─groupArrayInsertAt(toString(number), multiply(number, 2))─┐ -│ ['0','','1','','2','','3','','4'] │ -└───────────────────────────────────────────────────────────┘ -``` - -查询: - -``` sql -SELECT groupArrayInsertAt('-')(toString(number), number * 2) FROM numbers(5); -``` - -结果: - -``` text -┌─groupArrayInsertAt('-')(toString(number), multiply(number, 2))─┐ -│ ['0','-','1','-','2','-','3','-','4'] │ -└────────────────────────────────────────────────────────────────┘ -``` - -查询: - -``` sql -SELECT groupArrayInsertAt('-', 5)(toString(number), number * 2) FROM numbers(5); -``` - -结果: - -``` text -┌─groupArrayInsertAt('-', 5)(toString(number), multiply(number, 2))─┐ -│ ['0','-','1','-','2'] │ -└───────────────────────────────────────────────────────────────────┘ -``` - -在一个位置多线程插入数据。 - -查询: - -``` sql -SELECT groupArrayInsertAt(number, 0) FROM numbers_mt(10) SETTINGS max_block_size = 1; -``` - -作为这个查询的结果,你会得到随机整数 `[0,9]` 范围。 例如: - -``` text -┌─groupArrayInsertAt(number, 0)─┐ -│ [7] │ -└───────────────────────────────┘ -``` - -## groupArrayMovingSum {#agg_function-grouparraymovingsum} - -计算输入值的移动和。 - -``` sql -groupArrayMovingSum(numbers_for_summing) -groupArrayMovingSum(window_size)(numbers_for_summing) -``` - -该函数可以将窗口大小作为参数。 如果未指定,则该函数的窗口大小等于列中的行数。 - -**参数** - -- `numbers_for_summing` — [表达式](../syntax.md#syntax-expressions) 为数值数据类型值。 -- `window_size` — 窗口大小。 - -**返回值** - -- 与输入数据大小和类型相同的数组。 - -**示例** - -样品表: - -``` sql -CREATE TABLE t -( - `int` UInt8, - `float` Float32, - `dec` Decimal32(2) -) -ENGINE = TinyLog -``` - -``` text -┌─int─┬─float─┬──dec─┐ -│ 1 │ 1.1 │ 1.10 │ -│ 2 │ 2.2 │ 2.20 │ -│ 4 │ 4.4 │ 4.40 │ -│ 7 │ 7.77 │ 7.77 │ -└─────┴───────┴──────┘ -``` - -查询: - -``` sql -SELECT - groupArrayMovingSum(int) AS I, - groupArrayMovingSum(float) AS F, - groupArrayMovingSum(dec) AS D -FROM t -``` - -``` text -┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ -│ [1,3,7,14] │ [1.1,3.3000002,7.7000003,15.47] │ [1.10,3.30,7.70,15.47] │ -└────────────┴─────────────────────────────────┴────────────────────────┘ -``` - -``` sql -SELECT - groupArrayMovingSum(2)(int) AS I, - groupArrayMovingSum(2)(float) AS F, - groupArrayMovingSum(2)(dec) AS D -FROM t -``` - -``` text -┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ -│ [1,3,6,11] │ [1.1,3.3000002,6.6000004,12.17] │ [1.10,3.30,6.60,12.17] │ -└────────────┴─────────────────────────────────┴────────────────────────┘ -``` - -## groupArrayMovingAvg {#agg_function-grouparraymovingavg} - -计算输入值的移动平均值。 - -``` sql -groupArrayMovingAvg(numbers_for_summing) -groupArrayMovingAvg(window_size)(numbers_for_summing) -``` - -该函数可以将窗口大小作为参数。 如果未指定,则该函数的窗口大小等于列中的行数。 - -**参数** - -- `numbers_for_summing` — [表达式](../syntax.md#syntax-expressions) 生成数值数据类型值。 -- `window_size` — 窗口大小。 - -**返回值** - -- 与输入数据大小和类型相同的数组。 - -该函数使用 [四舍五入到零](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). 它截断无意义的小数位来保证结果的数据类型。 - -**示例** - -样品表 `b`: - -``` sql -CREATE TABLE t -( - `int` UInt8, - `float` Float32, - `dec` Decimal32(2) -) -ENGINE = TinyLog -``` - -``` text -┌─int─┬─float─┬──dec─┐ -│ 1 │ 1.1 │ 1.10 │ -│ 2 │ 2.2 │ 2.20 │ -│ 4 │ 4.4 │ 4.40 │ -│ 7 │ 7.77 │ 7.77 │ -└─────┴───────┴──────┘ -``` - -查询: - -``` sql -SELECT - groupArrayMovingAvg(int) AS I, - groupArrayMovingAvg(float) AS F, - groupArrayMovingAvg(dec) AS D -FROM t -``` - -``` text -┌─I─────────┬─F───────────────────────────────────┬─D─────────────────────┐ -│ [0,0,1,3] │ [0.275,0.82500005,1.9250001,3.8675] │ [0.27,0.82,1.92,3.86] │ -└───────────┴─────────────────────────────────────┴───────────────────────┘ -``` - -``` sql -SELECT - groupArrayMovingAvg(2)(int) AS I, - groupArrayMovingAvg(2)(float) AS F, - groupArrayMovingAvg(2)(dec) AS D -FROM t -``` - -``` text -┌─I─────────┬─F────────────────────────────────┬─D─────────────────────┐ -│ [0,1,3,5] │ [0.55,1.6500001,3.3000002,6.085] │ [0.55,1.65,3.30,6.08] │ -└───────────┴──────────────────────────────────┴───────────────────────┘ -``` - -## groupUniqArray(x), groupUniqArray(max_size)(x) {#groupuniqarrayx-groupuniqarraymax-sizex} - -从不同的参数值创建一个数组。 内存消耗是一样的 `uniqExact` 功能。 - -第二个版本(`max_size` 参数)将结果数组的大小限制为 `max_size` 元素。 -例如, `groupUniqArray(1)(x)` 相当于 `[any(x)]`. - -## quantile {#quantile} - -计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 - -此功能适用 [水塘抽样(](https://en.wikipedia.org/wiki/Reservoir_sampling),使用储存器最大到8192和随机数发生器进行采样。 结果是非确定性的。 要获得精确的分位数,请使用 [quantileExact](#quantileexact) 功能。 - -当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[分位数](#quantiles)功能。 - -**语法** - -``` sql -quantile(level)(expr) -``` - -别名: `median`. - -**参数** - -- `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median). -- `expr` — 求职表达式,类型为:数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。 - -**返回值** - -- 指定层次的近似分位数。 - -类型: - -- [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。 -- [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。 -- [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。 - -**示例** - -输入表: - -``` text -┌─val─┐ -│ 1 │ -│ 1 │ -│ 2 │ -│ 3 │ -└─────┘ -``` - -查询: - -``` sql -SELECT quantile(val) FROM t -``` - -结果: - -``` text -┌─quantile(val)─┐ -│ 1.5 │ -└───────────────┘ -``` - -**另请参阅** - -- [中位数](#median) -- [分位数](#quantiles) - -## quantileDeterministic {#quantiledeterministic} - -计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 - -此功能适用 [水塘抽样(](https://en.wikipedia.org/wiki/Reservoir_sampling),使用储存器最大到8192和随机数发生器进行采样。 结果是非确定性的。 要获得精确的分位数,请使用 [quantileExact](#quantileexact) 功能。 - -当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[分位数](#quantiles)功能。 - -**语法** - -``` sql -quantileDeterministic(level)(expr, determinator) -``` - -别名: `medianDeterministic`. - -**参数** - -- `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median). -- `expr` — 求职表达式,类型为:数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。 -- `determinator` — 一个数字,其hash被用来代替在水塘抽样中随机生成的数字,这样可以保证取样的确定性。你可以使用用户ID或者事件ID等任何正数,但是如果相同的 `determinator` 出现多次,那结果很可能不正确。 - -**返回值** - -- 指定层次的近似分位数。 - -类型: - -- [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。 -- [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。 -- [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。 - -**示例** - -输入表: - -``` text -┌─val─┐ -│ 1 │ -│ 1 │ -│ 2 │ -│ 3 │ -└─────┘ -``` - -查询: - -``` sql -SELECT quantileDeterministic(val, 1) FROM t -``` - -结果: - -``` text -┌─quantileDeterministic(val, 1)─┐ -│ 1.5 │ -└───────────────────────────────┘ -``` - -**另请参阅** - -- [中位数](#median) -- [分位数](#quantiles) - -## quantileExact {#quantileexact} - -准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 - -为了准确计算,所有输入的数据被合并为一个数组,并且部分的排序。因此该函数需要 `O(n)` 的内存,n为输入数据的个数。但是对于少量数据来说,该函数还是非常有效的。 - -当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[分位数](#quantiles)功能。 - -**语法** - -``` sql -quantileExact(level)(expr) -``` - -别名: `medianExact`. - -**参数** - -- `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median). -- `expr` — 求职表达式,类型为:数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。 - -**返回值** - -- 指定层次的分位数。 - -类型: - -- [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。 -- [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。 -- [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。 - -**示例** - -查询: - -``` sql -SELECT quantileExact(number) FROM numbers(10) -``` - -结果: - -``` text -┌─quantileExact(number)─┐ -│ 5 │ -└───────────────────────┘ -``` - -**另请参阅** - -- [中位数](#median) -- [分位数](#quantiles) - -## quantileExactWeighted {#quantileexactweighted} - -考虑到每个元素的权重,然后准确计算数值序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 - -为了准确计算,所有输入的数据被合并为一个数组,并且部分的排序。每个输入值需要根据 `weight` 计算求和。该算法使用哈希表。正因为如此,在数据重复较多的时候使用的内存是少于[quantileExact](#quantileexact)的。 您可以使用此函数代替 `quantileExact` 并指定重量1。 - -当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[分位数](#quantiles)功能。 - -**语法** - -``` sql -quantileExactWeighted(level)(expr, weight) -``` - -别名: `medianExactWeighted`. - -**参数** - -- `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median). -- `expr` — 求职表达式,类型为:数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。 -- `weight` — 权重序列。 权重是一个数据出现的数值。 - -**返回值** - -- 指定层次的分位数。 - -类型: - -- [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。 -- [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。 -- [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。 - -**示例** - -输入表: - -``` text -┌─n─┬─val─┐ -│ 0 │ 3 │ -│ 1 │ 2 │ -│ 2 │ 1 │ -│ 5 │ 4 │ -└───┴─────┘ -``` - -查询: - -``` sql -SELECT quantileExactWeighted(n, val) FROM t -``` - -结果: - -``` text -┌─quantileExactWeighted(n, val)─┐ -│ 1 │ -└───────────────────────────────┘ -``` - -**另请参阅** - -- [中位数](#median) -- [分位数](#quantiles) - -## quantileTiming {#quantiletiming} - -使用确定的精度计算数字数据序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 - -结果是确定性的(它不依赖于查询处理顺序)。 该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。 - -当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[分位数](#quantiles)功能。 - -**语法** - -``` sql -quantileTiming(level)(expr) -``` - -别名: `medianTiming`. - -**参数** - -- `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median). - -- `expr` — [表达式](../syntax.md#syntax-expressions),返回 [浮动\*](../../sql-reference/data-types/float.md)类型数据。 - - - 如果输入负值,那结果是不可预期的。 - - 如果输入值大于30000(页面加载时间大于30s),那我们假设为30000。 - -**精度** - -计算是准确的,如果: - -- 值的总数不超过5670。 -- 总数值超过5670,但页面加载时间小于1024ms。 - -否则,计算结果将四舍五入到16毫秒的最接近倍数。 - -!!! note "注" - 对于计算页面加载时间分位数,此函数比 [分位数](#quantile)更有效和准确。 - -**返回值** - -- 指定层次的分位数。 - -类型: `Float32`. - -!!! note "注" - 如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../sql-reference/data-types/float.md#data_type-float-nan-inf) 被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 看 [ORDER BY clause](../statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 - -**示例** - -输入表: - -``` text -┌─response_time─┐ -│ 72 │ -│ 112 │ -│ 126 │ -│ 145 │ -│ 104 │ -│ 242 │ -│ 313 │ -│ 168 │ -│ 108 │ -└───────────────┘ -``` - -查询: - -``` sql -SELECT quantileTiming(response_time) FROM t -``` - -结果: - -``` text -┌─quantileTiming(response_time)─┐ -│ 126 │ -└───────────────────────────────┘ -``` - -**另请参阅** - -- [中位数](#median) -- [分位数](#quantiles) - -## quantileTimingWeighted {#quantiletimingweighted} - -根据每个序列成员的权重,使用确定的精度计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 - -结果是确定性的(它不依赖于查询处理顺序)。 该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。 - -当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[分位数](#quantiles)功能。 - -**语法** - -``` sql -quantileTimingWeighted(level)(expr, weight) -``` - -别名: `medianTimingWeighted`. - -**参数** - -- `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median). - -- `expr` — [表达式](../syntax.md#syntax-expressions),返回 [浮动\*](../../sql-reference/data-types/float.md)类型数据。 - - - 如果输入负值,那结果是不可预期的。 - - 如果输入值大于30000(页面加载时间大于30s),那我们假设为30000。 - -- `weight` — 权重序列。 权重是一个数据出现的数值。 - -**精度** - -计算是准确的,如果: - -- 值的总数不超过5670。 -- 总数值超过5670,但页面加载时间小于1024ms。 - -否则,计算结果将四舍五入到16毫秒的最接近倍数。 - -!!! note "注" - 对于计算页面加载时间分位数,此函数比 [分位数](#quantile)更高效和准确。 - -**返回值** - -- 指定层次的分位数。 - -类型: `Float32`. - -!!! note "注" - 如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../sql-reference/data-types/float.md#data_type-float-nan-inf) 被返回。 这样做的目的是将这些案例与导致零的案例区分开来。看 [ORDER BY clause](../statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 - -**示例** - -输入表: - -``` text -┌─response_time─┬─weight─┐ -│ 68 │ 1 │ -│ 104 │ 2 │ -│ 112 │ 3 │ -│ 126 │ 2 │ -│ 138 │ 1 │ -│ 162 │ 1 │ -└───────────────┴────────┘ -``` - -查询: - -``` sql -SELECT quantileTimingWeighted(response_time, weight) FROM t -``` - -结果: - -``` text -┌─quantileTimingWeighted(response_time, weight)─┐ -│ 112 │ -└───────────────────────────────────────────────┘ -``` - -**另请参阅** - -- [中位数](#median) -- [分位数](#quantiles) - -## quantileTDigest {#quantiletdigest} - -使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 - -最大误差为1%。 内存消耗 `log(n)`,这里 `n` 是值的个数。 结果取决于运行查询的顺序,并且是不确定的。 - -该功能的性能低于性能 [分位数](#quantile) 或 [时间分位](#quantiletiming). 在状态大小与精度的比率方面,这个函数比 `quantile`更优秀。 - -当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[分位数](#quantiles)功能。 - -**语法** - -``` sql -quantileTDigest(level)(expr) -``` - -别名: `medianTDigest`. - -**参数** - -- `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median). -- `expr` — 求职表达式,类型为:数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。 - -**回值** - -- 指定层次的分位数。 - -类型: - -- [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。 -- [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。 -- [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。 - -**示例** - -查询: - -``` sql -SELECT quantileTDigest(number) FROM numbers(10) -``` - -结果: - -``` text -┌─quantileTDigest(number)─┐ -│ 4.5 │ -└─────────────────────────┘ -``` - -**另请参阅** - -- [中位数](#median) -- [分位数](#quantiles) - -## quantileTDigestWeighted {#quantiletdigestweighted} - -使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 该函数考虑了每个序列成员的权重。最大误差为1%。 内存消耗 `log(n)`,这里 `n` 是值的个数。 - -该功能的性能低于性能 [分位数](#quantile) 或 [时间分位](#quantiletiming). 在状态大小与精度的比率方面,这个函数比 `quantile`更优秀。 - -结果取决于运行查询的顺序,并且是不确定的。 - -当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[分位数](#quantiles)功能 - -**语法** - -``` sql -quantileTDigest(level)(expr) -``` - -别名: `medianTDigest`. - -**参数** - -- `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median). -- `expr` — 求职表达式,类型为:数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。 -- `weight` — 权重序列。 权重是一个数据出现的数值。 - -**返回值** - -- 指定层次的分位数。 - -类型: - -- [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。 -- [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。 -- [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。 - -**示例** - -查询: - -``` sql -SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) -``` - -结果: - -``` text -┌─quantileTDigestWeighted(number, 1)─┐ -│ 4.5 │ -└────────────────────────────────────┘ -``` - -**另请参阅** - -- [中位数](#median) -- [分位数](#quantiles) - -## median {#median} - -`median*` 函数是 `quantile*` 函数的别名。 它们计算数字数据样本的中位数。 - -函数: - -- `median` — [quantile](#quantile)别名。 -- `medianDeterministic` — [quantileDeterministic](#quantiledeterministic)别名。 -- `medianExact` — [quantileExact](#quantileexact)别名。 -- `medianExactWeighted` — [quantileExactWeighted](#quantileexactweighted)别名。 -- `medianTiming` — [quantileTiming](#quantiletiming)别名。 -- `medianTimingWeighted` — [quantileTimingWeighted](#quantiletimingweighted)别名。 -- `medianTDigest` — [quantileTDigest](#quantiletdigest)别名。 -- `medianTDigestWeighted` — [quantileTDigestWeighted](#quantiletdigestweighted)别名。 - -**示例** - -输入表: - -``` text -┌─val─┐ -│ 1 │ -│ 1 │ -│ 2 │ -│ 3 │ -└─────┘ -``` - -查询: - -``` sql -SELECT medianDeterministic(val, 1) FROM t -``` - -结果: - -``` text -┌─medianDeterministic(val, 1)─┐ -│ 1.5 │ -└─────────────────────────────┘ -``` - -## quantiles(level1, level2, …)(x) {#quantiles} - -所有分位数函数也有相应的函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。这些函数一次计算所列层次的所有分位数,并返回结果值的数组。 - -## varSamp(x) {#varsampx} - -计算 `Σ((x - x̅)^2) / (n - 1)`,这里 `n` 是样本大小, `x̅`是`x`的平均值。 - -它表示随机变量的方差的无偏估计,如果传递的值形成其样本。 - -返回 `Float64`. 当 `n <= 1`,返回 `+∞`. - -!!! note "注" - 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varSampStable` 功能。 它的工作速度较慢,但提供较低的计算错误。 - -## varPop(x) {#varpopx} - -计算 `Σ((x - x̅)^2) / n`,这里 `n` 是样本大小, `x̅`是`x`的平均值。 - -换句话说,计算一组数据的离差。 返回 `Float64`。 - -!!! note "注" - 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varPopStable` 功能。 它的工作速度较慢,但提供较低的计算错误。 - -## stddevSamp(x) {#stddevsampx} - -结果等于平方根 `varSamp(x)`。 - -!!! note "注" - 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevSampStable` 功能。 它的工作速度较慢,但提供较低的计算错误。 - -## stddevPop(x) {#stddevpopx} - -结果等于平方根 `varPop(x)`。 - -!!! note "注" - 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevPopStable` 功能。 它的工作速度较慢,但提供较低的计算错误。 - -## topK(N)(x) {#topknx} - -返回指定列中近似最常见值的数组。 生成的数组按值的近似频率降序排序(而不是值本身)。 - -实现了[过滤节省空间](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf)算法, 使用基于reduce-and-combine的算法,借鉴[并行节省空间](https://arxiv.org/pdf/1401.0702.pdf). - -``` sql -topK(N)(column) -``` - -此函数不提供保证的结果。 在某些情况下,可能会发生错误,并且可能会返回不是最高频的值。 - -我们建议使用 `N < 10` 值,`N` 值越大,性能越低。最大值 `N = 65536`。 - -**参数** - -- ‘N’ 是要返回的元素数。 - -如果省略该参数,则使用默认值10。 - -**参数** - -- ' x ' – 计算的频率值。 - -**示例** - -就拿 [OnTime](../../getting-started/example-datasets/ontime.md) 数据集来说,选择`AirlineID` 列中出现最频繁的三个。 - -``` sql -SELECT topK(3)(AirlineID) AS res -FROM ontime -``` - -``` text -┌─res─────────────────┐ -│ [19393,19790,19805] │ -└─────────────────────┘ -``` - -## topKWeighted {#topkweighted} - -类似于 `topK` 但需要一个整数类型的附加参数 - `weight`. 每个输入都被记入 `weight` 次频率计算。 - -**语法** - -``` sql -topKWeighted(N)(x, weight) -``` - -**参数** - -- `N` — 返回值个数。 - -**参数** - -- `x` – 输入值。 -- `weight` — 权重。 [UInt8](../../sql-reference/data-types/int-uint.md)类型。 - -**返回值** - -返回具有最大近似权重总和的值数组。 - -**示例** - -查询: - -``` sql -SELECT topKWeighted(10)(number, number) FROM numbers(1000) -``` - -结果: - -``` text -┌─topKWeighted(10)(number, number)──────────┐ -│ [999,998,997,996,995,994,993,992,991,990] │ -└───────────────────────────────────────────┘ -``` - -## covarSamp(x,y) {#covarsampx-y} - -计算 `Σ((x - x̅)(y - y̅)) / (n - 1)`。 - -返回Float64。 当 `n <= 1`, returns +∞。 - -!!! note "注" - 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarSampStable` 功能。 它的工作速度较慢,但提供较低的计算错误。 - -## covarPop(x,y) {#covarpopx-y} - -计算 `Σ((x - x̅)(y - y̅)) / n`。 - -!!! note "注" - 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarPopStable` 功能。 它的工作速度较慢,但提供了较低的计算错误。 - -## corr(x,y) {#corrx-y} - -计算Pearson相关系数: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`。 - -!!! note "注" - 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `corrStable` 功能。 它的工作速度较慢,但提供较低的计算错误。 - -## categoricalInformationValue {#categoricalinformationvalue} - -对于每个类别计算 `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` 。 - -``` sql -categoricalInformationValue(category1, category2, ..., tag) -``` - -结果指示离散(分类)要素如何使用 `[category1, category2, ...]` 有助于使用学习模型预测`tag`的值。 - -## simpleLinearRegression {#simplelinearregression} - -执行简单(一维)线性回归。 - -``` sql -simpleLinearRegression(x, y) -``` - -参数: - -- `x` — x轴。 -- `y` — y轴。 - -返回值: - -符合`y = a*x + b`的常量 `(a, b)` 。 - -**例** - -``` sql -SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3]) -``` - -``` text -┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])─┐ -│ (1,0) │ -└───────────────────────────────────────────────────────────────────┘ -``` - -``` sql -SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6]) -``` - -``` text -┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])─┐ -│ (1,3) │ -└───────────────────────────────────────────────────────────────────┘ -``` - -## stochasticLinearRegression {#agg_functions-stochasticlinearregression} - -该函数实现随机线性回归。 它支持自定义参数的学习率、L2正则化系数、微批,并且具有少量更新权重的方法([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (默认), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf))。 - -### 参数 {#agg_functions-stochasticlinearregression-parameters} - -有4个可自定义的参数。 它们按顺序传递给函数,但是没有必要传递所有四个默认值将被使用,但是好的模型需要一些参数调整。 - -``` text -stochasticLinearRegression(1.0, 1.0, 10, 'SGD') -``` - -1. `learning rate` 当执行梯度下降步骤时,步长上的系数。 过大的学习率可能会导致模型的权重无限大。 默认值为 `0.00001`. -2. `l2 regularization coefficient` 这可能有助于防止过度拟合。 默认值为 `0.1`. -3. `mini-batch size` 设置元素的数量,这些元素将被计算和求和以执行梯度下降的一个步骤。 纯随机下降使用一个元素,但是具有小批量(约10个元素)使梯度步骤更稳定。 默认值为 `15`. -4. `method for updating weights` 他们是: `Adam` (默认情况下), `SGD`, `Momentum`, `Nesterov`. `Momentum` 和 `Nesterov` 需要更多的计算和内存,但是它们恰好在收敛速度和随机梯度方法的稳定性方面是有用的。 - -### 用法 {#agg_functions-stochasticlinearregression-usage} - -`stochasticLinearRegression` 用于两个步骤:拟合模型和预测新数据。 为了拟合模型并保存其状态以供以后使用,我们使用 `-State` combinator,它基本上保存了状态(模型权重等)。 -为了预测我们使用函数 [evalMLMethod](../functions/machine-learning-functions.md#machine_learning_methods-evalmlmethod),这需要一个状态作为参数以及特征来预测。 - - - -**1.** 安装 - -可以使用这种查询。 - -``` sql -CREATE TABLE IF NOT EXISTS train_data -( - param1 Float64, - param2 Float64, - target Float64 -) ENGINE = Memory; - -CREATE TABLE your_model ENGINE = Memory AS SELECT -stochasticLinearRegressionState(0.1, 0.0, 5, 'SGD')(target, param1, param2) -AS state FROM train_data; -``` - -在这里,我们还需要将数据插入到 `train_data` 桌子 参数的数量不是固定的,它只取决于参数的数量,传递到 `linearRegressionState`. 它们都必须是数值。 -请注意,带有目标值的列(我们想要学习预测)被插入作为第一个参数。 - -**2.** 预测 - -在将状态保存到表中之后,我们可以多次使用它进行预测,甚至与其他状态合并并创建新的更好的模型。 - -``` sql -WITH (SELECT state FROM your_model) AS model SELECT -evalMLMethod(model, param1, param2) FROM test_data -``` - -查询将返回一列预测值。 请注意,第一个参数 `evalMLMethod` 是 `AggregateFunctionState` 对象,接下来是要素列。 - -`test_data` 是一个像表 `train_data` 但可能不包含目标值。 - -### 注 {#agg_functions-stochasticlinearregression-notes} - -1. 要合并两个模型,用户可以创建这样的查询: - `sql SELECT state1 + state2 FROM your_models` - 哪里 `your_models` 表包含这两个模型。 此查询将返回new `AggregateFunctionState` 对象。 - -2. 如果没有,用户可以获取创建的模型的权重用于自己的目的,而不保存模型 `-State` 使用combinator。 - `sql SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data` - 这种查询将拟合模型并返回其权重-首先是权重,它对应于模型的参数,最后一个是偏差。 所以在上面的例子中,查询将返回一个具有3个值的列。 - -**另请参阅** - -- [stochasticLogisticRegression](#agg_functions-stochasticlogisticregression) -- [线性回归和逻辑回归之间的区别](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) - -## stochasticLogisticRegression {#agg_functions-stochasticlogisticregression} - -该函数实现随机逻辑回归。 它可以用于二进制分类问题,支持与stochasticLinearRegression相同的自定义参数,并以相同的方式工作。 - -### 参数 {#agg_functions-stochasticlogisticregression-parameters} - -参数与stochasticLinearRegression中的参数完全相同: -`learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. -欲了解更多信息,请参阅 [参数](#agg_functions-stochasticlinearregression-parameters). - -``` text -stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') -``` - -**1.** 安装 - - - - 参考stochasticLinearRegression相关文档 - - 预测标签的取值范围为[-1, 1] - -**2.** 预测 - - - - 使用已经保存的state我们可以预测标签为 `1` 的对象的概率。 - - ``` sql - WITH (SELECT state FROM your_model) AS model SELECT - evalMLMethod(model, param1, param2) FROM test_data - ``` - - 查询结果返回一个列的概率。注意 `evalMLMethod` 的第一个参数是 `AggregateFunctionState` 对象,接下来的参数是列的特性。 - - 我们也可以设置概率的范围, 这样需要给元素指定不同的标签。 - - ``` sql - SELECT ans < 1.1 AND ans > 0.5 FROM - (WITH (SELECT state FROM your_model) AS model SELECT - evalMLMethod(model, param1, param2) AS ans FROM test_data) - ``` - - 结果是标签。 - - `test_data` 是一个像 `train_data` 一样的表,但是不包含目标值。 - -**另请参阅** - -- [随机指标线上回归](#agg_functions-stochasticlinearregression) -- [线性回归和逻辑回归之间的差异](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) - -## groupBitmapAnd {#groupbitmapand} - -计算位图列的AND,返回UInt64类型的基数,如果添加后缀状态,则返回 [位图对象](../../sql-reference/functions/bitmap-functions.md). - -``` sql -groupBitmapAnd(expr) -``` - -**参数** - -`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` 类型。 - -**返回值** - -的价值 `UInt64` 类型。 - -**示例** - -``` sql -DROP TABLE IF EXISTS bitmap_column_expr_test2; -CREATE TABLE bitmap_column_expr_test2 -( - tag_id String, - z AggregateFunction(groupBitmap, UInt32) -) -ENGINE = MergeTree -ORDER BY tag_id; - -INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); - -SELECT groupBitmapAnd(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─groupBitmapAnd(z)─┐ -│ 3 │ -└───────────────────┘ - -SELECT arraySort(bitmapToArray(groupBitmapAndState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─arraySort(bitmapToArray(groupBitmapAndState(z)))─┐ -│ [6,8,10] │ -└──────────────────────────────────────────────────┘ -``` - -## groupBitmapOr {#groupbitmapor} - -计算位图列的OR,返回UInt64类型的基数,如果添加后缀状态,则返回 [位图对象](../../sql-reference/functions/bitmap-functions.md). 这相当于 `groupBitmapMerge`. - -``` sql -groupBitmapOr(expr) -``` - -**参数** - -`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` 类型。 - -**返回值** - -的价值 `UInt64` 类型。 - -**示例** - -``` sql -DROP TABLE IF EXISTS bitmap_column_expr_test2; -CREATE TABLE bitmap_column_expr_test2 -( - tag_id String, - z AggregateFunction(groupBitmap, UInt32) -) -ENGINE = MergeTree -ORDER BY tag_id; - -INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); - -SELECT groupBitmapOr(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─groupBitmapOr(z)─┐ -│ 15 │ -└──────────────────┘ - -SELECT arraySort(bitmapToArray(groupBitmapOrState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─arraySort(bitmapToArray(groupBitmapOrState(z)))─┐ -│ [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] │ -└─────────────────────────────────────────────────┘ -``` - -## groupBitmapXor {#groupbitmapxor} - -计算位图列的XOR,返回UInt64类型的基数,如果添加后缀状态,则返回 [位图对象](../../sql-reference/functions/bitmap-functions.md). - -``` sql -groupBitmapOr(expr) -``` - -**参数** - -`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` 类型。 - -**返回值** - -的价值 `UInt64` 类型。 - -**示例** - -``` sql -DROP TABLE IF EXISTS bitmap_column_expr_test2; -CREATE TABLE bitmap_column_expr_test2 -( - tag_id String, - z AggregateFunction(groupBitmap, UInt32) -) -ENGINE = MergeTree -ORDER BY tag_id; - -INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); -INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); - -SELECT groupBitmapXor(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─groupBitmapXor(z)─┐ -│ 10 │ -└───────────────────┘ - -SELECT arraySort(bitmapToArray(groupBitmapXorState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); -┌─arraySort(bitmapToArray(groupBitmapXorState(z)))─┐ -│ [1,3,5,6,8,10,11,13,14,15] │ -└──────────────────────────────────────────────────┘ -``` - -[原始文章](https://clickhouse.tech/docs/en/query_language/agg_functions/reference/) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/any.md b/docs/zh/sql-reference/aggregate-functions/reference/any.md new file mode 100644 index 00000000000..205ff1c1944 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/any.md @@ -0,0 +1,13 @@ +--- +toc_priority: 6 +--- + +# any {#agg_function-any} + +选择第一个遇到的值。 +查询可以以任何顺序执行,甚至每次都以不同的顺序执行,因此此函数的结果是不确定的。 +要获得确定的结果,您可以使用 ‘min’ 或 ‘max’ 功能,而不是 ‘any’. + +在某些情况下,可以依靠执行的顺序。 这适用于SELECT来自使用ORDER BY的子查询的情况。 + +当一个 `SELECT` 查询具有 `GROUP BY` 子句或至少一个聚合函数,ClickHouse(相对于MySQL)要求在所有表达式 `SELECT`, `HAVING`,和 `ORDER BY` 子句可以从键或聚合函数计算。 换句话说,从表中选择的每个列必须在键或聚合函数内使用。 要获得像MySQL这样的行为,您可以将其他列放在 `any` 聚合函数。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md new file mode 100644 index 00000000000..b67be9e48cf --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md @@ -0,0 +1,34 @@ +--- +toc_priority: 103 +--- + +# anyHeavy {#anyheavyx} + +选择一个频繁出现的值,使用[heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf) 算法。 如果某个值在查询的每个执行线程中出现的情况超过一半,则返回此值。 通常情况下,结果是不确定的。 + +``` sql +anyHeavy(column) +``` + +**参数** + +- `column` – The column name。 + +**示例** + +使用 [OnTime](../../../getting-started/example-datasets/ontime.md) 数据集,并选择在 `AirlineID` 列任何频繁出现的值。 + +查询: + +``` sql +SELECT anyHeavy(AirlineID) AS res +FROM ontime; +``` + +结果: + +``` text +┌───res─┐ +│ 19690 │ +└───────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/anylast.md b/docs/zh/sql-reference/aggregate-functions/reference/anylast.md new file mode 100644 index 00000000000..e6792e0e449 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/anylast.md @@ -0,0 +1,9 @@ +--- +toc_priority: 104 +--- + +## anyLast {#anylastx} + +选择遇到的最后一个值。 +其结果和[any](../../../sql-reference/aggregate-functions/reference/any.md) 函数一样是不确定的 。 + \ No newline at end of file diff --git a/docs/zh/sql-reference/aggregate-functions/reference/argmax.md b/docs/zh/sql-reference/aggregate-functions/reference/argmax.md new file mode 100644 index 00000000000..9d90590b2f1 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/argmax.md @@ -0,0 +1,64 @@ +--- +toc_priority: 106 +--- + +# argMax {#agg-function-argmax} + +计算 `val` 最大值对应的 `arg` 值。 如果 `val` 最大值存在几个不同的 `arg` 值,输出遇到的第一个值。 + +这个函数的Tuple版本将返回 `val` 最大值对应的元组。本函数适合和 `SimpleAggregateFunction` 搭配使用。 + +**语法** + +``` sql +argMax(arg, val) +``` + +或 + +``` sql +argMax(tuple(arg, val)) +``` + +**参数** + +- `arg` — Argument. +- `val` — Value. + +**返回值** + +- `val` 最大值对应的 `arg` 值。 + +类型: 匹配 `arg` 类型。 + +对于输入中的元组: + +- 元组 `(arg, val)`, 其中 `val` 最大值,`arg` 是对应的值。 + +类型: [元组](../../../sql-reference/data-types/tuple.md)。 + +**示例** + +输入表: + +``` text +┌─user─────┬─salary─┐ +│ director │ 5000 │ +│ manager │ 3000 │ +│ worker │ 1000 │ +└──────────┴────────┘ +``` + +查询: + +``` sql +SELECT argMax(user, salary), argMax(tuple(user, salary), salary), argMax(tuple(user, salary)) FROM salary; +``` + +结果: + +``` text +┌─argMax(user, salary)─┬─argMax(tuple(user, salary), salary)─┬─argMax(tuple(user, salary))─┐ +│ director │ ('director',5000) │ ('director',5000) │ +└──────────────────────┴─────────────────────────────────────┴─────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/argmin.md b/docs/zh/sql-reference/aggregate-functions/reference/argmin.md new file mode 100644 index 00000000000..0dd4625ac0d --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/argmin.md @@ -0,0 +1,37 @@ +--- +toc_priority: 105 +--- + +# argMin {#agg-function-argmin} + +语法: `argMin(arg, val)` 或 `argMin(tuple(arg, val))` + +计算 `val` 最小值对应的 `arg` 值。 如果 `val` 最小值存在几个不同的 `arg` 值,输出遇到的第一个(`arg`)值。 + +这个函数的Tuple版本将返回 `val` 最小值对应的tuple。本函数适合和`SimpleAggregateFunction`搭配使用。 + +**示例:** + +输入表: + +``` text +┌─user─────┬─salary─┐ +│ director │ 5000 │ +│ manager │ 3000 │ +│ worker │ 1000 │ +└──────────┴────────┘ +``` + +查询: + +``` sql +SELECT argMin(user, salary), argMin(tuple(user, salary)) FROM salary; +``` + +结果: + +``` text +┌─argMin(user, salary)─┬─argMin(tuple(user, salary))─┐ +│ worker │ ('worker',1000) │ +└──────────────────────┴─────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/avg.md b/docs/zh/sql-reference/aggregate-functions/reference/avg.md new file mode 100644 index 00000000000..739654adc1c --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/avg.md @@ -0,0 +1,64 @@ +--- +toc_priority: 5 +--- + +# avg {#agg_function-avg} + +计算算术平均值。 + +**语法** + +``` sql +avg(x) +``` + +**参数** + +- `x` — 输入值, 必须是 [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md), 或 [Decimal](../../../sql-reference/data-types/decimal.md)。 + +**返回值** + +- 算术平均值,总是 [Float64](../../../sql-reference/data-types/float.md) 类型。 +- 输入参数 `x` 为空时返回 `NaN` 。 + +**示例** + +查询: + +``` sql +SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5); +``` + +结果: + +``` text +┌─avg(x)─┐ +│ 2.5 │ +└────────┘ +``` + +**示例** + +创建一个临时表: + +查询: + +``` sql +CREATE table test (t UInt8) ENGINE = Memory; +``` + +获取算术平均值: + +查询: + +``` +SELECT avg(t) FROM test; +``` + +结果: + +``` text +┌─avg(x)─┐ +│ nan │ +└────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md new file mode 100644 index 00000000000..9b732f57b4a --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md @@ -0,0 +1,84 @@ +--- +toc_priority: 107 +--- + +# avgWeighted {#avgweighted} + + +计算 [加权算术平均值](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean)。 + +**语法** + +``` sql +avgWeighted(x, weight) +``` + +**参数** + +- `x` — 值。 +- `weight` — 值的加权。 + +`x` 和 `weight` 的类型必须是 +[整数](../../../sql-reference/data-types/int-uint.md), 或 +[浮点数](../../../sql-reference/data-types/float.md), 或 +[定点数](../../../sql-reference/data-types/decimal.md), +但是可以不一样。 + +**返回值** + +- `NaN`。 如果所有的权重都等于0 或所提供的权重参数是空。 +- 加权平均值。 其他。 + +类型: 总是[Float64](../../../sql-reference/data-types/float.md). + +**示例** + +查询: + +``` sql +SELECT avgWeighted(x, w) +FROM values('x Int8, w Int8', (4, 1), (1, 0), (10, 2)) +``` + +结果: + +``` text +┌─avgWeighted(x, weight)─┐ +│ 8 │ +└────────────────────────┘ +``` + + +**示例** + +查询: + +``` sql +SELECT avgWeighted(x, w) +FROM values('x Int8, w Int8', (0, 0), (1, 0), (10, 0)) +``` + +结果: + +``` text +┌─avgWeighted(x, weight)─┐ +│ nan │ +└────────────────────────┘ +``` + +**示例** + +查询: + +``` sql +CREATE table test (t UInt8) ENGINE = Memory; +SELECT avgWeighted(t) FROM test +``` + +结果: + +``` text +┌─avgWeighted(x, weight)─┐ +│ nan │ +└────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md b/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md new file mode 100644 index 00000000000..1970e76c2fd --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md @@ -0,0 +1,13 @@ +--- +toc_priority: 250 +--- + +# categoricalInformationValue {#categoricalinformationvalue} + +对于每个类别计算 `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` 。 + +``` sql +categoricalInformationValue(category1, category2, ..., tag) +``` + +结果指示离散(分类)要素如何使用 `[category1, category2, ...]` 有助于使用学习模型预测`tag`的值。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/corr.md b/docs/zh/sql-reference/aggregate-functions/reference/corr.md new file mode 100644 index 00000000000..5ab49f75023 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/corr.md @@ -0,0 +1,15 @@ +--- +toc_priority: 107 +--- + +# corr {#corrx-y} + +**语法** +``` sql +`corr(x, y)` +``` + +计算Pearson相关系数: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`。 + +!!! note "注" +该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `corrStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/count.md b/docs/zh/sql-reference/aggregate-functions/reference/count.md new file mode 100644 index 00000000000..fc528980bfa --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/count.md @@ -0,0 +1,70 @@ +--- +toc_priority: 1 +--- + +# count {#agg_function-count} + + +计数行数或非空值。 + +ClickHouse支持以下 `count` 语法: +- `count(expr)` 或 `COUNT(DISTINCT expr)`。 +- `count()` 或 `COUNT(*)`. 该 `count()` 语法是ClickHouse特定的。 + +**参数** + +该函数可以采取: + +- 零参数。 +- 一个 [表达式](../../../sql-reference/syntax.md#syntax-expressions)。 + +**返回值** + +- 如果没有参数调用函数,它会计算行数。 +- 如果 [表达式](../../../sql-reference/syntax.md#syntax-expressions) 被传递,则该函数计数此表达式返回非null的次数。 如果表达式返回 [可为空](../../../sql-reference/data-types/nullable.md)类型的值,`count`的结果仍然不 `Nullable`。 如果表达式对于所有的行都返回 `NULL` ,则该函数返回 0 。 + +在这两种情况下,返回值的类型为 [UInt64](../../../sql-reference/data-types/int-uint.md)。 + +**详细信息** + +ClickHouse支持 `COUNT(DISTINCT ...)` 语法,这种结构的行为取决于 [count_distinct_implementation](../../../operations/settings/settings.md#settings-count_distinct_implementation) 设置。 它定义了用于执行该操作的 [uniq\*](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)函数。 默认值是 [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)函数。 + +`SELECT count() FROM table` 这个查询未被优化,因为表中的条目数没有单独存储。 它从表中选择一个小列并计算其值的个数。 + +**示例** + +示例1: + +``` sql +SELECT count() FROM t +``` + +``` text +┌─count()─┐ +│ 5 │ +└─────────┘ +``` + +示例2: + +``` sql +SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation' +``` + +``` text +┌─name──────────────────────────┬─value─────┐ +│ count_distinct_implementation │ uniqExact │ +└───────────────────────────────┴───────────┘ +``` + +``` sql +SELECT count(DISTINCT num) FROM t +``` + +``` text +┌─uniqExact(num)─┐ +│ 3 │ +└────────────────┘ +``` + +这个例子表明 `count(DISTINCT num)` 是通过 `count_distinct_implementation` 的设定值 `uniqExact` 函数来执行的。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md new file mode 100644 index 00000000000..c6f43c6b9e9 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md @@ -0,0 +1,15 @@ +--- +toc_priority: 36 +--- + +# covarPop {#covarpop} + +**语法** +``` sql +covarPop(x, y) +``` + +计算 `Σ((x - x̅)(y - y̅)) / n` 的值。 + +!!! note "注" +该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarPopStable` 函数。 它的工作速度较慢,但提供了较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md new file mode 100644 index 00000000000..5ef5104504b --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md @@ -0,0 +1,17 @@ +--- +toc_priority: 37 +--- + +# covarSamp {#covarsamp} + +**语法** +``` sql +covarSamp(x, y) +``` + +计算 `Σ((x - x̅)(y - y̅)) / (n - 1)` 的值。 + +返回Float64。 当 `n <= 1`, 返回 +∞。 + +!!! note "注" +该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md b/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md new file mode 100644 index 00000000000..e439263bf78 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md @@ -0,0 +1,69 @@ +--- +toc_priority: 141 +--- + +# deltaSum {#agg_functions-deltasum} + +计算连续行之间的差值和。如果差值为负,则忽略。 + +**语法** + +``` sql +deltaSum(value) +``` + +**参数** + +- `value` — 必须是 [整型](../../data-types/int-uint.md) 或者 [浮点型](../../data-types/float.md) 。 + +**返回值** + +- `Integer` or `Float` 型的算术差值和。 + +**示例** + +查询: + +``` sql +SELECT deltaSum(arrayJoin([1, 2, 3])); +``` + +结果: + +``` text +┌─deltaSum(arrayJoin([1, 2, 3]))─┐ +│ 2 │ +└────────────────────────────────┘ +``` + +查询: + +``` sql +SELECT deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3])); +``` + +结果: + +``` text +┌─deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]))─┐ +│ 7 │ +└───────────────────────────────────────────────┘ +``` + +查询: + +``` sql +SELECT deltaSum(arrayJoin([2.25, 3, 4.5])); +``` + +结果: + +``` text +┌─deltaSum(arrayJoin([2.25, 3, 4.5]))─┐ +│ 2.25 │ +└─────────────────────────────────────┘ +``` + +**参见** + +- [runningDifference](../../functions/other-functions.md#other_functions-runningdifference) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md new file mode 100644 index 00000000000..0a8f1cd326d --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md @@ -0,0 +1,20 @@ +--- +toc_priority: 110 +--- + +# groupArray {#agg_function-grouparray} + +**语法** +``` sql +groupArray(x) +或 +groupArray(max_size)(x) +``` + +创建参数值的数组。 +值可以按任何(不确定)顺序添加到数组中。 + +第二个版本(带有 `max_size` 参数)将结果数组的大小限制为 `max_size` 个元素。 +例如, `groupArray (1) (x)` 相当于 `[any (x)]` 。 + +在某些情况下,您仍然可以依赖执行顺序。这适用于SELECT(查询)来自使用了 `ORDER BY` 子查询的情况。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md new file mode 100644 index 00000000000..3a50b24fd7f --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md @@ -0,0 +1,91 @@ +--- +toc_priority: 112 +--- + +# groupArrayInsertAt {#grouparrayinsertat} + +在指定位置向数组中插入一个值。 + +**语法** + +``` sql +groupArrayInsertAt(default_x, size)(x, pos); +``` + +如果在一个查询中将多个值插入到同一位置,则该函数的行为方式如下: + +- 如果在单个线程中执行查询,则使用第一个插入的值。 +- 如果在多个线程中执行查询,则结果值是未确定的插入值之一。 + +**参数** + +- `x` — 要插入的值。生成所[支持的数据类型](../../../sql-reference/data-types/index.md)(数据)的[表达式](../../../sql-reference/syntax.md#syntax-expressions)。 +- `pos` — 指定元素 `x` 将被插入的位置。 数组中的索引编号从零开始。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). +- `default_x` — 在空位置替换的默认值。可选参数。生成 `x` 数据类型 (数据) 的[表达式](../../../sql-reference/syntax.md#syntax-expressions)。 如果 `default_x` 未定义,则 [默认值](../../../sql-reference/statements/create.md#create-default-values) 被使用。 +- `size`— 结果数组的长度。可选参数。如果使用该参数,必须指定默认值 `default_x` 。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges)。 + +**返回值** + +- 具有插入值的数组。 + +类型: [阵列](../../../sql-reference/data-types/array.md#data-type-array)。 + +**示例** + +查询: + +``` sql +SELECT groupArrayInsertAt(toString(number), number * 2) FROM numbers(5); +``` + +结果: + +``` text +┌─groupArrayInsertAt(toString(number), multiply(number, 2))─┐ +│ ['0','','1','','2','','3','','4'] │ +└───────────────────────────────────────────────────────────┘ +``` + +查询: + +``` sql +SELECT groupArrayInsertAt('-')(toString(number), number * 2) FROM numbers(5); +``` + +结果: + +``` text +┌─groupArrayInsertAt('-')(toString(number), multiply(number, 2))─┐ +│ ['0','-','1','-','2','-','3','-','4'] │ +└────────────────────────────────────────────────────────────────┘ +``` + +查询: + +``` sql +SELECT groupArrayInsertAt('-', 5)(toString(number), number * 2) FROM numbers(5); +``` + +结果: + +``` text +┌─groupArrayInsertAt('-', 5)(toString(number), multiply(number, 2))─┐ +│ ['0','-','1','-','2'] │ +└───────────────────────────────────────────────────────────────────┘ +``` + +在一个位置多线程插入数据。 + +查询: + +``` sql +SELECT groupArrayInsertAt(number, 0) FROM numbers_mt(10) SETTINGS max_block_size = 1; +``` + +作为这个查询的结果,你会得到 `[0,9]` 范围的随机整数。 例如: + +``` text +┌─groupArrayInsertAt(number, 0)─┐ +│ [7] │ +└───────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md new file mode 100644 index 00000000000..8cdfc302b39 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md @@ -0,0 +1,85 @@ +--- +toc_priority: 114 +--- + +# groupArrayMovingAvg {#agg_function-grouparraymovingavg} + +计算输入值的移动平均值。 + +**语法** + +``` sql +groupArrayMovingAvg(numbers_for_summing) +groupArrayMovingAvg(window_size)(numbers_for_summing) +``` + +该函数可以将窗口大小作为参数。 如果未指定,则该函数的窗口大小等于列中的行数。 + +**参数** + +- `numbers_for_summing` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 生成数值数据类型值。 +- `window_size` — 窗口大小。 + +**返回值** + +- 与输入数据大小相同的数组。 + +对于输入数据类型是[Integer](../../../sql-reference/data-types/int-uint.md), +和[floating-point](../../../sql-reference/data-types/float.md), +对应的返回值类型是 `Float64` 。 +对于输入数据类型是[Decimal](../../../sql-reference/data-types/decimal.md) 返回值类型是 `Decimal128` 。 + +该函数对于 `Decimal128` 使用 [四舍五入到零](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). 它截断无意义的小数位来保证结果的数据类型。 + +**示例** + +样表 `t`: + +``` sql +CREATE TABLE t +( + `int` UInt8, + `float` Float32, + `dec` Decimal32(2) +) +ENGINE = TinyLog +``` + +``` text +┌─int─┬─float─┬──dec─┐ +│ 1 │ 1.1 │ 1.10 │ +│ 2 │ 2.2 │ 2.20 │ +│ 4 │ 4.4 │ 4.40 │ +│ 7 │ 7.77 │ 7.77 │ +└─────┴───────┴──────┘ +``` + +查询: + +``` sql +SELECT + groupArrayMovingAvg(int) AS I, + groupArrayMovingAvg(float) AS F, + groupArrayMovingAvg(dec) AS D +FROM t +``` + +``` text +┌─I────────────────────┬─F─────────────────────────────────────────────────────────────────────────────┬─D─────────────────────┐ +│ [0.25,0.75,1.75,3.5] │ [0.2750000059604645,0.8250000178813934,1.9250000417232513,3.8499999940395355] │ [0.27,0.82,1.92,3.86] │ +└──────────────────────┴───────────────────────────────────────────────────────────────────────────────┴───────────────────────┘ +``` + +``` sql +SELECT + groupArrayMovingAvg(2)(int) AS I, + groupArrayMovingAvg(2)(float) AS F, + groupArrayMovingAvg(2)(dec) AS D +FROM t +``` + +``` text +┌─I───────────────┬─F───────────────────────────────────────────────────────────────────────────┬─D─────────────────────┐ +│ [0.5,1.5,3,5.5] │ [0.550000011920929,1.6500000357627869,3.3000000715255737,6.049999952316284] │ [0.55,1.65,3.30,6.08] │ +└─────────────────┴─────────────────────────────────────────────────────────────────────────────┴───────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md new file mode 100644 index 00000000000..d58d848e7ac --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md @@ -0,0 +1,81 @@ +--- +toc_priority: 113 +--- + +# groupArrayMovingSum {#agg_function-grouparraymovingsum} + + +计算输入值的移动和。 + +**语法** + +``` sql +groupArrayMovingSum(numbers_for_summing) +groupArrayMovingSum(window_size)(numbers_for_summing) +``` + +该函数可以将窗口大小作为参数。 如果未指定,则该函数的窗口大小等于列中的行数。 + +**参数** + +- `numbers_for_summing` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 生成数值数据类型值。 +- `window_size` — 窗口大小。 + +**返回值** + +- 与输入数据大小相同的数组。 +对于输入数据类型是[Decimal](../../../sql-reference/data-types/decimal.md) 数组元素类型是 `Decimal128` 。 +对于其他的数值类型, 获取其对应的 `NearestFieldType` 。 + +**示例** + +样表: + +``` sql +CREATE TABLE t +( + `int` UInt8, + `float` Float32, + `dec` Decimal32(2) +) +ENGINE = TinyLog +``` + +``` text +┌─int─┬─float─┬──dec─┐ +│ 1 │ 1.1 │ 1.10 │ +│ 2 │ 2.2 │ 2.20 │ +│ 4 │ 4.4 │ 4.40 │ +│ 7 │ 7.77 │ 7.77 │ +└─────┴───────┴──────┘ +``` + +查询: + +``` sql +SELECT + groupArrayMovingSum(int) AS I, + groupArrayMovingSum(float) AS F, + groupArrayMovingSum(dec) AS D +FROM t +``` + +``` text +┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ +│ [1,3,7,14] │ [1.1,3.3000002,7.7000003,15.47] │ [1.10,3.30,7.70,15.47] │ +└────────────┴─────────────────────────────────┴────────────────────────┘ +``` + +``` sql +SELECT + groupArrayMovingSum(2)(int) AS I, + groupArrayMovingSum(2)(float) AS F, + groupArrayMovingSum(2)(dec) AS D +FROM t +``` + +``` text +┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ +│ [1,3,6,11] │ [1.1,3.3000002,6.6000004,12.17] │ [1.10,3.30,6.60,12.17] │ +└────────────┴─────────────────────────────────┴────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md new file mode 100644 index 00000000000..529b63a2316 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md @@ -0,0 +1,82 @@ +--- +toc_priority: 114 +--- + +# groupArraySample {#grouparraysample} + +构建一个参数值的采样数组。 +结果数组的大小限制为 `max_size` 个元素。参数值被随机选择并添加到数组中。 + +**语法** + +``` sql +groupArraySample(max_size[, seed])(x) +``` + +**参数** + +- `max_size` — 结果数组的最大长度。[UInt64](../../data-types/int-uint.md)。 +- `seed` — 随机数发生器的种子。可选。[UInt64](../../data-types/int-uint.md)。默认值: `123456`。 +- `x` — 参数 (列名 或者 表达式)。 + +**返回值** + +- 随机选取参数 `x` (的值)组成的数组。 + +类型: [Array](../../../sql-reference/data-types/array.md). + +**示例** + +样表 `colors`: + +``` text +┌─id─┬─color──┐ +│ 1 │ red │ +│ 2 │ blue │ +│ 3 │ green │ +│ 4 │ white │ +│ 5 │ orange │ +└────┴────────┘ +``` + +使用列名做参数查询: + +``` sql +SELECT groupArraySample(3)(color) as newcolors FROM colors; +``` + +结果: + +```text +┌─newcolors──────────────────┐ +│ ['white','blue','green'] │ +└────────────────────────────┘ +``` + +使用列名和不同的(随机数)种子查询: + +``` sql +SELECT groupArraySample(3, 987654321)(color) as newcolors FROM colors; +``` + +结果: + +```text +┌─newcolors──────────────────┐ +│ ['red','orange','green'] │ +└────────────────────────────┘ +``` + +使用表达式做参数查询: + +``` sql +SELECT groupArraySample(3)(concat('light-', color)) as newcolors FROM colors; +``` + +结果: + +```text +┌─newcolors───────────────────────────────────┐ +│ ['light-blue','light-orange','light-green'] │ +└─────────────────────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md new file mode 100644 index 00000000000..1a8520b0f08 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md @@ -0,0 +1,48 @@ +--- +toc_priority: 125 +--- + +# groupBitAnd {#groupbitand} + +对于数字序列按位应用 `AND` 。 + +**语法** + +``` sql +groupBitAnd(expr) +``` + +**参数** + +`expr` – 结果为 `UInt*` 类型的表达式。 + +**返回值** + +`UInt*` 类型的值。 + +**示例** + +测试数据: + +``` text +binary decimal +00101100 = 44 +00011100 = 28 +00001101 = 13 +01010101 = 85 +``` + +查询: + +``` sql +SELECT groupBitAnd(num) FROM t +``` + +`num` 是包含测试数据的列。 + +结果: + +``` text +binary decimal +00000100 = 4 +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md new file mode 100644 index 00000000000..5e14c3a21ea --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md @@ -0,0 +1,46 @@ +--- +toc_priority: 128 +--- + +# groupBitmap {#groupbitmap} + +从无符号整数列进行位图或聚合计算,返回 `UInt64` 类型的基数,如果添加后缀 `State` ,则返回[位图对象](../../../sql-reference/functions/bitmap-functions.md)。 + +**语法** + +``` sql +groupBitmap(expr) +``` + +**参数** + +`expr` – 结果为 `UInt*` 类型的表达式。 + +**返回值** + +`UInt64` 类型的值。 + +**示例** + +测试数据: + +``` text +UserID +1 +1 +2 +3 +``` + +查询: + +``` sql +SELECT groupBitmap(UserID) as num FROM t +``` + +结果: + +``` text +num +3 +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md new file mode 100644 index 00000000000..bd5aa17c7ff --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md @@ -0,0 +1,48 @@ +--- +toc_priority: 129 +--- + +# groupBitmapAnd {#groupbitmapand} + +计算位图列的 `AND` ,返回 `UInt64` 类型的基数,如果添加后缀 `State` ,则返回 [位图对象](../../../sql-reference/functions/bitmap-functions.md)。 + +**语法** + +``` sql +groupBitmapAnd(expr) +``` + +**参数** + +`expr` – 结果为 `AggregateFunction(groupBitmap, UInt*)` 类型的表达式。 + +**返回值** + +`UInt64` 类型的值。 + +**示例** + +``` sql +DROP TABLE IF EXISTS bitmap_column_expr_test2; +CREATE TABLE bitmap_column_expr_test2 +( + tag_id String, + z AggregateFunction(groupBitmap, UInt32) +) +ENGINE = MergeTree +ORDER BY tag_id; + +INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); + +SELECT groupBitmapAnd(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─groupBitmapAnd(z)─┐ +│ 3 │ +└───────────────────┘ + +SELECT arraySort(bitmapToArray(groupBitmapAndState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─arraySort(bitmapToArray(groupBitmapAndState(z)))─┐ +│ [6,8,10] │ +└──────────────────────────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md new file mode 100644 index 00000000000..52048083d17 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md @@ -0,0 +1,48 @@ +--- +toc_priority: 130 +--- + +# groupBitmapOr {#groupbitmapor} + +计算位图列的 `OR` ,返回 `UInt64` 类型的基数,如果添加后缀 `State` ,则返回 [位图对象](../../../sql-reference/functions/bitmap-functions.md)。 + +**语法** + +``` sql +groupBitmapOr(expr) +``` + +**参数** + +`expr` – 结果为 `AggregateFunction(groupBitmap, UInt*)` 类型的表达式。 + +**返回值** + +`UInt64` 类型的值。 + +**示例** + +``` sql +DROP TABLE IF EXISTS bitmap_column_expr_test2; +CREATE TABLE bitmap_column_expr_test2 +( + tag_id String, + z AggregateFunction(groupBitmap, UInt32) +) +ENGINE = MergeTree +ORDER BY tag_id; + +INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); + +SELECT groupBitmapOr(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─groupBitmapOr(z)─┐ +│ 15 │ +└──────────────────┘ + +SELECT arraySort(bitmapToArray(groupBitmapOrState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─arraySort(bitmapToArray(groupBitmapOrState(z)))─┐ +│ [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] │ +└─────────────────────────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md new file mode 100644 index 00000000000..d862e974418 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md @@ -0,0 +1,48 @@ +--- +toc_priority: 131 +--- + +# groupBitmapXor {#groupbitmapxor} + +计算位图列的 `XOR` ,返回 `UInt64` 类型的基数,如果添加后缀 `State` ,则返回 [位图对象](../../../sql-reference/functions/bitmap-functions.md)。 + +**语法** + +``` sql +groupBitmapXor(expr) +``` + +**参数** + +`expr` – 结果为 `AggregateFunction(groupBitmap, UInt*)` 类型的表达式。 + +**返回值** + +`UInt64` 类型的值。 + +**示例** + +``` sql +DROP TABLE IF EXISTS bitmap_column_expr_test2; +CREATE TABLE bitmap_column_expr_test2 +( + tag_id String, + z AggregateFunction(groupBitmap, UInt32) +) +ENGINE = MergeTree +ORDER BY tag_id; + +INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); + +SELECT groupBitmapXor(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─groupBitmapXor(z)─┐ +│ 10 │ +└───────────────────┘ + +SELECT arraySort(bitmapToArray(groupBitmapXorState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─arraySort(bitmapToArray(groupBitmapXorState(z)))─┐ +│ [1,3,5,6,8,10,11,13,14,15] │ +└──────────────────────────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md new file mode 100644 index 00000000000..175cc8d7286 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md @@ -0,0 +1,48 @@ +--- +toc_priority: 126 +--- + +# groupBitOr {#groupbitor} + +对于数字序列按位应用 `OR` 。 + +**语法** + +``` sql +groupBitOr(expr) +``` + +**参数** + +`expr` – 结果为 `UInt*` 类型的表达式。 + +**返回值** + +`UInt*` 类型的值。 + +**示例** + +测试数据:: + +``` text +binary decimal +00101100 = 44 +00011100 = 28 +00001101 = 13 +01010101 = 85 +``` + +查询: + +``` sql +SELECT groupBitOr(num) FROM t +``` + +`num` 是包含测试数据的列。 + +结果: + +``` text +binary decimal +01111101 = 125 +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md new file mode 100644 index 00000000000..26409f00032 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md @@ -0,0 +1,48 @@ +--- +toc_priority: 127 +--- + +# groupBitXor {#groupbitxor} + +对于数字序列按位应用 `XOR` 。 + +**语法** + +``` sql +groupBitXor(expr) +``` + +**参数** + +`expr` – 结果为 `UInt*` 类型的表达式。 + +**返回值** + +`UInt*` 类型的值。 + +**示例** + +测试数据: + +``` text +binary decimal +00101100 = 44 +00011100 = 28 +00001101 = 13 +01010101 = 85 +``` + +查询: + +``` sql +SELECT groupBitXor(num) FROM t +``` + +`num` 是包含测试数据的列。 + +结果: + +``` text +binary decimal +01101000 = 104 +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md b/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md new file mode 100644 index 00000000000..f371361bbf6 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md @@ -0,0 +1,18 @@ +--- +toc_priority: 111 +--- + +# groupUniqArray {#groupuniqarray} + +**语法** + +``` sql +groupUniqArray(x) +或 +groupUniqArray(max_size)(x) +``` + +从不同的参数值创建一个数组。 内存消耗和 [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) 函数是一样的。 + +第二个版本(带有 `max_size` 参数)将结果数组的大小限制为 `max_size` 个元素。 +例如, `groupUniqArray(1)(x)` 相当于 `[any(x)]`. diff --git a/docs/zh/sql-reference/aggregate-functions/reference/index.md b/docs/zh/sql-reference/aggregate-functions/reference/index.md new file mode 100644 index 00000000000..5070c79775e --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/index.md @@ -0,0 +1,72 @@ +--- +toc_folder_title: Reference +toc_priority: 36 +toc_hidden: true +--- + +# 聚合函数列表 {#aggregate-functions-reference} + +标准聚合函数: + +- [count](../../../sql-reference/aggregate-functions/reference/count.md) +- [min](../../../sql-reference/aggregate-functions/reference/min.md) +- [max](../../../sql-reference/aggregate-functions/reference/max.md) +- [sum](../../../sql-reference/aggregate-functions/reference/sum.md) +- [avg](../../../sql-reference/aggregate-functions/reference/avg.md) +- [any](../../../sql-reference/aggregate-functions/reference/any.md) +- [stddevPop](../../../sql-reference/aggregate-functions/reference/stddevpop.md) +- [stddevSamp](../../../sql-reference/aggregate-functions/reference/stddevsamp.md) +- [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md) +- [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md) +- [covarPop](../../../sql-reference/aggregate-functions/reference/covarpop.md) +- [covarSamp](../../../sql-reference/aggregate-functions/reference/covarsamp.md) + +ClickHouse 特有的聚合函数: + +- [anyHeavy](../../../sql-reference/aggregate-functions/reference/anyheavy.md) +- [anyLast](../../../sql-reference/aggregate-functions/reference/anylast.md) +- [argMin](../../../sql-reference/aggregate-functions/reference/argmin.md) +- [argMax](../../../sql-reference/aggregate-functions/reference/argmax.md) +- [avgWeighted](../../../sql-reference/aggregate-functions/reference/avgweighted.md) +- [topK](../../../sql-reference/aggregate-functions/reference/topk.md) +- [topKWeighted](../../../sql-reference/aggregate-functions/reference/topkweighted.md) +- [groupArray](../../../sql-reference/aggregate-functions/reference/grouparray.md) +- [groupUniqArray](../../../sql-reference/aggregate-functions/reference/groupuniqarray.md) +- [groupArrayInsertAt](../../../sql-reference/aggregate-functions/reference/grouparrayinsertat.md) +- [groupArrayMovingAvg](../../../sql-reference/aggregate-functions/reference/grouparraymovingavg.md) +- [groupArrayMovingSum](../../../sql-reference/aggregate-functions/reference/grouparraymovingsum.md) +- [groupBitAnd](../../../sql-reference/aggregate-functions/reference/groupbitand.md) +- [groupBitOr](../../../sql-reference/aggregate-functions/reference/groupbitor.md) +- [groupBitXor](../../../sql-reference/aggregate-functions/reference/groupbitxor.md) +- [groupBitmap](../../../sql-reference/aggregate-functions/reference/groupbitmap.md) +- [groupBitmapAnd](../../../sql-reference/aggregate-functions/reference/groupbitmapand.md) +- [groupBitmapOr](../../../sql-reference/aggregate-functions/reference/groupbitmapor.md) +- [groupBitmapXor](../../../sql-reference/aggregate-functions/reference/groupbitmapxor.md) +- [sumWithOverflow](../../../sql-reference/aggregate-functions/reference/sumwithoverflow.md) +- [sumMap](../../../sql-reference/aggregate-functions/reference/summap.md) +- [minMap](../../../sql-reference/aggregate-functions/reference/minmap.md) +- [maxMap](../../../sql-reference/aggregate-functions/reference/maxmap.md) +- [skewSamp](../../../sql-reference/aggregate-functions/reference/skewsamp.md) +- [skewPop](../../../sql-reference/aggregate-functions/reference/skewpop.md) +- [kurtSamp](../../../sql-reference/aggregate-functions/reference/kurtsamp.md) +- [kurtPop](../../../sql-reference/aggregate-functions/reference/kurtpop.md) +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md) +- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md) +- [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md) +- [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md) +- [quantileExactLow](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexactlow) +- [quantileExactHigh](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexacthigh) +- [quantileExactWeighted](../../../sql-reference/aggregate-functions/reference/quantileexactweighted.md) +- [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md) +- [quantileTimingWeighted](../../../sql-reference/aggregate-functions/reference/quantiletimingweighted.md) +- [quantileDeterministic](../../../sql-reference/aggregate-functions/reference/quantiledeterministic.md) +- [quantileTDigest](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md) +- [quantileTDigestWeighted](../../../sql-reference/aggregate-functions/reference/quantiletdigestweighted.md) +- [simpleLinearRegression](../../../sql-reference/aggregate-functions/reference/simplelinearregression.md) +- [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md) +- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md) +- [categoricalInformationValue](../../../sql-reference/aggregate-functions/reference/categoricalinformationvalue.md) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md b/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md new file mode 100644 index 00000000000..feecd7afb1f --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md @@ -0,0 +1,37 @@ +--- +toc_priority: 150 +--- + +## initializeAggregation {#initializeaggregation} + +初始化你输入行的聚合。用于后缀是 `State` 的函数。 +用它来测试或处理 `AggregateFunction` 和 `AggregationgMergeTree` 类型的列。 + +**语法** + +``` sql +initializeAggregation (aggregate_function, column_1, column_2) +``` + +**参数** + +- `aggregate_function` — 聚合函数名。 这个函数的状态 — 正创建的。[String](../../../sql-reference/data-types/string.md#string)。 +- `column_n` — 将其转换为函数的参数的列。[String](../../../sql-reference/data-types/string.md#string)。 + +**返回值** + +返回输入行的聚合结果。返回类型将与 `initializeAgregation` 用作第一个参数的函数的返回类型相同。 +例如,对于后缀为 `State` 的函数,返回类型将是 `AggregateFunction`。 + +**示例** + +查询: + +```sql +SELECT uniqMerge(state) FROM (SELECT initializeAggregation('uniqState', number % 3) AS state FROM system.numbers LIMIT 10000); +``` +结果: + +┌─uniqMerge(state)─┐ +│ 3 │ +└──────────────────┘ diff --git a/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md new file mode 100644 index 00000000000..d5b76e0c1e9 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md @@ -0,0 +1,26 @@ +--- +toc_priority: 153 +--- + +# kurtPop {#kurtpop} + +计算给定序列的 [峰度](https://en.wikipedia.org/wiki/Kurtosis)。 + +**语法** + +``` sql +kurtPop(expr) +``` + +**参数** + +`expr` — 结果为数字的 [表达式](../../../sql-reference/syntax.md#syntax-expressions)。 + +**返回值** + +给定分布的峰度。 类型 — [Float64](../../../sql-reference/data-types/float.md) + +**示例** + +``` sql +SELECT kurtPop(value) FROM series_with_value_column; diff --git a/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md new file mode 100644 index 00000000000..a38e14d0792 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md @@ -0,0 +1,28 @@ +--- +toc_priority: 154 +--- + +# kurtSamp {#kurtsamp} + +计算给定序列的 [峰度样本](https://en.wikipedia.org/wiki/Kurtosis)。 +它表示随机变量峰度的无偏估计,如果传递的值形成其样本。 + +**语法** + +``` sql +kurtSamp(expr) +``` + +**参数** + +`expr` — 结果为数字的 [表达式](../../../sql-reference/syntax.md#syntax-expressions)。 + +**返回值** + +给定序列的峰度。类型 — [Float64](../../../sql-reference/data-types/float.md)。 如果 `n <= 1` (`n` 是样本的大小),则该函数返回 `nan`。 + +**示例** + +``` sql +SELECT kurtSamp(value) FROM series_with_value_column; +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/zh/sql-reference/aggregate-functions/reference/mannwhitneyutest.md new file mode 100644 index 00000000000..016a650b61b --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -0,0 +1,72 @@ +--- +toc_priority: 310 +toc_title: mannWhitneyUTest +--- + +# mannWhitneyUTest {#mannwhitneyutest} + +对两个总体的样本应用 Mann-Whitney 秩检验。 + +**语法** + +``` sql +mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_index) +``` + +两个样本的值都在 `sample_data` 列中。如果 `sample_index` 等于 0,则该行的值属于第一个总体的样本。 反之属于第二个总体的样本。 +零假设是两个总体随机相等。也可以检验单边假设。该检验不假设数据具有正态分布。 + +**参数** + +- `sample_data` — 样本数据。[Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) 或 [Decimal](../../../sql-reference/data-types/decimal.md)。 +- `sample_index` — 样本索引。[Integer](../../../sql-reference/data-types/int-uint.md). + +**参数** + +- `alternative` — 供选假设。(可选,默认值是: `'two-sided'` 。) [String](../../../sql-reference/data-types/string.md)。 + - `'two-sided'`; + - `'greater'`; + - `'less'`。 +- `continuity_correction` — 如果不为0,那么将对p值进行正态近似的连续性修正。(可选,默认:1。) [UInt64](../../../sql-reference/data-types/int-uint.md)。 + +**返回值** + +[元组](../../../sql-reference/data-types/tuple.md),有两个元素: + +- 计算出U统计量。[Float64](../../../sql-reference/data-types/float.md)。 +- 计算出的p值。[Float64](../../../sql-reference/data-types/float.md)。 + + +**示例** + +输入表: + +``` text +┌─sample_data─┬─sample_index─┐ +│ 10 │ 0 │ +│ 11 │ 0 │ +│ 12 │ 0 │ +│ 1 │ 1 │ +│ 2 │ 1 │ +│ 3 │ 1 │ +└─────────────┴──────────────┘ +``` + +查询: + +``` sql +SELECT mannWhitneyUTest('greater')(sample_data, sample_index) FROM mww_ttest; +``` + +结果: + +``` text +┌─mannWhitneyUTest('greater')(sample_data, sample_index)─┐ +│ (9,0.04042779918503192) │ +└────────────────────────────────────────────────────────┘ +``` + +**参见** + +- [Mann–Whitney U test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test) +- [Stochastic ordering](https://en.wikipedia.org/wiki/Stochastic_ordering) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/max.md b/docs/zh/sql-reference/aggregate-functions/reference/max.md new file mode 100644 index 00000000000..8372d5c6f85 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/max.md @@ -0,0 +1,7 @@ +--- +toc_priority: 3 +--- + +# max {#agg_function-max} + +计算最大值。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md new file mode 100644 index 00000000000..4d91d1e75fd --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md @@ -0,0 +1,33 @@ +--- +toc_priority: 143 +--- + +# maxMap {#agg_functions-maxmap} + +**语法** + +```sql +maxMap(key, value) + 或 +maxMap(Tuple(key, value)) +``` + + +根据 `key` 数组中指定的键对 `value` 数组计算最大值。 + +传递 `key` 和 `value` 数组的元组与传递 `key` 和 `value` 的两个数组是同义的。 +要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。 +返回两个数组组成的元组: 排好序的`key` 和对应 `key` 的 `value` 计算值(最大值)。 + +示例: + +``` sql +SELECT maxMap(a, b) +FROM values('a Array(Int32), b Array(Int64)', ([1, 2], [2, 2]), ([2, 3], [1, 1])) +``` + +``` text +┌─maxMap(a, b)──────┐ +│ ([1,2,3],[2,2,1]) │ +└───────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/median.md b/docs/zh/sql-reference/aggregate-functions/reference/median.md new file mode 100644 index 00000000000..83879f6cb34 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/median.md @@ -0,0 +1,41 @@ +# median {#median} + +`median*` 函数是 `quantile*` 函数的别名。它们计算数字数据样本的中位数。 + +函数: + +- `median` — [quantile](#quantile)别名。 +- `medianDeterministic` — [quantileDeterministic](#quantiledeterministic)别名。 +- `medianExact` — [quantileExact](#quantileexact)别名。 +- `medianExactWeighted` — [quantileExactWeighted](#quantileexactweighted)别名。 +- `medianTiming` — [quantileTiming](#quantiletiming)别名。 +- `medianTimingWeighted` — [quantileTimingWeighted](#quantiletimingweighted)别名。 +- `medianTDigest` — [quantileTDigest](#quantiletdigest)别名。 +- `medianTDigestWeighted` — [quantileTDigestWeighted](#quantiletdigestweighted)别名。 + +**示例** + +输入表: + +``` text +┌─val─┐ +│ 1 │ +│ 1 │ +│ 2 │ +│ 3 │ +└─────┘ +``` + +查询: + +``` sql +SELECT medianDeterministic(val, 1) FROM t +``` + +结果: + +``` text +┌─medianDeterministic(val, 1)─┐ +│ 1.5 │ +└─────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/min.md b/docs/zh/sql-reference/aggregate-functions/reference/min.md new file mode 100644 index 00000000000..95a4099a1b7 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/min.md @@ -0,0 +1,7 @@ +--- +toc_priority: 2 +--- + +## min {#agg_function-min} + +计算最小值。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/minmap.md b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md new file mode 100644 index 00000000000..8e0022ac174 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md @@ -0,0 +1,32 @@ +--- +toc_priority: 142 +--- + +# minMap {#agg_functions-minmap} + +**语法** + +```sql +minMap(key, value) +或 +minMap(Tuple(key, value)) +``` + +根据 `key` 数组中指定的键对 `value` 数组计算最小值。 + +传递 `key` 和 `value` 数组的元组与传递 `key` 和 `value` 的两个数组是同义的。 +要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。 +返回两个数组组成的元组: 排好序的 `key` 和对应 `key` 的 `value` 计算值(最小值)。 + +**示例** + +``` sql +SELECT minMap(a, b) +FROM values('a Array(Int32), b Array(Int64)', ([1, 2], [2, 2]), ([2, 3], [1, 1])) +``` + +``` text +┌─minMap(a, b)──────┐ +│ ([1,2,3],[2,1,1]) │ +└───────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md new file mode 100644 index 00000000000..4519688dc7e --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md @@ -0,0 +1,65 @@ +--- +toc_priority: 200 +--- + +# quantile {#quantile} + +计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 +此函数应用[水塘抽样][reservoir sampling] (https://en.wikipedia.org/wiki/Reservoir_sampling),使用高达8192的水塘大小和随机数发生器采样。 +结果是不确定的。要获得精确的分位数,使用 [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) 函数。 +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。 + +**语法** + +``` sql +quantile(level)(expr) +``` + +别名: `median`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值:0.5。当 `level=0.5` 时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — 求值表达式,类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。 + +**返回值** + +- 指定层次的分位数。 + +类型: + +- [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。 +- [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。 +- [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。 + +**示例** + +输入表: + +``` text +┌─val─┐ +│ 1 │ +│ 1 │ +│ 2 │ +│ 3 │ +└─────┘ +``` + +查询: + +``` sql +SELECT quantile(val) FROM t +``` + +结果: + +``` text +┌─quantile(val)─┐ +│ 1.5 │ +└───────────────┘ +``` + +**参见** + +- [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md new file mode 100644 index 00000000000..c6c6b0a63de --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md @@ -0,0 +1,66 @@ +--- +toc_priority: 206 +--- + +# quantileDeterministic {#quantiledeterministic} + +计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 + +此功能适用 [水塘抽样](https://en.wikipedia.org/wiki/Reservoir_sampling),使用储存器最大到8192和随机数发生器进行采样。 结果是非确定性的。 要获得精确的分位数,请使用 [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) 功能。 + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)功能。 + +**语法** + +``` sql +quantileDeterministic(level)(expr, determinator) +``` + +别名: `medianDeterministic`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值:0.5。 当 `level=0.5`时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — 求值表达式,类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。 +- `determinator` — 一个数字,其hash被用来代替在水塘抽样中随机生成的数字,这样可以保证取样的确定性。你可以使用用户ID或者事件ID等任何正数,但是如果相同的 `determinator` 出现多次,那结果很可能不正确。 +**返回值** + +- 指定层次的近似分位数。 + +类型: + +- [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。 +- [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。 +- [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。 + +**示例** + +输入表: + +``` text +┌─val─┐ +│ 1 │ +│ 1 │ +│ 2 │ +│ 3 │ +└─────┘ +``` + +查询: + +``` sql +SELECT quantileDeterministic(val, 1) FROM t +``` + +结果: + +``` text +┌─quantileDeterministic(val, 1)─┐ +│ 1.5 │ +└───────────────────────────────┘ +``` + +**参见** + +- [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md b/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md new file mode 100644 index 00000000000..a8d39c35700 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md @@ -0,0 +1,170 @@ +--- +toc_priority: 202 +--- + +# quantileExact {#quantileexact} + + +准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 + +为了准确计算,所有输入的数据被合并为一个数组,并且部分的排序。因此该函数需要 `O(n)` 的内存,n为输入数据的个数。但是对于少量数据来说,该函数还是非常有效的。 + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。 + +**语法** + +``` sql +quantileExact(level)(expr) +``` + +别名: `medianExact`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值:0.5。当 `level=0.5` 时,该函数计算[中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — 求值表达式,类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。 + +**返回值** + +- 指定层次的分位数。 + + +类型: + +- [Float64](../../../sql-reference/data-types/float.md) 对于数字数据类型输入。 +- [日期](../../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。 +- [日期时间](../../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。 + +**示例** + +查询: + +``` sql +SELECT quantileExact(number) FROM numbers(10) +``` + +结果: + +``` text +┌─quantileExact(number)─┐ +│ 5 │ +└───────────────────────┘ +``` + +# quantileExactLow {#quantileexactlow} + +和 `quantileExact` 相似, 准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 + +为了准确计算,所有输入的数据被合并为一个数组,并且全排序。这排序[算法](https://en.cppreference.com/w/cpp/algorithm/sort)的复杂度是 `O(N·log(N))`, 其中 `N = std::distance(first, last)` 比较。 + +返回值取决于分位数级别和所选取的元素数量,即如果级别是 0.5, 函数返回偶数元素的低位中位数,奇数元素的中位数。中位数计算类似于 python 中使用的[median_low](https://docs.python.org/3/library/statistics.html#statistics.median_low)的实现。 + +对于所有其他级别, 返回 `level * size_of_array` 值所对应的索引的元素值。 + +例如: + +``` sql +SELECT quantileExactLow(0.1)(number) FROM numbers(10) + +┌─quantileExactLow(0.1)(number)─┐ +│ 1 │ +└───────────────────────────────┘ +``` + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。 + +**语法** + +``` sql +quantileExactLow(level)(expr) +``` + +别名: `medianExactLow`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值:0.5。当 `level=0.5` 时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — — 求值表达式,类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。 + +**返回值** + +- 指定层次的分位数。 + +类型: + +- [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。 +- [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。 +- [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。 + +**示例** + +查询: + +``` sql +SELECT quantileExactLow(number) FROM numbers(10) +``` + +结果: + +``` text +┌─quantileExactLow(number)─┐ +│ 4 │ +└──────────────────────────┘ +``` + +# quantileExactHigh {#quantileexacthigh} + +和 `quantileExact` 相似, 准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 + +为了准确计算,所有输入的数据被合并为一个数组,并且全排序。这排序[算法](https://en.cppreference.com/w/cpp/algorithm/sort)的复杂度是 `O(N·log(N))`, 其中 `N = std::distance(first, last)` 比较。 + +返回值取决于分位数级别和所选取的元素数量,即如果级别是 0.5, 函数返回偶数元素的低位中位数,奇数元素的中位数。中位数计算类似于 python 中使用的[median_high](https://docs.python.org/3/library/statistics.html#statistics.median_high)的实现。 + +对于所有其他级别, 返回 `level * size_of_array` 值所对应的索引的元素值。 + +这个实现与当前的 `quantileExact` 实现完全相似。 + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。 + +**语法** + +``` sql +quantileExactHigh(level)(expr) +``` + +别名: `medianExactHigh`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值:0.5。当 `level=0.5` 时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — — 求值表达式,类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。 + +**返回值** + +- 指定层次的分位数。 + +类型: + +- [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。 +- [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。 +- [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。 + +**示例** + +查询: + +``` sql +SELECT quantileExactHigh(number) FROM numbers(10) +``` + +结果: + +``` text +┌─quantileExactHigh(number)─┐ +│ 5 │ +└───────────────────────────┘ +``` +**参见** + +- [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md new file mode 100644 index 00000000000..5211ca210f2 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md @@ -0,0 +1,66 @@ +--- +toc_priority: 203 +--- + +# quantileExactWeighted {#quantileexactweighted} + +考虑到每个元素的权重,然后准确计算数值序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 + +为了准确计算,所有输入的数据被合并为一个数组,并且部分的排序。每个输入值需要根据 `weight` 计算求和。该算法使用哈希表。正因为如此,在数据重复较多的时候使用的内存是少于[quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact)的。 您可以使用此函数代替 `quantileExact` 并指定`weight`为 1 。 + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。 + +**语法** + +``` sql +quantileExactWeighted(level)(expr, weight) +``` + +别名: `medianExactWeighted`。 + +**参数** +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值:0.5。当 `level=0.5` 时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — 求值表达式,类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。 +- `weight` — 权重序列。 权重是一个数据出现的数值。 + +**返回值** + +- 指定层次的分位数。 + +类型: + +- [Float64](../../../sql-reference/data-types/float.md) 对于数字数据类型输入。 +- [日期](../../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。 +- [日期时间](../../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。 + +**示例** + +输入表: + +``` text +┌─n─┬─val─┐ +│ 0 │ 3 │ +│ 1 │ 2 │ +│ 2 │ 1 │ +│ 5 │ 4 │ +└───┴─────┘ +``` + +查询: + +``` sql +SELECT quantileExactWeighted(n, val) FROM t +``` + +结果: + +``` text +┌─quantileExactWeighted(n, val)─┐ +│ 1 │ +└───────────────────────────────┘ +``` + +**参见** + +- [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md new file mode 100644 index 00000000000..044c4d6d24e --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md @@ -0,0 +1,12 @@ +--- +toc_priority: 201 +--- + +# quantiles {#quantiles} + +**语法** +``` sql +quantiles(level1, level2, …)(x) +``` + +所有分位数函数(quantile)也有相应的分位数(quantiles)函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。 这些函数一次计算所列的级别的所有分位数, 并返回结果值的数组。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md new file mode 100644 index 00000000000..fb186da299e --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md @@ -0,0 +1,57 @@ +--- +toc_priority: 207 +--- + +# quantileTDigest {#quantiletdigest} + +使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算数字序列近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 + +最大误差为1%。 内存消耗为 `log(n)`,这里 `n` 是值的个数。 结果取决于运行查询的顺序,并且是不确定的。 + +该函数的性能低于 [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) 或 [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming) 的性能。 从状态大小和精度的比值来看,这个函数比 `quantile` 更优秀。 + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。 + +**语法** + +``` sql +quantileTDigest(level)(expr) +``` + +别名: `medianTDigest`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]` 。默认值:0.5。当 `level=0.5` 时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — 求值表达式,类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。 + +**返回值** + +- 指定层次的分位数。 + +类型: + +- [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。 +- [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。 +- [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。 + +**示例** + +查询: + +``` sql +SELECT quantileTDigest(number) FROM numbers(10) +``` + +结果: + +``` text +┌─quantileTDigest(number)─┐ +│ 4.5 │ +└─────────────────────────┘ +``` + +**参见** + +- [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md new file mode 100644 index 00000000000..cf78c4c03bc --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md @@ -0,0 +1,58 @@ +--- +toc_priority: 208 +--- + +# quantileTDigestWeighted {#quantiletdigestweighted} + +使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算数字序列近似[分位数](https://en.wikipedia.org/wiki/Quantile)。该函数考虑了每个序列成员的权重。最大误差为1%。 内存消耗为 `log(n)`,这里 `n` 是值的个数。 + +该函数的性能低于 [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) 或 [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming) 的性能。 从状态大小和精度的比值来看,这个函数比 `quantile` 更优秀。 + +结果取决于运行查询的顺序,并且是不确定的。 + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。 + +**语法** + +``` sql +quantileTDigestWeighted(level)(expr, weight) +``` + +别名: `medianTDigestWeighted`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]` 。默认值:0.5。 当 `level=0.5` 时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — 求值表达式,类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。 +- `weight` — 权重序列。 权重是一个数据出现的数值。 + +**返回值** + +- 指定层次的分位数。 + +类型: + +- [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。 +- [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。 +- [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。 + +**示例** + +查询: + +``` sql +SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) +``` + +结果: + +``` text +┌─quantileTDigestWeighted(number, 1)─┐ +│ 4.5 │ +└────────────────────────────────────┘ +``` + +**参见** + +- [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md new file mode 100644 index 00000000000..a193b60338a --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md @@ -0,0 +1,86 @@ +--- +toc_priority: 204 +--- + +# quantileTiming {#quantiletiming} + +使用确定的精度计算数字数据序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 + +结果是确定性的(它不依赖于查询处理顺序)。该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。 + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)函数。 + +**语法** + +``` sql +quantileTiming(level)(expr) +``` + +别名: `medianTiming`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]` 。默认值:0.5。当 `level=0.5` 时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — 求值[表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回 [Float\*](../../../sql-reference/data-types/float.md) 类型数值。 + + - 如果输入负值,那结果是不可预期的。 + - 如果输入值大于30000(页面加载时间大于30s),那我们假设为30000。 + +**精度** + +计算是准确的,如果: + + +- 值的总数不超过5670。 +- 总数值超过5670,但页面加载时间小于1024ms。 + +否则,计算结果将四舍五入到16毫秒的最接近倍数。 + +!!! note "注" + 对于计算页面加载时间分位数, 此函数比[quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile)更有效和准确。 + +**返回值** + +- 指定层次的分位数。 + +类型: `Float32`。 + +!!! note "注" +如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 + +**示例** + +输入表: + +``` text +┌─response_time─┐ +│ 72 │ +│ 112 │ +│ 126 │ +│ 145 │ +│ 104 │ +│ 242 │ +│ 313 │ +│ 168 │ +│ 108 │ +└───────────────┘ +``` + +查询: + +``` sql +SELECT quantileTiming(response_time) FROM t +``` + +结果: + +``` text +┌─quantileTiming(response_time)─┐ +│ 126 │ +└───────────────────────────────┘ +``` + +**参见** + +- [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md new file mode 100644 index 00000000000..7b130dbddbd --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md @@ -0,0 +1,118 @@ +--- +toc_priority: 205 +--- + +# quantileTimingWeighted {#quantiletimingweighted} + +根据每个序列成员的权重,使用确定的精度计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。 + +结果是确定性的(它不依赖于查询处理顺序)。该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。 + +当在一个查询中使用多个不同层次的 `quantile*` 时,内部状态不会被组合(即查询的工作效率低于组合情况)。在这种情况下,使用[quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)功能。 + +**语法** + +``` sql +quantileTimingWeighted(level)(expr, weight) +``` + +别名: `medianTimingWeighted`。 + +**参数** + +- `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]` 。默认值:0.5。当 `level=0.5` 时,该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。 +- `expr` — 求值[表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回 [Float\*](../../../sql-reference/data-types/float.md) 类型数值。 + + - 如果输入负值,那结果是不可预期的。 + - 如果输入值大于30000(页面加载时间大于30s),那我们假设为30000。 + +- `weight` — 权重序列。 权重是一个数据出现的数值。 + +**精度** + +计算是准确的,如果: + + +- 值的总数不超过5670。 +- 总数值超过5670,但页面加载时间小于1024ms。 + +否则,计算结果将四舍五入到16毫秒的最接近倍数。 + +!!! note "注" + 对于计算页面加载时间分位数, 此函数比[quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile)更有效和准确。 + +**返回值** + +- 指定层次的分位数。 + +类型: `Float32`。 + +!!! note "注" +如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 + +**示例** + +输入表: + +``` text +┌─response_time─┬─weight─┐ +│ 68 │ 1 │ +│ 104 │ 2 │ +│ 112 │ 3 │ +│ 126 │ 2 │ +│ 138 │ 1 │ +│ 162 │ 1 │ +└───────────────┴────────┘ +``` + +查询: + +``` sql +SELECT quantileTimingWeighted(response_time, weight) FROM t +``` + +结果: + +``` text +┌─quantileTimingWeighted(response_time, weight)─┐ +│ 112 │ +└───────────────────────────────────────────────┘ +``` + +# quantilesTimingWeighted {#quantilestimingweighted} + +类似于 `quantileTimingWeighted` , 但接受多个分位数层次参数,并返回一个由这些分位数值组成的数组。 + +**示例** + +输入表: + +``` text +┌─response_time─┬─weight─┐ +│ 68 │ 1 │ +│ 104 │ 2 │ +│ 112 │ 3 │ +│ 126 │ 2 │ +│ 138 │ 1 │ +│ 162 │ 1 │ +└───────────────┴────────┘ +``` + +查询: + +``` sql +SELECT quantilesTimingWeighted(0,5, 0.99)(response_time, weight) FROM t +``` + +结果: + +``` text +┌─quantilesTimingWeighted(0.5, 0.99)(response_time, weight)─┐ +│ [112,162] │ +└───────────────────────────────────────────────────────────┘ +``` + +**参见** + +- [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md new file mode 100644 index 00000000000..c29a43f6ca9 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md @@ -0,0 +1,53 @@ +## rankCorr {#agg_function-rankcorr} + +计算等级相关系数。 + +**语法** + +``` sql +rankCorr(x, y) +``` + +**参数** + +- `x` — 任意值。[Float32](../../../sql-reference/data-types/float.md#float32-float64) 或 [Float64](../../../sql-reference/data-types/float.md#float32-float64)。 +- `y` — 任意值。[Float32](../../../sql-reference/data-types/float.md#float32-float64) 或 [Float64](../../../sql-reference/data-types/float.md#float32-float64)。 + +**返回值** + +- Returns a rank correlation coefficient of the ranks of x and y. The value of the correlation coefficient ranges from -1 to +1. If less than two arguments are passed, the function will return an exception. The value close to +1 denotes a high linear relationship, and with an increase of one random variable, the second random variable also increases. The value close to -1 denotes a high linear relationship, and with an increase of one random variable, the second random variable decreases. The value close or equal to 0 denotes no relationship between the two random variables. + +类型: [Float64](../../../sql-reference/data-types/float.md#float32-float64)。 + +**示例** + +查询: + +``` sql +SELECT rankCorr(number, number) FROM numbers(100); +``` + +结果: + +``` text +┌─rankCorr(number, number)─┐ +│ 1 │ +└──────────────────────────┘ +``` + +查询: + +``` sql +SELECT roundBankers(rankCorr(exp(number), sin(number)), 3) FROM numbers(100); +``` + +结果: + +``` text +┌─roundBankers(rankCorr(exp(number), sin(number)), 3)─┐ +│ -0.037 │ +└─────────────────────────────────────────────────────┘ +``` +**参见** + +- 斯皮尔曼等级相关系数[Spearman's rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) \ No newline at end of file diff --git a/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md new file mode 100644 index 00000000000..56cb1539fc9 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md @@ -0,0 +1,44 @@ +--- +toc_priority: 220 +--- + +# simpleLinearRegression {#simplelinearregression} + +执行简单(一维)线性回归。 + +**语法** + +``` sql +simpleLinearRegression(x, y) +``` + +**参数** + +- `x` — x轴。 +- `y` — y轴。 + +**返回值** + +符合`y = a*x + b`的常量 `(a, b)` 。 + +**示例** + +``` sql +SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3]) +``` + +``` text +┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])─┐ +│ (1,0) │ +└───────────────────────────────────────────────────────────────────┘ +``` + +``` sql +SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6]) +``` + +``` text +┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])─┐ +│ (1,3) │ +└───────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md b/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md new file mode 100644 index 00000000000..0771c18c2f3 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md @@ -0,0 +1,27 @@ +--- +toc_priority: 150 +--- + +# skewPop {#skewpop} + +计算给定序列的 [偏度] (https://en.wikipedia.org/wiki/Skewness)。 + +**语法** + +``` sql +skewPop(expr) +``` + +**参数** + +`expr` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回一个数字。 + +**返回值** + +给定分布的偏度。类型 — [Float64](../../../sql-reference/data-types/float.md) + +**示例** + +``` sql +SELECT skewPop(value) FROM series_with_value_column; +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md new file mode 100644 index 00000000000..902d06da8e7 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md @@ -0,0 +1,29 @@ +--- +toc_priority: 151 +--- + +# skewSamp {#skewsamp} + +计算给定序列的 [样本偏度] (https://en.wikipedia.org/wiki/Skewness)。 + +如果传递的值形成其样本,它代表了一个随机变量的偏度的无偏估计。 + +**语法** + +``` sql +skewSamp(expr) +``` + +**参数** + +`expr` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回一个数字。 + +**返回值** + +给定分布的偏度。 类型 — [Float64](../../../sql-reference/data-types/float.md)。 如果 `n <= 1` (`n` 样本的大小), 函数返回 `nan`。 + +**示例** + +``` sql +SELECT skewSamp(value) FROM series_with_value_column; +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md new file mode 100644 index 00000000000..378ef4ae7e4 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md @@ -0,0 +1,10 @@ +--- +toc_priority: 30 +--- + +# stddevPop {#stddevpop} + +结果等于 [varPop] (../../../sql-reference/aggregate-functions/reference/varpop.md)的平方根。 + +!!! note "注" +该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md new file mode 100644 index 00000000000..68a348146a9 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md @@ -0,0 +1,10 @@ +--- +toc_priority: 31 +--- + +# stddevSamp {#stddevsamp} + +结果等于 [varSamp] (../../../sql-reference/aggregate-functions/reference/varsamp.md)的平方根。 + +!!! note "注" +该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md new file mode 100644 index 00000000000..43ebd6be575 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -0,0 +1,77 @@ +--- +toc_priority: 221 +--- + +# stochasticLinearRegression {#agg_functions-stochasticlinearregression} + +该函数实现随机线性回归。 它支持自定义参数的学习率、L2正则化系数、微批,并且具有少量更新权重的方法([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (默认), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf))。 + +### 参数 {#agg_functions-stochasticlinearregression-parameters} + +有4个可自定义的参数。它们按顺序传递给函数,但不需要传递所有四个参数——将使用默认值,然而好的模型需要一些参数调整。 + +**语法** + +``` sql +stochasticLinearRegression(1.0, 1.0, 10, 'SGD') +``` + +1. `learning rate` 当执行梯度下降步骤时,步长的系数。 过大的学习率可能会导致模型的权重无限大。 默认值为 `0.00001`。 +2. `l2 regularization coefficient` 这可能有助于防止过度拟合。 默认值为 `0.1`。 +3. `mini-batch size` 设置元素的数量,这些元素将被计算和求和以执行梯度下降的一个步骤。纯随机下降使用一个元素,但是具有小批量(约10个元素)使梯度步骤更稳定。 默认值为 `15`。 +4. `method for updating weights` 他们是: `Adam` (默认情况下), `SGD`, `Momentum`, `Nesterov`。`Momentum` 和 `Nesterov` 需要更多的计算和内存,但是它们恰好在收敛速度和随机梯度方法的稳定性方面是有用的。 + +### 使用 {#agg_functions-stochasticlinearregression-usage} + +`stochasticLinearRegression` 用于两个步骤:拟合模型和预测新数据。 为了拟合模型并保存其状态以供以后使用,我们使用 `-State` 组合器,它基本上保存了状态(模型权重等)。 +为了预测我们使用函数 [evalMLMethod](../../../sql-reference/functions/machine-learning-functions.md#machine_learning_methods-evalmlmethod), 这需要一个状态作为参数以及特征来预测。 + + + +**1.** 拟合 + +可以使用这种查询。 + +``` sql +CREATE TABLE IF NOT EXISTS train_data +( + param1 Float64, + param2 Float64, + target Float64 +) ENGINE = Memory; + +CREATE TABLE your_model ENGINE = Memory AS SELECT +stochasticLinearRegressionState(0.1, 0.0, 5, 'SGD')(target, param1, param2) +AS state FROM train_data; +``` + +在这里,我们还需要将数据插入到 `train_data` 表。参数的数量不是固定的,它只取决于传入 `linearRegressionState` 的参数数量。它们都必须是数值。 +注意,目标值(我们想学习预测的)列作为第一个参数插入。 + +**2.** 预测 + +在将状态保存到表中之后,我们可以多次使用它进行预测,甚至与其他状态合并,创建新的更好的模型。 + +``` sql +WITH (SELECT state FROM your_model) AS model SELECT +evalMLMethod(model, param1, param2) FROM test_data +``` + +查询将返回一列预测值。注意,`evalMLMethod` 的第一个参数是 `AggregateFunctionState` 对象, 接下来是特征列。 + +`test_data` 是一个类似 `train_data` 的表 但可能不包含目标值。 + +### 注 {#agg_functions-stochasticlinearregression-notes} + +1. 要合并两个模型,用户可以创建这样的查询: + `sql SELECT state1 + state2 FROM your_models` + 其中 `your_models` 表包含这两个模型。此查询将返回新的 `AggregateFunctionState` 对象。 + +2. 如果没有使用 `-State` 组合器,用户可以为自己的目的获取所创建模型的权重,而不保存模型 。 + `sql SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data` + 这样的查询将拟合模型,并返回其权重——首先是权重,对应模型的参数,最后一个是偏差。 所以在上面的例子中,查询将返回一个具有3个值的列。 + +**参见** + +- [随机指标逻辑回归](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions-stochasticlogisticregression) +- [线性回归和逻辑回归之间的差异](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md new file mode 100644 index 00000000000..5ed2fb74b89 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md @@ -0,0 +1,56 @@ +--- +toc_priority: 222 +--- + +# stochasticLogisticRegression {#agg_functions-stochasticlogisticregression} + +该函数实现随机逻辑回归。 它可以用于二进制分类问题,支持与stochasticLinearRegression相同的自定义参数,并以相同的方式工作。 + +### 参数 {#agg_functions-stochasticlogisticregression-parameters} + +参数与stochasticLinearRegression中的参数完全相同: +`learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. +欲了解更多信息,参见 [参数] (#agg_functions-stochasticlinearregression-parameters). + +**语法** + +``` sql +stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') +``` + +**1.** 拟合 + + + + 参考[stochasticLinearRegression](#stochasticlinearregression-usage-fitting) `拟合` 章节文档。 + + 预测标签的取值范围为\[-1, 1\] + +**2.** 预测 + + + + 使用已经保存的state我们可以预测标签为 `1` 的对象的概率。 + ``` sql + WITH (SELECT state FROM your_model) AS model SELECT + evalMLMethod(model, param1, param2) FROM test_data + ``` + + 查询结果返回一个列的概率。注意 `evalMLMethod` 的第一个参数是 `AggregateFunctionState` 对象,接下来的参数是列的特性。 + + 我们也可以设置概率的范围, 这样需要给元素指定不同的标签。 + + ``` sql + SELECT ans < 1.1 AND ans > 0.5 FROM + (WITH (SELECT state FROM your_model) AS model SELECT + evalMLMethod(model, param1, param2) AS ans FROM test_data) + ``` + + 结果是标签。 + + `test_data` 是一个像 `train_data` 一样的表,但是不包含目标值。 + +**参见** + +- [随机指标线性回归](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md#agg_functions-stochasticlinearregression) +- [线性回归和逻辑回归之间的差异](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/studentttest.md b/docs/zh/sql-reference/aggregate-functions/reference/studentttest.md new file mode 100644 index 00000000000..6d84e728330 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/studentttest.md @@ -0,0 +1,64 @@ +--- +toc_priority: 300 +toc_title: studentTTest +--- + +# studentTTest {#studentttest} + +对两个总体的样本应用t检验。 + +**语法** + +``` sql +studentTTest(sample_data, sample_index) +``` + +两个样本的值都在 `sample_data` 列中。如果 `sample_index` 等于 0,则该行的值属于第一个总体的样本。 反之属于第二个总体的样本。 +零假设是总体的均值相等。假设为方差相等的正态分布。 + +**参数** + +- `sample_data` — 样本数据。[Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) 或 [Decimal](../../../sql-reference/data-types/decimal.md)。 +- `sample_index` — 样本索引。[Integer](../../../sql-reference/data-types/int-uint.md)。 + +**返回值** + +[元组](../../../sql-reference/data-types/tuple.md),有两个元素: + +- 计算出的t统计量。 [Float64](../../../sql-reference/data-types/float.md)。 +- 计算出的p值。[Float64](../../../sql-reference/data-types/float.md)。 + + +**示例** + +输入表: + +``` text +┌─sample_data─┬─sample_index─┐ +│ 20.3 │ 0 │ +│ 21.1 │ 0 │ +│ 21.9 │ 1 │ +│ 21.7 │ 0 │ +│ 19.9 │ 1 │ +│ 21.8 │ 1 │ +└─────────────┴──────────────┘ +``` + +查询: + +``` sql +SELECT studentTTest(sample_data, sample_index) FROM student_ttest; +``` + +结果: + +``` text +┌─studentTTest(sample_data, sample_index)───┐ +│ (-0.21739130434783777,0.8385421208415731) │ +└───────────────────────────────────────────┘ +``` + +**参见** + +- [Student's t-test](https://en.wikipedia.org/wiki/Student%27s_t-test) +- [welchTTest function](../../../sql-reference/aggregate-functions/reference/welchttest.md#welchttest) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/sum.md b/docs/zh/sql-reference/aggregate-functions/reference/sum.md new file mode 100644 index 00000000000..049c491d2a5 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/sum.md @@ -0,0 +1,8 @@ +--- +toc_priority: 4 +--- + +# sum {#agg_function-sum} + +计算总和。 +只适用于数字。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/summap.md b/docs/zh/sql-reference/aggregate-functions/reference/summap.md new file mode 100644 index 00000000000..4a92a1ea1b0 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/summap.md @@ -0,0 +1,52 @@ +--- +toc_priority: 141 +--- + +# sumMap {#agg_functions-summap} + +**语法** + +``` sql +sumMap(key, value) +或 +sumMap(Tuple(key, value)) +``` + +根据 `key` 数组中指定的键对 `value` 数组进行求和。 + +传递 `key` 和 `value` 数组的元组与传递 `key` 和 `value` 的两个数组是同义的。 +要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。 +返回两个数组组成的一个元组: 排好序的 `key` 和对应 `key` 的 `value` 之和。 + +示例: + +``` sql +CREATE TABLE sum_map( + date Date, + timeslot DateTime, + statusMap Nested( + status UInt16, + requests UInt64 + ), + statusMapTuple Tuple(Array(Int32), Array(Int32)) +) ENGINE = Log; +INSERT INTO sum_map VALUES + ('2000-01-01', '2000-01-01 00:00:00', [1, 2, 3], [10, 10, 10], ([1, 2, 3], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:00:00', [3, 4, 5], [10, 10, 10], ([3, 4, 5], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:01:00', [4, 5, 6], [10, 10, 10], ([4, 5, 6], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:01:00', [6, 7, 8], [10, 10, 10], ([6, 7, 8], [10, 10, 10])); + +SELECT + timeslot, + sumMap(statusMap.status, statusMap.requests), + sumMap(statusMapTuple) +FROM sum_map +GROUP BY timeslot +``` + +``` text +┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┬─sumMap(statusMapTuple)─────────┐ +│ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10]) │ ([1,2,3,4,5],[10,10,20,10,10]) │ +│ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ ([4,5,6,7,8],[10,10,20,10,10]) │ +└─────────────────────┴──────────────────────────────────────────────┴────────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md b/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md new file mode 100644 index 00000000000..0fd5af519da --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md @@ -0,0 +1,9 @@ +--- +toc_priority: 140 +--- + +# sumWithOverflow {#sumwithoverflowx} + +使用与输入参数相同的数据类型计算结果的数字总和。如果总和超过此数据类型的最大值,则使用溢出进行计算。 + +只适用于数字。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/topk.md b/docs/zh/sql-reference/aggregate-functions/reference/topk.md new file mode 100644 index 00000000000..69e006d1a6c --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/topk.md @@ -0,0 +1,43 @@ +--- +toc_priority: 108 +--- + +# topK {#topk} + +返回指定列中近似最常见值的数组。 生成的数组按值的近似频率降序排序(而不是值本身)。 + +实现了[过滤节省空间](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf)算法, 使用基于reduce-and-combine的算法,借鉴[并行节省空间](https://arxiv.org/pdf/1401.0702.pdf)。 + +**语法** + +``` sql +topK(N)(x) +``` +此函数不提供保证的结果。 在某些情况下,可能会发生错误,并且可能会返回不是最高频的值。 + +我们建议使用 `N < 10` 值,`N` 值越大,性能越低。最大值 `N = 65536`。 + +**参数** + +- `N` — 要返回的元素数。 + +如果省略该参数,则使用默认值10。 + +**参数** + +- `x` – (要计算频次的)值。 + +**示例** + +就拿 [OnTime](../../../getting-started/example-datasets/ontime.md) 数据集来说,选择`AirlineID` 列中出现最频繁的三个。 + +``` sql +SELECT topK(3)(AirlineID) AS res +FROM ontime +``` + +``` text +┌─res─────────────────┐ +│ [19393,19790,19805] │ +└─────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md new file mode 100644 index 00000000000..66b436f42bb --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md @@ -0,0 +1,42 @@ +--- +toc_priority: 109 +--- + +# topKWeighted {#topkweighted} + +类似于 `topK` 但需要一个整数类型的附加参数 - `weight`。 每个输入都被记入 `weight` 次频率计算。 + +**语法** + +``` sql +topKWeighted(N)(x, weight) +``` + +**参数** + +- `N` — 要返回的元素数。 + +**参数** + +- `x` – (要计算频次的)值。 +- `weight` — 权重。 [UInt8](../../../sql-reference/data-types/int-uint.md)类型。 + +**返回值** + +返回具有最大近似权重总和的值数组。 + +**示例** + +查询: + +``` sql +SELECT topKWeighted(10)(number, number) FROM numbers(1000) +``` + +结果: + +``` text +┌─topKWeighted(10)(number, number)──────────┐ +│ [999,998,997,996,995,994,993,992,991,990] │ +└───────────────────────────────────────────┘ +``` diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniq.md b/docs/zh/sql-reference/aggregate-functions/reference/uniq.md new file mode 100644 index 00000000000..2cf020d052b --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/uniq.md @@ -0,0 +1,42 @@ +--- +toc_priority: 190 +--- + +# uniq {#agg_function-uniq} + +计算参数的不同值的近似数量。 + +**语法** + +``` sql +uniq(x[, ...]) +``` + +**参数** + +该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`, 或数字类型。 + +**返回值** + +- [UInt64](../../../sql-reference/data-types/int-uint.md) 类型数值。 + +**实现细节** + +功能: + +- 计算聚合中所有参数的哈希值,然后在计算中使用它。 + +- 使用自适应采样算法。 对于计算状态,该函数使用最多65536个元素哈希值的样本。 + + 这个算法是非常精确的,并且对于CPU来说非常高效。如果查询包含一些这样的函数,那和其他聚合函数相比 `uniq` 将是几乎一样快。 + +- 确定性地提供结果(它不依赖于查询处理顺序)。 + +我们建议在几乎所有情况下使用此功能。 + +**参见** + +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) +- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md new file mode 100644 index 00000000000..26a681ed5af --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md @@ -0,0 +1,52 @@ +--- +toc_priority: 192 +--- + +# uniqCombined {#agg_function-uniqcombined} + +计算不同参数值的近似数量。 + +**语法** +``` sql +uniqCombined(HLL_precision)(x[, ...]) +``` +该 `uniqCombined` 函数是计算不同值数量的不错选择。 + +**参数** + +该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`,或数字类型。 + +`HLL_precision` 是以2为底的单元格数的对数 [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog)。可选,您可以将该函数用作 `uniqCombined(x[, ...])`。 `HLL_precision` 的默认值是17,这是有效的96KiB的空间(2^17个单元,每个6比特)。 + +**返回值** + +- 一个[UInt64](../../../sql-reference/data-types/int-uint.md)类型的数字。 + +**实现细节** + +功能: + +- 为聚合中的所有参数计算哈希(`String`类型用64位哈希,其他32位),然后在计算中使用它。 + +- 使用三种算法的组合:数组、哈希表和包含错误修正表的HyperLogLog。 + + + 少量的不同的值,使用数组。 值再多一些,使用哈希表。对于大量的数据来说,使用HyperLogLog,HyperLogLog占用一个固定的内存空间。 + +- 确定性地提供结果(它不依赖于查询处理顺序)。 + +!!! note "注" + 由于它对非 `String` 类型使用32位哈希,对于基数显著大于`UINT_MAX` ,结果将有非常高的误差(误差将在几百亿不同值之后迅速提高), 因此这种情况,你应该使用 [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) + +相比于 [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) 函数, 该 `uniqCombined`: + +- 消耗内存要少几倍。 +- 计算精度高出几倍。 +- 通常具有略低的性能。 在某些情况下, `uniqCombined` 可以表现得比 `uniq` 好,例如,使用通过网络传输大量聚合状态的分布式查询。 + +**参见** + +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md new file mode 100644 index 00000000000..3c07791450d --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md @@ -0,0 +1,7 @@ +--- +toc_priority: 193 +--- + +# uniqCombined64 {#agg_function-uniqcombined64} + +和 [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)一样, 但对于所有数据类型使用64位哈希。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md new file mode 100644 index 00000000000..bdd60ca1d30 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md @@ -0,0 +1,26 @@ +--- +toc_priority: 191 +--- + +# uniqExact {#agg_function-uniqexact} + +计算不同参数值的准确数目。 + +**语法** + +``` sql +uniqExact(x[, ...]) +``` +如果你绝对需要一个确切的结果,使用 `uniqExact` 函数。 否则使用 [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) 函数。 + +`uniqExact` 函数比 `uniq` 使用更多的内存,因为状态的大小随着不同值的数量的增加而无界增长。 + +**参数** + +该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`,或数字类型。 + +**参见** + +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqcombined) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqhll12) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md new file mode 100644 index 00000000000..7521065b954 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md @@ -0,0 +1,43 @@ +--- +toc_priority: 194 +--- + +# uniqHLL12 {#agg_function-uniqhll12} + +计算不同参数值的近似数量,使用 [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) 算法。 + +**语法** + +``` sql +uniqHLL12(x[, ...]) +``` + +**参数** + +该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`,或数字类型。 + +**返回值** + +**返回值** + +- 一个[UInt64](../../../sql-reference/data-types/int-uint.md)类型的数字。 + +**实现细节** + +功能: + +- 计算聚合中所有参数的哈希值,然后在计算中使用它。 + +- 使用 HyperLogLog 算法来近似不同参数值的数量。 + + 使用2^12个5比特单元。 状态的大小略大于2.5KB。 对于小数据集(<10K元素),结果不是很准确(误差高达10%)。 但是, 对于高基数数据集(10K-100M),结果相当准确,最大误差约为1.6%。Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). + +- 提供确定结果(它不依赖于查询处理顺序)。 + +我们不建议使用此函数。 在大多数情况下, 使用 [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) 或 [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) 函数。 + +**参见** + +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varpop.md b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md new file mode 100644 index 00000000000..4dca8efde38 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md @@ -0,0 +1,12 @@ +--- +toc_priority: 32 +--- + +# varPop(x) {#varpopx} + +计算 `Σ((x - x̅)^2) / n`,这里 `n` 是样本大小, `x̅` 是 `x` 的平均值。 + +换句话说,计算一组数据的离差。 返回 `Float64`。 + +!!! note "注" +该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md new file mode 100644 index 00000000000..c83ee7e24d2 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md @@ -0,0 +1,15 @@ +--- +toc_priority: 33 +--- + +# varSamp {#varsamp} + +计算 `Σ((x - x̅)^2) / (n - 1)`,这里 `n` 是样本大小, `x̅`是`x`的平均值。 + +它表示随机变量的方差的无偏估计,如果传递的值形成其样本。 + +返回 `Float64`。 当 `n <= 1`,返回 `+∞`。 + +!!! note "注" +该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + diff --git a/docs/zh/sql-reference/aggregate-functions/reference/welchttest.md b/docs/zh/sql-reference/aggregate-functions/reference/welchttest.md new file mode 100644 index 00000000000..44b8e81d4d9 --- /dev/null +++ b/docs/zh/sql-reference/aggregate-functions/reference/welchttest.md @@ -0,0 +1,62 @@ +--- +toc_priority: 301 +toc_title: welchTTest +--- + +# welchTTest {#welchttest} + +对两个总体的样本应用 Welch t检验。 + +**语法** + +``` sql +welchTTest(sample_data, sample_index) +``` +两个样本的值都在 `sample_data` 列中。如果 `sample_index` 等于 0,则该行的值属于第一个总体的样本。 反之属于第二个总体的样本。 +零假设是群体的均值相等。假设为正态分布。总体可能具有不相等的方差。 + +**参数** + +- `sample_data` — 样本数据。[Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) 或 [Decimal](../../../sql-reference/data-types/decimal.md). +- `sample_index` — 样本索引。[Integer](../../../sql-reference/data-types/int-uint.md). + +**返回值** + +[元组](../../../sql-reference/data-types/tuple.md),有两个元素: + +- 计算出的t统计量。 [Float64](../../../sql-reference/data-types/float.md)。 +- 计算出的p值。[Float64](../../../sql-reference/data-types/float.md)。 + +**示例** + +输入表: + +``` text +┌─sample_data─┬─sample_index─┐ +│ 20.3 │ 0 │ +│ 22.1 │ 0 │ +│ 21.9 │ 0 │ +│ 18.9 │ 1 │ +│ 20.3 │ 1 │ +│ 19 │ 1 │ +└─────────────┴──────────────┘ +``` + +查询: + +``` sql +SELECT welchTTest(sample_data, sample_index) FROM welch_ttest; +``` + +结果: + +``` text +┌─welchTTest(sample_data, sample_index)─────┐ +│ (2.7988719532211235,0.051807360348581945) │ +└───────────────────────────────────────────┘ +``` + +**参见** + +- [Welch's t-test](https://en.wikipedia.org/wiki/Welch%27s_t-test) +- [studentTTest function](../../../sql-reference/aggregate-functions/reference/studentttest.md#studentttest) diff --git a/docs/zh/sql-reference/data-types/index.md b/docs/zh/sql-reference/data-types/index.md index 70aa976cb11..c7f5c63e357 100644 --- a/docs/zh/sql-reference/data-types/index.md +++ b/docs/zh/sql-reference/data-types/index.md @@ -1,5 +1,12 @@ +--- +toc_folder_title: 数据类型 +toc_priority: 37 +toc_title: 简介 +--- + # 数据类型 {#data_types} ClickHouse 可以在数据表中存储多种数据类型。 本节描述 ClickHouse 支持的数据类型,以及使用或者实现它们时(如果有的话)的注意事项。 +你可以在系统表 [system.data_type_families](../../operations/system-tables/data_type_families.md#system_tables-data_type_families) 中检查数据类型名称是否区分大小写。 diff --git a/docs/zh/sql-reference/data-types/simpleaggregatefunction.md b/docs/zh/sql-reference/data-types/simpleaggregatefunction.md index e827adb817e..38d7699c176 100644 --- a/docs/zh/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/zh/sql-reference/data-types/simpleaggregatefunction.md @@ -1,26 +1,31 @@ ---- -machine_translated: true -machine_translated_rev: 71d72c1f237f4a553fe91ba6c6c633e81a49e35b ---- - # SimpleAggregateFunction {#data-type-simpleaggregatefunction} -`SimpleAggregateFunction(name, types_of_arguments…)` 数据类型存储聚合函数的当前值,而不将其完整状态存储为 [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) 有 此优化可应用于具有以下属性的函数:应用函数的结果 `f` 到行集 `S1 UNION ALL S2` 可以通过应用来获得 `f` 行的部分单独设置,然后再次应用 `f` 到结果: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. 此属性保证部分聚合结果足以计算组合结果,因此我们不必存储和处理任何额外的数据。 +`SimpleAggregateFunction(name, types_of_arguments…)` 数据类型存储聚合函数的当前值, 并不像 [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) 那样存储其全部状态。这种优化可以应用于具有以下属性函数: 将函数 `f` 应用于行集合 `S1 UNION ALL S2` 的结果,可以通过将 `f` 分别应用于行集合的部分, 然后再将 `f` 应用于结果来获得: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`。 这个属性保证了部分聚合结果足以计算出合并的结果,所以我们不必存储和处理任何额外的数据。 支持以下聚合函数: -- [`any`](../../sql-reference/aggregate-functions/reference.md#agg_function-any) -- [`anyLast`](../../sql-reference/aggregate-functions/reference.md#anylastx) -- [`min`](../../sql-reference/aggregate-functions/reference.md#agg_function-min) -- [`max`](../../sql-reference/aggregate-functions/reference.md#agg_function-max) -- [`sum`](../../sql-reference/aggregate-functions/reference.md#agg_function-sum) -- [`groupBitAnd`](../../sql-reference/aggregate-functions/reference.md#groupbitand) -- [`groupBitOr`](../../sql-reference/aggregate-functions/reference.md#groupbitor) -- [`groupBitXor`](../../sql-reference/aggregate-functions/reference.md#groupbitxor) -- [`groupArrayArray`](../../sql-reference/aggregate-functions/reference.md#agg_function-grouparray) -- [`groupUniqArrayArray`](../../sql-reference/aggregate-functions/reference.md#groupuniqarrayx-groupuniqarraymax-sizex) +- [`any`](../../sql-reference/aggregate-functions/reference/any.md#agg_function-any) +- [`anyLast`](../../sql-reference/aggregate-functions/reference/anylast.md#anylastx) +- [`min`](../../sql-reference/aggregate-functions/reference/min.md#agg_function-min) +- [`max`](../../sql-reference/aggregate-functions/reference/max.md#agg_function-max) +- [`sum`](../../sql-reference/aggregate-functions/reference/sum.md#agg_function-sum) +- [`sumWithOverflow`](../../sql-reference/aggregate-functions/reference/sumwithoverflow.md#sumwithoverflowx) +- [`groupBitAnd`](../../sql-reference/aggregate-functions/reference/groupbitand.md#groupbitand) +- [`groupBitOr`](../../sql-reference/aggregate-functions/reference/groupbitor.md#groupbitor) +- [`groupBitXor`](../../sql-reference/aggregate-functions/reference/groupbitxor.md#groupbitxor) +- [`groupArrayArray`](../../sql-reference/aggregate-functions/reference/grouparray.md#agg_function-grouparray) +- [`groupUniqArrayArray`](../../sql-reference/aggregate-functions/reference/groupuniqarray.md) +- [`sumMap`](../../sql-reference/aggregate-functions/reference/summap.md#agg_functions-summap) +- [`minMap`](../../sql-reference/aggregate-functions/reference/minmap.md#agg_functions-minmap) +- [`maxMap`](../../sql-reference/aggregate-functions/reference/maxmap.md#agg_functions-maxmap) +- [`argMin`](../../sql-reference/aggregate-functions/reference/argmin.md) +- [`argMax`](../../sql-reference/aggregate-functions/reference/argmax.md) -的值 `SimpleAggregateFunction(func, Type)` 看起来和存储方式相同 `Type`,所以你不需要应用函数 `-Merge`/`-State` 后缀。 `SimpleAggregateFunction` 具有比更好的性能 `AggregateFunction` 具有相同的聚合功能。 + +!!! note "注" + `SimpleAggregateFunction(func, Type)` 的值外观和存储方式于 `Type` 相同, 所以你不需要应用带有 `-Merge`/`-State` 后缀的函数。 + + `SimpleAggregateFunction` 的性能优于具有相同聚合函数的 `AggregateFunction` 。 **参数** @@ -30,11 +35,7 @@ machine_translated_rev: 71d72c1f237f4a553fe91ba6c6c633e81a49e35b **示例** ``` sql -CREATE TABLE t -( - column1 SimpleAggregateFunction(sum, UInt64), - column2 SimpleAggregateFunction(any, String) -) ENGINE = ... +CREATE TABLE simple (id UInt64, val SimpleAggregateFunction(sum, Double)) ENGINE=AggregatingMergeTree ORDER BY id; ``` [原始文章](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) diff --git a/docs/zh/sql-reference/data-types/uuid.md b/docs/zh/sql-reference/data-types/uuid.md index 2ff1e391e81..b454484003c 100644 --- a/docs/zh/sql-reference/data-types/uuid.md +++ b/docs/zh/sql-reference/data-types/uuid.md @@ -1,21 +1,19 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 46 toc_title: UUID --- # UUID {#uuid-data-type} -通用唯一标识符(UUID)是用于标识记录的16字节数。 有关UUID的详细信息,请参阅 [维基百科](https://en.wikipedia.org/wiki/Universally_unique_identifier). +通用唯一标识符(UUID)是一个16字节的数字,用于标识记录。有关UUID的详细信息, 参见[维基百科](https://en.wikipedia.org/wiki/Universally_unique_identifier)。 -UUID类型值的示例如下所示: +UUID类型值的示例如下: ``` text 61f0c404-5cb3-11e7-907b-a6006ad3dba0 ``` -如果在插入新记录时未指定UUID列值,则UUID值将用零填充: +如果在插入新记录时未指定UUID列的值,则UUID值将用零填充: ``` text 00000000-0000-0000-0000-000000000000 @@ -23,13 +21,13 @@ UUID类型值的示例如下所示: ## 如何生成 {#how-to-generate} -要生成UUID值,ClickHouse提供了 [generateuidv4](../../sql-reference/functions/uuid-functions.md) 功能。 +要生成UUID值,ClickHouse提供了 [generateuidv4](../../sql-reference/functions/uuid-functions.md) 函数。 ## 用法示例 {#usage-example} **示例1** -此示例演示如何创建具有UUID类型列的表并将值插入到表中。 +这个例子演示了创建一个具有UUID类型列的表,并在表中插入一个值。 ``` sql CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog @@ -51,7 +49,7 @@ SELECT * FROM t_uuid **示例2** -在此示例中,插入新记录时未指定UUID列值。 +在这个示例中,插入新记录时未指定UUID列的值。 ``` sql INSERT INTO t_uuid (y) VALUES ('Example 2') @@ -70,8 +68,7 @@ SELECT * FROM t_uuid ## 限制 {#restrictions} -UUID数据类型仅支持以下功能 [字符串](string.md) 数据类型也支持(例如, [min](../../sql-reference/aggregate-functions/reference.md#agg_function-min), [max](../../sql-reference/aggregate-functions/reference.md#agg_function-max),和 [计数](../../sql-reference/aggregate-functions/reference.md#agg_function-count)). +UUID数据类型只支持 [字符串](../../sql-reference/data-types/string.md) 数据类型也支持的函数(比如, [min](../../sql-reference/aggregate-functions/reference/min.md#agg_function-min), [max](../../sql-reference/aggregate-functions/reference/max.md#agg_function-max), 和 [count](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count))。 -算术运算不支持UUID数据类型(例如, [abs](../../sql-reference/functions/arithmetic-functions.md#arithm_func-abs))或聚合函数,例如 [sum](../../sql-reference/aggregate-functions/reference.md#agg_function-sum) 和 [avg](../../sql-reference/aggregate-functions/reference.md#agg_function-avg). +算术运算不支持UUID数据类型(例如, [abs](../../sql-reference/functions/arithmetic-functions.md#arithm_func-abs))或聚合函数,例如 [sum](../../sql-reference/aggregate-functions/reference/sum.md#agg_function-sum) 和 [avg](../../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg). -[原始文章](https://clickhouse.tech/docs/en/data_types/uuid/) diff --git a/docs/zh/sql-reference/functions/other-functions.md b/docs/zh/sql-reference/functions/other-functions.md index b17a5e89332..c58c4bd1510 100644 --- a/docs/zh/sql-reference/functions/other-functions.md +++ b/docs/zh/sql-reference/functions/other-functions.md @@ -477,6 +477,103 @@ FROM 1 rows in set. Elapsed: 0.002 sec. + +## indexHint {#indexhint} +输出符合索引选择范围内的所有数据,同时不实用参数中的表达式进行过滤。 + +传递给函数的表达式参数将不会被计算,但ClickHouse使用参数中的表达式进行索引过滤。 + +**返回值** + +- 1。 + +**示例** + +这是一个包含[ontime](../../getting-started/example-datasets/ontime.md)测试数据集的测试表。 + +``` +SELECT count() FROM ontime + +┌─count()─┐ +│ 4276457 │ +└─────────┘ +``` + +该表使用`(FlightDate, (Year, FlightDate))`作为索引。 + +对该表进行如下的查询: + +``` +:) SELECT FlightDate AS k, count() FROM ontime GROUP BY k ORDER BY k + +SELECT + FlightDate AS k, + count() +FROM ontime +GROUP BY k +ORDER BY k ASC + +┌──────────k─┬─count()─┐ +│ 2017-01-01 │ 13970 │ +│ 2017-01-02 │ 15882 │ +........................ +│ 2017-09-28 │ 16411 │ +│ 2017-09-29 │ 16384 │ +│ 2017-09-30 │ 12520 │ +└────────────┴─────────┘ + +273 rows in set. Elapsed: 0.072 sec. Processed 4.28 million rows, 8.55 MB (59.00 million rows/s., 118.01 MB/s.) +``` + +在这个查询中,由于没有使用索引,所以ClickHouse将处理整个表的所有数据(`Processed 4.28 million rows`)。使用下面的查询尝试使用索引进行查询: + +``` +:) SELECT FlightDate AS k, count() FROM ontime WHERE k = '2017-09-15' GROUP BY k ORDER BY k + +SELECT + FlightDate AS k, + count() +FROM ontime +WHERE k = '2017-09-15' +GROUP BY k +ORDER BY k ASC + +┌──────────k─┬─count()─┐ +│ 2017-09-15 │ 16428 │ +└────────────┴─────────┘ + +1 rows in set. Elapsed: 0.014 sec. Processed 32.74 thousand rows, 65.49 KB (2.31 million rows/s., 4.63 MB/s.) +``` + +在最后一行的显示中,通过索引ClickHouse处理的行数明显减少(`Processed 32.74 thousand rows`)。 + +现在将表达式`k = '2017-09-15'`传递给`indexHint`函数: + +``` +:) SELECT FlightDate AS k, count() FROM ontime WHERE indexHint(k = '2017-09-15') GROUP BY k ORDER BY k + +SELECT + FlightDate AS k, + count() +FROM ontime +WHERE indexHint(k = '2017-09-15') +GROUP BY k +ORDER BY k ASC + +┌──────────k─┬─count()─┐ +│ 2017-09-14 │ 7071 │ +│ 2017-09-15 │ 16428 │ +│ 2017-09-16 │ 1077 │ +│ 2017-09-30 │ 8167 │ +└────────────┴─────────┘ + +4 rows in set. Elapsed: 0.004 sec. Processed 32.74 thousand rows, 65.49 KB (8.97 million rows/s., 17.94 MB/s.) +``` + +对于这个请求,根据ClickHouse显示ClickHouse与上一次相同的方式应用了索引(`Processed 32.74 thousand rows`)。但是,最终返回的结果集中并没有根据`k = '2017-09-15'`表达式进行过滤结果。 + +由于ClickHouse中使用稀疏索引,因此在读取范围时(本示例中为相邻日期),"额外"的数据将包含在索引结果中。使用`indexHint`函数可以查看到它们。 + ## 复制 {#replicate} 使用单个值填充一个数组。 diff --git a/docs/zh/sql-reference/operators/in.md b/docs/zh/sql-reference/operators/in.md index a16b75fe555..f39bd02c309 100644 --- a/docs/zh/sql-reference/operators/in.md +++ b/docs/zh/sql-reference/operators/in.md @@ -18,7 +18,7 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... 如果左侧是索引中的单列,而右侧是一组常量,则系统将使用索引处理查询。 -Don't list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”),然后使用子查询。 +请不要列举太多具体的常量 (比方说 几百万条)。如果数据集非常大,请把它放在一张临时表里(例如,参考章节[用于查询处理的外部数据](../../engines/table-engines/special/external-data.md)),然后使用子查询。 运算符的右侧可以是一组常量表达式、一组带有常量表达式的元组(如上面的示例所示),或括号中的数据库表或SELECT子查询的名称。 diff --git a/docs/zh/sql-reference/statements/index.md b/docs/zh/sql-reference/statements/index.md index 1c5f4e9a7ef..ab080584c66 100644 --- a/docs/zh/sql-reference/statements/index.md +++ b/docs/zh/sql-reference/statements/index.md @@ -1,7 +1,7 @@ --- machine_translated: true machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "\u53D1\u8A00" +toc_folder_title: "\u8BED\u53E5" toc_priority: 31 --- diff --git a/docs/zh/sql-reference/table-functions/file.md b/docs/zh/sql-reference/table-functions/file.md index 4d694cb6729..84fddada867 100644 --- a/docs/zh/sql-reference/table-functions/file.md +++ b/docs/zh/sql-reference/table-functions/file.md @@ -1,23 +1,25 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 37 -toc_title: "\u6587\u4EF6" +toc_title: file --- -# 文件 {#file} +# file {#file} -从文件创建表。 此表函数类似于 [url](url.md) 和 [hdfs](hdfs.md) 一些的。 +从文件创建表。 此表函数类似于 [url](../../sql-reference/table-functions/url.md) 和 [hdfs](../../sql-reference/table-functions/hdfs.md)。 + +`file` 函数可用于对[File](../../engines/table-engines/special/file.md) 表中的数据进行 `SELECT` 和 `INSERT` 查询。 + +**语法** ``` sql file(path, format, structure) ``` -**输入参数** +**参数** -- `path` — The relative path to the file from [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). 只读模式下的globs后的文件支持路径: `*`, `?`, `{abc,def}` 和 `{N..M}` 哪里 `N`, `M` — numbers, \``'abc', 'def'` — strings. -- `format` — The [格式](../../interfaces/formats.md#formats) 的文件。 -- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. +- `path` — [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path)中文件的相对路径。在只读模式下,文件路径支持以下通配符: `*`, `?`, `{abc,def}` 和 `{N..M}`,其中 `N`, `M` 是数字, \``'abc', 'def'` 是字符串。 +- `format` —文件的[格式](../../interfaces/formats.md#formats)。 +- `structure` — 表的结构。格式 `'column1_name column1_type, column2_name column2_type, ...'`。 **返回值** @@ -25,7 +27,7 @@ file(path, format, structure) **示例** -设置 `user_files_path` 和文件的内容 `test.csv`: +设置 `user_files_path` 和文件 `test.csv` 的内容: ``` bash $ grep user_files_path /etc/clickhouse-server/config.xml @@ -37,12 +39,10 @@ $ cat /var/lib/clickhouse/user_files/test.csv 78,43,45 ``` -表从`test.csv` 并从中选择前两行: +从 `test.csv` 中的表中获取数据,并从表中选择前两行: ``` sql -SELECT * -FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') -LIMIT 2 +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2; ``` ``` text @@ -52,25 +52,40 @@ LIMIT 2 └─────────┴─────────┴─────────┘ ``` +从CSV文件获取包含3列 [UInt32](../../sql-reference/data-types/int-uint.md) 类型的表的前10行: + ``` sql --- getting the first 10 lines of a table that contains 3 columns of UInt32 type from a CSV file -SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10 +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10; ``` -**路径中的水珠** +将文件中的数据插入表中: -多个路径组件可以具有globs。 对于正在处理的文件应该存在并匹配到整个路径模式(不仅后缀或前缀)。 +``` sql +INSERT INTO FUNCTION file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') VALUES (1, 2, 3), (3, 2, 1); +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32'); +``` -- `*` — Substitutes any number of any characters except `/` 包括空字符串。 -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. +``` text +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` -建筑与 `{}` 类似于 [远程表功能](../../sql-reference/table-functions/remote.md)). +**路径中的通配符** + +多个路径组件可以具有通配符。 对于要处理的文件必须存在并与整个路径模式匹配(不仅后缀或前缀)。 + +- `*` — 替换任意数量的任何字符,除了 `/` 包括空字符串。 +- `?` — 替换任何单个字符。 +- `{some_string,another_string,yet_another_one}` — 替换任何字符串 `'some_string', 'another_string', 'yet_another_one'`。 +- `{N..M}` — 替换范围从N到M的任何数字(包括两个边界)。 + +使用 `{}` 的构造类似于 [remote](../../sql-reference/table-functions/remote.md))表函数。 **示例** -1. 假设我们有几个具有以下相对路径的文件: +假设我们有几个文件,这些文件具有以下相对路径: - ‘some_dir/some_file_1’ - ‘some_dir/some_file_2’ @@ -79,18 +94,14 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U - ‘another_dir/some_file_2’ - ‘another_dir/some_file_3’ -1. 查询这些文件中的行数: - - +查询这些文件中的行数: ``` sql SELECT count(*) FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') ``` -1. 查询这两个目录的所有文件中的行数: - - +查询这两个目录的所有文件中的行数: ``` sql SELECT count(*) @@ -98,11 +109,11 @@ FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') ``` !!! warning "警告" - 如果您的文件列表包含带前导零的数字范围,请单独使用带大括号的构造或使用 `?`. + 如果您的文件列表包含带前导零的数字范围,请对每个数字分别使用带有大括号的结构或使用 `?`。 **示例** -从名为 `file000`, `file001`, … , `file999`: +从名为 `file000`, `file001`, … , `file999`的文件中查询数据: ``` sql SELECT count(*) @@ -111,8 +122,8 @@ FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') ## 虚拟列 {#virtual-columns} -- `_path` — Path to the file. -- `_file` — Name of the file. +- `_path` — 文件路径。 +- `_file` — 文件名称。 **另请参阅** diff --git a/docs/zh/sql-reference/table-functions/generate.md b/docs/zh/sql-reference/table-functions/generate.md index 1b535161acb..b9b02793cf3 100644 --- a/docs/zh/sql-reference/table-functions/generate.md +++ b/docs/zh/sql-reference/table-functions/generate.md @@ -1,15 +1,13 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 47 toc_title: generateRandom --- # generateRandom {#generaterandom} -使用给定的模式生成随机数据。 -允许用数据填充测试表。 -支持可以存储在表中的所有数据类型,除了 `LowCardinality` 和 `AggregateFunction`. +生成具用给定的模式的随机数据。 +允许用数据来填充测试表。 +支持所有可以存储在表中的数据类型, `LowCardinality` 和 `AggregateFunction`除外。 ``` sql generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]); @@ -17,15 +15,15 @@ generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_stri **参数** -- `name` — Name of corresponding column. -- `TypeName` — Type of corresponding column. -- `max_array_length` — Maximum array length for all generated arrays. Defaults to `10`. -- `max_string_length` — Maximum string length for all generated strings. Defaults to `10`. -- `random_seed` — Specify random seed manually to produce stable results. If NULL — seed is randomly generated. +- `name` — 对应列的名称。 +- `TypeName` — 对应列的类型。 +- `max_array_length` — 生成数组的最大长度。 默认为10。 +- `max_string_length` — 生成字符串的最大长度。 默认为10。 +- `random_seed` — 手动指定随机种子以产生稳定的结果。 如果为NULL-种子是随机生成的。 **返回值** -具有请求架构的表对象。 +具有请求模式的表对象。 ## 用法示例 {#usage-example} diff --git a/docs/zh/sql-reference/table-functions/hdfs.md b/docs/zh/sql-reference/table-functions/hdfs.md index 112c88450e2..715d9671dc8 100644 --- a/docs/zh/sql-reference/table-functions/hdfs.md +++ b/docs/zh/sql-reference/table-functions/hdfs.md @@ -1,13 +1,11 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 45 toc_title: hdfs --- # hdfs {#hdfs} -从HDFS中的文件创建表。 此表函数类似于 [url](url.md) 和 [文件](file.md) 一些的。 +根据HDFS中的文件创建表。 该表函数类似于 [url](url.md) 和 [文件](file.md)。 ``` sql hdfs(URI, format, structure) @@ -15,9 +13,9 @@ hdfs(URI, format, structure) **输入参数** -- `URI` — The relative URI to the file in HDFS. Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` 和 `{N..M}` 哪里 `N`, `M` — numbers, \``'abc', 'def'` — strings. -- `format` — The [格式](../../interfaces/formats.md#formats) 的文件。 -- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. +- `URI` — HDFS中文件的相对URI。 在只读模式下,文件路径支持以下通配符: `*`, `?`, `{abc,def}` 和 `{N..M}` ,其中 `N`, `M` 是数字, \``'abc', 'def'` 是字符串。 +- `format` — 文件的[格式](../../interfaces/formats.md#formats)。 +- `structure` — 表的结构。格式 `'column1_name column1_type, column2_name column2_type, ...'`。 **返回值** @@ -25,7 +23,7 @@ hdfs(URI, format, structure) **示例** -表从 `hdfs://hdfs1:9000/test` 并从中选择前两行: +表来自 `hdfs://hdfs1:9000/test` 并从中选择前两行: ``` sql SELECT * @@ -40,20 +38,20 @@ LIMIT 2 └─────────┴─────────┴─────────┘ ``` -**路径中的水珠** +**路径中的通配符** -多个路径组件可以具有globs。 对于正在处理的文件应该存在并匹配到整个路径模式(不仅后缀或前缀)。 +多个路径组件可以具有通配符。 对于要处理的文件必须存在并与整个路径模式匹配(不仅后缀或前缀)。 -- `*` — Substitutes any number of any characters except `/` 包括空字符串。 -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. +- `*` — 替换任意数量的任何字符,除了 `/` 包括空字符串。 +- `?` — 替换任何单个字符。 +- `{some_string,another_string,yet_another_one}` — 替换任何字符串 `'some_string', 'another_string', 'yet_another_one'`。 +- `{N..M}` — 替换范围从N到M的任何数字(包括两个边界)。 -建筑与 `{}` 类似于 [远程表功能](../../sql-reference/table-functions/remote.md)). +使用 `{}` 的构造类似于 [remote](../../sql-reference/table-functions/remote.md))表函数。 **示例** -1. 假设我们在HDFS上有几个具有以下Uri的文件: +1. 假设我们在HDFS上有几个带有以下URI的文件: - ‘hdfs://hdfs1:9000/some_dir/some_file_1’ - ‘hdfs://hdfs1:9000/some_dir/some_file_2’ @@ -62,7 +60,7 @@ LIMIT 2 - ‘hdfs://hdfs1:9000/another_dir/some_file_2’ - ‘hdfs://hdfs1:9000/another_dir/some_file_3’ -1. 查询这些文件中的行数: +2. 查询这些文件中的行数: @@ -71,7 +69,7 @@ SELECT count(*) FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') ``` -1. 查询这两个目录的所有文件中的行数: +3. 查询这两个目录的所有文件中的行数: @@ -81,11 +79,11 @@ FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value U ``` !!! warning "警告" - 如果您的文件列表包含带前导零的数字范围,请单独使用带大括号的构造或使用 `?`. + 如果您的文件列表包含带前导零的数字范围,请对每个数字分别使用带有大括号的结构或使用 `?`。 **示例** -从名为 `file000`, `file001`, … , `file999`: +从名为 `file000`, `file001`, … , `file999`的文件中查询数据: ``` sql SELECT count(*) @@ -94,8 +92,8 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin ## 虚拟列 {#virtual-columns} -- `_path` — Path to the file. -- `_file` — Name of the file. +- `_path` — 文件路径。 +- `_file` — 文件名称。 **另请参阅** diff --git a/docs/zh/sql-reference/table-functions/index.md b/docs/zh/sql-reference/table-functions/index.md index d9eadb9c592..20a335de0fc 100644 --- a/docs/zh/sql-reference/table-functions/index.md +++ b/docs/zh/sql-reference/table-functions/index.md @@ -1,38 +1,36 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd -toc_folder_title: "\u8868\u51FD\u6570" +toc_folder_title: 表函数 toc_priority: 34 toc_title: "\u5BFC\u8A00" --- # 表函数 {#table-functions} -表函数是构造表的方法。 +表函数是用来构造表的方法。 -您可以使用表函数: +您可以在以下位置使用表函数: -- [FROM](../statements/select/from.md) 《公约》条款 `SELECT` 查询。 +- `SELECT` 查询的[FROM](../../sql-reference/statements/select/from.md)子句。 - The method for creating a temporary table that is available only in the current query. The table is deleted when the query finishes. + 创建临时表的方法,该临时表仅在当前查询中可用。当查询完成后,该临时表将被删除。 -- [创建表为\](../statements/create.md#create-table-query) 查询。 +- [CREATE TABLE AS \](../statements/create.md#create-table-query) 查询。 - It's one of the methods of creating a table. + 这是创建表的方法之一。 !!! warning "警告" - 你不能使用表函数,如果 [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) 设置被禁用。 + 如果 [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) 设置被禁用,则不能使用表函数。 -| 功能 | 产品描述 | -|--------------------|--------------------------------------------------------------------------------------------------------| -| [文件](file.md) | 创建一个 [文件](../../engines/table-engines/special/file.md)-发动机表。 | -| [合并](merge.md) | 创建一个 [合并](../../engines/table-engines/special/merge.md)-发动机表。 | -| [数字](numbers.md) | 创建一个包含整数填充的单列的表。 | -| [远程](remote.md) | 允许您访问远程服务器,而无需创建 [分布](../../engines/table-engines/special/distributed.md)-发动机表。 | -| [url](url.md) | 创建一个 [Url](../../engines/table-engines/special/url.md)-发动机表。 | -| [mysql](mysql.md) | 创建一个 [MySQL](../../engines/table-engines/integrations/mysql.md)-发动机表。 | -| [jdbc](jdbc.md) | 创建一个 [JDBC](../../engines/table-engines/integrations/jdbc.md)-发动机表。 | -| [odbc](odbc.md) | 创建一个 [ODBC](../../engines/table-engines/integrations/odbc.md)-发动机表。 | -| [hdfs](hdfs.md) | 创建一个 [HDFS](../../engines/table-engines/integrations/hdfs.md)-发动机表。 | +| 函数 | 描述 | +|-----------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| +| [file](../../sql-reference/table-functions/file.md) | 创建一个file引擎表。 | +| [merge](../../sql-reference/table-functions/merge.md) | 创建一个merge引擎表。 | +| [numbers](../../sql-reference/table-functions/numbers.md) | 创建一个单列的表,其中包含整数。 | +| [remote](../../sql-reference/table-functions/remote.md) | 允许您访问远程服务器,而无需创建分布式表。 | +| [url](../../sql-reference/table-functions/url.md) | 创建一个URL引擎表。 | +| [mysql](../../sql-reference/table-functions/mysql.md) | 创建一个MySQL引擎表。 | +| [jdbc](../../sql-reference/table-functions/jdbc.md) | 创建一个JDBC引擎表。 | +| [odbc](../../sql-reference/table-functions/odbc.md) | 创建一个ODBC引擎表。 | +| [hdfs](../../sql-reference/table-functions/hdfs.md) | 创建一个HDFS引擎表。 | [原始文章](https://clickhouse.tech/docs/en/query_language/table_functions/) diff --git a/docs/zh/sql-reference/table-functions/input.md b/docs/zh/sql-reference/table-functions/input.md index 42b354dc935..a0215b26c8a 100644 --- a/docs/zh/sql-reference/table-functions/input.md +++ b/docs/zh/sql-reference/table-functions/input.md @@ -1,33 +1,29 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 46 -toc_title: "\u8F93\u5165" +toc_title: input --- -# 输入 {#input} +# input {#input} -`input(structure)` -表功能,允许有效地转换和插入数据发送到 -服务器与给定结构的表与另一种结构。 +`input(structure)` -表函数,可以有效地将发送给服务器的数据转换为具有给定结构的数据并将其插入到具有其他结构的表中。 -`structure` -以下格式发送到服务器的数据结构 `'column1_name column1_type, column2_name column2_type, ...'`. -例如, `'id UInt32, name String'`. +`structure` -发送到服务器的数据结构的格式 `'column1_name column1_type, column2_name column2_type, ...'`。 +例如, `'id UInt32, name String'`。 -此功能只能用于 `INSERT SELECT` 查询,只有一次,但其他行为像普通表函数 +该函数只能在 `INSERT SELECT` 查询中使用,并且只能使用一次,但在其他方面,行为类似于普通的表函数 (例如,它可以用于子查询等。). -数据可以以任何方式像普通发送 `INSERT` 查询并传递任何可用 [格式](../../interfaces/formats.md#formats) -必须在查询结束时指定(不像普通 `INSERT SELECT`). +数据可以像普通 `INSERT` 查询一样发送,并以必须在查询末尾指定的任何可用[格式](../../interfaces/formats.md#formats) +传递(与普通 `INSERT SELECT`不同)。 -这个功能的主要特点是,当服务器从客户端接收数据时,它同时将其转换 -根据表达式中的列表 `SELECT` 子句并插入到目标表中。 临时表 -不创建所有传输的数据。 +该函数的主要特点是,当服务器从客户端接收数据时,它会同时根据 `SELECT` 子句中的表达式列表将其转换,并插入到目标表中。 +不会创建包含所有已传输数据的临时表。 **例** - 让 `test` 表具有以下结构 `(a String, b String)` - 和数据 `data.csv` 具有不同的结构 `(col1 String, col2 Date, col3 Int32)`. 查询插入 - 从数据 `data.csv` 进 `test` 同时转换的表如下所示: + 并且 `data.csv` 中的数据具有不同的结构 `(col1 String, col2 Date, col3 Int32)`。 + 将数据从 `data.csv` 插入到 `test` 表中,同时进行转换的查询如下所示: @@ -35,7 +31,7 @@ toc_title: "\u8F93\u5165" $ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; ``` -- 如果 `data.csv` 包含相同结构的数据 `test_structure` 作为表 `test` 那么这两个查询是相等的: +- 如果 `data.csv` 包含与表 `test` 相同结构 `test_structure` 的数据,那么这两个查询是相等的: diff --git a/docs/zh/sql-reference/table-functions/jdbc.md b/docs/zh/sql-reference/table-functions/jdbc.md index c1833462171..af8c82f0097 100644 --- a/docs/zh/sql-reference/table-functions/jdbc.md +++ b/docs/zh/sql-reference/table-functions/jdbc.md @@ -1,6 +1,4 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 43 toc_title: jdbc --- @@ -9,10 +7,10 @@ toc_title: jdbc `jdbc(jdbc_connection_uri, schema, table)` -返回通过JDBC驱动程序连接的表。 -此表函数需要单独的 `clickhouse-jdbc-bridge` 程序正在运行。 +此表函数需要单独的 `clickhouse-jdbc-bridge` 程序才能运行。 它支持可空类型(基于查询的远程表的DDL)。 -**例** +**示例** ``` sql SELECT * FROM jdbc('jdbc:mysql://localhost:3306/?user=root&password=root', 'schema', 'table') diff --git a/docs/zh/sql-reference/table-functions/merge.md b/docs/zh/sql-reference/table-functions/merge.md index 0e94dcc4d42..410468b3d8a 100644 --- a/docs/zh/sql-reference/table-functions/merge.md +++ b/docs/zh/sql-reference/table-functions/merge.md @@ -1,14 +1,12 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 38 -toc_title: "\u5408\u5E76" +toc_title: merge --- -# 合并 {#merge} +# merge {#merge} -`merge(db_name, 'tables_regexp')` – Creates a temporary Merge table. For more information, see the section “Table engines, Merge”. +`merge(db_name, 'tables_regexp')` – 创建一个临时Merge表。 有关更多信息,请参见 “Table engines, Merge”。 -表结构取自与正则表达式匹配的第一个表。 +表结构取自遇到的第一个与正则表达式匹配的表。 [原始文章](https://clickhouse.tech/docs/en/query_language/table_functions/merge/) diff --git a/docs/zh/sql-reference/table-functions/numbers.md b/docs/zh/sql-reference/table-functions/numbers.md index e5f13d60791..59a57b157e0 100644 --- a/docs/zh/sql-reference/table-functions/numbers.md +++ b/docs/zh/sql-reference/table-functions/numbers.md @@ -1,18 +1,16 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 39 -toc_title: "\u6570\u5B57" +toc_title: numbers --- -# 数字 {#numbers} +# numbers {#numbers} -`numbers(N)` – Returns a table with the single ‘number’ 包含从0到N-1的整数的列(UInt64)。 -`numbers(N, M)` -返回一个表与单 ‘number’ 包含从N到(N+M-1)的整数的列(UInt64)。 +`numbers(N)` – 返回一个包含单个 ‘number’ 列(UInt64)的表,其中包含从0到N-1的整数。 +`numbers(N, M)` - 返回一个包含单个 ‘number’ 列(UInt64)的表,其中包含从N到(N+M-1)的整数。 -类似于 `system.numbers` 表,它可以用于测试和生成连续的值, `numbers(N, M)` 比 `system.numbers`. +类似于 `system.numbers` 表,它可以用于测试和生成连续的值, `numbers(N, M)` 比 `system.numbers`更有效。 -以下查询是等效的: +以下查询是等价的: ``` sql SELECT * FROM numbers(10); @@ -20,10 +18,10 @@ SELECT * FROM numbers(0, 10); SELECT * FROM system.numbers LIMIT 10; ``` -例: +示例: ``` sql --- Generate a sequence of dates from 2010-01-01 to 2010-12-31 +-- 生成2010-01-01至2010-12-31的日期序列 select toDate('2010-01-01') + number as d FROM numbers(365); ``` diff --git a/docs/zh/sql-reference/table-functions/odbc.md b/docs/zh/sql-reference/table-functions/odbc.md index 95fb2277474..dd2826e892f 100644 --- a/docs/zh/sql-reference/table-functions/odbc.md +++ b/docs/zh/sql-reference/table-functions/odbc.md @@ -1,13 +1,11 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 44 toc_title: odbc --- # odbc {#table-functions-odbc} -返回通过连接的表 [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). +返回通过 [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity) 连接的表。 ``` sql odbc(connection_settings, external_database, external_table) @@ -15,23 +13,23 @@ odbc(connection_settings, external_database, external_table) 参数: -- `connection_settings` — Name of the section with connection settings in the `odbc.ini` 文件 -- `external_database` — Name of a database in an external DBMS. -- `external_table` — Name of a table in the `external_database`. +- `connection_settings` — 在 `odbc.ini` 文件中连接设置的部分的名称。 +- `external_database` — 外部DBMS的数据库名。 +- `external_table` — `external_database` 数据库中的表名。 -为了安全地实现ODBC连接,ClickHouse使用单独的程序 `clickhouse-odbc-bridge`. 如果直接从ODBC驱动程序加载 `clickhouse-server`,驱动程序问题可能会导致ClickHouse服务器崩溃。 ClickHouse自动启动 `clickhouse-odbc-bridge` 当它是必需的。 ODBC桥程序是从相同的软件包作为安装 `clickhouse-server`. +为了安全地实现ODBC连接,ClickHouse使用单独的程序 `clickhouse-odbc-bridge`。 如果ODBC驱动程序直接从 `clickhouse-server` 加载,则驱动程序问题可能会导致ClickHouse服务器崩溃。 当需要时,ClickHouse自动启动 `clickhouse-odbc-bridge`。 ODBC桥程序是从与 `clickhouse-server` 相同的软件包安装的。 -与字段 `NULL` 外部表中的值将转换为基数据类型的默认值。 例如,如果远程MySQL表字段具有 `INT NULL` 键入它将转换为0(ClickHouse的默认值 `Int32` 数据类型)。 +外部表中字段包含的 `NULL` 值将转换为基本据类型的默认值。 例如,如果远程MySQL表字段包含 `INT NULL` 类型,则将被转换为0(ClickHouse`Int32` 数据类型的默认值)。 ## 用法示例 {#usage-example} -**通过ODBC从本地MySQL安装获取数据** +**通过ODBC从本地安装的MySQL获取数据** -此示例检查Ubuntu Linux18.04和MySQL服务器5.7。 +这个例子检查Ubuntu Linux18.04和MySQL服务器5.7。 -确保安装了unixODBC和MySQL连接器。 +确保已经安装了unixODBC和MySQL连接器。 -默认情况下(如果从软件包安装),ClickHouse以用户身份启动 `clickhouse`. 因此,您需要在MySQL服务器中创建和配置此用户。 +默认情况下(如果从软件包安装),ClickHouse以用户 `clickhouse` 启动。 因此,您需要在MySQL服务器中创建和配置此用户。 ``` bash $ sudo mysql @@ -42,7 +40,7 @@ mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` -然后配置连接 `/etc/odbc.ini`. +然后在 `/etc/odbc.ini` 中配置连接。 ``` bash $ cat /etc/odbc.ini @@ -55,7 +53,7 @@ USERNAME = clickhouse PASSWORD = clickhouse ``` -您可以使用 `isql` unixodbc安装中的实用程序。 +您可以使用unixODBC安装的 `isql` 实用程序检查连接。 ``` bash $ isql -v mysqlconn diff --git a/docs/zh/sql-reference/table-functions/remote.md b/docs/zh/sql-reference/table-functions/remote.md index b7bd494609b..cacc68c0b71 100644 --- a/docs/zh/sql-reference/table-functions/remote.md +++ b/docs/zh/sql-reference/table-functions/remote.md @@ -1,22 +1,52 @@ -# 远程,远程安全 {#remote-remotesecure} +# remote, remoteSecure {#remote-remotesecure} -允许您访问远程服务器,而无需创建 `Distributed` 表 +允许您访问远程服务器,而无需创建 `Distributed` 表。`remoteSecure` - 与 `remote` 相同,但是会使用加密链接。 -签名: +这两个函数都可以在 `SELECT` 和 `INSERT` 查询中使用。 + +语法: ``` sql -remote('addresses_expr', db, table[, 'user'[, 'password']]) -remote('addresses_expr', db.table[, 'user'[, 'password']]) -remoteSecure('addresses_expr', db, table[, 'user'[, 'password']]) -remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) +remote('addresses_expr', db, table[, 'user'[, 'password'], sharding_key]) +remote('addresses_expr', db.table[, 'user'[, 'password'], sharding_key]) +remoteSecure('addresses_expr', db, table[, 'user'[, 'password'], sharding_key]) +remoteSecure('addresses_expr', db.table[, 'user'[, 'password'], sharding_key]) ``` -`addresses_expr` – 代表远程服务器地址的一个表达式。可以只是单个服务器地址。 服务器地址可以是 `host:port` 或 `host`。`host` 可以指定为服务器域名,或是IPV4或IPV6地址。IPv6地址在方括号中指定。`port` 是远程服务器上的TCP端口。 如果省略端口,则使用服务器配置文件中的 `tcp_port` (默认情况为,9000)。 +**参数** + +- `addresses_expr` – 代表远程服务器地址的一个表达式。可以只是单个服务器地址。 服务器地址可以是 `host:port` 或 `host`。 + + `host` 可以指定为服务器名称,或是IPV4或IPV6地址。IPv6地址在方括号中指定。 + + `port` 是远程服务器上的TCP端口。 如果省略端口,则 `remote` 使用服务器配置文件中的 [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) (默认情况为,9000),`remoteSecure` 使用 [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure) (默认情况为,9440)。 -!!! important "重要事项" IPv6地址需要指定端口。 + + 类型: [String](../../sql-reference/data-types/string.md)。 + + - `db` — 数据库名。类型: [String](../../sql-reference/data-types/string.md)。 + - `table` — 表名。类型: [String](../../sql-reference/data-types/string.md)。 + - `user` — 用户名。如果未指定用户,则使用 `default` 。类型: [String](../../sql-reference/data-types/string.md)。 + - `password` — 用户密码。如果未指定密码,则使用空密码。类型: [String](../../sql-reference/data-types/string.md)。 + - `sharding_key` — 分片键以支持在节点之间分布数据。 例如: `insert into remote('127.0.0.1:9000,127.0.0.2', db, table, 'default', rand())`。 类型: [UInt32](../../sql-reference/data-types/int-uint.md)。 + + **返回值** + + 来自远程服务器的数据集。 + + **用法** + + 使用 `remote` 表函数没有创建一个 `Distributed` 表更优,因为在这种情况下,将为每个请求重新建立服务器连接。此外,如果设置了主机名,则会解析这些名称,并且在使用各种副本时不会计入错误。 在处理大量查询时,始终优先创建 `Distributed` 表,不要使用 `remote` 表函数。 -例: + 该 `remote` 表函数可以在以下情况下是有用的: + + - 访问特定服务器进行数据比较、调试和测试。 + - 在多个ClickHouse集群之间的用户研究目的的查询。 + - 手动发出的不频繁分布式请求。 + - 每次重新定义服务器集的分布式请求。 + + **地址** ``` text example01-01-1 @@ -29,8 +59,6 @@ localhost 多个地址可以用逗号分隔。在这种情况下,ClickHouse将使用分布式处理,因此它将将查询发送到所有指定的地址(如具有不同数据的分片)。 -示例: - ``` text example01-01-1,example01-02-1 ``` @@ -49,30 +77,28 @@ example01-{01..02}-1 如果您有多对大括号,它会生成相应集合的直接乘积。 -大括号中的地址和部分地址可以用管道符号(\|)分隔。 在这种情况下,相应的地址集被解释为副本,并且查询将被发送到第一个正常副本。 但是,副本将按照当前[load_balancing](../../operations/settings/settings.md)设置的顺序进行迭代。 - -示例: +大括号中的地址和部分地址可以用管道符号(\|)分隔。 在这种情况下,相应的地址集被解释为副本,并且查询将被发送到第一个正常副本。 但是,副本将按照当前[load_balancing](../../operations/settings/settings.md)设置的顺序进行迭代。此示例指定两个分片,每个分片都有两个副本: ``` text example01-{01..02}-{1|2} ``` -此示例指定两个分片,每个分片都有两个副本。 - 生成的地址数由常量限制。目前这是1000个地址。 -使用 `remote` 表函数没有创建一个 `Distributed` 表更优,因为在这种情况下,将为每个请求重新建立服务器连接。此外,如果设置了主机名,则会解析这些名称,并且在使用各种副本时不会计算错误。 在处理大量查询时,始终优先创建 `Distributed` 表,不要使用 `remote` 表功能。 +**示例** -该 `remote` 表函数可以在以下情况下是有用的: +从远程服务器选择数据: -- 访问特定服务器进行数据比较、调试和测试。 -- 在多个ClickHouse集群之间的用户研究目的的查询。 -- 手动发出的不频繁分布式请求。 -- 每次重新定义服务器集的分布式请求。 +``` sql +SELECT * FROM remote('127.0.0.1', db.remote_engine_table) LIMIT 3; +``` -如果未指定用户, 将会使用`default`。 -如果未指定密码,则使用空密码。 +将远程服务器中的数据插入表中: -`remoteSecure` - 与 `remote` 相同,但是会使用加密链接。默认端口为配置文件中的[tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure),或9440。 +``` sql +CREATE TABLE remote_table (name String, value UInt32) ENGINE=Memory; +INSERT INTO FUNCTION remote('127.0.0.1', currentDatabase(), 'remote_table') VALUES ('test', 42); +SELECT * FROM remote_table; +``` [原始文章](https://clickhouse.tech/docs/en/query_language/table_functions/remote/) diff --git a/docs/zh/sql-reference/table-functions/url.md b/docs/zh/sql-reference/table-functions/url.md index c2efe09913a..d726cddd748 100644 --- a/docs/zh/sql-reference/table-functions/url.md +++ b/docs/zh/sql-reference/table-functions/url.md @@ -1,26 +1,43 @@ --- -machine_translated: true -machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd toc_priority: 41 toc_title: url --- # url {#url} -`url(URL, format, structure)` -返回从创建的表 `URL` 与给定 -`format` 和 `structure`. +`url` 函数从 `URL` 创建一个具有给定 `format` 和 `structure` 的表。 -URL-HTTP或HTTPS服务器地址,它可以接受 `GET` 和/或 `POST` 请求。 +`url` 函数可用于对[URL](../../engines/table-engines/special/url.md)表中的数据进行 `SELECT` 和 `INSERT` 的查询中。 -格式 - [格式](../../interfaces/formats.md#formats) 的数据。 +**语法** -结构-表结构 `'UserID UInt64, Name String'` 格式。 确定列名称和类型。 +``` sql +url(URL, format, structure) +``` + +**参数** + +- `URL` — HTTP或HTTPS服务器地址,它可以接受 `GET` 或 `POST` 请求 (对应于 `SELECT` 或 `INSERT` 查询)。类型: [String](../../sql-reference/data-types/string.md)。 +- `format` — 数据[格式](../../interfaces/formats.md#formats)。类型: [String](../../sql-reference/data-types/string.md)。 +- `structure` — 以 `'UserID UInt64, Name String'` 格式的表结构。确定列名和类型。 类型: [String](../../sql-reference/data-types/string.md)。 + +**返回值** + +A table with the specified format and structure and with data from the defined `URL`. **示例** +获取一个表的前3行,该表是从HTTP服务器获取的包含 `String` 和 [UInt32](../../sql-reference/data-types/int-uint.md) 类型的列,以[CSV](../../interfaces/formats.md#csv)格式返回。 + ``` sql --- getting the first 3 lines of a table that contains columns of String and UInt32 type from HTTP-server which answers in CSV format. -SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 +SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3; ``` +将 `URL` 的数据插入到表中: + +``` sql +CREATE TABLE test_table (column1 String, column2 UInt32) ENGINE=Memory; +INSERT INTO FUNCTION url('http://127.0.0.1:8123/?query=INSERT+INTO+test_table+FORMAT+CSV', 'CSV', 'column1 String, column2 UInt32') VALUES ('http interface', 42); +SELECT * FROM test_table; +``` [原始文章](https://clickhouse.tech/docs/en/query_language/table_functions/url/) diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 9adca58b55a..ad3ff84d8bf 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -33,7 +33,14 @@ option (ENABLE_CLICKHOUSE_OBFUSCATOR "Table data obfuscator (convert real data t ${ENABLE_CLICKHOUSE_ALL}) # https://clickhouse.tech/docs/en/operations/utilities/odbc-bridge/ -option (ENABLE_CLICKHOUSE_ODBC_BRIDGE "HTTP-server working like a proxy to ODBC driver" +if (ENABLE_ODBC) + option (ENABLE_CLICKHOUSE_ODBC_BRIDGE "HTTP-server working like a proxy to ODBC driver" + ${ENABLE_CLICKHOUSE_ALL}) +else () + option (ENABLE_CLICKHOUSE_ODBC_BRIDGE "HTTP-server working like a proxy to ODBC driver" OFF) +endif () + +option (ENABLE_CLICKHOUSE_LIBRARY_BRIDGE "HTTP-server working like a proxy to Library dictionary source" ${ENABLE_CLICKHOUSE_ALL}) # https://presentations.clickhouse.tech/matemarketing_2020/ @@ -109,6 +116,12 @@ else() message(STATUS "ODBC bridge mode: OFF") endif() +if (ENABLE_CLICKHOUSE_LIBRARY_BRIDGE) + message(STATUS "Library bridge mode: ON") +else() + message(STATUS "Library bridge mode: OFF") +endif() + if (ENABLE_CLICKHOUSE_INSTALL) message(STATUS "ClickHouse install: ON") else() @@ -188,11 +201,16 @@ add_subdirectory (format) add_subdirectory (obfuscator) add_subdirectory (install) add_subdirectory (git-import) +add_subdirectory (bash-completion) if (ENABLE_CLICKHOUSE_ODBC_BRIDGE) add_subdirectory (odbc-bridge) endif () +if (ENABLE_CLICKHOUSE_LIBRARY_BRIDGE) + add_subdirectory (library-bridge) +endif () + if (CLICKHOUSE_ONE_SHARED) add_library(clickhouse-lib SHARED ${CLICKHOUSE_SERVER_SOURCES} ${CLICKHOUSE_CLIENT_SOURCES} ${CLICKHOUSE_LOCAL_SOURCES} ${CLICKHOUSE_BENCHMARK_SOURCES} ${CLICKHOUSE_COPIER_SOURCES} ${CLICKHOUSE_EXTRACT_FROM_CONFIG_SOURCES} ${CLICKHOUSE_COMPRESSOR_SOURCES} ${CLICKHOUSE_FORMAT_SOURCES} ${CLICKHOUSE_OBFUSCATOR_SOURCES} ${CLICKHOUSE_GIT_IMPORT_SOURCES} ${CLICKHOUSE_ODBC_BRIDGE_SOURCES}) target_link_libraries(clickhouse-lib ${CLICKHOUSE_SERVER_LINK} ${CLICKHOUSE_CLIENT_LINK} ${CLICKHOUSE_LOCAL_LINK} ${CLICKHOUSE_BENCHMARK_LINK} ${CLICKHOUSE_COPIER_LINK} ${CLICKHOUSE_EXTRACT_FROM_CONFIG_LINK} ${CLICKHOUSE_COMPRESSOR_LINK} ${CLICKHOUSE_FORMAT_LINK} ${CLICKHOUSE_OBFUSCATOR_LINK} ${CLICKHOUSE_GIT_IMPORT_LINK} ${CLICKHOUSE_ODBC_BRIDGE_LINK}) @@ -208,6 +226,10 @@ if (CLICKHOUSE_SPLIT_BINARY) list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-odbc-bridge) endif () + if (ENABLE_CLICKHOUSE_LIBRARY_BRIDGE) + list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-library-bridge) + endif () + set_target_properties(${CLICKHOUSE_ALL_TARGETS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..) add_custom_target (clickhouse-bundle ALL DEPENDS ${CLICKHOUSE_ALL_TARGETS}) @@ -325,7 +347,7 @@ else () endif () if (ENABLE_TESTS AND USE_GTEST) - set (CLICKHOUSE_UNIT_TESTS_TARGETS unit_tests_libcommon unit_tests_dbms) + set (CLICKHOUSE_UNIT_TESTS_TARGETS unit_tests_dbms) add_custom_target (clickhouse-tests ALL DEPENDS ${CLICKHOUSE_UNIT_TESTS_TARGETS}) add_dependencies(clickhouse-bundle clickhouse-tests) endif() diff --git a/programs/bash-completion/CMakeLists.txt b/programs/bash-completion/CMakeLists.txt new file mode 100644 index 00000000000..d3a47f5a35e --- /dev/null +++ b/programs/bash-completion/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(completions) diff --git a/programs/bash-completion/completions/CMakeLists.txt b/programs/bash-completion/completions/CMakeLists.txt new file mode 100644 index 00000000000..d364e07ef6e --- /dev/null +++ b/programs/bash-completion/completions/CMakeLists.txt @@ -0,0 +1,28 @@ +macro(configure_bash_completion) + set(out "/usr/share/bash-completion/completions") + find_program(pkg-config PKG_CONFIG_BIN) + if (PKG_CONFIG_BIN) + execute_process( + COMMAND ${PKG_CONFIG_BIN} --variable=completionsdir bash-completion + OUTPUT_VARIABLE ${out} + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + endif() + string(REPLACE /usr "${CMAKE_INSTALL_PREFIX}" out "${out}") + message(STATUS "bash_completion will be written to ${out}") +endmacro() + +configure_bash_completion() +foreach (name + # set of functions + clickhouse-bootstrap + + # binaries that accept settings as command line argument + clickhouse-client + clickhouse-local + clickhouse-benchmark + + clickhouse +) + install(FILES ${name} DESTINATION ${out}) +endforeach() diff --git a/programs/bash-completion/completions/clickhouse b/programs/bash-completion/completions/clickhouse new file mode 100644 index 00000000000..c4b77cf3f7a --- /dev/null +++ b/programs/bash-completion/completions/clickhouse @@ -0,0 +1,43 @@ +[[ -v $_CLICKHOUSE_COMPLETION_LOADED ]] || source "$(dirname "${BASH_SOURCE[0]}")/clickhouse-bootstrap" + +function _clickhouse_get_utils() +{ + local cmd=$1 && shift + "$cmd" --help |& awk '/^clickhouse.*args/ { print $2 }' +} + +function _complete_for_clickhouse_entrypoint_bin() +{ + local cur prev cword words + eval local cmd="$( _clickhouse_quote "$1" )" + _clickhouse_bin_exist "$cmd" || return 0 + + COMPREPLY=() + _get_comp_words_by_ref cur prev cword words + + local util="$cur" + # complete utils, until it will be finished + if [[ $cword -lt 2 ]]; then + COMPREPLY=( $(compgen -W "$(_clickhouse_get_utils "$cmd")" -- "$cur") ) + return + fi + util="${words[1]}" + + case "$prev" in + -C|--config-file|--config) + return + ;; + # Argh... This looks like a bash bug... + # Redirections are passed to the completion function + # although it is managed by the shell directly... + '<'|'>'|'>>'|[12]'>'|[12]'>>') + return + ;; + esac + + COMPREPLY=( $(compgen -W "$(_clickhouse_get_options "$cmd" "$util")" -- "$cur") ) + + return 0 +} + +_complete_clickhouse_generic clickhouse _complete_for_clickhouse_entrypoint_bin diff --git a/programs/bash-completion/completions/clickhouse-benchmark b/programs/bash-completion/completions/clickhouse-benchmark new file mode 100644 index 00000000000..13064b7417d --- /dev/null +++ b/programs/bash-completion/completions/clickhouse-benchmark @@ -0,0 +1,2 @@ +[[ -v $_CLICKHOUSE_COMPLETION_LOADED ]] || source "$(dirname "${BASH_SOURCE[0]}")/clickhouse-bootstrap" +_complete_clickhouse_generic clickhouse-benchmark diff --git a/programs/bash-completion/completions/clickhouse-bootstrap b/programs/bash-completion/completions/clickhouse-bootstrap new file mode 100644 index 00000000000..dc8dcd5ad8d --- /dev/null +++ b/programs/bash-completion/completions/clickhouse-bootstrap @@ -0,0 +1,81 @@ +# +# bash autocomplete, that can work with: +# a) --help of program +# +# Also you may like: +# $ bind "set completion-ignore-case on" +# $ bind "set show-all-if-ambiguous on" +# +# It uses bash-completion dynamic loader. + +# Known to work with bash 3.* with programmable completion and extended +# pattern matching enabled (use 'shopt -s extglob progcomp' to enable +# these if they are not already enabled). +shopt -s extglob + +export _CLICKHOUSE_COMPLETION_LOADED=1 + +function _clickhouse_bin_exist() +{ [ -x "$1" ] || command -v "$1" >& /dev/null; } + +function _clickhouse_quote() +{ + local quoted=${1//\'/\'\\\'\'}; + printf "'%s'" "$quoted" +} + +# Extract every option (everything that starts with "-") from the --help dialog. +function _clickhouse_get_options() +{ + "$@" --help 2>&1 | awk -F '[ ,=<>]' '{ for (i=1; i <= NF; ++i) { if (substr($i, 0, 1) == "-" && length($i) > 1) print $i; } }' | sort -u +} + +function _complete_for_clickhouse_generic_bin() +{ + local cur prev + eval local cmd="$( _clickhouse_quote "$1" )" + _clickhouse_bin_exist "$cmd" || return 0 + + COMPREPLY=() + _get_comp_words_by_ref cur prev + + case "$prev" in + -C|--config-file|--config) + return + ;; + # Argh... This looks like a bash bug... + # Redirections are passed to the completion function + # although it is managed by the shell directly... + '<'|'>'|'>>'|[12]'>'|[12]'>>') + return + ;; + esac + + COMPREPLY=( $(compgen -W "$(_clickhouse_get_options "$cmd")" -- "$cur") ) + + return 0 +} + +function _complete_clickhouse_generic() +{ + local bin=$1 && shift + local f=${1:-_complete_for_clickhouse_generic_bin} + local o=( + -o default + -o bashdefault + -o nospace + -F "$f" + "$bin" + ) + complete "${o[@]}" +} + +function _complete_clickhouse_bootstrap_main() +{ + local runtime=/usr/share/bash-completion/bash_completion + if ! type _get_comp_words_by_ref >& /dev/null && [[ -f $runtime ]]; then + source $runtime + fi + type _get_comp_words_by_ref >& /dev/null || return 0 +} +_complete_clickhouse_bootstrap_main "$@" diff --git a/programs/bash-completion/completions/clickhouse-client b/programs/bash-completion/completions/clickhouse-client new file mode 100644 index 00000000000..6b7899b7263 --- /dev/null +++ b/programs/bash-completion/completions/clickhouse-client @@ -0,0 +1,2 @@ +[[ -v $_CLICKHOUSE_COMPLETION_LOADED ]] || source "$(dirname "${BASH_SOURCE[0]}")/clickhouse-bootstrap" +_complete_clickhouse_generic clickhouse-client diff --git a/programs/bash-completion/completions/clickhouse-local b/programs/bash-completion/completions/clickhouse-local new file mode 100644 index 00000000000..7b12b48c7cd --- /dev/null +++ b/programs/bash-completion/completions/clickhouse-local @@ -0,0 +1,2 @@ +[[ -v $_CLICKHOUSE_COMPLETION_LOADED ]] || source "$(dirname "${BASH_SOURCE[0]}")/clickhouse-bootstrap" +_complete_clickhouse_generic clickhouse-local diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp index a0e2ea155ba..1d2b579db3a 100644 --- a/programs/benchmark/Benchmark.cpp +++ b/programs/benchmark/Benchmark.cpp @@ -95,8 +95,8 @@ public: comparison_info_total.emplace_back(std::make_shared()); } - global_context.makeGlobalContext(); - global_context.setSettings(settings); + global_context->makeGlobalContext(); + global_context->setSettings(settings); std::cerr << std::fixed << std::setprecision(3); @@ -159,7 +159,7 @@ private: bool print_stacktrace; const Settings & settings; SharedContextHolder shared_context; - Context global_context; + ContextPtr global_context; QueryProcessingStage::Enum query_processing_stage; /// Don't execute new queries after timelimit or SIGINT or exception diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 3c27908741c..1aec3677b41 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +117,34 @@ namespace ErrorCodes } +static bool queryHasWithClause(const IAST * ast) +{ + if (const auto * select = dynamic_cast(ast); + select && select->with()) + { + return true; + } + + // This full recursive walk is somewhat excessive, because most of the + // children are not queries, but on the other hand it will let us to avoid + // breakage when the AST structure changes and some new variant of query + // nesting is added. This function is used in fuzzer, so it's better to be + // defensive and avoid weird unexpected errors. + // clang-tidy is confused by this function: it thinks that if `select` is + // nullptr, `ast` is also nullptr, and complains about nullptr dereference. + // NOLINTNEXTLINE + for (const auto & child : ast->children) + { + if (queryHasWithClause(child.get())) + { + return true; + } + } + + return false; +} + + class Client : public Poco::Util::Application { public: @@ -162,7 +191,7 @@ private: bool has_vertical_output_suffix = false; /// Is \G present at the end of the query string? SharedContextHolder shared_context = Context::createShared(); - Context context = Context::createGlobal(shared_context.get()); + ContextPtr context = Context::createGlobal(shared_context.get()); /// Buffer that reads from stdin in batch mode. ReadBufferFromFileDescriptor std_in {STDIN_FILENO}; @@ -245,20 +274,20 @@ private: configReadClient(config(), home_path); - context.setApplicationType(Context::ApplicationType::CLIENT); - context.setQueryParameters(query_parameters); + context->setApplicationType(Context::ApplicationType::CLIENT); + context->setQueryParameters(query_parameters); /// settings and limits could be specified in config file, but passed settings has higher priority - for (const auto & setting : context.getSettingsRef().allUnchanged()) + for (const auto & setting : context->getSettingsRef().allUnchanged()) { const auto & name = setting.getName(); if (config().has(name)) - context.setSetting(name, config().getString(name)); + context->setSetting(name, config().getString(name)); } /// Set path for format schema files if (config().has("format_schema_path")) - context.setFormatSchemaPath(Poco::Path(config().getString("format_schema_path")).toString()); + context->setFormatSchemaPath(Poco::Path(config().getString("format_schema_path")).toString()); /// Initialize query_id_formats if any if (config().has("query_id_formats")) @@ -390,7 +419,7 @@ private: for (auto d : chineseNewYearIndicators) { /// Let's celebrate until Lantern Festival - if (d <= days && d + 25u >= days) + if (d <= days && d + 25 >= days) return true; else if (d > days) return false; @@ -498,7 +527,10 @@ private: std::cerr << std::fixed << std::setprecision(3); if (is_interactive) + { + clearTerminal(); showClientVersion(); + } is_default_format = !config().has("vertical") && !config().has("format"); if (config().has("vertical")) @@ -506,15 +538,15 @@ private: else format = config().getString("format", is_interactive ? "PrettyCompact" : "TabSeparated"); - format_max_block_size = config().getInt("format_max_block_size", context.getSettingsRef().max_block_size); + format_max_block_size = config().getInt("format_max_block_size", context->getSettingsRef().max_block_size); insert_format = "Values"; /// Setting value from cmd arg overrides one from config - if (context.getSettingsRef().max_insert_block_size.changed) - insert_format_max_block_size = context.getSettingsRef().max_insert_block_size; + if (context->getSettingsRef().max_insert_block_size.changed) + insert_format_max_block_size = context->getSettingsRef().max_insert_block_size; else - insert_format_max_block_size = config().getInt("insert_format_max_block_size", context.getSettingsRef().max_insert_block_size); + insert_format_max_block_size = config().getInt("insert_format_max_block_size", context->getSettingsRef().max_insert_block_size); if (!is_interactive) { @@ -523,7 +555,7 @@ private: ignore_error = config().getBool("ignore-error", false); } - ClientInfo & client_info = context.getClientInfo(); + ClientInfo & client_info = context->getClientInfo(); client_info.setInitialQuery(); client_info.quota_key = config().getString("quota_key", ""); @@ -531,7 +563,7 @@ private: /// Initialize DateLUT here to avoid counting time spent here as query execution time. const auto local_tz = DateLUT::instance().getTimeZone(); - if (!context.getSettingsRef().use_client_time_zone) + if (!context->getSettingsRef().use_client_time_zone) { const auto & time_zone = connection->getServerTimezone(connection_parameters.timeouts); if (!time_zone.empty()) @@ -706,7 +738,7 @@ private: { auto query_id = config().getString("query_id", ""); if (!query_id.empty()) - context.setCurrentQueryId(query_id); + context->setCurrentQueryId(query_id); nonInteractive(); @@ -1006,7 +1038,7 @@ private: { Tokens tokens(this_query_begin, all_queries_end); IParser::Pos token_iterator(tokens, - context.getSettingsRef().max_parser_depth); + context->getSettingsRef().max_parser_depth); if (!token_iterator.isValid()) { break; @@ -1055,7 +1087,7 @@ private: if (ignore_error) { Tokens tokens(this_query_begin, all_queries_end); - IParser::Pos token_iterator(tokens, context.getSettingsRef().max_parser_depth); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); while (token_iterator->type != TokenType::Semicolon && token_iterator.isValid()) ++token_iterator; this_query_begin = token_iterator->end; @@ -1101,7 +1133,7 @@ private: // beneficial so that we see proper trailing comments in "echo" and // server log. adjustQueryEnd(this_query_end, all_queries_end, - context.getSettingsRef().max_parser_depth); + context->getSettingsRef().max_parser_depth); // full_query is the query + inline INSERT data + trailing comments // (the latter is our best guess for now). @@ -1141,7 +1173,7 @@ private: { this_query_end = insert_ast->end; adjustQueryEnd(this_query_end, all_queries_end, - context.getSettingsRef().max_parser_depth); + context->getSettingsRef().max_parser_depth); } // Now we know for sure where the query ends. @@ -1255,6 +1287,29 @@ private: return true; } + // Prints changed settings to stderr. Useful for debugging fuzzing failures. + void printChangedSettings() const + { + const auto & changes = context->getSettingsRef().changes(); + if (!changes.empty()) + { + fmt::print(stderr, "Changed settings: "); + for (size_t i = 0; i < changes.size(); ++i) + { + if (i) + { + fmt::print(stderr, ", "); + } + fmt::print(stderr, "{} = '{}'", changes[i].name, + toString(changes[i].value)); + } + fmt::print(stderr, "\n"); + } + else + { + fmt::print(stderr, "No changed settings.\n"); + } + } /// Returns false when server is not available. bool processWithFuzzing(const String & text) @@ -1317,9 +1372,14 @@ private: auto base_after_fuzz = fuzz_base->formatForErrorMessage(); - // Debug AST cloning errors. + // Check that the source AST didn't change after fuzzing. This + // helps debug AST cloning errors, where the cloned AST doesn't + // clone all its children, and erroneously points to some source + // child elements. if (base_before_fuzz != base_after_fuzz) { + printChangedSettings(); + fmt::print(stderr, "Base before fuzz: {}\n" "Base after fuzz: {}\n", @@ -1334,7 +1394,7 @@ private: fmt::print(stderr, "IAST::clone() is broken for some AST node. This is a bug. The original AST ('dump before fuzz') and its cloned copy ('dump of cloned AST') refer to the same nodes, which must never happen. This means that their parent node doesn't implement clone() correctly."); - assert(false); + exit(1); } auto fuzzed_text = ast_to_process->formatForErrorMessage(); @@ -1378,29 +1438,76 @@ private: // Print the changed settings because they might be needed to // reproduce the error. - const auto & changes = context.getSettingsRef().changes(); - if (!changes.empty()) - { - fmt::print(stderr, "Changed settings: "); - for (size_t i = 0; i < changes.size(); ++i) - { - if (i) - { - fmt::print(stderr, ", "); - } - fmt::print(stderr, "{} = '{}'", changes[i].name, - toString(changes[i].value)); - } - fmt::print(stderr, "\n"); - } - else - { - fmt::print(stderr, "No changed settings.\n"); - } + printChangedSettings(); return false; } + // Check that after the query is formatted, we can parse it back, + // format again and get the same result. Unfortunately, we can't + // compare the ASTs, which would be more sensitive to errors. This + // double formatting check doesn't catch all errors, e.g. we can + // format query incorrectly, but to a valid SQL that we can then + // parse and format into the same SQL. + // There are some complicated cases where we can generate the SQL + // which we can't parse: + // * first argument of lambda() replaced by fuzzer with + // something else, leading to constructs such as + // arrayMap((min(x) + 3) -> x + 1, ....) + // * internals of Enum replaced, leading to: + // Enum(equals(someFunction(y), 3)). + // And there are even the cases when we can parse the query, but + // it's logically incorrect and its formatting is a mess, such as + // when `lambda()` function gets substituted into a wrong place. + // To avoid dealing with these cases, run the check only for the + // queries we were able to successfully execute. + // The final caveat is that sometimes WITH queries are not executed, + // if they are not referenced by the main SELECT, so they can still + // have the aforementioned problems. Disable this check for such + // queries, for lack of a better solution. + if (!have_error && queryHasWithClause(parsed_query.get())) + { + ASTPtr parsed_formatted_query; + try + { + const auto * tmp_pos = query_to_send.c_str(); + parsed_formatted_query = parseQuery(tmp_pos, + tmp_pos + query_to_send.size(), + false /* allow_multi_statements */); + } + catch (Exception & e) + { + if (e.code() != ErrorCodes::SYNTAX_ERROR) + { + throw; + } + } + + if (parsed_formatted_query) + { + const auto formatted_twice + = parsed_formatted_query->formatForErrorMessage(); + + if (formatted_twice != query_to_send) + { + fmt::print(stderr, "The query formatting is broken.\n"); + + printChangedSettings(); + + fmt::print(stderr, "Got the following (different) text after formatting the fuzzed query and parsing it back:\n'{}'\n, expected:\n'{}'\n", + formatted_twice, query_to_send); + fmt::print(stderr, "In more detail:\n"); + fmt::print(stderr, "AST-1:\n'{}'\n", parsed_query->dumpTree()); + fmt::print(stderr, "Text-1 (AST-1 formatted):\n'{}'\n", query_to_send); + fmt::print(stderr, "AST-2 (Text-1 parsed):\n'{}'\n", parsed_formatted_query->dumpTree()); + fmt::print(stderr, "Text-2 (AST-2 formatted):\n'{}'\n", formatted_twice); + fmt::print(stderr, "Text-1 must be equal to Text-2, but it is not.\n"); + + exit(1); + } + } + } + // The server is still alive so we're going to continue fuzzing. // Determine what we're going to use as the starting AST. if (have_error) @@ -1483,11 +1590,11 @@ private: if (is_interactive) { // Generate a new query_id - context.setCurrentQueryId(""); + context->setCurrentQueryId(""); for (const auto & query_id_format : query_id_formats) { writeString(query_id_format.first, std_out); - writeString(fmt::format(query_id_format.second, fmt::arg("query_id", context.getCurrentQueryId())), std_out); + writeString(fmt::format(query_id_format.second, fmt::arg("query_id", context->getCurrentQueryId())), std_out); writeChar('\n', std_out); std_out.next(); } @@ -1503,12 +1610,12 @@ private: { /// Temporarily apply query settings to context. std::optional old_settings; - SCOPE_EXIT({ if (old_settings) context.setSettings(*old_settings); }); + SCOPE_EXIT_SAFE({ if (old_settings) context->setSettings(*old_settings); }); auto apply_query_settings = [&](const IAST & settings_ast) { if (!old_settings) - old_settings.emplace(context.getSettingsRef()); - context.applySettingsChanges(settings_ast.as()->changes); + old_settings.emplace(context->getSettingsRef()); + context->applySettingsChanges(settings_ast.as()->changes); }; const auto * insert = parsed_query->as(); if (insert && insert->settings_ast) @@ -1546,7 +1653,7 @@ private: if (change.name == "profile") current_profile = change.value.safeGet(); else - context.applySettingChange(change); + context->applySettingChange(change); } } @@ -1618,10 +1725,10 @@ private: connection->sendQuery( connection_parameters.timeouts, query_to_send, - context.getCurrentQueryId(), + context->getCurrentQueryId(), query_processing_stage, - &context.getSettingsRef(), - &context.getClientInfo(), + &context->getSettingsRef(), + &context->getClientInfo(), true); sendExternalTables(); @@ -1659,10 +1766,10 @@ private: connection->sendQuery( connection_parameters.timeouts, query_to_send, - context.getCurrentQueryId(), + context->getCurrentQueryId(), query_processing_stage, - &context.getSettingsRef(), - &context.getClientInfo(), + &context->getSettingsRef(), + &context->getClientInfo(), true); sendExternalTables(); @@ -1685,7 +1792,7 @@ private: ParserQuery parser(end); ASTPtr res; - const auto & settings = context.getSettingsRef(); + const auto & settings = context->getSettingsRef(); size_t max_length = 0; if (!allow_multi_statements) max_length = settings.max_query_size; @@ -1773,7 +1880,7 @@ private: current_format = insert->format; } - BlockInputStreamPtr block_input = context.getInputFormat( + BlockInputStreamPtr block_input = context->getInputFormat( current_format, buf, sample, insert_format_max_block_size); if (columns_description.hasDefaults()) @@ -2097,9 +2204,9 @@ private: /// It is not clear how to write progress with parallel formatting. It may increase code complexity significantly. if (!need_render_progress) - block_out_stream = context.getOutputStreamParallelIfPossible(current_format, *out_buf, block); + block_out_stream = context->getOutputStreamParallelIfPossible(current_format, *out_buf, block); else - block_out_stream = context.getOutputStream(current_format, *out_buf, block); + block_out_stream = context->getOutputStream(current_format, *out_buf, block); block_out_stream->writePrefix(); } @@ -2363,6 +2470,17 @@ private: std::cout << DBMS_NAME << " client version " << VERSION_STRING << VERSION_OFFICIAL << "." << std::endl; } + static void clearTerminal() + { + /// Clear from cursor until end of screen. + /// It is needed if garbage is left in terminal. + /// Show cursor. It can be left hidden by invocation of previous programs. + /// A test for this feature: perl -e 'print "x"x100000'; echo -ne '\033[0;0H\033[?25l'; clickhouse-client + std::cout << + "\033[0J" + "\033[?25h"; + } + public: void init(int argc, char ** argv) { @@ -2472,7 +2590,7 @@ public: /** If "--password [value]" is used but the value is omitted, the bad argument exception will be thrown. * implicit_value is used to avoid this exception (to allow user to type just "--password") * Since currently boost provides no way to check if a value has been set implicitly for an option, - * the "\n" is used to distinguish this case because there is hardly a chance an user would use "\n" + * the "\n" is used to distinguish this case because there is hardly a chance a user would use "\n" * as the password. */ ("password", po::value()->implicit_value("\n", ""), "password") @@ -2592,12 +2710,12 @@ public: } } - context.makeGlobalContext(); - context.setSettings(cmd_settings); + context->makeGlobalContext(); + context->setSettings(cmd_settings); /// Copy settings-related program options to config. /// TODO: Is this code necessary? - for (const auto & setting : context.getSettingsRef().all()) + for (const auto & setting : context->getSettingsRef().all()) { const auto & name = setting.getName(); if (options.count(name)) @@ -2689,7 +2807,7 @@ public: { std::string traceparent = options["opentelemetry-traceparent"].as(); std::string error; - if (!context.getClientInfo().client_trace_context.parseTraceparentHeader( + if (!context->getClientInfo().client_trace_context.parseTraceparentHeader( traceparent, error)) { throw Exception(ErrorCodes::BAD_ARGUMENTS, @@ -2700,7 +2818,7 @@ public: if (options.count("opentelemetry-tracestate")) { - context.getClientInfo().client_trace_context.tracestate = + context->getClientInfo().client_trace_context.tracestate = options["opentelemetry-tracestate"].as(); } diff --git a/programs/client/ConnectionParameters.cpp b/programs/client/ConnectionParameters.cpp index 19734dd5ffa..6faf43759df 100644 --- a/programs/client/ConnectionParameters.cpp +++ b/programs/client/ConnectionParameters.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include @@ -60,7 +62,9 @@ ConnectionParameters::ConnectionParameters(const Poco::Util::AbstractConfigurati #endif } - compression = config.getBool("compression", true) ? Protocol::Compression::Enable : Protocol::Compression::Disable; + /// By default compression is disabled if address looks like localhost. + compression = config.getBool("compression", !isLocalAddress(DNSResolver::instance().resolveHost(host))) + ? Protocol::Compression::Enable : Protocol::Compression::Disable; timeouts = ConnectionTimeouts( Poco::Timespan(config.getInt("connect_timeout", DBMS_DEFAULT_CONNECT_TIMEOUT_SEC), 0), diff --git a/programs/client/QueryFuzzer.cpp b/programs/client/QueryFuzzer.cpp index 8d8d8daaf39..6243e2c82ec 100644 --- a/programs/client/QueryFuzzer.cpp +++ b/programs/client/QueryFuzzer.cpp @@ -37,34 +37,33 @@ namespace ErrorCodes Field QueryFuzzer::getRandomField(int type) { + static constexpr Int64 bad_int64_values[] + = {-2, -1, 0, 1, 2, 3, 7, 10, 100, 255, 256, 257, 1023, 1024, + 1025, 65535, 65536, 65537, 1024 * 1024 - 1, 1024 * 1024, + 1024 * 1024 + 1, INT_MIN - 1ll, INT_MIN, INT_MIN + 1, + INT_MAX - 1, INT_MAX, INT_MAX + 1ll, INT64_MIN, INT64_MIN + 1, + INT64_MAX - 1, INT64_MAX}; switch (type) { case 0: { - static constexpr Int64 values[] - = {-2, -1, 0, 1, 2, 3, 7, 10, 100, 255, 256, 257, 1023, 1024, - 1025, 65535, 65536, 65537, 1024 * 1024 - 1, 1024 * 1024, - 1024 * 1024 + 1, INT64_MIN, INT64_MAX}; - return values[fuzz_rand() % (sizeof(values) / sizeof(*values))]; + return bad_int64_values[fuzz_rand() % (sizeof(bad_int64_values) + / sizeof(*bad_int64_values))]; } case 1: { static constexpr float values[] - = {NAN, INFINITY, -INFINITY, 0., 0.0001, 0.5, 0.9999, - 1., 1.0001, 2., 10.0001, 100.0001, 1000.0001}; - return values[fuzz_rand() % (sizeof(values) / sizeof(*values))]; + = {NAN, INFINITY, -INFINITY, 0., -0., 0.0001, 0.5, 0.9999, + 1., 1.0001, 2., 10.0001, 100.0001, 1000.0001, 1e10, 1e20, + FLT_MIN, FLT_MIN + FLT_EPSILON, FLT_MAX, FLT_MAX + FLT_EPSILON}; return values[fuzz_rand() % (sizeof(values) / sizeof(*values))]; } case 2: { - static constexpr Int64 values[] - = {-2, -1, 0, 1, 2, 3, 7, 10, 100, 255, 256, 257, 1023, 1024, - 1025, 65535, 65536, 65537, 1024 * 1024 - 1, 1024 * 1024, - 1024 * 1024 + 1, INT64_MIN, INT64_MAX}; static constexpr UInt64 scales[] = {0, 1, 2, 10}; return DecimalField( - values[fuzz_rand() % (sizeof(values) / sizeof(*values))], - scales[fuzz_rand() % (sizeof(scales) / sizeof(*scales))] - ); + bad_int64_values[fuzz_rand() % (sizeof(bad_int64_values) + / sizeof(*bad_int64_values))], + scales[fuzz_rand() % (sizeof(scales) / sizeof(*scales))]); } default: assert(false); @@ -570,6 +569,15 @@ void QueryFuzzer::addColumnLike(const ASTPtr ast) } const auto name = ast->formatForErrorMessage(); + if (name == "Null") + { + // The `Null` identifier from FORMAT Null clause. We don't quote it + // properly when formatting the AST, and while the resulting query + // technically works, it has non-standard case for Null (the standard + // is NULL), so it breaks the query formatting idempotence check. + // Just plug this particular case for now. + return; + } if (name.size() < 200) { column_like_map.insert({name, ast}); diff --git a/programs/client/Suggest.cpp b/programs/client/Suggest.cpp index dfa7048349e..8d4c0fdbd5a 100644 --- a/programs/client/Suggest.cpp +++ b/programs/client/Suggest.cpp @@ -108,14 +108,6 @@ void Suggest::loadImpl(Connection & connection, const ConnectionTimeouts & timeo " UNION ALL " "SELECT cluster FROM system.clusters" " UNION ALL " - "SELECT name FROM system.errors" - " UNION ALL " - "SELECT event FROM system.events" - " UNION ALL " - "SELECT metric FROM system.asynchronous_metrics" - " UNION ALL " - "SELECT metric FROM system.metrics" - " UNION ALL " "SELECT macro FROM system.macros" " UNION ALL " "SELECT policy_name FROM system.storage_policies" @@ -139,17 +131,12 @@ void Suggest::loadImpl(Connection & connection, const ConnectionTimeouts & timeo query << ") WHERE notEmpty(res)"; - Settings settings; - /// To show all rows from: - /// - system.errors - /// - system.events - settings.system_events_show_zero_values = true; - fetch(connection, timeouts, query.str(), settings); + fetch(connection, timeouts, query.str()); } -void Suggest::fetch(Connection & connection, const ConnectionTimeouts & timeouts, const std::string & query, Settings & settings) +void Suggest::fetch(Connection & connection, const ConnectionTimeouts & timeouts, const std::string & query) { - connection.sendQuery(timeouts, query, "" /* query_id */, QueryProcessingStage::Complete, &settings); + connection.sendQuery(timeouts, query, "" /* query_id */, QueryProcessingStage::Complete); while (true) { diff --git a/programs/client/Suggest.h b/programs/client/Suggest.h index 0049bc08ebf..03332088cbe 100644 --- a/programs/client/Suggest.h +++ b/programs/client/Suggest.h @@ -33,7 +33,7 @@ public: private: void loadImpl(Connection & connection, const ConnectionTimeouts & timeouts, size_t suggestion_limit); - void fetch(Connection & connection, const ConnectionTimeouts & timeouts, const std::string & query, Settings & settings); + void fetch(Connection & connection, const ConnectionTimeouts & timeouts, const std::string & query); void fillWordsFromBlock(const Block & block); /// Words are fetched asynchronously. diff --git a/programs/config_tools.h.in b/programs/config_tools.h.in index 7cb5a6d883a..abe9ef8c562 100644 --- a/programs/config_tools.h.in +++ b/programs/config_tools.h.in @@ -15,3 +15,4 @@ #cmakedefine01 ENABLE_CLICKHOUSE_GIT_IMPORT #cmakedefine01 ENABLE_CLICKHOUSE_INSTALL #cmakedefine01 ENABLE_CLICKHOUSE_ODBC_BRIDGE +#cmakedefine01 ENABLE_CLICKHOUSE_LIBRARY_BRIDGE diff --git a/programs/copier/ClusterCopier.cpp b/programs/copier/ClusterCopier.cpp index 7eea23160b2..aa9b359993e 100644 --- a/programs/copier/ClusterCopier.cpp +++ b/programs/copier/ClusterCopier.cpp @@ -22,7 +22,7 @@ namespace ErrorCodes void ClusterCopier::init() { - auto zookeeper = context.getZooKeeper(); + auto zookeeper = getContext()->getZooKeeper(); task_description_watch_callback = [this] (const Coordination::WatchResponse & response) { @@ -39,14 +39,14 @@ void ClusterCopier::init() task_cluster_initial_config = task_cluster_current_config; task_cluster->loadTasks(*task_cluster_initial_config); - context.setClustersConfig(task_cluster_initial_config, task_cluster->clusters_prefix); + getContext()->setClustersConfig(task_cluster_initial_config, task_cluster->clusters_prefix); /// Set up shards and their priority task_cluster->random_engine.seed(task_cluster->random_device()); for (auto & task_table : task_cluster->table_tasks) { - task_table.cluster_pull = context.getCluster(task_table.cluster_pull_name); - task_table.cluster_push = context.getCluster(task_table.cluster_push_name); + task_table.cluster_pull = getContext()->getCluster(task_table.cluster_pull_name); + task_table.cluster_push = getContext()->getCluster(task_table.cluster_push_name); task_table.initShards(task_cluster->random_engine); } @@ -106,7 +106,7 @@ void ClusterCopier::discoverShardPartitions(const ConnectionTimeouts & timeouts, try { - type->deserializeAsTextQuoted(*column_dummy, rb, FormatSettings()); + type->getDefaultSerialization()->deserializeTextQuoted(*column_dummy, rb, FormatSettings()); } catch (Exception & e) { @@ -206,7 +206,7 @@ void ClusterCopier::uploadTaskDescription(const std::string & task_path, const s if (task_config_str.empty()) return; - auto zookeeper = context.getZooKeeper(); + auto zookeeper = getContext()->getZooKeeper(); zookeeper->createAncestors(local_task_description_path); auto code = zookeeper->tryCreate(local_task_description_path, task_config_str, zkutil::CreateMode::Persistent); @@ -219,7 +219,7 @@ void ClusterCopier::uploadTaskDescription(const std::string & task_path, const s void ClusterCopier::reloadTaskDescription() { - auto zookeeper = context.getZooKeeper(); + auto zookeeper = getContext()->getZooKeeper(); task_description_watch_zookeeper = zookeeper; String task_config_str; @@ -235,7 +235,7 @@ void ClusterCopier::reloadTaskDescription() /// Setup settings task_cluster->reloadSettings(*config); - context.setSettings(task_cluster->settings_common); + getContext()->setSettings(task_cluster->settings_common); task_cluster_current_config = config; task_description_current_stat = stat; @@ -440,7 +440,7 @@ bool ClusterCopier::checkPartitionPieceIsDone(const TaskTable & task_table, cons { LOG_DEBUG(log, "Check that all shards processed partition {} piece {} successfully", partition_name, piece_number); - auto zookeeper = context.getZooKeeper(); + auto zookeeper = getContext()->getZooKeeper(); /// Collect all shards that contain partition piece number piece_number. Strings piece_status_paths; @@ -532,7 +532,7 @@ TaskStatus ClusterCopier::tryMoveAllPiecesToDestinationTable(const TaskTable & t LOG_DEBUG(log, "Try to move {} to destination table", partition_name); - auto zookeeper = context.getZooKeeper(); + auto zookeeper = getContext()->getZooKeeper(); const auto current_partition_attach_is_active = task_table.getPartitionAttachIsActivePath(partition_name); const auto current_partition_attach_is_done = task_table.getPartitionAttachIsDonePath(partition_name); @@ -599,11 +599,13 @@ TaskStatus ClusterCopier::tryMoveAllPiecesToDestinationTable(const TaskTable & t toString(current_piece_number)); Settings settings_push = task_cluster->settings_push; - - /// It is important, ALTER ATTACH PARTITION must be done synchronously - /// And we will execute this ALTER query on each replica of a shard. - /// It is correct, because this query is idempotent. - settings_push.replication_alter_partitions_sync = 2; + ClusterExecutionMode execution_mode = ClusterExecutionMode::ON_EACH_NODE; + UInt64 max_successful_executions_per_shard = 0; + if (settings_push.replication_alter_partitions_sync == 1) + { + execution_mode = ClusterExecutionMode::ON_EACH_SHARD; + max_successful_executions_per_shard = 1; + } query_alter_ast_string += " ALTER TABLE " + getQuotedTable(original_table) + ((partition_name == "'all'") ? " ATTACH PARTITION ID " : " ATTACH PARTITION ") + partition_name + @@ -613,14 +615,33 @@ TaskStatus ClusterCopier::tryMoveAllPiecesToDestinationTable(const TaskTable & t try { - size_t num_nodes = executeQueryOnCluster( - task_table.cluster_push, - query_alter_ast_string, - settings_push, - PoolMode::GET_MANY, - ClusterExecutionMode::ON_EACH_NODE); + /// Try attach partition on each shard + UInt64 num_nodes = executeQueryOnCluster( + task_table.cluster_push, + query_alter_ast_string, + task_cluster->settings_push, + PoolMode::GET_MANY, + execution_mode, + max_successful_executions_per_shard); - LOG_INFO(log, "Number of nodes that executed ALTER query successfully : {}", toString(num_nodes)); + if (settings_push.replication_alter_partitions_sync == 1) + { + LOG_INFO( + log, + "Destination tables {} have been executed alter query successfully on {} shards of {}", + getQuotedTable(task_table.table_push), + num_nodes, + task_table.cluster_push->getShardCount()); + + if (num_nodes != task_table.cluster_push->getShardCount()) + { + return TaskStatus::Error; + } + } + else + { + LOG_INFO(log, "Number of nodes that executed ALTER query successfully : {}", toString(num_nodes)); + } } catch (...) { @@ -856,6 +877,16 @@ bool ClusterCopier::tryDropPartitionPiece( bool ClusterCopier::tryProcessTable(const ConnectionTimeouts & timeouts, TaskTable & task_table) { + /// Create destination table + TaskStatus task_status = TaskStatus::Error; + + task_status = tryCreateDestinationTable(timeouts, task_table); + /// Exit if success + if (task_status != TaskStatus::Finished) + { + LOG_WARNING(log, "Create destination Tale Failed "); + return false; + } /// An heuristic: if previous shard is already done, then check next one without sleeps due to max_workers constraint bool previous_shard_is_instantly_finished = false; @@ -932,7 +963,7 @@ bool ClusterCopier::tryProcessTable(const ConnectionTimeouts & timeouts, TaskTab /// Do not sleep if there is a sequence of already processed shards to increase startup bool is_unprioritized_task = !previous_shard_is_instantly_finished && shard->priority.is_remote; - TaskStatus task_status = TaskStatus::Error; + task_status = TaskStatus::Error; bool was_error = false; has_shard_to_process = true; for (UInt64 try_num = 0; try_num < max_shard_partition_tries; ++try_num) @@ -1050,6 +1081,44 @@ bool ClusterCopier::tryProcessTable(const ConnectionTimeouts & timeouts, TaskTab return table_is_done; } +TaskStatus ClusterCopier::tryCreateDestinationTable(const ConnectionTimeouts & timeouts, TaskTable & task_table) +{ + /// Try create original table (if not exists) on each shard + + //TaskTable & task_table = task_shard.task_table; + const TaskShardPtr task_shard = task_table.all_shards.at(0); + /// We need to update table definitions for each part, it could be changed after ALTER + task_shard->current_pull_table_create_query = getCreateTableForPullShard(timeouts, *task_shard); + try + { + auto create_query_push_ast + = rewriteCreateQueryStorage(task_shard->current_pull_table_create_query, task_table.table_push, task_table.engine_push_ast); + auto & create = create_query_push_ast->as(); + create.if_not_exists = true; + InterpreterCreateQuery::prepareOnClusterQuery(create, getContext(), task_table.cluster_push_name); + String query = queryToString(create_query_push_ast); + + LOG_DEBUG(log, "Create destination tables. Query: {}", query); + UInt64 shards = executeQueryOnCluster(task_table.cluster_push, query, task_cluster->settings_push, PoolMode::GET_MANY); + LOG_INFO( + log, + "Destination tables {} have been created on {} shards of {}", + getQuotedTable(task_table.table_push), + shards, + task_table.cluster_push->getShardCount()); + if (shards != task_table.cluster_push->getShardCount()) + { + return TaskStatus::Error; + } + } + catch (...) + { + tryLogCurrentException(log, "Error while creating original table. Maybe we are not first."); + } + + return TaskStatus::Finished; +} + /// Job for copying partition from particular shard. TaskStatus ClusterCopier::tryProcessPartitionTask(const ConnectionTimeouts & timeouts, ShardPartition & task_partition, bool is_unprioritized_task) { @@ -1142,7 +1211,7 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl( auto split_table_for_current_piece = task_shard.list_of_split_tables_on_shard[current_piece_number]; - auto zookeeper = context.getZooKeeper(); + auto zookeeper = getContext()->getZooKeeper(); const String piece_is_dirty_flag_path = partition_piece.getPartitionPieceIsDirtyPath(); const String piece_is_dirty_cleaned_path = partition_piece.getPartitionPieceIsCleanedPath(); @@ -1193,7 +1262,7 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl( ParserQuery p_query(query.data() + query.size()); - const auto & settings = context.getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); return parseQuery(p_query, query, settings.max_query_size, settings.max_parser_depth); }; @@ -1297,10 +1366,10 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl( ASTPtr query_select_ast = get_select_query(split_table_for_current_piece, "count()", /*enable_splitting*/ true); UInt64 count; { - Context local_context = context; + auto local_context = Context::createCopy(context); // Use pull (i.e. readonly) settings, but fetch data from destination servers - local_context.setSettings(task_cluster->settings_pull); - local_context.setSetting("skip_unavailable_shards", true); + local_context->setSettings(task_cluster->settings_pull); + local_context->setSetting("skip_unavailable_shards", true); Block block = getBlockWithAllStreamData(InterpreterFactory::get(query_select_ast, local_context)->execute().getInputStream()); count = (block) ? block.safeGetByPosition(0).column->getUInt(0) : 0; @@ -1366,8 +1435,17 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl( LOG_DEBUG(log, "Create destination tables. Query: {}", query); UInt64 shards = executeQueryOnCluster(task_table.cluster_push, query, task_cluster->settings_push, PoolMode::GET_MANY); - LOG_DEBUG(log, "Destination tables {} have been created on {} shards of {}", - getQuotedTable(task_table.table_push), shards, task_table.cluster_push->getShardCount()); + LOG_INFO( + log, + "Destination tables {} have been created on {} shards of {}", + getQuotedTable(task_table.table_push), + shards, + task_table.cluster_push->getShardCount()); + + if (shards != task_table.cluster_push->getShardCount()) + { + return TaskStatus::Error; + } } /// Do the copying @@ -1390,7 +1468,7 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl( query += "INSERT INTO " + getQuotedTable(split_table_for_current_piece) + " VALUES "; ParserQuery p_query(query.data() + query.size()); - const auto & settings = context.getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); query_insert_ast = parseQuery(p_query, query, settings.max_query_size, settings.max_parser_depth); LOG_DEBUG(log, "Executing INSERT query: {}", query); @@ -1398,18 +1476,18 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl( try { - std::unique_ptr context_select = std::make_unique(context); + auto context_select = Context::createCopy(context); context_select->setSettings(task_cluster->settings_pull); - std::unique_ptr context_insert = std::make_unique(context); + auto context_insert = Context::createCopy(context); context_insert->setSettings(task_cluster->settings_push); /// Custom INSERT SELECT implementation BlockInputStreamPtr input; BlockOutputStreamPtr output; { - BlockIO io_select = InterpreterFactory::get(query_select_ast, *context_select)->execute(); - BlockIO io_insert = InterpreterFactory::get(query_insert_ast, *context_insert)->execute(); + BlockIO io_select = InterpreterFactory::get(query_select_ast, context_select)->execute(); + BlockIO io_insert = InterpreterFactory::get(query_insert_ast, context_insert)->execute(); input = io_select.getInputStream(); output = io_insert.out; @@ -1477,26 +1555,6 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl( LOG_INFO(log, "Partition {} piece {} copied. But not moved to original destination table.", task_partition.name, toString(current_piece_number)); - - /// Try create original table (if not exists) on each shard - try - { - auto create_query_push_ast = rewriteCreateQueryStorage(task_shard.current_pull_table_create_query, - task_table.table_push, task_table.engine_push_ast); - auto & create = create_query_push_ast->as(); - create.if_not_exists = true; - InterpreterCreateQuery::prepareOnClusterQuery(create, context, task_table.cluster_push_name); - String query = queryToString(create_query_push_ast); - - LOG_DEBUG(log, "Create destination tables. Query: {}", query); - UInt64 shards = executeQueryOnCluster(task_table.cluster_push, query, task_cluster->settings_push, PoolMode::GET_MANY); - LOG_DEBUG(log, "Destination tables {} have been created on {} shards of {}", getQuotedTable(task_table.table_push), shards, task_table.cluster_push->getShardCount()); - } - catch (...) - { - tryLogCurrentException(log, "Error while creating original table. Maybe we are not first."); - } - /// Finalize the processing, change state of current partition task (and also check is_dirty flag) { String state_finished = TaskStateWithOwner::getData(TaskState::Finished, host_id); @@ -1523,7 +1581,7 @@ void ClusterCopier::dropAndCreateLocalTable(const ASTPtr & create_ast) const auto & create = create_ast->as(); dropLocalTableIfExists({create.database, create.table}); - InterpreterCreateQuery interpreter(create_ast, context); + InterpreterCreateQuery interpreter(create_ast, getContext()); interpreter.execute(); } @@ -1534,37 +1592,40 @@ void ClusterCopier::dropLocalTableIfExists(const DatabaseAndTableName & table_na drop_ast->database = table_name.first; drop_ast->table = table_name.second; - InterpreterDropQuery interpreter(drop_ast, context); + InterpreterDropQuery interpreter(drop_ast, getContext()); interpreter.execute(); } +void ClusterCopier::dropHelpingTablesByPieceNumber(const TaskTable & task_table, size_t current_piece_number) +{ + LOG_DEBUG(log, "Removing helping tables piece {}", current_piece_number); + + DatabaseAndTableName original_table = task_table.table_push; + DatabaseAndTableName helping_table + = DatabaseAndTableName(original_table.first, original_table.second + "_piece_" + toString(current_piece_number)); + + String query = "DROP TABLE IF EXISTS " + getQuotedTable(helping_table); + + const ClusterPtr & cluster_push = task_table.cluster_push; + Settings settings_push = task_cluster->settings_push; + + LOG_DEBUG(log, "Execute distributed DROP TABLE: {}", query); + + /// We have to drop partition_piece on each replica + UInt64 num_nodes = executeQueryOnCluster(cluster_push, query, settings_push, PoolMode::GET_MANY, ClusterExecutionMode::ON_EACH_NODE); + + LOG_INFO(log, "DROP TABLE query was successfully executed on {} nodes.", toString(num_nodes)); +} void ClusterCopier::dropHelpingTables(const TaskTable & task_table) { LOG_DEBUG(log, "Removing helping tables"); for (size_t current_piece_number = 0; current_piece_number < task_table.number_of_splits; ++current_piece_number) { - DatabaseAndTableName original_table = task_table.table_push; - DatabaseAndTableName helping_table = DatabaseAndTableName(original_table.first, original_table.second + "_piece_" + toString(current_piece_number)); - - String query = "DROP TABLE IF EXISTS " + getQuotedTable(helping_table); - - const ClusterPtr & cluster_push = task_table.cluster_push; - Settings settings_push = task_cluster->settings_push; - - LOG_DEBUG(log, "Execute distributed DROP TABLE: {}", query); - /// We have to drop partition_piece on each replica - UInt64 num_nodes = executeQueryOnCluster( - cluster_push, query, - settings_push, - PoolMode::GET_MANY, - ClusterExecutionMode::ON_EACH_NODE); - - LOG_DEBUG(log, "DROP TABLE query was successfully executed on {} nodes.", toString(num_nodes)); + dropHelpingTablesByPieceNumber(task_table, current_piece_number); } } - void ClusterCopier::dropParticularPartitionPieceFromAllHelpingTables(const TaskTable & task_table, const String & partition_name) { LOG_DEBUG(log, "Try drop partition partition from all helping tables."); @@ -1586,15 +1647,15 @@ void ClusterCopier::dropParticularPartitionPieceFromAllHelpingTables(const TaskT PoolMode::GET_MANY, ClusterExecutionMode::ON_EACH_NODE); - LOG_DEBUG(log, "DROP PARTITION query was successfully executed on {} nodes.", toString(num_nodes)); + LOG_INFO(log, "DROP PARTITION query was successfully executed on {} nodes.", toString(num_nodes)); } LOG_DEBUG(log, "All helping tables dropped partition {}", partition_name); } String ClusterCopier::getRemoteCreateTable(const DatabaseAndTableName & table, Connection & connection, const Settings & settings) { - Context remote_context(context); - remote_context.setSettings(settings); + auto remote_context = Context::createCopy(context); + remote_context->setSettings(settings); String query = "SHOW CREATE TABLE " + getQuotedTable(table); Block block = getBlockWithAllStreamData(std::make_shared( @@ -1613,7 +1674,7 @@ ASTPtr ClusterCopier::getCreateTableForPullShard(const ConnectionTimeouts & time task_cluster->settings_pull); ParserCreateQuery parser_create_query; - const auto & settings = context.getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); return parseQuery(parser_create_query, create_query_pull_str, settings.max_query_size, settings.max_parser_depth); } @@ -1642,7 +1703,7 @@ void ClusterCopier::createShardInternalTables(const ConnectionTimeouts & timeout /// Create special cluster with single shard String shard_read_cluster_name = read_shard_prefix + task_table.cluster_pull_name; ClusterPtr cluster_pull_current_shard = task_table.cluster_pull->getClusterWithSingleShard(task_shard.indexInCluster()); - context.setCluster(shard_read_cluster_name, cluster_pull_current_shard); + getContext()->setCluster(shard_read_cluster_name, cluster_pull_current_shard); auto storage_shard_ast = createASTStorageDistributed(shard_read_cluster_name, task_table.table_pull.first, task_table.table_pull.second); @@ -1702,13 +1763,13 @@ std::set ClusterCopier::getShardPartitions(const ConnectionTimeouts & ti } ParserQuery parser_query(query.data() + query.size()); - const auto & settings = context.getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); ASTPtr query_ast = parseQuery(parser_query, query, settings.max_query_size, settings.max_parser_depth); LOG_DEBUG(log, "Computing destination partition set, executing query: {}", query); - Context local_context = context; - local_context.setSettings(task_cluster->settings_pull); + auto local_context = Context::createCopy(context); + local_context->setSettings(task_cluster->settings_pull); Block block = getBlockWithAllStreamData(InterpreterFactory::get(query_ast, local_context)->execute().getInputStream()); if (block) @@ -1719,7 +1780,7 @@ std::set ClusterCopier::getShardPartitions(const ConnectionTimeouts & ti for (size_t i = 0; i < column.column->size(); ++i) { WriteBufferFromOwnString wb; - column.type->serializeAsTextQuoted(*column.column, i, wb, FormatSettings()); + column.type->getDefaultSerialization()->serializeTextQuoted(*column.column, i, wb, FormatSettings()); res.emplace(wb.str()); } } @@ -1748,11 +1809,11 @@ bool ClusterCopier::checkShardHasPartition(const ConnectionTimeouts & timeouts, LOG_DEBUG(log, "Checking shard {} for partition {} existence, executing query: {}", task_shard.getDescription(), partition_quoted_name, query); ParserQuery parser_query(query.data() + query.size()); -const auto & settings = context.getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); ASTPtr query_ast = parseQuery(parser_query, query, settings.max_query_size, settings.max_parser_depth); - Context local_context = context; - local_context.setSettings(task_cluster->settings_pull); + auto local_context = Context::createCopy(context); + local_context->setSettings(task_cluster->settings_pull); return InterpreterFactory::get(query_ast, local_context)->execute().getInputStream()->read().rows() != 0; } @@ -1787,11 +1848,11 @@ bool ClusterCopier::checkPresentPartitionPiecesOnCurrentShard(const ConnectionTi LOG_DEBUG(log, "Checking shard {} for partition {} piece {} existence, executing query: {}", task_shard.getDescription(), partition_quoted_name, std::to_string(current_piece_number), query); ParserQuery parser_query(query.data() + query.size()); - const auto & settings = context.getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); ASTPtr query_ast = parseQuery(parser_query, query, settings.max_query_size, settings.max_parser_depth); - Context local_context = context; - local_context.setSettings(task_cluster->settings_pull); + auto local_context = Context::createCopy(context); + local_context->setSettings(task_cluster->settings_pull); auto result = InterpreterFactory::get(query_ast, local_context)->execute().getInputStream()->read().rows(); if (result != 0) LOG_DEBUG(log, "Partition {} piece number {} is PRESENT on shard {}", partition_quoted_name, std::to_string(current_piece_number), task_shard.getDescription()); @@ -1847,7 +1908,7 @@ UInt64 ClusterCopier::executeQueryOnCluster( /// In that case we don't have local replicas, but do it just in case for (UInt64 i = 0; i < num_local_replicas; ++i) { - auto interpreter = InterpreterFactory::get(query_ast, context); + auto interpreter = InterpreterFactory::get(query_ast, getContext()); interpreter->execute(); if (increment_and_check_exit()) @@ -1862,8 +1923,8 @@ UInt64 ClusterCopier::executeQueryOnCluster( auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(shard_settings).getSaturated(shard_settings.max_execution_time); auto connections = shard.pool->getMany(timeouts, &shard_settings, pool_mode); - Context shard_context(context); - shard_context.setSettings(shard_settings); + auto shard_context = Context::createCopy(context); + shard_context->setSettings(shard_settings); for (auto & connection : connections) { diff --git a/programs/copier/ClusterCopier.h b/programs/copier/ClusterCopier.h index 9aff5493cf8..e875ca7df2e 100644 --- a/programs/copier/ClusterCopier.h +++ b/programs/copier/ClusterCopier.h @@ -12,18 +12,17 @@ namespace DB { -class ClusterCopier +class ClusterCopier : WithContext { public: ClusterCopier(const String & task_path_, const String & host_id_, const String & proxy_database_name_, - Context & context_) - : + ContextPtr context_) + : WithContext(context_), task_zookeeper_path(task_path_), host_id(host_id_), working_database_name(proxy_database_name_), - context(context_), log(&Poco::Logger::get("ClusterCopier")) {} void init(); @@ -36,7 +35,7 @@ public: /// Compute set of partitions, assume set of partitions aren't changed during the processing void discoverTablePartitions(const ConnectionTimeouts & timeouts, TaskTable & task_table, UInt64 num_threads = 0); - void uploadTaskDescription(const std::string & task_path, const std::string & task_file, const bool force); + void uploadTaskDescription(const std::string & task_path, const std::string & task_file, bool force); void reloadTaskDescription(); @@ -120,15 +119,16 @@ protected: /// Removes MATERIALIZED and ALIAS columns from create table query static ASTPtr removeAliasColumnsFromCreateQuery(const ASTPtr & query_ast); - bool tryDropPartitionPiece(ShardPartition & task_partition, const size_t current_piece_number, + bool tryDropPartitionPiece(ShardPartition & task_partition, size_t current_piece_number, const zkutil::ZooKeeperPtr & zookeeper, const CleanStateClock & clean_state_clock); - static constexpr UInt64 max_table_tries = 1000; - static constexpr UInt64 max_shard_partition_tries = 600; - static constexpr UInt64 max_shard_partition_piece_tries_for_alter = 100; + static constexpr UInt64 max_table_tries = 3; + static constexpr UInt64 max_shard_partition_tries = 3; + static constexpr UInt64 max_shard_partition_piece_tries_for_alter = 3; bool tryProcessTable(const ConnectionTimeouts & timeouts, TaskTable & task_table); + TaskStatus tryCreateDestinationTable(const ConnectionTimeouts & timeouts, TaskTable & task_table); /// Job for copying partition from particular shard. TaskStatus tryProcessPartitionTask(const ConnectionTimeouts & timeouts, ShardPartition & task_partition, @@ -140,7 +140,7 @@ protected: TaskStatus processPartitionPieceTaskImpl(const ConnectionTimeouts & timeouts, ShardPartition & task_partition, - const size_t current_piece_number, + size_t current_piece_number, bool is_unprioritized_task); void dropAndCreateLocalTable(const ASTPtr & create_ast); @@ -149,6 +149,8 @@ protected: void dropHelpingTables(const TaskTable & task_table); + void dropHelpingTablesByPieceNumber(const TaskTable & task_table, size_t current_piece_number); + /// Is used for usage less disk space. /// After all pieces were successfully moved to original destination /// table we can get rid of partition pieces (partitions in helping tables). @@ -216,7 +218,6 @@ private: bool experimental_use_sample_offset{false}; - Context & context; Poco::Logger * log; std::chrono::milliseconds default_sleep_time{1000}; diff --git a/programs/copier/ClusterCopierApp.cpp b/programs/copier/ClusterCopierApp.cpp index e3169a49ecf..d3fff616b65 100644 --- a/programs/copier/ClusterCopierApp.cpp +++ b/programs/copier/ClusterCopierApp.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -110,9 +111,9 @@ void ClusterCopierApp::mainImpl() LOG_INFO(log, "Starting clickhouse-copier (id {}, host_id {}, path {}, revision {})", process_id, host_id, process_path, ClickHouseRevision::getVersionRevision()); SharedContextHolder shared_context = Context::createShared(); - auto context = std::make_unique(Context::createGlobal(shared_context.get())); + auto context = Context::createGlobal(shared_context.get()); context->makeGlobalContext(); - SCOPE_EXIT(context->shutdown()); + SCOPE_EXIT_SAFE(context->shutdown()); context->setConfig(loaded_config.configuration); context->setApplicationType(Context::ApplicationType::LOCAL); @@ -127,13 +128,13 @@ void ClusterCopierApp::mainImpl() registerFormats(); static const std::string default_database = "_local"; - DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared(default_database, *context)); + DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared(default_database, context)); context->setCurrentDatabase(default_database); /// Initialize query scope just in case. - CurrentThread::QueryScope query_scope(*context); + CurrentThread::QueryScope query_scope(context); - auto copier = std::make_unique(task_path, host_id, default_database, *context); + auto copier = std::make_unique(task_path, host_id, default_database, context); copier->setSafeMode(is_safe_mode); copier->setCopyFaultProbability(copy_fault_probability); copier->setMoveFaultProbability(move_fault_probability); diff --git a/programs/copier/Internals.cpp b/programs/copier/Internals.cpp index ea2be469945..bec612a8226 100644 --- a/programs/copier/Internals.cpp +++ b/programs/copier/Internals.cpp @@ -222,8 +222,8 @@ Names extractPrimaryKeyColumnNames(const ASTPtr & storage_ast) { String pk_column = primary_key_expr_list->children[i]->getColumnName(); if (pk_column != sorting_key_column) - throw Exception("Primary key must be a prefix of the sorting key, but in position " - + toString(i) + " its column is " + pk_column + ", not " + sorting_key_column, + throw Exception("Primary key must be a prefix of the sorting key, but the column in the position " + + toString(i) + " is " + sorting_key_column +", not " + pk_column, ErrorCodes::BAD_ARGUMENTS); if (!primary_key_columns_set.emplace(pk_column).second) diff --git a/programs/copier/TaskCluster.h b/programs/copier/TaskCluster.h index 5b28f461dd8..1a50597d07f 100644 --- a/programs/copier/TaskCluster.h +++ b/programs/copier/TaskCluster.h @@ -98,6 +98,7 @@ inline void DB::TaskCluster::reloadSettings(const Poco::Util::AbstractConfigurat set_default_value(settings_pull.max_block_size, 8192UL); set_default_value(settings_pull.preferred_block_size_bytes, 0); set_default_value(settings_push.insert_distributed_timeout, 0); + set_default_value(settings_push.replication_alter_partitions_sync, 2); } } diff --git a/programs/format/Format.cpp b/programs/format/Format.cpp index 86a85d1d4a5..5bf19191353 100644 --- a/programs/format/Format.cpp +++ b/programs/format/Format.cpp @@ -1,16 +1,18 @@ +#include #include #include -#include #include #include #include #include #include +#include #include -#include #include #include +#include +#include #include #include @@ -28,6 +30,14 @@ #pragma GCC diagnostic ignored "-Wunused-function" #pragma GCC diagnostic ignored "-Wmissing-declarations" +namespace DB +{ +namespace ErrorCodes +{ +extern const int INVALID_FORMAT_INSERT_QUERY_WITH_DATA; +} +} + int mainEntryClickHouseFormat(int argc, char ** argv) { using namespace DB; @@ -40,6 +50,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv) ("quiet,q", "just check syntax, no output on success") ("multiquery,n", "allow multiple queries in the same file") ("obfuscate", "obfuscate instead of formatting") + ("backslash", "add a backslash at the end of each line of the formatted query") ("seed", po::value(), "seed (arbitrary string) that determines the result of obfuscation") ; @@ -60,6 +71,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv) bool quiet = options.count("quiet"); bool multiple = options.count("multiquery"); bool obfuscate = options.count("obfuscate"); + bool backslash = options.count("backslash"); if (quiet && (hilite || oneline || obfuscate)) { @@ -90,8 +102,8 @@ int mainEntryClickHouseFormat(int argc, char ** argv) } SharedContextHolder shared_context = Context::createShared(); - Context context = Context::createGlobal(shared_context.get()); - context.makeGlobalContext(); + auto context = Context::createGlobal(shared_context.get()); + context->makeGlobalContext(); registerFunctions(); registerAggregateFunctions(); @@ -128,15 +140,70 @@ int mainEntryClickHouseFormat(int argc, char ** argv) do { ASTPtr res = parseQueryAndMovePosition(parser, pos, end, "query", multiple, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); + /// For insert query with data(INSERT INTO ... VALUES ...), will lead to format fail, + /// should throw exception early and make exception message more readable. + if (const auto * insert_query = res->as(); insert_query && insert_query->data) + { + throw Exception( + "Can't format ASTInsertQuery with data, since data will be lost", + DB::ErrorCodes::INVALID_FORMAT_INSERT_QUERY_WITH_DATA); + } if (!quiet) { - WriteBufferFromOStream res_buf(std::cout, 4096); - formatAST(*res, res_buf, hilite, oneline); - res_buf.next(); - if (multiple) - std::cout << "\n;\n"; - std::cout << std::endl; + if (!backslash) + { + WriteBufferFromOStream res_buf(std::cout, 4096); + formatAST(*res, res_buf, hilite, oneline); + res_buf.next(); + if (multiple) + std::cout << "\n;\n"; + std::cout << std::endl; + } + /// add additional '\' at the end of each line; + else + { + WriteBufferFromOwnString str_buf; + formatAST(*res, str_buf, hilite, oneline); + + auto res_string = str_buf.str(); + WriteBufferFromOStream res_cout(std::cout, 4096); + + const char * s_pos= res_string.data(); + const char * s_end = s_pos + res_string.size(); + + while (s_pos != s_end) + { + if (*s_pos == '\n') + res_cout.write(" \\", 2); + res_cout.write(*s_pos++); + } + + res_cout.next(); + if (multiple) + std::cout << " \\\n;\n"; + std::cout << std::endl; + } } + + do + { + /// skip spaces to avoid throw exception after last query + while (pos != end && std::isspace(*pos)) + ++pos; + + /// for skip comment after the last query and to not throw exception + if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-') + { + pos += 2; + /// skip until the end of the line + while (pos != end && *pos != '\n') + ++pos; + } + /// need to parse next sql + else + break; + } while (pos != end); + } while (multiple && pos != end); } } diff --git a/programs/git-import/git-import.cpp b/programs/git-import/git-import.cpp index ae8b55e2aff..b07435dcf78 100644 --- a/programs/git-import/git-import.cpp +++ b/programs/git-import/git-import.cpp @@ -1064,7 +1064,7 @@ void processCommit( time_t commit_time; readText(commit_time, in); - commit.time = commit_time; + commit.time = LocalDateTime(commit_time); assertChar('\0', in); readNullTerminated(commit.author, in); std::string parent_hash; diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index 8404586d394..2b0f390f709 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -66,10 +66,14 @@ namespace ErrorCodes extern const int CANNOT_OPEN_FILE; extern const int SYSTEM_ERROR; extern const int NOT_ENOUGH_SPACE; + extern const int CANNOT_KILL; } } +/// ANSI escape sequence for intense color in terminal. +#define HILITE "\033[1m" +#define END_HILITE "\033[0m" using namespace DB; namespace po = boost::program_options; @@ -558,20 +562,32 @@ int mainEntryClickHouseInstall(int argc, char ** argv) bool stdin_is_a_tty = isatty(STDIN_FILENO); bool stdout_is_a_tty = isatty(STDOUT_FILENO); - bool is_interactive = stdin_is_a_tty && stdout_is_a_tty; + + /// dpkg or apt installers can ask for non-interactive work explicitly. + + const char * debian_frontend_var = getenv("DEBIAN_FRONTEND"); + bool noninteractive = debian_frontend_var && debian_frontend_var == std::string_view("noninteractive"); + + bool is_interactive = !noninteractive && stdin_is_a_tty && stdout_is_a_tty; + + /// We can ask password even if stdin is closed/redirected but /dev/tty is available. + bool can_ask_password = !noninteractive && stdout_is_a_tty; if (has_password_for_default_user) { - fmt::print("Password for default user is already specified. To remind or reset, see {} and {}.\n", + fmt::print(HILITE "Password for default user is already specified. To remind or reset, see {} and {}." END_HILITE "\n", users_config_file.string(), users_d.string()); } - else if (!is_interactive) + else if (!can_ask_password) { - fmt::print("Password for default user is empty string. See {} and {} to change it.\n", + fmt::print(HILITE "Password for default user is empty string. See {} and {} to change it." END_HILITE "\n", users_config_file.string(), users_d.string()); } else { + /// NOTE: When installing debian package with dpkg -i, stdin is not a terminal but we are still being able to enter password. + /// More sophisticated method with /dev/tty is used inside the `readpassphrase` function. + char buf[1000] = {}; std::string password; if (auto * result = readpassphrase("Enter password for default user: ", buf, sizeof(buf), 0)) @@ -599,7 +615,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) "
\n"; out.sync(); out.finalize(); - fmt::print("Password for default user is saved in file {}.\n", password_file); + fmt::print(HILITE "Password for default user is saved in file {}." END_HILITE "\n", password_file); #else out << "\n" " \n" @@ -610,12 +626,12 @@ int mainEntryClickHouseInstall(int argc, char ** argv) "\n"; out.sync(); out.finalize(); - fmt::print("Password for default user is saved in plaintext in file {}.\n", password_file); + fmt::print(HILITE "Password for default user is saved in plaintext in file {}." END_HILITE "\n", password_file); #endif has_password_for_default_user = true; } else - fmt::print("Password for default user is empty string. See {} and {} to change it.\n", + fmt::print(HILITE "Password for default user is empty string. See {} and {} to change it." END_HILITE "\n", users_config_file.string(), users_d.string()); } @@ -640,7 +656,6 @@ int mainEntryClickHouseInstall(int argc, char ** argv) " This is optional. Taskstats accounting will be disabled." " To enable taskstats accounting you may add the required capability later manually.\"", "/tmp/test_setcap.sh", fs::canonical(main_bin_path).string()); - fmt::print(" {}\n", command); executeScript(command); #endif @@ -886,6 +901,27 @@ namespace fmt::print("Sent kill signal.\n", pid); else throwFromErrno("Cannot send kill signal", ErrorCodes::SYSTEM_ERROR); + + /// Wait for the process (100 seconds). + constexpr size_t num_kill_check_tries = 1000; + constexpr size_t kill_check_delay_ms = 100; + for (size_t i = 0; i < num_kill_check_tries; ++i) + { + fmt::print("Waiting for server to be killed\n"); + if (!isRunning(pid_file)) + { + fmt::print("Server exited\n"); + break; + } + sleepForMilliseconds(kill_check_delay_ms); + } + + if (isRunning(pid_file)) + { + throw Exception(ErrorCodes::CANNOT_KILL, + "The server process still exists after %zu ms", + num_kill_check_tries, kill_check_delay_ms); + } } return 0; diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt new file mode 100644 index 00000000000..0913c6e4a9a --- /dev/null +++ b/programs/library-bridge/CMakeLists.txt @@ -0,0 +1,25 @@ +set (CLICKHOUSE_LIBRARY_BRIDGE_SOURCES + library-bridge.cpp + LibraryInterface.cpp + LibraryBridge.cpp + Handlers.cpp + HandlerFactory.cpp + SharedLibraryHandler.cpp + SharedLibraryHandlerFactory.cpp +) + +if (OS_LINUX) + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-export-dynamic") +endif () + +add_executable(clickhouse-library-bridge ${CLICKHOUSE_LIBRARY_BRIDGE_SOURCES}) + +target_link_libraries(clickhouse-library-bridge PRIVATE + daemon + dbms + bridge +) + +set_target_properties(clickhouse-library-bridge PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..) + +install(TARGETS clickhouse-library-bridge RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) diff --git a/programs/library-bridge/HandlerFactory.cpp b/programs/library-bridge/HandlerFactory.cpp new file mode 100644 index 00000000000..9f53a24156f --- /dev/null +++ b/programs/library-bridge/HandlerFactory.cpp @@ -0,0 +1,23 @@ +#include "HandlerFactory.h" + +#include +#include +#include "Handlers.h" + + +namespace DB +{ + std::unique_ptr LibraryBridgeHandlerFactory::createRequestHandler(const HTTPServerRequest & request) + { + Poco::URI uri{request.getURI()}; + LOG_DEBUG(log, "Request URI: {}", uri.toString()); + + if (uri == "/ping" && request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) + return std::make_unique(keep_alive_timeout); + + if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) + return std::make_unique(keep_alive_timeout, getContext()); + + return nullptr; + } +} diff --git a/programs/library-bridge/HandlerFactory.h b/programs/library-bridge/HandlerFactory.h new file mode 100644 index 00000000000..93f0721bf01 --- /dev/null +++ b/programs/library-bridge/HandlerFactory.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class SharedLibraryHandler; +using SharedLibraryHandlerPtr = std::shared_ptr; + +/// Factory for '/ping', '/' handlers. +class LibraryBridgeHandlerFactory : public HTTPRequestHandlerFactory, WithContext +{ +public: + LibraryBridgeHandlerFactory( + const std::string & name_, + size_t keep_alive_timeout_, + ContextPtr context_) + : WithContext(context_) + , log(&Poco::Logger::get(name_)) + , name(name_) + , keep_alive_timeout(keep_alive_timeout_) + { + } + + std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override; + +private: + Poco::Logger * log; + std::string name; + size_t keep_alive_timeout; +}; + +} diff --git a/programs/library-bridge/Handlers.cpp b/programs/library-bridge/Handlers.cpp new file mode 100644 index 00000000000..6a1bfbbccb7 --- /dev/null +++ b/programs/library-bridge/Handlers.cpp @@ -0,0 +1,288 @@ +#include "Handlers.h" +#include "SharedLibraryHandlerFactory.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace +{ + std::shared_ptr parseColumns(std::string && column_string) + { + auto sample_block = std::make_shared(); + auto names_and_types = NamesAndTypesList::parse(column_string); + + for (const NameAndTypePair & column_data : names_and_types) + sample_block->insert({column_data.type, column_data.name}); + + return sample_block; + } + + std::vector parseIdsFromBinary(const std::string & ids_string) + { + ReadBufferFromString buf(ids_string); + std::vector ids; + readVectorBinary(ids, buf); + return ids; + } + + std::vector parseNamesFromBinary(const std::string & names_string) + { + ReadBufferFromString buf(names_string); + std::vector names; + readVectorBinary(names, buf); + return names; + } +} + + +void LibraryRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) +{ + LOG_TRACE(log, "Request URI: {}", request.getURI()); + HTMLForm params(request); + + if (!params.has("method")) + { + processError(response, "No 'method' in request URL"); + return; + } + + if (!params.has("dictionary_id")) + { + processError(response, "No 'dictionary_id in request URL"); + return; + } + + std::string method = params.get("method"); + std::string dictionary_id = params.get("dictionary_id"); + LOG_TRACE(log, "Library method: '{}', dictionary id: {}", method, dictionary_id); + + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + + try + { + if (method == "libNew") + { + auto & read_buf = request.getStream(); + params.read(read_buf); + + if (!params.has("library_path")) + { + processError(response, "No 'library_path' in request URL"); + return; + } + + if (!params.has("library_settings")) + { + processError(response, "No 'library_settings' in request URL"); + return; + } + + std::string library_path = params.get("library_path"); + const auto & settings_string = params.get("library_settings"); + std::vector library_settings = parseNamesFromBinary(settings_string); + + /// Needed for library dictionary + if (!params.has("attributes_names")) + { + processError(response, "No 'attributes_names' in request URL"); + return; + } + + const auto & attributes_string = params.get("attributes_names"); + std::vector attributes_names = parseNamesFromBinary(attributes_string); + + /// Needed to parse block from binary string format + if (!params.has("sample_block")) + { + processError(response, "No 'sample_block' in request URL"); + return; + } + std::string sample_block_string = params.get("sample_block"); + + std::shared_ptr sample_block; + try + { + sample_block = parseColumns(std::move(sample_block_string)); + } + catch (const Exception & ex) + { + processError(response, "Invalid 'sample_block' parameter in request body '" + ex.message() + "'"); + LOG_WARNING(log, ex.getStackTraceString()); + return; + } + + if (!params.has("null_values")) + { + processError(response, "No 'null_values' in request URL"); + return; + } + + ReadBufferFromString read_block_buf(params.get("null_values")); + auto format = FormatFactory::instance().getInput(FORMAT, read_block_buf, *sample_block, getContext(), DEFAULT_BLOCK_SIZE); + auto reader = std::make_shared(format); + auto sample_block_with_nulls = reader->read(); + + LOG_DEBUG(log, "Dictionary sample block with null values: {}", sample_block_with_nulls.dumpStructure()); + + SharedLibraryHandlerFactory::instance().create(dictionary_id, library_path, library_settings, sample_block_with_nulls, attributes_names); + writeStringBinary("1", out); + } + else if (method == "libClone") + { + if (!params.has("from_dictionary_id")) + { + processError(response, "No 'from_dictionary_id' in request URL"); + return; + } + + std::string from_dictionary_id = params.get("from_dictionary_id"); + LOG_TRACE(log, "Calling libClone from {} to {}", from_dictionary_id, dictionary_id); + SharedLibraryHandlerFactory::instance().clone(from_dictionary_id, dictionary_id); + writeStringBinary("1", out); + } + else if (method == "libDelete") + { + SharedLibraryHandlerFactory::instance().remove(dictionary_id); + writeStringBinary("1", out); + } + else if (method == "isModified") + { + auto library_handler = SharedLibraryHandlerFactory::instance().get(dictionary_id); + bool res = library_handler->isModified(); + writeStringBinary(std::to_string(res), out); + } + else if (method == "supportsSelectiveLoad") + { + auto library_handler = SharedLibraryHandlerFactory::instance().get(dictionary_id); + bool res = library_handler->supportsSelectiveLoad(); + writeStringBinary(std::to_string(res), out); + } + else if (method == "loadAll") + { + auto library_handler = SharedLibraryHandlerFactory::instance().get(dictionary_id); + const auto & sample_block = library_handler->getSampleBlock(); + auto input = library_handler->loadAll(); + + BlockOutputStreamPtr output = FormatFactory::instance().getOutputStream(FORMAT, out, sample_block, getContext()); + copyData(*input, *output); + } + else if (method == "loadIds") + { + params.read(request.getStream()); + + if (!params.has("ids")) + { + processError(response, "No 'ids' in request URL"); + return; + } + + std::vector ids = parseIdsFromBinary(params.get("ids")); + auto library_handler = SharedLibraryHandlerFactory::instance().get(dictionary_id); + const auto & sample_block = library_handler->getSampleBlock(); + auto input = library_handler->loadIds(ids); + BlockOutputStreamPtr output = FormatFactory::instance().getOutputStream(FORMAT, out, sample_block, getContext()); + copyData(*input, *output); + } + else if (method == "loadKeys") + { + if (!params.has("requested_block_sample")) + { + processError(response, "No 'requested_block_sample' in request URL"); + return; + } + + std::string requested_block_string = params.get("requested_block_sample"); + + std::shared_ptr requested_sample_block; + try + { + requested_sample_block = parseColumns(std::move(requested_block_string)); + } + catch (const Exception & ex) + { + processError(response, "Invalid 'requested_block' parameter in request body '" + ex.message() + "'"); + LOG_WARNING(log, ex.getStackTraceString()); + return; + } + + auto & read_buf = request.getStream(); + auto format = FormatFactory::instance().getInput(FORMAT, read_buf, *requested_sample_block, getContext(), DEFAULT_BLOCK_SIZE); + auto reader = std::make_shared(format); + auto block = reader->read(); + + auto library_handler = SharedLibraryHandlerFactory::instance().get(dictionary_id); + const auto & sample_block = library_handler->getSampleBlock(); + auto input = library_handler->loadKeys(block.getColumns()); + BlockOutputStreamPtr output = FormatFactory::instance().getOutputStream(FORMAT, out, sample_block, getContext()); + copyData(*input, *output); + } + } + catch (...) + { + auto message = getCurrentExceptionMessage(true); + response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR, message); // can't call process_error, because of too soon response sending + + try + { + writeStringBinary(message, out); + out.finalize(); + } + catch (...) + { + tryLogCurrentException(log); + } + + tryLogCurrentException(log); + } + + try + { + out.finalize(); + } + catch (...) + { + tryLogCurrentException(log); + } +} + + +void LibraryRequestHandler::processError(HTTPServerResponse & response, const std::string & message) +{ + response.setStatusAndReason(HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); + + if (!response.sent()) + *response.send() << message << std::endl; + + LOG_WARNING(log, message); +} + + +void PingHandler::handleRequest(HTTPServerRequest & /* request */, HTTPServerResponse & response) +{ + try + { + setResponseDefaultHeaders(response, keep_alive_timeout); + const char * data = "Ok.\n"; + response.sendBuffer(data, strlen(data)); + } + catch (...) + { + tryLogCurrentException("PingHandler"); + } +} + + +} diff --git a/programs/library-bridge/Handlers.h b/programs/library-bridge/Handlers.h new file mode 100644 index 00000000000..dac61d3a735 --- /dev/null +++ b/programs/library-bridge/Handlers.h @@ -0,0 +1,59 @@ +#pragma once + +#include +#include +#include +#include "SharedLibraryHandler.h" + + +namespace DB +{ + + +/// Handler for requests to Library Dictionary Source, returns response in RowBinary format. +/// When a library dictionary source is created, it sends libNew request to library bridge (which is started on first +/// request to it, if it was not yet started). On this request a new sharedLibrayHandler is added to a +/// sharedLibraryHandlerFactory by a dictionary uuid. With libNew request come: library_path, library_settings, +/// names of dictionary attributes, sample block to parse block of null values, block of null values. Everything is +/// passed in binary format and is urlencoded. When dictionary is cloned, a new handler is created. +/// Each handler is unique to dictionary. +class LibraryRequestHandler : public HTTPRequestHandler, WithContext +{ +public: + + LibraryRequestHandler( + size_t keep_alive_timeout_, + ContextPtr context_) + : WithContext(context_) + , log(&Poco::Logger::get("LibraryRequestHandler")) + , keep_alive_timeout(keep_alive_timeout_) + { + } + + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; + +private: + static constexpr inline auto FORMAT = "RowBinary"; + + void processError(HTTPServerResponse & response, const std::string & message); + + Poco::Logger * log; + size_t keep_alive_timeout; +}; + + +class PingHandler : public HTTPRequestHandler +{ +public: + explicit PingHandler(size_t keep_alive_timeout_) + : keep_alive_timeout(keep_alive_timeout_) + { + } + + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; + +private: + const size_t keep_alive_timeout; +}; + +} diff --git a/programs/library-bridge/LibraryBridge.cpp b/programs/library-bridge/LibraryBridge.cpp new file mode 100644 index 00000000000..2e5d6041151 --- /dev/null +++ b/programs/library-bridge/LibraryBridge.cpp @@ -0,0 +1,17 @@ +#include "LibraryBridge.h" + +#pragma GCC diagnostic ignored "-Wmissing-declarations" +int mainEntryClickHouseLibraryBridge(int argc, char ** argv) +{ + DB::LibraryBridge app; + try + { + return app.run(argc, argv); + } + catch (...) + { + std::cerr << DB::getCurrentExceptionMessage(true) << "\n"; + auto code = DB::getCurrentExceptionCode(); + return code ? code : 1; + } +} diff --git a/programs/library-bridge/LibraryBridge.h b/programs/library-bridge/LibraryBridge.h new file mode 100644 index 00000000000..9f2dafb89ab --- /dev/null +++ b/programs/library-bridge/LibraryBridge.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include "HandlerFactory.h" + + +namespace DB +{ + +class LibraryBridge : public IBridge +{ + +protected: + std::string bridgeName() const override + { + return "LibraryBridge"; + } + + HandlerFactoryPtr getHandlerFactoryPtr(ContextPtr context) const override + { + return std::make_shared("LibraryRequestHandlerFactory-factory", keep_alive_timeout, context); + } +}; + +} diff --git a/src/Dictionaries/LibraryDictionarySourceExternal.cpp b/programs/library-bridge/LibraryInterface.cpp similarity index 55% rename from src/Dictionaries/LibraryDictionarySourceExternal.cpp rename to programs/library-bridge/LibraryInterface.cpp index 2e944056283..3975368c17f 100644 --- a/src/Dictionaries/LibraryDictionarySourceExternal.cpp +++ b/programs/library-bridge/LibraryInterface.cpp @@ -1,4 +1,5 @@ -#include "LibraryDictionarySourceExternal.h" +#include "LibraryInterface.h" + #include namespace @@ -6,10 +7,25 @@ namespace const char DICT_LOGGER_NAME[] = "LibraryDictionarySourceExternal"; } -void ClickHouseLibrary::log(ClickHouseLibrary::LogLevel level, ClickHouseLibrary::CString msg) +namespace ClickHouseLibrary { - using ClickHouseLibrary::LogLevel; +std::string_view LIBRARY_CREATE_NEW_FUNC_NAME = "ClickHouseDictionary_v3_libNew"; +std::string_view LIBRARY_CLONE_FUNC_NAME = "ClickHouseDictionary_v3_libClone"; +std::string_view LIBRARY_DELETE_FUNC_NAME = "ClickHouseDictionary_v3_libDelete"; + +std::string_view LIBRARY_DATA_NEW_FUNC_NAME = "ClickHouseDictionary_v3_dataNew"; +std::string_view LIBRARY_DATA_DELETE_FUNC_NAME = "ClickHouseDictionary_v3_dataDelete"; + +std::string_view LIBRARY_LOAD_ALL_FUNC_NAME = "ClickHouseDictionary_v3_loadAll"; +std::string_view LIBRARY_LOAD_IDS_FUNC_NAME = "ClickHouseDictionary_v3_loadIds"; +std::string_view LIBRARY_LOAD_KEYS_FUNC_NAME = "ClickHouseDictionary_v3_loadKeys"; + +std::string_view LIBRARY_IS_MODIFIED_FUNC_NAME = "ClickHouseDictionary_v3_isModified"; +std::string_view LIBRARY_SUPPORTS_SELECTIVE_LOAD_FUNC_NAME = "ClickHouseDictionary_v3_supportsSelectiveLoad"; + +void log(LogLevel level, CString msg) +{ auto & logger = Poco::Logger::get(DICT_LOGGER_NAME); switch (level) { @@ -47,3 +63,5 @@ void ClickHouseLibrary::log(ClickHouseLibrary::LogLevel level, ClickHouseLibrary break; } } + +} diff --git a/programs/library-bridge/LibraryInterface.h b/programs/library-bridge/LibraryInterface.h new file mode 100644 index 00000000000..d23de59bbb1 --- /dev/null +++ b/programs/library-bridge/LibraryInterface.h @@ -0,0 +1,110 @@ +#pragma once + +#include +#include + +#define CLICKHOUSE_DICTIONARY_LIBRARY_API 1 + +namespace ClickHouseLibrary +{ +using CString = const char *; +using ColumnName = CString; +using ColumnNames = ColumnName[]; + +struct CStrings +{ + CString * data = nullptr; + uint64_t size = 0; +}; + +struct VectorUInt64 +{ + const uint64_t * data = nullptr; + uint64_t size = 0; +}; + +struct ColumnsUInt64 +{ + VectorUInt64 * data = nullptr; + uint64_t size = 0; +}; + +struct Field +{ + const void * data = nullptr; + uint64_t size = 0; +}; + +struct Row +{ + const Field * data = nullptr; + uint64_t size = 0; +}; + +struct Table +{ + const Row * data = nullptr; + uint64_t size = 0; + uint64_t error_code = 0; // 0 = ok; !0 = error, with message in error_string + const char * error_string = nullptr; +}; + +enum LogLevel +{ + FATAL = 1, + CRITICAL, + ERROR, + WARNING, + NOTICE, + INFORMATION, + DEBUG, + TRACE, +}; + +void log(LogLevel level, CString msg); + +extern std::string_view LIBRARY_CREATE_NEW_FUNC_NAME; +extern std::string_view LIBRARY_CLONE_FUNC_NAME; +extern std::string_view LIBRARY_DELETE_FUNC_NAME; + +extern std::string_view LIBRARY_DATA_NEW_FUNC_NAME; +extern std::string_view LIBRARY_DATA_DELETE_FUNC_NAME; + +extern std::string_view LIBRARY_LOAD_ALL_FUNC_NAME; +extern std::string_view LIBRARY_LOAD_IDS_FUNC_NAME; +extern std::string_view LIBRARY_LOAD_KEYS_FUNC_NAME; + +extern std::string_view LIBRARY_IS_MODIFIED_FUNC_NAME; +extern std::string_view LIBRARY_SUPPORTS_SELECTIVE_LOAD_FUNC_NAME; + +using LibraryContext = void *; + +using LibraryLoggerFunc = void (*)(LogLevel, CString /* message */); + +using LibrarySettings = CStrings *; + +using LibraryNewFunc = LibraryContext (*)(LibrarySettings, LibraryLoggerFunc); +using LibraryCloneFunc = LibraryContext (*)(LibraryContext); +using LibraryDeleteFunc = void (*)(LibraryContext); + +using LibraryData = void *; +using LibraryDataNewFunc = LibraryData (*)(LibraryContext); +using LibraryDataDeleteFunc = void (*)(LibraryContext, LibraryData); + +/// Can be safely casted into const Table * with static_cast +using RawClickHouseLibraryTable = void *; +using RequestedColumnsNames = CStrings *; + +using LibraryLoadAllFunc = RawClickHouseLibraryTable (*)(LibraryData, LibrarySettings, RequestedColumnsNames); + +using RequestedIds = const VectorUInt64 *; +using LibraryLoadIdsFunc = RawClickHouseLibraryTable (*)(LibraryData, LibrarySettings, RequestedColumnsNames, RequestedIds); + +using RequestedKeys = Table *; +/// There are no requested column names for load keys func +using LibraryLoadKeysFunc = RawClickHouseLibraryTable (*)(LibraryData, LibrarySettings, RequestedKeys); + +using LibraryIsModifiedFunc = bool (*)(LibraryContext, LibrarySettings); +using LibrarySupportsSelectiveLoadFunc = bool (*)(LibraryContext, LibrarySettings); + +} diff --git a/programs/library-bridge/LibraryUtils.h b/programs/library-bridge/LibraryUtils.h new file mode 100644 index 00000000000..8ced8df1c48 --- /dev/null +++ b/programs/library-bridge/LibraryUtils.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include +#include + +#include "LibraryInterface.h" + + +namespace DB +{ + +class CStringsHolder +{ + +public: + using Container = std::vector; + + explicit CStringsHolder(const Container & strings_pass) + { + strings_holder = strings_pass; + strings.size = strings_holder.size(); + + ptr_holder = std::make_unique(strings.size); + strings.data = ptr_holder.get(); + + size_t i = 0; + for (auto & str : strings_holder) + { + strings.data[i] = str.c_str(); + ++i; + } + } + + ClickHouseLibrary::CStrings strings; // will pass pointer to lib + +private: + std::unique_ptr ptr_holder = nullptr; + Container strings_holder; +}; + + +} diff --git a/programs/library-bridge/SharedLibraryHandler.cpp b/programs/library-bridge/SharedLibraryHandler.cpp new file mode 100644 index 00000000000..ab8cf2417c2 --- /dev/null +++ b/programs/library-bridge/SharedLibraryHandler.cpp @@ -0,0 +1,219 @@ +#include "SharedLibraryHandler.h" + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int EXTERNAL_LIBRARY_ERROR; + extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; +} + + +SharedLibraryHandler::SharedLibraryHandler( + const std::string & library_path_, + const std::vector & library_settings, + const Block & sample_block_, + const std::vector & attributes_names_) + : library_path(library_path_) + , sample_block(sample_block_) + , attributes_names(attributes_names_) +{ + library = std::make_shared(library_path, RTLD_LAZY); + settings_holder = std::make_shared(CStringsHolder(library_settings)); + + auto lib_new = library->tryGet(ClickHouseLibrary::LIBRARY_CREATE_NEW_FUNC_NAME); + + if (lib_new) + lib_data = lib_new(&settings_holder->strings, ClickHouseLibrary::log); + else + throw Exception("Method libNew failed", ErrorCodes::EXTERNAL_LIBRARY_ERROR); +} + + +SharedLibraryHandler::SharedLibraryHandler(const SharedLibraryHandler & other) + : library_path{other.library_path} + , sample_block{other.sample_block} + , attributes_names{other.attributes_names} + , library{other.library} + , settings_holder{other.settings_holder} +{ + + auto lib_clone = library->tryGet(ClickHouseLibrary::LIBRARY_CLONE_FUNC_NAME); + + if (lib_clone) + { + lib_data = lib_clone(other.lib_data); + } + else + { + auto lib_new = library->tryGet(ClickHouseLibrary::LIBRARY_CREATE_NEW_FUNC_NAME); + + if (lib_new) + lib_data = lib_new(&settings_holder->strings, ClickHouseLibrary::log); + } +} + + +SharedLibraryHandler::~SharedLibraryHandler() +{ + auto lib_delete = library->tryGet(ClickHouseLibrary::LIBRARY_DELETE_FUNC_NAME); + + if (lib_delete) + lib_delete(lib_data); +} + + +bool SharedLibraryHandler::isModified() +{ + auto func_is_modified = library->tryGet(ClickHouseLibrary::LIBRARY_IS_MODIFIED_FUNC_NAME); + + if (func_is_modified) + return func_is_modified(lib_data, &settings_holder->strings); + + return true; +} + + +bool SharedLibraryHandler::supportsSelectiveLoad() +{ + auto func_supports_selective_load = library->tryGet(ClickHouseLibrary::LIBRARY_SUPPORTS_SELECTIVE_LOAD_FUNC_NAME); + + if (func_supports_selective_load) + return func_supports_selective_load(lib_data, &settings_holder->strings); + + return true; +} + + +BlockInputStreamPtr SharedLibraryHandler::loadAll() +{ + auto columns_holder = std::make_unique(attributes_names.size()); + ClickHouseLibrary::CStrings columns{static_cast(columns_holder.get()), attributes_names.size()}; + for (size_t i = 0; i < attributes_names.size(); ++i) + columns.data[i] = attributes_names[i].c_str(); + + auto load_all_func = library->get(ClickHouseLibrary::LIBRARY_LOAD_ALL_FUNC_NAME); + auto data_new_func = library->get(ClickHouseLibrary::LIBRARY_DATA_NEW_FUNC_NAME); + auto data_delete_func = library->get(ClickHouseLibrary::LIBRARY_DATA_DELETE_FUNC_NAME); + + ClickHouseLibrary::LibraryData data_ptr = data_new_func(lib_data); + SCOPE_EXIT(data_delete_func(lib_data, data_ptr)); + + ClickHouseLibrary::RawClickHouseLibraryTable data = load_all_func(data_ptr, &settings_holder->strings, &columns); + auto block = dataToBlock(data); + + return std::make_shared(block); +} + + +BlockInputStreamPtr SharedLibraryHandler::loadIds(const std::vector & ids) +{ + const ClickHouseLibrary::VectorUInt64 ids_data{ext::bit_cast(ids.data()), ids.size()}; + + auto columns_holder = std::make_unique(attributes_names.size()); + ClickHouseLibrary::CStrings columns_pass{static_cast(columns_holder.get()), attributes_names.size()}; + + auto load_ids_func = library->get(ClickHouseLibrary::LIBRARY_LOAD_IDS_FUNC_NAME); + auto data_new_func = library->get(ClickHouseLibrary::LIBRARY_DATA_NEW_FUNC_NAME); + auto data_delete_func = library->get(ClickHouseLibrary::LIBRARY_DATA_DELETE_FUNC_NAME); + + ClickHouseLibrary::LibraryData data_ptr = data_new_func(lib_data); + SCOPE_EXIT(data_delete_func(lib_data, data_ptr)); + + ClickHouseLibrary::RawClickHouseLibraryTable data = load_ids_func(data_ptr, &settings_holder->strings, &columns_pass, &ids_data); + auto block = dataToBlock(data); + + return std::make_shared(block); +} + + +BlockInputStreamPtr SharedLibraryHandler::loadKeys(const Columns & key_columns) +{ + auto holder = std::make_unique(key_columns.size()); + std::vector> column_data_holders; + + for (size_t i = 0; i < key_columns.size(); ++i) + { + auto cell_holder = std::make_unique(key_columns[i]->size()); + + for (size_t j = 0; j < key_columns[i]->size(); ++j) + { + auto data_ref = key_columns[i]->getDataAt(j); + + cell_holder[j] = ClickHouseLibrary::Field{ + .data = static_cast(data_ref.data), + .size = data_ref.size}; + } + + holder[i] = ClickHouseLibrary::Row{ + .data = static_cast(cell_holder.get()), + .size = key_columns[i]->size()}; + + column_data_holders.push_back(std::move(cell_holder)); + } + + ClickHouseLibrary::Table request_cols{ + .data = static_cast(holder.get()), + .size = key_columns.size()}; + + auto load_keys_func = library->get(ClickHouseLibrary::LIBRARY_LOAD_KEYS_FUNC_NAME); + auto data_new_func = library->get(ClickHouseLibrary::LIBRARY_DATA_NEW_FUNC_NAME); + auto data_delete_func = library->get(ClickHouseLibrary::LIBRARY_DATA_DELETE_FUNC_NAME); + + ClickHouseLibrary::LibraryData data_ptr = data_new_func(lib_data); + SCOPE_EXIT(data_delete_func(lib_data, data_ptr)); + + ClickHouseLibrary::RawClickHouseLibraryTable data = load_keys_func(data_ptr, &settings_holder->strings, &request_cols); + auto block = dataToBlock(data); + + return std::make_shared(block); +} + + +Block SharedLibraryHandler::dataToBlock(const ClickHouseLibrary::RawClickHouseLibraryTable data) +{ + if (!data) + throw Exception("LibraryDictionarySource: No data returned", ErrorCodes::EXTERNAL_LIBRARY_ERROR); + + const auto * columns_received = static_cast(data); + if (columns_received->error_code) + throw Exception( + "LibraryDictionarySource: Returned error: " + std::to_string(columns_received->error_code) + " " + (columns_received->error_string ? columns_received->error_string : ""), + ErrorCodes::EXTERNAL_LIBRARY_ERROR); + + MutableColumns columns = sample_block.cloneEmptyColumns(); + + for (size_t col_n = 0; col_n < columns_received->size; ++col_n) + { + if (columns.size() != columns_received->data[col_n].size) + throw Exception( + "LibraryDictionarySource: Returned unexpected number of columns: " + std::to_string(columns_received->data[col_n].size) + ", must be " + std::to_string(columns.size()), + ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + + for (size_t row_n = 0; row_n < columns_received->data[col_n].size; ++row_n) + { + const auto & field = columns_received->data[col_n].data[row_n]; + if (!field.data) + { + /// sample_block contains null_value (from config) inside corresponding column + const auto & col = sample_block.getByPosition(row_n); + columns[row_n]->insertFrom(*(col.column), 0); + } + else + { + const auto & size = field.size; + columns[row_n]->insertData(static_cast(field.data), size); + } + } + } + + return sample_block.cloneWithColumns(std::move(columns)); +} + +} diff --git a/programs/library-bridge/SharedLibraryHandler.h b/programs/library-bridge/SharedLibraryHandler.h new file mode 100644 index 00000000000..5c0334ac89f --- /dev/null +++ b/programs/library-bridge/SharedLibraryHandler.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include +#include +#include "LibraryUtils.h" + + +namespace DB +{ + +/// A class that manages all operations with library dictionary. +/// Every library dictionary source has its own object of this class, accessed by UUID. +class SharedLibraryHandler +{ + +public: + SharedLibraryHandler( + const std::string & library_path_, + const std::vector & library_settings, + const Block & sample_block_, + const std::vector & attributes_names_); + + SharedLibraryHandler(const SharedLibraryHandler & other); + + ~SharedLibraryHandler(); + + BlockInputStreamPtr loadAll(); + + BlockInputStreamPtr loadIds(const std::vector & ids); + + BlockInputStreamPtr loadKeys(const Columns & key_columns); + + bool isModified(); + + bool supportsSelectiveLoad(); + + const Block & getSampleBlock() { return sample_block; } + +private: + Block dataToBlock(const ClickHouseLibrary::RawClickHouseLibraryTable data); + + std::string library_path; + const Block sample_block; + std::vector attributes_names; + + SharedLibraryPtr library; + std::shared_ptr settings_holder; + void * lib_data; +}; + +using SharedLibraryHandlerPtr = std::shared_ptr; + +} diff --git a/programs/library-bridge/SharedLibraryHandlerFactory.cpp b/programs/library-bridge/SharedLibraryHandlerFactory.cpp new file mode 100644 index 00000000000..05494c313c4 --- /dev/null +++ b/programs/library-bridge/SharedLibraryHandlerFactory.cpp @@ -0,0 +1,67 @@ +#include "SharedLibraryHandlerFactory.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +SharedLibraryHandlerPtr SharedLibraryHandlerFactory::get(const std::string & dictionary_id) +{ + std::lock_guard lock(mutex); + auto library_handler = library_handlers.find(dictionary_id); + + if (library_handler != library_handlers.end()) + return library_handler->second; + + return nullptr; +} + + +void SharedLibraryHandlerFactory::create( + const std::string & dictionary_id, + const std::string & library_path, + const std::vector & library_settings, + const Block & sample_block, + const std::vector & attributes_names) +{ + std::lock_guard lock(mutex); + library_handlers[dictionary_id] = std::make_shared(library_path, library_settings, sample_block, attributes_names); +} + + +void SharedLibraryHandlerFactory::clone(const std::string & from_dictionary_id, const std::string & to_dictionary_id) +{ + std::lock_guard lock(mutex); + auto from_library_handler = library_handlers.find(from_dictionary_id); + + /// This is not supposed to happen as libClone is called from copy constructor of LibraryDictionarySource + /// object, and shared library handler of from_dictionary is removed only in its destructor. + /// And if for from_dictionary there was no shared library handler, it would have received and exception in + /// its constructor, so no libClone would be made from it. + if (from_library_handler == library_handlers.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No shared library handler found"); + + /// libClone method will be called in copy constructor + library_handlers[to_dictionary_id] = std::make_shared(*from_library_handler->second); +} + + +void SharedLibraryHandlerFactory::remove(const std::string & dictionary_id) +{ + std::lock_guard lock(mutex); + /// libDelete is called in destructor. + library_handlers.erase(dictionary_id); +} + + +SharedLibraryHandlerFactory & SharedLibraryHandlerFactory::instance() +{ + static SharedLibraryHandlerFactory ret; + return ret; +} + +} diff --git a/programs/library-bridge/SharedLibraryHandlerFactory.h b/programs/library-bridge/SharedLibraryHandlerFactory.h new file mode 100644 index 00000000000..473d90618a2 --- /dev/null +++ b/programs/library-bridge/SharedLibraryHandlerFactory.h @@ -0,0 +1,37 @@ +#pragma once + +#include "SharedLibraryHandler.h" +#include +#include + + +namespace DB +{ + +/// Each library dictionary source has unique UUID. When clone() method is called, a new UUID is generated. +/// There is a unique mapping from diciotnary UUID to sharedLibraryHandler. +class SharedLibraryHandlerFactory final : private boost::noncopyable +{ +public: + static SharedLibraryHandlerFactory & instance(); + + SharedLibraryHandlerPtr get(const std::string & dictionary_id); + + void create( + const std::string & dictionary_id, + const std::string & library_path, + const std::vector & library_settings, + const Block & sample_block, + const std::vector & attributes_names); + + void clone(const std::string & from_dictionary_id, const std::string & to_dictionary_id); + + void remove(const std::string & dictionary_id); + +private: + /// map: dict_id -> sharedLibraryHandler + std::unordered_map library_handlers; + std::mutex mutex; +}; + +} diff --git a/programs/library-bridge/library-bridge.cpp b/programs/library-bridge/library-bridge.cpp new file mode 100644 index 00000000000..5fff2ffe525 --- /dev/null +++ b/programs/library-bridge/library-bridge.cpp @@ -0,0 +1,3 @@ +int mainEntryClickHouseLibraryBridge(int argc, char ** argv); +int main(int argc_, char ** argv_) { return mainEntryClickHouseLibraryBridge(argc_, argv_); } + diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 5a8d35e204d..f680c2c2da6 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -99,9 +99,9 @@ void LocalServer::initialize(Poco::Util::Application & self) } } -void LocalServer::applyCmdSettings(Context & context) +void LocalServer::applyCmdSettings(ContextPtr context) { - context.applySettingsChanges(cmd_settings.changes()); + context->applySettingsChanges(cmd_settings.changes()); } /// If path is specified and not empty, will try to setup server environment and load existing metadata @@ -176,7 +176,7 @@ void LocalServer::tryInitPath() } -static void attachSystemTables(const Context & context) +static void attachSystemTables(ContextPtr context) { DatabasePtr system_database = DatabaseCatalog::instance().tryGetDatabase(DatabaseCatalog::SYSTEM_DATABASE); if (!system_database) @@ -211,7 +211,7 @@ try } shared_context = Context::createShared(); - global_context = std::make_unique(Context::createGlobal(shared_context.get())); + global_context = Context::createGlobal(shared_context.get()); global_context->makeGlobalContext(); global_context->setApplicationType(Context::ApplicationType::LOCAL); tryInitPath(); @@ -240,7 +240,7 @@ try /// Skip networking - /// Sets external authenticators config (LDAP). + /// Sets external authenticators config (LDAP, Kerberos). global_context->setExternalAuthenticatorsConfig(config()); setupUsers(); @@ -260,6 +260,11 @@ try if (mark_cache_size) global_context->setMarkCache(mark_cache_size); + /// A cache for mmapped files. + size_t mmap_cache_size = config().getUInt64("mmap_cache_size", 1000); /// The choice of default is arbitrary. + if (mmap_cache_size) + global_context->setMMappedFileCache(mmap_cache_size); + /// Load global settings from default_profile and system_profile. global_context->setDefaultProfiles(config()); @@ -269,9 +274,9 @@ try * if such tables will not be dropped, clickhouse-server will not be able to load them due to security reasons. */ std::string default_database = config().getString("default_database", "_local"); - DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared(default_database, *global_context)); + DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared(default_database, global_context)); global_context->setCurrentDatabase(default_database); - applyCmdOptions(*global_context); + applyCmdOptions(global_context); if (config().has("path")) { @@ -283,15 +288,15 @@ try LOG_DEBUG(log, "Loading metadata from {}", path); Poco::File(path + "data/").createDirectories(); Poco::File(path + "metadata/").createDirectories(); - loadMetadataSystem(*global_context); - attachSystemTables(*global_context); - loadMetadata(*global_context); + loadMetadataSystem(global_context); + attachSystemTables(global_context); + loadMetadata(global_context); DatabaseCatalog::instance().loadDatabases(); LOG_DEBUG(log, "Loaded metadata."); } else if (!config().has("no-system-tables")) { - attachSystemTables(*global_context); + attachSystemTables(global_context); } processQueries(); @@ -370,13 +375,13 @@ void LocalServer::processQueries() /// we can't mutate global global_context (can lead to races, as it was already passed to some background threads) /// so we can't reuse it safely as a query context and need a copy here - auto context = Context(*global_context); + auto context = Context::createCopy(global_context); - context.makeSessionContext(); - context.makeQueryContext(); + context->makeSessionContext(); + context->makeQueryContext(); - context.setUser("default", "", Poco::Net::SocketAddress{}); - context.setCurrentQueryId(""); + context->setUser("default", "", Poco::Net::SocketAddress{}); + context->setCurrentQueryId(""); applyCmdSettings(context); /// Use the same query_id (and thread group) for all queries @@ -613,9 +618,9 @@ void LocalServer::init(int argc, char ** argv) argsToConfig(arguments, config(), 100); } -void LocalServer::applyCmdOptions(Context & context) +void LocalServer::applyCmdOptions(ContextPtr context) { - context.setDefaultFormat(config().getString("output-format", config().getString("format", "TSV"))); + context->setDefaultFormat(config().getString("output-format", config().getString("format", "TSV"))); applyCmdSettings(context); } diff --git a/programs/local/LocalServer.h b/programs/local/LocalServer.h index 02778bd86cb..3555e8a38ad 100644 --- a/programs/local/LocalServer.h +++ b/programs/local/LocalServer.h @@ -36,15 +36,15 @@ private: std::string getInitialCreateTableQuery(); void tryInitPath(); - void applyCmdOptions(Context & context); - void applyCmdSettings(Context & context); + void applyCmdOptions(ContextPtr context); + void applyCmdSettings(ContextPtr context); void processQueries(); void setupUsers(); void cleanup(); protected: SharedContextHolder shared_context; - std::unique_ptr global_context; + ContextPtr global_context; /// Settings specified via command line args Settings cmd_settings; diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 950db4e4f05..c92eb5c6647 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -100,16 +100,16 @@ class IModel { public: /// Call train iteratively for each block to train a model. - virtual void train(const IColumn & column); + virtual void train(const IColumn & column) = 0; /// Call finalize one time after training before generating. - virtual void finalize(); + virtual void finalize() = 0; /// Call generate: pass source data column to obtain a column with anonymized data as a result. - virtual ColumnPtr generate(const IColumn & column); + virtual ColumnPtr generate(const IColumn & column) = 0; /// Deterministically change seed to some other value. This can be used to generate more values than were in source. - virtual void updateSeed(); + virtual void updateSeed() = 0; virtual ~IModel() = default; }; @@ -1129,8 +1129,8 @@ try } SharedContextHolder shared_context = Context::createShared(); - Context context = Context::createGlobal(shared_context.get()); - context.makeGlobalContext(); + ContextPtr context = Context::createGlobal(shared_context.get()); + context->makeGlobalContext(); ReadBufferFromFileDescriptor file_in(STDIN_FILENO); WriteBufferFromFileDescriptor file_out(STDOUT_FILENO); @@ -1152,7 +1152,7 @@ try if (!silent) std::cerr << "Training models\n"; - BlockInputStreamPtr input = context.getInputFormat(input_format, file_in, header, max_block_size); + BlockInputStreamPtr input = context->getInputFormat(input_format, file_in, header, max_block_size); input->readPrefix(); while (Block block = input->read()) @@ -1179,8 +1179,8 @@ try file_in.seek(0, SEEK_SET); - BlockInputStreamPtr input = context.getInputFormat(input_format, file_in, header, max_block_size); - BlockOutputStreamPtr output = context.getOutputStream(output_format, file_out, header); + BlockInputStreamPtr input = context->getInputFormat(input_format, file_in, header, max_block_size); + BlockOutputStreamPtr output = context->getOutputStreamParallelIfPossible(output_format, file_out, header); if (processed_rows + source_rows > limit) input = std::make_shared(input, limit - processed_rows, 0); diff --git a/programs/odbc-bridge/CMakeLists.txt b/programs/odbc-bridge/CMakeLists.txt index 11864354619..7b232f2b5dc 100644 --- a/programs/odbc-bridge/CMakeLists.txt +++ b/programs/odbc-bridge/CMakeLists.txt @@ -24,12 +24,14 @@ add_executable(clickhouse-odbc-bridge ${CLICKHOUSE_ODBC_BRIDGE_SOURCES}) target_link_libraries(clickhouse-odbc-bridge PRIVATE daemon dbms + bridge clickhouse_parsers - Poco::Data - Poco::Data::ODBC + nanodbc + unixodbc ) set_target_properties(clickhouse-odbc-bridge PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..) +target_compile_options (clickhouse-odbc-bridge PRIVATE -Wno-reserved-id-macro -Wno-keyword-macro) if (USE_GDB_ADD_INDEX) add_custom_command(TARGET clickhouse-odbc-bridge POST_BUILD COMMAND ${GDB_ADD_INDEX_EXE} ../clickhouse-odbc-bridge COMMENT "Adding .gdb-index to clickhouse-odbc-bridge" VERBATIM) diff --git a/programs/odbc-bridge/ColumnInfoHandler.cpp b/programs/odbc-bridge/ColumnInfoHandler.cpp index 14fa734f246..e33858583c2 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.cpp +++ b/programs/odbc-bridge/ColumnInfoHandler.cpp @@ -2,29 +2,36 @@ #if USE_ODBC -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include "getIdentifierQuote.h" -# include "validateODBCConnectionString.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "getIdentifierQuote.h" +#include "validateODBCConnectionString.h" +#include "ODBCConnectionFactory.h" + +#include +#include -# define POCO_SQL_ODBC_CLASS Poco::Data::ODBC namespace DB { + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; +} + namespace { DataTypePtr getDataType(SQLSMALLINT type) @@ -59,6 +66,7 @@ namespace } } + void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { HTMLForm params(request, request.getStream()); @@ -77,88 +85,79 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ process_error("No 'table' param in request URL"); return; } + if (!params.has("connection_string")) { process_error("No 'connection_string' in request URL"); return; } + std::string schema_name; std::string table_name = params.get("table"); std::string connection_string = params.get("connection_string"); if (params.has("schema")) - { schema_name = params.get("schema"); - LOG_TRACE(log, "Will fetch info for table '{}'", schema_name + "." + table_name); - } - else - LOG_TRACE(log, "Will fetch info for table '{}'", table_name); + LOG_TRACE(log, "Got connection str '{}'", connection_string); try { const bool external_table_functions_use_nulls = Poco::NumberParser::parseBool(params.get("external_table_functions_use_nulls", "false")); - POCO_SQL_ODBC_CLASS::SessionImpl session(validateODBCConnectionString(connection_string), DBMS_DEFAULT_CONNECT_TIMEOUT_SEC); - SQLHDBC hdbc = session.dbc().handle(); + auto connection = ODBCConnectionFactory::instance().get( + validateODBCConnectionString(connection_string), + getContext()->getSettingsRef().odbc_bridge_connection_pool_size); - SQLHSTMT hstmt = nullptr; + nanodbc::catalog catalog(*connection); + std::string catalog_name; - if (POCO_SQL_ODBC_CLASS::Utility::isError(SQLAllocStmt(hdbc, &hstmt))) - throw POCO_SQL_ODBC_CLASS::ODBCException("Could not allocate connection handle."); - - SCOPE_EXIT(SQLFreeStmt(hstmt, SQL_DROP)); - - const auto & context_settings = context.getSettingsRef(); - - /// TODO Why not do SQLColumns instead? - std::string name = schema_name.empty() ? backQuoteIfNeed(table_name) : backQuoteIfNeed(schema_name) + "." + backQuoteIfNeed(table_name); - WriteBufferFromOwnString buf; - std::string input = "SELECT * FROM " + name + " WHERE 1 = 0"; - ParserQueryWithOutput parser(input.data() + input.size()); - ASTPtr select = parseQuery(parser, input.data(), input.data() + input.size(), "", context_settings.max_query_size, context_settings.max_parser_depth); - - IAST::FormatSettings settings(buf, true); - settings.always_quote_identifiers = true; - settings.identifier_quoting_style = getQuotingStyle(hdbc); - select->format(settings); - std::string query = buf.str(); - - LOG_TRACE(log, "Inferring structure with query '{}'", query); - - if (POCO_SQL_ODBC_CLASS::Utility::isError(POCO_SQL_ODBC_CLASS::SQLPrepare(hstmt, reinterpret_cast(query.data()), query.size()))) - throw POCO_SQL_ODBC_CLASS::DescriptorException(session.dbc()); - - if (POCO_SQL_ODBC_CLASS::Utility::isError(SQLExecute(hstmt))) - throw POCO_SQL_ODBC_CLASS::StatementException(hstmt); - - SQLSMALLINT cols = 0; - if (POCO_SQL_ODBC_CLASS::Utility::isError(SQLNumResultCols(hstmt, &cols))) - throw POCO_SQL_ODBC_CLASS::StatementException(hstmt); - - /// TODO cols not checked - - NamesAndTypesList columns; - for (SQLSMALLINT ncol = 1; ncol <= cols; ++ncol) + /// In XDBC tables it is allowed to pass either database_name or schema_name in table definion, but not both of them. + /// They both are passed as 'schema' parameter in request URL, so it is not clear whether it is database_name or schema_name passed. + /// If it is schema_name then we know that database is added in odbc.ini. But if we have database_name as 'schema', + /// it is not guaranteed. For nanodbc database_name must be either in odbc.ini or passed as catalog_name. + auto get_columns = [&]() { - SQLSMALLINT type = 0; - /// TODO Why 301? - SQLCHAR column_name[301]; - - SQLSMALLINT is_nullable; - const auto result = POCO_SQL_ODBC_CLASS::SQLDescribeCol(hstmt, ncol, column_name, sizeof(column_name), nullptr, &type, nullptr, nullptr, &is_nullable); - if (POCO_SQL_ODBC_CLASS::Utility::isError(result)) - throw POCO_SQL_ODBC_CLASS::StatementException(hstmt); - - auto column_type = getDataType(type); - if (external_table_functions_use_nulls && is_nullable == SQL_NULLABLE) + nanodbc::catalog::tables tables = catalog.find_tables(table_name, /* type = */ "", /* schema = */ "", /* catalog = */ schema_name); + if (tables.next()) { - column_type = std::make_shared(column_type); + catalog_name = tables.table_catalog(); + LOG_TRACE(log, "Will fetch info for table '{}.{}'", catalog_name, table_name); + return catalog.find_columns(/* column = */ "", table_name, /* schema = */ "", catalog_name); } - columns.emplace_back(reinterpret_cast(column_name), std::move(column_type)); + tables = catalog.find_tables(table_name, /* type = */ "", /* schema = */ schema_name); + if (tables.next()) + { + catalog_name = tables.table_catalog(); + LOG_TRACE(log, "Will fetch info for table '{}.{}.{}'", catalog_name, schema_name, table_name); + return catalog.find_columns(/* column = */ "", table_name, schema_name, catalog_name); + } + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table {} not found", schema_name.empty() ? table_name : schema_name + '.' + table_name); + }; + + nanodbc::catalog::columns columns_definition = get_columns(); + + NamesAndTypesList columns; + while (columns_definition.next()) + { + SQLSMALLINT type = columns_definition.sql_data_type(); + std::string column_name = columns_definition.column_name(); + + bool is_nullable = columns_definition.nullable() == SQL_NULLABLE; + + auto column_type = getDataType(type); + + if (external_table_functions_use_nulls && is_nullable == SQL_NULLABLE) + column_type = std::make_shared(column_type); + + columns.emplace_back(column_name, std::move(column_type)); } + if (columns.empty()) + throw Exception("Columns definition was not returned", ErrorCodes::LOGICAL_ERROR); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); try { diff --git a/programs/odbc-bridge/ColumnInfoHandler.h b/programs/odbc-bridge/ColumnInfoHandler.h index 9b5b470b31d..bc976f54aee 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.h +++ b/programs/odbc-bridge/ColumnInfoHandler.h @@ -2,24 +2,23 @@ #if USE_ODBC -# include -# include -# include +#include +#include +#include +#include +#include -# include -/** The structure of the table is taken from the query "SELECT * FROM table WHERE 1=0". - * TODO: It would be much better to utilize ODBC methods dedicated for columns description. - * If there is no such table, an exception is thrown. - */ namespace DB { -class ODBCColumnsInfoHandler : public HTTPRequestHandler +class ODBCColumnsInfoHandler : public HTTPRequestHandler, WithContext { public: - ODBCColumnsInfoHandler(size_t keep_alive_timeout_, Context & context_) - : log(&Poco::Logger::get("ODBCColumnsInfoHandler")), keep_alive_timeout(keep_alive_timeout_), context(context_) + ODBCColumnsInfoHandler(size_t keep_alive_timeout_, ContextPtr context_) + : WithContext(context_) + , log(&Poco::Logger::get("ODBCColumnsInfoHandler")) + , keep_alive_timeout(keep_alive_timeout_) { } @@ -28,7 +27,6 @@ public: private: Poco::Logger * log; size_t keep_alive_timeout; - Context & context; }; } diff --git a/programs/odbc-bridge/HandlerFactory.cpp b/programs/odbc-bridge/HandlerFactory.cpp index 9ac48af4ace..49984453d33 100644 --- a/programs/odbc-bridge/HandlerFactory.cpp +++ b/programs/odbc-bridge/HandlerFactory.cpp @@ -8,7 +8,7 @@ namespace DB { -std::unique_ptr HandlerFactory::createRequestHandler(const HTTPServerRequest & request) +std::unique_ptr ODBCBridgeHandlerFactory::createRequestHandler(const HTTPServerRequest & request) { Poco::URI uri{request.getURI()}; LOG_TRACE(log, "Request URI: {}", uri.toString()); @@ -21,26 +21,26 @@ std::unique_ptr HandlerFactory::createRequestHandler(const H if (uri.getPath() == "/columns_info") #if USE_ODBC - return std::make_unique(keep_alive_timeout, context); + return std::make_unique(keep_alive_timeout, getContext()); #else return nullptr; #endif else if (uri.getPath() == "/identifier_quote") #if USE_ODBC - return std::make_unique(keep_alive_timeout, context); + return std::make_unique(keep_alive_timeout, getContext()); #else return nullptr; #endif else if (uri.getPath() == "/schema_allowed") #if USE_ODBC - return std::make_unique(keep_alive_timeout, context); + return std::make_unique(keep_alive_timeout, getContext()); #else return nullptr; #endif else if (uri.getPath() == "/write") - return std::make_unique(pool_map, keep_alive_timeout, context, "write"); + return std::make_unique(keep_alive_timeout, getContext(), "write"); else - return std::make_unique(pool_map, keep_alive_timeout, context, "read"); + return std::make_unique(keep_alive_timeout, getContext(), "read"); } return nullptr; } diff --git a/programs/odbc-bridge/HandlerFactory.h b/programs/odbc-bridge/HandlerFactory.h index 5dce6f02ecd..ffbbe3670af 100644 --- a/programs/odbc-bridge/HandlerFactory.h +++ b/programs/odbc-bridge/HandlerFactory.h @@ -1,32 +1,28 @@ #pragma once -#include +#include #include #include "ColumnInfoHandler.h" #include "IdentifierQuoteHandler.h" #include "MainHandler.h" #include "SchemaAllowedHandler.h" - #include -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#include -#pragma GCC diagnostic pop - namespace DB { /** Factory for '/ping', '/', '/columns_info', '/identifier_quote', '/schema_allowed' handlers. * Also stores Session pools for ODBC connections */ -class HandlerFactory : public HTTPRequestHandlerFactory +class ODBCBridgeHandlerFactory : public HTTPRequestHandlerFactory, WithContext { public: - HandlerFactory(const std::string & name_, size_t keep_alive_timeout_, Context & context_) - : log(&Poco::Logger::get(name_)), name(name_), keep_alive_timeout(keep_alive_timeout_), context(context_) + ODBCBridgeHandlerFactory(const std::string & name_, size_t keep_alive_timeout_, ContextPtr context_) + : WithContext(context_) + , log(&Poco::Logger::get(name_)) + , name(name_) + , keep_alive_timeout(keep_alive_timeout_) { - pool_map = std::make_shared(); } std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override; @@ -35,7 +31,6 @@ private: Poco::Logger * log; std::string name; size_t keep_alive_timeout; - Context & context; - std::shared_ptr pool_map; }; + } diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.cpp b/programs/odbc-bridge/IdentifierQuoteHandler.cpp index 5060d37c479..a5a97cb8086 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp +++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp @@ -2,23 +2,20 @@ #if USE_ODBC -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include "getIdentifierQuote.h" -# include "validateODBCConnectionString.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "getIdentifierQuote.h" +#include "validateODBCConnectionString.h" +#include "ODBCConnectionFactory.h" -# define POCO_SQL_ODBC_CLASS Poco::Data::ODBC namespace DB { @@ -44,10 +41,12 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ try { std::string connection_string = params.get("connection_string"); - POCO_SQL_ODBC_CLASS::SessionImpl session(validateODBCConnectionString(connection_string), DBMS_DEFAULT_CONNECT_TIMEOUT_SEC); - SQLHDBC hdbc = session.dbc().handle(); - auto identifier = getIdentifierQuote(hdbc); + auto connection = ODBCConnectionFactory::instance().get( + validateODBCConnectionString(connection_string), + getContext()->getSettingsRef().odbc_bridge_connection_pool_size); + + auto identifier = getIdentifierQuote(*connection); WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); try diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.h b/programs/odbc-bridge/IdentifierQuoteHandler.h index dad88c72ad8..ef3806fd802 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.h +++ b/programs/odbc-bridge/IdentifierQuoteHandler.h @@ -11,11 +11,13 @@ namespace DB { -class IdentifierQuoteHandler : public HTTPRequestHandler +class IdentifierQuoteHandler : public HTTPRequestHandler, WithContext { public: - IdentifierQuoteHandler(size_t keep_alive_timeout_, Context &) - : log(&Poco::Logger::get("IdentifierQuoteHandler")), keep_alive_timeout(keep_alive_timeout_) + IdentifierQuoteHandler(size_t keep_alive_timeout_, ContextPtr context_) + : WithContext(context_) + , log(&Poco::Logger::get("IdentifierQuoteHandler")) + , keep_alive_timeout(keep_alive_timeout_) { } diff --git a/programs/odbc-bridge/MainHandler.cpp b/programs/odbc-bridge/MainHandler.cpp index 4fcc9deea6a..e24b51f6037 100644 --- a/programs/odbc-bridge/MainHandler.cpp +++ b/programs/odbc-bridge/MainHandler.cpp @@ -18,18 +18,17 @@ #include #include #include +#include "ODBCConnectionFactory.h" #include #include +#include -#if USE_ODBC -#include -#define POCO_SQL_ODBC_CLASS Poco::Data::ODBC -#endif namespace DB { + namespace { std::unique_ptr parseColumns(std::string && column_string) @@ -42,37 +41,6 @@ namespace } } -using PocoSessionPoolConstructor = std::function()>; -/** Is used to adjust max size of default Poco thread pool. See issue #750 - * Acquire the lock, resize pool and construct new Session. - */ -static std::shared_ptr createAndCheckResizePocoSessionPool(PocoSessionPoolConstructor pool_constr) -{ - static std::mutex mutex; - - Poco::ThreadPool & pool = Poco::ThreadPool::defaultPool(); - - /// NOTE: The lock don't guarantee that external users of the pool don't change its capacity - std::unique_lock lock(mutex); - - if (pool.available() == 0) - pool.addCapacity(2 * std::max(pool.capacity(), 1)); - - return pool_constr(); -} - -ODBCHandler::PoolPtr ODBCHandler::getPool(const std::string & connection_str) -{ - std::lock_guard lock(mutex); - if (!pool_map->count(connection_str)) - { - pool_map->emplace(connection_str, createAndCheckResizePocoSessionPool([connection_str] - { - return std::make_shared("ODBC", validateODBCConnectionString(connection_str)); - })); - } - return pool_map->at(connection_str); -} void ODBCHandler::processError(HTTPServerResponse & response, const std::string & message) { @@ -82,12 +50,14 @@ void ODBCHandler::processError(HTTPServerResponse & response, const std::string LOG_WARNING(log, message); } + void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { HTMLForm params(request); + LOG_TRACE(log, "Request URI: {}", request.getURI()); + if (mode == "read") params.read(request.getStream()); - LOG_TRACE(log, "Request URI: {}", request.getURI()); if (mode == "read" && !params.has("query")) { @@ -95,11 +65,6 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse return; } - if (!params.has("columns")) - { - processError(response, "No 'columns' in request URL"); - return; - } if (!params.has("connection_string")) { @@ -107,6 +72,16 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse return; } + if (!params.has("sample_block")) + { + processError(response, "No 'sample_block' in request URL"); + return; + } + + std::string format = params.get("format", "RowBinary"); + std::string connection_string = params.get("connection_string"); + LOG_TRACE(log, "Connection string: '{}'", connection_string); + UInt64 max_block_size = DEFAULT_BLOCK_SIZE; if (params.has("max_block_size")) { @@ -119,28 +94,27 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse max_block_size = parse(max_block_size_str); } - std::string columns = params.get("columns"); + std::string sample_block_string = params.get("sample_block"); std::unique_ptr sample_block; try { - sample_block = parseColumns(std::move(columns)); + sample_block = parseColumns(std::move(sample_block_string)); } catch (const Exception & ex) { - processError(response, "Invalid 'columns' parameter in request body '" + ex.message() + "'"); - LOG_WARNING(log, ex.getStackTraceString()); + processError(response, "Invalid 'sample_block' parameter in request body '" + ex.message() + "'"); + LOG_ERROR(log, ex.getStackTraceString()); return; } - std::string format = params.get("format", "RowBinary"); - - std::string connection_string = params.get("connection_string"); - LOG_TRACE(log, "Connection string: '{}'", connection_string); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); try { + auto connection = ODBCConnectionFactory::instance().get( + validateODBCConnectionString(connection_string), + getContext()->getSettingsRef().odbc_bridge_connection_pool_size); + if (mode == "write") { if (!params.has("db_name")) @@ -159,15 +133,12 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse auto quoting_style = IdentifierQuotingStyle::None; #if USE_ODBC - POCO_SQL_ODBC_CLASS::SessionImpl session(validateODBCConnectionString(connection_string), DBMS_DEFAULT_CONNECT_TIMEOUT_SEC); - quoting_style = getQuotingStyle(session.dbc().handle()); + quoting_style = getQuotingStyle(*connection); #endif - - auto pool = getPool(connection_string); auto & read_buf = request.getStream(); - auto input_format = FormatFactory::instance().getInput(format, read_buf, *sample_block, context, max_block_size); + auto input_format = FormatFactory::instance().getInput(format, read_buf, *sample_block, getContext(), max_block_size); auto input_stream = std::make_shared(input_format); - ODBCBlockOutputStream output_stream(pool->get(), db_name, table_name, *sample_block, quoting_style); + ODBCBlockOutputStream output_stream(*connection, db_name, table_name, *sample_block, getContext(), quoting_style); copyData(*input_stream, output_stream); writeStringBinary("Ok.", out); } @@ -176,9 +147,8 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse std::string query = params.get("query"); LOG_TRACE(log, "Query: {}", query); - BlockOutputStreamPtr writer = FormatFactory::instance().getOutputStream(format, out, *sample_block, context); - auto pool = getPool(connection_string); - ODBCBlockInputStream inp(pool->get(), query, *sample_block, max_block_size); + BlockOutputStreamPtr writer = FormatFactory::instance().getOutputStreamParallelIfPossible(format, out, *sample_block, getContext()); + ODBCBlockInputStream inp(*connection, query, *sample_block, max_block_size); copyData(inp, *writer); } } diff --git a/programs/odbc-bridge/MainHandler.h b/programs/odbc-bridge/MainHandler.h index e237ede5814..bc0fca8b9a5 100644 --- a/programs/odbc-bridge/MainHandler.h +++ b/programs/odbc-bridge/MainHandler.h @@ -1,14 +1,13 @@ #pragma once -#include +#include #include - #include -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#include -#pragma GCC diagnostic pop + +#include +#include + namespace DB { @@ -17,20 +16,16 @@ namespace DB * and also query in request body * response in RowBinary format */ -class ODBCHandler : public HTTPRequestHandler +class ODBCHandler : public HTTPRequestHandler, WithContext { public: - using PoolPtr = std::shared_ptr; - using PoolMap = std::unordered_map; - - ODBCHandler(std::shared_ptr pool_map_, + ODBCHandler( size_t keep_alive_timeout_, - Context & context_, + ContextPtr context_, const String & mode_) - : log(&Poco::Logger::get("ODBCHandler")) - , pool_map(pool_map_) + : WithContext(context_) + , log(&Poco::Logger::get("ODBCHandler")) , keep_alive_timeout(keep_alive_timeout_) - , context(context_) , mode(mode_) { } @@ -40,14 +35,11 @@ public: private: Poco::Logger * log; - std::shared_ptr pool_map; size_t keep_alive_timeout; - Context & context; String mode; static inline std::mutex mutex; - PoolPtr getPool(const std::string & connection_str); void processError(HTTPServerResponse & response, const std::string & message); }; diff --git a/programs/odbc-bridge/ODBCBlockInputStream.cpp b/programs/odbc-bridge/ODBCBlockInputStream.cpp index 3e2a2d0c7d4..3a73cb9f601 100644 --- a/programs/odbc-bridge/ODBCBlockInputStream.cpp +++ b/programs/odbc-bridge/ODBCBlockInputStream.cpp @@ -1,5 +1,7 @@ #include "ODBCBlockInputStream.h" #include +#include +#include #include #include #include @@ -14,137 +16,143 @@ namespace DB { namespace ErrorCodes { - extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; extern const int UNKNOWN_TYPE; } ODBCBlockInputStream::ODBCBlockInputStream( - Poco::Data::Session && session_, const std::string & query_str, const Block & sample_block, const UInt64 max_block_size_) - : session{session_} - , statement{(this->session << query_str, Poco::Data::Keywords::now)} - , result{statement} - , iterator{result.begin()} + nanodbc::connection & connection_, const std::string & query_str, const Block & sample_block, const UInt64 max_block_size_) + : log(&Poco::Logger::get("ODBCBlockInputStream")) , max_block_size{max_block_size_} - , log(&Poco::Logger::get("ODBCBlockInputStream")) + , connection(connection_) + , query(query_str) { - if (sample_block.columns() != result.columnCount()) - throw Exception{"RecordSet contains " + toString(result.columnCount()) + " columns while " + toString(sample_block.columns()) - + " expected", - ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; - description.init(sample_block); -} - - -namespace -{ - using ValueType = ExternalResultDescription::ValueType; - - void insertValue(IColumn & column, const ValueType type, const Poco::Dynamic::Var & value) - { - switch (type) - { - case ValueType::vtUInt8: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtUInt16: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtUInt32: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtUInt64: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtInt8: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtInt16: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtInt32: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtInt64: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtFloat32: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtFloat64: - assert_cast(column).insertValue(value.convert()); - break; - case ValueType::vtString: - assert_cast(column).insert(value.convert()); - break; - case ValueType::vtDate: - { - Poco::DateTime date = value.convert(); - assert_cast(column).insertValue(UInt16{LocalDate(date.year(), date.month(), date.day()).getDayNum()}); - break; - } - case ValueType::vtDateTime: - { - Poco::DateTime datetime = value.convert(); - assert_cast(column).insertValue(time_t{LocalDateTime( - datetime.year(), datetime.month(), datetime.day(), datetime.hour(), datetime.minute(), datetime.second())}); - break; - } - case ValueType::vtUUID: - assert_cast(column).insert(parse(value.convert())); - break; - default: - throw Exception("Unsupported value type", ErrorCodes::UNKNOWN_TYPE); - } - } - - void insertDefaultValue(IColumn & column, const IColumn & sample_column) { column.insertFrom(sample_column, 0); } + result = execute(connection, NANODBC_TEXT(query)); } Block ODBCBlockInputStream::readImpl() { - if (iterator == result.end()) - return {}; - - MutableColumns columns(description.sample_block.columns()); - for (const auto i : ext::range(0, columns.size())) - columns[i] = description.sample_block.getByPosition(i).column->cloneEmpty(); + if (finished) + return Block(); + MutableColumns columns(description.sample_block.cloneEmptyColumns()); size_t num_rows = 0; - while (iterator != result.end()) + + while (true) { - Poco::Data::Row & row = *iterator; - - for (const auto idx : ext::range(0, row.fieldCount())) + if (!result.next()) { - /// TODO This is extremely slow. - const Poco::Dynamic::Var & value = row[idx]; + finished = true; + break; + } - if (!value.isEmpty()) + for (int idx = 0; idx < result.columns(); ++idx) + { + const auto & sample = description.sample_block.getByPosition(idx); + + if (!result.is_null(idx)) { - if (description.types[idx].second) + bool is_nullable = description.types[idx].second; + + if (is_nullable) { ColumnNullable & column_nullable = assert_cast(*columns[idx]); - insertValue(column_nullable.getNestedColumn(), description.types[idx].first, value); + const auto & data_type = assert_cast(*sample.type); + insertValue(column_nullable.getNestedColumn(), data_type.getNestedType(), description.types[idx].first, result, idx); column_nullable.getNullMapData().emplace_back(0); } else - insertValue(*columns[idx], description.types[idx].first, value); + { + insertValue(*columns[idx], sample.type, description.types[idx].first, result, idx); + } } else - insertDefaultValue(*columns[idx], *description.sample_block.getByPosition(idx).column); + insertDefaultValue(*columns[idx], *sample.column); } - ++iterator; - - ++num_rows; - if (num_rows == max_block_size) + if (++num_rows == max_block_size) break; } return description.sample_block.cloneWithColumns(std::move(columns)); } + +void ODBCBlockInputStream::insertValue( + IColumn & column, const DataTypePtr data_type, const ValueType type, nanodbc::result & row, size_t idx) +{ + switch (type) + { + case ValueType::vtUInt8: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtUInt16: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtUInt32: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtUInt64: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtInt8: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtInt16: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtInt32: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtInt64: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtFloat32: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtFloat64: + assert_cast(column).insertValue(row.get(idx)); + break; + case ValueType::vtFixedString:[[fallthrough]]; + case ValueType::vtString: + assert_cast(column).insert(row.get(idx)); + break; + case ValueType::vtUUID: + { + auto value = row.get(idx); + assert_cast(column).insert(parse(value.data(), value.size())); + break; + } + case ValueType::vtDate: + assert_cast(column).insertValue(UInt16{LocalDate{row.get(idx)}.getDayNum()}); + break; + case ValueType::vtDateTime: + { + auto value = row.get(idx); + ReadBufferFromString in(value); + time_t time = 0; + readDateTimeText(time, in); + if (time < 0) + time = 0; + assert_cast(column).insertValue(time); + break; + } + case ValueType::vtDateTime64:[[fallthrough]]; + case ValueType::vtDecimal32: [[fallthrough]]; + case ValueType::vtDecimal64: [[fallthrough]]; + case ValueType::vtDecimal128: [[fallthrough]]; + case ValueType::vtDecimal256: + { + auto value = row.get(idx); + ReadBufferFromString istr(value); + data_type->getDefaultSerialization()->deserializeWholeText(column, istr, FormatSettings{}); + break; + } + default: + throw Exception("Unsupported value type", ErrorCodes::UNKNOWN_TYPE); + } +} + } diff --git a/programs/odbc-bridge/ODBCBlockInputStream.h b/programs/odbc-bridge/ODBCBlockInputStream.h index 13491e05822..bbd90ce4d6c 100644 --- a/programs/odbc-bridge/ODBCBlockInputStream.h +++ b/programs/odbc-bridge/ODBCBlockInputStream.h @@ -3,10 +3,8 @@ #include #include #include -#include -#include -#include #include +#include namespace DB @@ -15,25 +13,33 @@ namespace DB class ODBCBlockInputStream final : public IBlockInputStream { public: - ODBCBlockInputStream( - Poco::Data::Session && session_, const std::string & query_str, const Block & sample_block, const UInt64 max_block_size_); + ODBCBlockInputStream(nanodbc::connection & connection_, const std::string & query_str, const Block & sample_block, const UInt64 max_block_size_); String getName() const override { return "ODBC"; } Block getHeader() const override { return description.sample_block.cloneEmpty(); } private: + using QueryResult = std::shared_ptr; + using ValueType = ExternalResultDescription::ValueType; + Block readImpl() override; - Poco::Data::Session session; - Poco::Data::Statement statement; - Poco::Data::RecordSet result; - Poco::Data::RecordSet::Iterator iterator; + static void insertValue(IColumn & column, const DataTypePtr data_type, const ValueType type, nanodbc::result & row, size_t idx); + static void insertDefaultValue(IColumn & column, const IColumn & sample_column) + { + column.insertFrom(sample_column, 0); + } + + Poco::Logger * log; const UInt64 max_block_size; ExternalResultDescription description; - Poco::Logger * log; + nanodbc::connection & connection; + nanodbc::result result; + String query; + bool finished = false; }; } diff --git a/programs/odbc-bridge/ODBCBlockOutputStream.cpp b/programs/odbc-bridge/ODBCBlockOutputStream.cpp index 4d8b9fa6bdf..e4614204178 100644 --- a/programs/odbc-bridge/ODBCBlockOutputStream.cpp +++ b/programs/odbc-bridge/ODBCBlockOutputStream.cpp @@ -8,16 +8,14 @@ #include #include #include "getIdentifierQuote.h" +#include +#include +#include namespace DB { -namespace ErrorCodes -{ - extern const int UNKNOWN_TYPE; -} - namespace { using ValueType = ExternalResultDescription::ValueType; @@ -40,69 +38,21 @@ namespace return buf.str(); } - std::string getQuestionMarks(size_t n) - { - std::string result = "("; - for (size_t i = 0; i < n; ++i) - { - if (i > 0) - result += ","; - result += "?"; - } - return result + ")"; - } - - Poco::Dynamic::Var getVarFromField(const Field & field, const ValueType type) - { - switch (type) - { - case ValueType::vtUInt8: - return Poco::Dynamic::Var(static_cast(field.get())).convert(); - case ValueType::vtUInt16: - return Poco::Dynamic::Var(static_cast(field.get())).convert(); - case ValueType::vtUInt32: - return Poco::Dynamic::Var(static_cast(field.get())).convert(); - case ValueType::vtUInt64: - return Poco::Dynamic::Var(field.get()).convert(); - case ValueType::vtInt8: - return Poco::Dynamic::Var(static_cast(field.get())).convert(); - case ValueType::vtInt16: - return Poco::Dynamic::Var(static_cast(field.get())).convert(); - case ValueType::vtInt32: - return Poco::Dynamic::Var(static_cast(field.get())).convert(); - case ValueType::vtInt64: - return Poco::Dynamic::Var(field.get()).convert(); - case ValueType::vtFloat32: - return Poco::Dynamic::Var(field.get()).convert(); - case ValueType::vtFloat64: - return Poco::Dynamic::Var(field.get()).convert(); - case ValueType::vtString: - return Poco::Dynamic::Var(field.get()).convert(); - case ValueType::vtDate: - return Poco::Dynamic::Var(LocalDate(DayNum(field.get())).toString()).convert(); - case ValueType::vtDateTime: - return Poco::Dynamic::Var(std::to_string(LocalDateTime(time_t(field.get())))).convert(); - case ValueType::vtUUID: - return Poco::Dynamic::Var(UUID(field.get()).toUnderType().toHexString()).convert(); - default: - throw Exception("Unsupported value type", ErrorCodes::UNKNOWN_TYPE); - - } - __builtin_unreachable(); - } } -ODBCBlockOutputStream::ODBCBlockOutputStream(Poco::Data::Session && session_, +ODBCBlockOutputStream::ODBCBlockOutputStream(nanodbc::connection & connection_, const std::string & remote_database_name_, const std::string & remote_table_name_, const Block & sample_block_, + ContextPtr local_context_, IdentifierQuotingStyle quoting_) - : session(session_) + : log(&Poco::Logger::get("ODBCBlockOutputStream")) + , connection(connection_) , db_name(remote_database_name_) , table_name(remote_table_name_) , sample_block(sample_block_) + , local_context(local_context_) , quoting(quoting_) - , log(&Poco::Logger::get("ODBCBlockOutputStream")) { description.init(sample_block); } @@ -114,28 +64,12 @@ Block ODBCBlockOutputStream::getHeader() const void ODBCBlockOutputStream::write(const Block & block) { - ColumnsWithTypeAndName columns; - for (size_t i = 0; i < block.columns(); ++i) - columns.push_back({block.getColumns()[i], sample_block.getDataTypes()[i], sample_block.getNames()[i]}); + WriteBufferFromOwnString values_buf; + auto writer = FormatFactory::instance().getOutputStream("Values", values_buf, sample_block, local_context); + writer->write(block); - std::vector row_to_insert(block.columns()); - Poco::Data::Statement statement(session << getInsertQuery(db_name, table_name, columns, quoting) + getQuestionMarks(block.columns())); - for (size_t i = 0; i < block.columns(); ++i) - statement.addBind(Poco::Data::Keywords::use(row_to_insert[i])); - - for (size_t i = 0; i < block.rows(); ++i) - { - for (size_t col_idx = 0; col_idx < block.columns(); ++col_idx) - { - Field val; - columns[col_idx].column->get(i, val); - if (val.isNull()) - row_to_insert[col_idx] = Poco::Dynamic::Var(); - else - row_to_insert[col_idx] = getVarFromField(val, description.types[col_idx].first); - } - statement.execute(); - } + std::string query = getInsertQuery(db_name, table_name, block.getColumnsWithTypeAndName(), quoting) + values_buf.str(); + execute(connection, query); } } diff --git a/programs/odbc-bridge/ODBCBlockOutputStream.h b/programs/odbc-bridge/ODBCBlockOutputStream.h index 39e1d6f77ac..0b13f7039b5 100644 --- a/programs/odbc-bridge/ODBCBlockOutputStream.h +++ b/programs/odbc-bridge/ODBCBlockOutputStream.h @@ -2,30 +2,41 @@ #include #include -#include #include #include +#include +#include + namespace DB { + class ODBCBlockOutputStream : public IBlockOutputStream { + public: - ODBCBlockOutputStream(Poco::Data::Session && session_, const std::string & remote_database_name_, - const std::string & remote_table_name_, const Block & sample_block_, IdentifierQuotingStyle quoting); + ODBCBlockOutputStream( + nanodbc::connection & connection_, + const std::string & remote_database_name_, + const std::string & remote_table_name_, + const Block & sample_block_, + ContextPtr local_context_, + IdentifierQuotingStyle quoting); Block getHeader() const override; void write(const Block & block) override; private: - Poco::Data::Session session; + Poco::Logger * log; + + nanodbc::connection & connection; std::string db_name; std::string table_name; Block sample_block; + ContextPtr local_context; IdentifierQuotingStyle quoting; ExternalResultDescription description; - Poco::Logger * log; }; } diff --git a/programs/odbc-bridge/ODBCBridge.cpp b/programs/odbc-bridge/ODBCBridge.cpp index 8869a2639c1..0deefe46014 100644 --- a/programs/odbc-bridge/ODBCBridge.cpp +++ b/programs/odbc-bridge/ODBCBridge.cpp @@ -1,244 +1,4 @@ #include "ODBCBridge.h" -#include "HandlerFactory.h" - -#include -#include -#include -#include - -#if USE_ODBC -// It doesn't make much sense to build this bridge without ODBC, but we still do this. -# include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int ARGUMENT_OUT_OF_BOUND; -} - -namespace -{ - Poco::Net::SocketAddress makeSocketAddress(const std::string & host, UInt16 port, Poco::Logger * log) - { - Poco::Net::SocketAddress socket_address; - try - { - socket_address = Poco::Net::SocketAddress(host, port); - } - catch (const Poco::Net::DNSException & e) - { - const auto code = e.code(); - if (code == EAI_FAMILY -#if defined(EAI_ADDRFAMILY) - || code == EAI_ADDRFAMILY -#endif - ) - { - LOG_ERROR(log, "Cannot resolve listen_host ({}), error {}: {}. If it is an IPv6 address and your host has disabled IPv6, then consider to specify IPv4 address to listen in element of configuration file. Example: 0.0.0.0", host, e.code(), e.message()); - } - - throw; - } - return socket_address; - } - - Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, Poco::Logger * log) - { - auto address = makeSocketAddress(host, port, log); -#if POCO_VERSION < 0x01080000 - socket.bind(address, /* reuseAddress = */ true); -#else - socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ false); -#endif - - socket.listen(/* backlog = */ 64); - - return address; - } -} - -void ODBCBridge::handleHelp(const std::string &, const std::string &) -{ - Poco::Util::HelpFormatter help_formatter(options()); - help_formatter.setCommand(commandName()); - help_formatter.setHeader("HTTP-proxy for odbc requests"); - help_formatter.setUsage("--http-port "); - help_formatter.format(std::cerr); - - stopOptionsProcessing(); -} - - -void ODBCBridge::defineOptions(Poco::Util::OptionSet & options) -{ - options.addOption(Poco::Util::Option("http-port", "", "port to listen").argument("http-port", true).binding("http-port")); - options.addOption( - Poco::Util::Option("listen-host", "", "hostname or address to listen, default 127.0.0.1").argument("listen-host").binding("listen-host")); - options.addOption( - Poco::Util::Option("http-timeout", "", "http timeout for socket, default 1800").argument("http-timeout").binding("http-timeout")); - - options.addOption(Poco::Util::Option("max-server-connections", "", "max connections to server, default 1024") - .argument("max-server-connections") - .binding("max-server-connections")); - options.addOption(Poco::Util::Option("keep-alive-timeout", "", "keepalive timeout, default 10") - .argument("keep-alive-timeout") - .binding("keep-alive-timeout")); - - options.addOption(Poco::Util::Option("log-level", "", "sets log level, default info").argument("log-level").binding("logger.level")); - - options.addOption( - Poco::Util::Option("log-path", "", "log path for all logs, default console").argument("log-path").binding("logger.log")); - - options.addOption(Poco::Util::Option("err-log-path", "", "err log path for all logs, default no") - .argument("err-log-path") - .binding("logger.errorlog")); - - options.addOption(Poco::Util::Option("stdout-path", "", "stdout log path, default console") - .argument("stdout-path") - .binding("logger.stdout")); - - options.addOption(Poco::Util::Option("stderr-path", "", "stderr log path, default console") - .argument("stderr-path") - .binding("logger.stderr")); - - using Me = std::decay_t; - options.addOption(Poco::Util::Option("help", "", "produce this help message") - .binding("help") - .callback(Poco::Util::OptionCallback(this, &Me::handleHelp))); - - ServerApplication::defineOptions(options); // NOLINT Don't need complex BaseDaemon's .xml config -} - -void ODBCBridge::initialize(Application & self) -{ - BaseDaemon::closeFDs(); - is_help = config().has("help"); - - if (is_help) - return; - - config().setString("logger", "ODBCBridge"); - - /// Redirect stdout, stderr to specified files. - /// Some libraries and sanitizers write to stderr in case of errors. - const auto stdout_path = config().getString("logger.stdout", ""); - if (!stdout_path.empty()) - { - if (!freopen(stdout_path.c_str(), "a+", stdout)) - throw Poco::OpenFileException("Cannot attach stdout to " + stdout_path); - - /// Disable buffering for stdout. - setbuf(stdout, nullptr); - } - const auto stderr_path = config().getString("logger.stderr", ""); - if (!stderr_path.empty()) - { - if (!freopen(stderr_path.c_str(), "a+", stderr)) - throw Poco::OpenFileException("Cannot attach stderr to " + stderr_path); - - /// Disable buffering for stderr. - setbuf(stderr, nullptr); - } - - buildLoggers(config(), logger(), self.commandName()); - - BaseDaemon::logRevision(); - - log = &logger(); - hostname = config().getString("listen-host", "127.0.0.1"); - port = config().getUInt("http-port"); - if (port > 0xFFFF) - throw Exception("Out of range 'http-port': " + std::to_string(port), ErrorCodes::ARGUMENT_OUT_OF_BOUND); - - http_timeout = config().getUInt("http-timeout", DEFAULT_HTTP_READ_BUFFER_TIMEOUT); - max_server_connections = config().getUInt("max-server-connections", 1024); - keep_alive_timeout = config().getUInt("keep-alive-timeout", 10); - - initializeTerminationAndSignalProcessing(); - -#if USE_ODBC - // It doesn't make much sense to build this bridge without ODBC, but we - // still do this. - Poco::Data::ODBC::Connector::registerConnector(); -#endif - - ServerApplication::initialize(self); // NOLINT -} - -void ODBCBridge::uninitialize() -{ - BaseDaemon::uninitialize(); -} - -int ODBCBridge::main(const std::vector & /*args*/) -{ - if (is_help) - return Application::EXIT_OK; - - registerFormats(); - - LOG_INFO(log, "Starting up"); - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, hostname, port, log); - socket.setReceiveTimeout(http_timeout); - socket.setSendTimeout(http_timeout); - Poco::ThreadPool server_pool(3, max_server_connections); - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(http_timeout); - http_params->setKeepAliveTimeout(keep_alive_timeout); - - auto shared_context = Context::createShared(); - Context context(Context::createGlobal(shared_context.get())); - context.makeGlobalContext(); - - if (config().has("query_masking_rules")) - { - SensitiveDataMasker::setInstance(std::make_unique(config(), "query_masking_rules")); - } - - auto server = HTTPServer( - context, - std::make_shared("ODBCRequestHandlerFactory-factory", keep_alive_timeout, context), - server_pool, - socket, - http_params); - server.start(); - - LOG_INFO(log, "Listening http://{}", address.toString()); - - SCOPE_EXIT({ - LOG_DEBUG(log, "Received termination signal."); - LOG_DEBUG(log, "Waiting for current connections to close."); - server.stop(); - for (size_t count : ext::range(1, 6)) - { - if (server.currentConnections() == 0) - break; - LOG_DEBUG(log, "Waiting for {} connections, try {}", server.currentConnections(), count); - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } - }); - - waitForTerminationRequest(); - return Application::EXIT_OK; -} -} #pragma GCC diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseODBCBridge(int argc, char ** argv) diff --git a/programs/odbc-bridge/ODBCBridge.h b/programs/odbc-bridge/ODBCBridge.h index 9a0d37fa0f9..b17051dce91 100644 --- a/programs/odbc-bridge/ODBCBridge.h +++ b/programs/odbc-bridge/ODBCBridge.h @@ -2,38 +2,25 @@ #include #include -#include +#include +#include "HandlerFactory.h" + namespace DB { -/** Class represents clickhouse-odbc-bridge server, which listen - * incoming HTTP POST and GET requests on specified port and host. - * Has two handlers '/' for all incoming POST requests to ODBC driver - * and /ping for GET request about service status - */ -class ODBCBridge : public BaseDaemon + +class ODBCBridge : public IBridge { -public: - void defineOptions(Poco::Util::OptionSet & options) override; protected: - void initialize(Application & self) override; + std::string bridgeName() const override + { + return "ODBCBridge"; + } - void uninitialize() override; - - int main(const std::vector & args) override; - -private: - void handleHelp(const std::string &, const std::string &); - - bool is_help; - std::string hostname; - size_t port; - size_t http_timeout; - std::string log_level; - size_t max_server_connections; - size_t keep_alive_timeout; - - Poco::Logger * log; + HandlerFactoryPtr getHandlerFactoryPtr(ContextPtr context) const override + { + return std::make_shared("ODBCRequestHandlerFactory-factory", keep_alive_timeout, context); + } }; } diff --git a/programs/odbc-bridge/ODBCConnectionFactory.h b/programs/odbc-bridge/ODBCConnectionFactory.h new file mode 100644 index 00000000000..56961ddb2fb --- /dev/null +++ b/programs/odbc-bridge/ODBCConnectionFactory.h @@ -0,0 +1,82 @@ +#pragma once + +#include +#include +#include +#include +#include + + +namespace nanodbc +{ + +static constexpr inline auto ODBC_CONNECT_TIMEOUT = 100; + +using ConnectionPtr = std::shared_ptr; +using Pool = BorrowedObjectPool; +using PoolPtr = std::shared_ptr; + +class ConnectionHolder +{ + +public: + ConnectionHolder(const std::string & connection_string_, PoolPtr pool_) : connection_string(connection_string_), pool(pool_) {} + + ~ConnectionHolder() + { + if (connection) + pool->returnObject(std::move(connection)); + } + + nanodbc::connection & operator*() + { + if (!connection) + { + pool->borrowObject(connection, [&]() + { + return std::make_shared(connection_string, ODBC_CONNECT_TIMEOUT); + }); + } + + return *connection; + } + +private: + std::string connection_string; + PoolPtr pool; + ConnectionPtr connection; +}; + +} + + +namespace DB +{ + +class ODBCConnectionFactory final : private boost::noncopyable +{ +public: + static ODBCConnectionFactory & instance() + { + static ODBCConnectionFactory ret; + return ret; + } + + nanodbc::ConnectionHolder get(const std::string & connection_string, size_t pool_size) + { + std::lock_guard lock(mutex); + + if (!factory.count(connection_string)) + factory.emplace(std::make_pair(connection_string, std::make_shared(pool_size))); + + return nanodbc::ConnectionHolder(connection_string, factory[connection_string]); + } + +private: + /// [connection_settings_string] -> [connection_pool] + using PoolFactory = std::unordered_map; + PoolFactory factory; + std::mutex mutex; +}; + +} diff --git a/programs/odbc-bridge/SchemaAllowedHandler.cpp b/programs/odbc-bridge/SchemaAllowedHandler.cpp index d4a70db61f4..4cceaee962c 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.cpp +++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp @@ -2,33 +2,26 @@ #if USE_ODBC -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include "validateODBCConnectionString.h" +#include +#include +#include +#include +#include +#include +#include "validateODBCConnectionString.h" +#include "ODBCConnectionFactory.h" +#include +#include -# define POCO_SQL_ODBC_CLASS Poco::Data::ODBC namespace DB { namespace { - bool isSchemaAllowed(SQLHDBC hdbc) + bool isSchemaAllowed(nanodbc::connection & connection) { - SQLUINTEGER value; - SQLSMALLINT value_length = sizeof(value); - SQLRETURN r = POCO_SQL_ODBC_CLASS::SQLGetInfo(hdbc, SQL_SCHEMA_USAGE, &value, sizeof(value), &value_length); - - if (POCO_SQL_ODBC_CLASS::Utility::isError(r)) - throw POCO_SQL_ODBC_CLASS::ConnectionException(hdbc); - - return value != 0; + uint32_t result = connection.get_info(SQL_SCHEMA_USAGE); + return result != 0; } } @@ -55,10 +48,12 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer try { std::string connection_string = params.get("connection_string"); - POCO_SQL_ODBC_CLASS::SessionImpl session(validateODBCConnectionString(connection_string), DBMS_DEFAULT_CONNECT_TIMEOUT_SEC); - SQLHDBC hdbc = session.dbc().handle(); - bool result = isSchemaAllowed(hdbc); + auto connection = ODBCConnectionFactory::instance().get( + validateODBCConnectionString(connection_string), + getContext()->getSettingsRef().odbc_bridge_connection_pool_size); + + bool result = isSchemaAllowed(*connection); WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); try diff --git a/programs/odbc-bridge/SchemaAllowedHandler.h b/programs/odbc-bridge/SchemaAllowedHandler.h index 91eddf67803..d7b922ed05b 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.h +++ b/programs/odbc-bridge/SchemaAllowedHandler.h @@ -1,22 +1,25 @@ #pragma once +#include #include - #include #if USE_ODBC + namespace DB { class Context; /// This handler establishes connection to database, and retrieves whether schema is allowed. -class SchemaAllowedHandler : public HTTPRequestHandler +class SchemaAllowedHandler : public HTTPRequestHandler, WithContext { public: - SchemaAllowedHandler(size_t keep_alive_timeout_, Context &) - : log(&Poco::Logger::get("SchemaAllowedHandler")), keep_alive_timeout(keep_alive_timeout_) + SchemaAllowedHandler(size_t keep_alive_timeout_, ContextPtr context_) + : WithContext(context_) + , log(&Poco::Logger::get("SchemaAllowedHandler")) + , keep_alive_timeout(keep_alive_timeout_) { } diff --git a/programs/odbc-bridge/getIdentifierQuote.cpp b/programs/odbc-bridge/getIdentifierQuote.cpp index 15b3749d37d..9ccad6e6e1d 100644 --- a/programs/odbc-bridge/getIdentifierQuote.cpp +++ b/programs/odbc-bridge/getIdentifierQuote.cpp @@ -2,11 +2,10 @@ #if USE_ODBC -# include -# include -# include - -# define POCO_SQL_ODBC_CLASS Poco::Data::ODBC +#include +#include +#include +#include namespace DB @@ -17,33 +16,27 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } -std::string getIdentifierQuote(SQLHDBC hdbc) + +std::string getIdentifierQuote(nanodbc::connection & connection) { - std::string identifier; - - SQLSMALLINT t; - SQLRETURN r = POCO_SQL_ODBC_CLASS::SQLGetInfo(hdbc, SQL_IDENTIFIER_QUOTE_CHAR, nullptr, 0, &t); - - if (POCO_SQL_ODBC_CLASS::Utility::isError(r)) - throw POCO_SQL_ODBC_CLASS::ConnectionException(hdbc); - - if (t > 0) + std::string quote; + try { - // I have no idea, why to add '2' here, got from: contrib/poco/Data/ODBC/src/ODBCStatementImpl.cpp:60 (SQL_DRIVER_NAME) - identifier.resize(static_cast(t) + 2); - - if (POCO_SQL_ODBC_CLASS::Utility::isError(POCO_SQL_ODBC_CLASS::SQLGetInfo( - hdbc, SQL_IDENTIFIER_QUOTE_CHAR, &identifier[0], SQLSMALLINT((identifier.length() - 1) * sizeof(identifier[0])), &t))) - throw POCO_SQL_ODBC_CLASS::ConnectionException(hdbc); - - identifier.resize(static_cast(t)); + quote = connection.get_info(SQL_IDENTIFIER_QUOTE_CHAR); } - return identifier; + catch (...) + { + LOG_WARNING(&Poco::Logger::get("ODBCGetIdentifierQuote"), "Cannot fetch identifier quote. Default double quote is used. Reason: {}", getCurrentExceptionMessage(false)); + return "\""; + } + + return quote; } -IdentifierQuotingStyle getQuotingStyle(SQLHDBC hdbc) + +IdentifierQuotingStyle getQuotingStyle(nanodbc::connection & connection) { - auto identifier_quote = getIdentifierQuote(hdbc); + auto identifier_quote = getIdentifierQuote(connection); if (identifier_quote.length() == 0) return IdentifierQuotingStyle::None; else if (identifier_quote[0] == '`') diff --git a/programs/odbc-bridge/getIdentifierQuote.h b/programs/odbc-bridge/getIdentifierQuote.h index 0fb4c3bddb1..7f7156eff82 100644 --- a/programs/odbc-bridge/getIdentifierQuote.h +++ b/programs/odbc-bridge/getIdentifierQuote.h @@ -2,20 +2,19 @@ #if USE_ODBC -# include -# include -# include - -# include - +#include +#include +#include #include +#include + namespace DB { -std::string getIdentifierQuote(SQLHDBC hdbc); +std::string getIdentifierQuote(nanodbc::connection & connection); -IdentifierQuotingStyle getQuotingStyle(SQLHDBC hdbc); +IdentifierQuotingStyle getQuotingStyle(nanodbc::connection & connection); } diff --git a/programs/server/.gitignore b/programs/server/.gitignore index b774776e4be..ddc480e4b29 100644 --- a/programs/server/.gitignore +++ b/programs/server/.gitignore @@ -1,8 +1,11 @@ -/access -/dictionaries_lib -/flags -/format_schemas +/metadata /metadata_dropped +/data +/store +/access +/flags +/dictionaries_lib +/format_schemas /preprocessed_configs /shadow /tmp diff --git a/programs/server/CMakeLists.txt b/programs/server/CMakeLists.txt index 198d9081168..3a04228942b 100644 --- a/programs/server/CMakeLists.txt +++ b/programs/server/CMakeLists.txt @@ -19,6 +19,7 @@ set (CLICKHOUSE_SERVER_LINK clickhouse_storages_system clickhouse_table_functions string_utils + jemalloc ${LINK_RESOURCE_LIB} @@ -42,11 +43,16 @@ if (OS_LINUX) set(RESOURCE_OBJS ${RESOURCE_OBJS} ${RESOURCE_OBJ}) # https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake - add_custom_command(OUTPUT ${RESOURCE_OBJ} - COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} - COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents - ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}) - + # PPC64LE fails to do this with objcopy, use ld or lld instead + if (ARCH_PPC64LE) + add_custom_command(OUTPUT ${RESOURCE_OBJ} + COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${RESOURCE_FILE}) + else() + add_custom_command(OUTPUT ${RESOURCE_OBJ} + COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} + COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents + ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ} ${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}) + endif() set_source_files_properties(${RESOURCE_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true) endforeach(RESOURCE_FILE) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index f501e182cb7..e874122250c 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include #include #include @@ -82,6 +84,7 @@ #if defined(OS_LINUX) # include +# include # include #endif @@ -95,7 +98,11 @@ #endif #if USE_NURAFT -# include +# include +#endif + +#if USE_JEMALLOC +# include #endif namespace CurrentMetrics @@ -106,11 +113,35 @@ namespace CurrentMetrics extern const Metric MaxDDLEntryID; } +#if USE_JEMALLOC +static bool jemallocOptionEnabled(const char *name) +{ + bool value; + size_t size = sizeof(value); + + if (mallctl(name, reinterpret_cast(&value), &size, /* newp= */ nullptr, /* newlen= */ 0)) + throw Poco::SystemException("mallctl() failed"); + + return value; +} +#else +static bool jemallocOptionEnabled(const char *) { return 0; } +#endif + int mainEntryClickHouseServer(int argc, char ** argv) { DB::Server app; + if (jemallocOptionEnabled("opt.background_thread")) + { + LOG_ERROR(&app.logger(), + "jemalloc.background_thread was requested, " + "however ClickHouse uses percpu_arena and background_thread most likely will not give any benefits, " + "and also background_thread is not compatible with ClickHouse watchdog " + "(that can be disabled with CLICKHOUSE_WATCHDOG_ENABLE=0)"); + } + /// Do not fork separate process from watchdog if we attached to terminal. /// Otherwise it breaks gdb usage. /// Can be overridden by environment variable (cannot use server config at this moment). @@ -170,18 +201,24 @@ int waitServersToFinish(std::vector & servers, size_t const int sleep_one_ms = 100; int sleep_current_ms = 0; int current_connections = 0; - while (sleep_current_ms < sleep_max_ms) + for (;;) { current_connections = 0; + for (auto & server : servers) { server.stop(); current_connections += server.currentConnections(); } + if (!current_connections) break; + sleep_current_ms += sleep_one_ms; - std::this_thread::sleep_for(std::chrono::milliseconds(sleep_one_ms)); + if (sleep_current_ms < sleep_max_ms) + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_one_ms)); + else + break; } return current_connections; } @@ -423,8 +460,7 @@ int Server::main(const std::vector & /*args*/) * settings, available functions, data types, aggregate functions, databases, ... */ auto shared_context = Context::createShared(); - auto global_context = std::make_unique(Context::createGlobal(shared_context.get())); - global_context_ptr = global_context.get(); + global_context = Context::createGlobal(shared_context.get()); global_context->makeGlobalContext(); global_context->setApplicationType(Context::ApplicationType::SERVER); @@ -480,16 +516,26 @@ int Server::main(const std::vector & /*args*/) } else { - throw Exception(ErrorCodes::CORRUPTED_DATA, - "Calculated checksum of the ClickHouse binary ({0}) does not correspond" - " to the reference checksum stored in the binary ({1})." - " It may indicate one of the following:" - " - the file {2} was changed just after startup;" - " - the file {2} is damaged on disk due to faulty hardware;" - " - the loaded executable is damaged in memory due to faulty hardware;" - " - the file {2} was intentionally modified;" - " - logical error in code." - , calculated_binary_hash, stored_binary_hash, executable_path); + /// If program is run under debugger, ptrace will fail. + if (ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) == -1) + { + /// Program is run under debugger. Modification of it's binary image is ok for breakpoints. + LOG_WARNING(log, "Server is run under debugger and its binary image is modified (most likely with breakpoints).", + calculated_binary_hash); + } + else + { + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Calculated checksum of the ClickHouse binary ({0}) does not correspond" + " to the reference checksum stored in the binary ({1})." + " It may indicate one of the following:" + " - the file {2} was changed just after startup;" + " - the file {2} is damaged on disk due to faulty hardware;" + " - the loaded executable is damaged in memory due to faulty hardware;" + " - the file {2} was intentionally modified;" + " - logical error in code." + , calculated_binary_hash, stored_binary_hash, executable_path); + } } } else @@ -676,16 +722,8 @@ int Server::main(const std::vector & /*args*/) } } - if (config().has("interserver_http_credentials")) - { - String user = config().getString("interserver_http_credentials.user", ""); - String password = config().getString("interserver_http_credentials.password", ""); - - if (user.empty()) - throw Exception("Configuration parameter interserver_http_credentials user can't be empty", ErrorCodes::NO_ELEMENTS_IN_CONFIG); - - global_context->setInterserverCredentials(user, password); - } + LOG_DEBUG(log, "Initiailizing interserver credentials."); + global_context->updateInterserverCredentials(config()); if (config().has("macros")) global_context->setMacros(std::make_unique(config(), "macros", log)); @@ -704,7 +742,7 @@ int Server::main(const std::vector & /*args*/) config().getString("path", ""), std::move(main_config_zk_node_cache), main_config_zk_changed_event, - [&](ConfigurationPtr config) + [&](ConfigurationPtr config, bool initial_loading) { Settings::checkNoSettingNamesAtTopLevel(*config, config_path); @@ -746,6 +784,7 @@ int Server::main(const std::vector & /*args*/) global_context->setClustersConfig(config); global_context->setMacros(std::make_unique(*config, "macros", log)); global_context->setExternalAuthenticatorsConfig(*config); + global_context->setExternalModelsConfig(config); /// Setup protection to avoid accidental DROP for big tables (that are greater than 50 GB by default) if (config->has("max_table_size_to_drop")) @@ -754,14 +793,20 @@ int Server::main(const std::vector & /*args*/) if (config->has("max_partition_size_to_drop")) global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop")); - if (config->has("zookeeper")) - global_context->reloadZooKeeperIfChanged(config); + if (!initial_loading) + { + /// We do not load ZooKeeper configuration on the first config loading + /// because TestKeeper server is not started yet. + if (config->has("zookeeper")) + global_context->reloadZooKeeperIfChanged(config); - global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config); + global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config); + } global_context->updateStorageConfiguration(*config); + global_context->updateInterserverCredentials(*config); }, - /* already_loaded = */ true); + /* already_loaded = */ false); /// Reload it right now (initial loading) auto & access_control = global_context->getAccessControlManager(); if (config().has("custom_settings_prefixes")) @@ -812,10 +857,14 @@ int Server::main(const std::vector & /*args*/) } global_context->setMarkCache(mark_cache_size); + /// A cache for mmapped files. + size_t mmap_cache_size = config().getUInt64("mmap_cache_size", 1000); /// The choice of default is arbitrary. + if (mmap_cache_size) + global_context->setMMappedFileCache(mmap_cache_size); + #if USE_EMBEDDED_COMPILER size_t compiled_expression_cache_size = config().getUInt64("compiled_expression_cache_size", 500); - if (compiled_expression_cache_size) - global_context->setCompiledExpressionCache(compiled_expression_cache_size); + CompiledExpressionCacheFactory::instance().init(compiled_expression_cache_size); #endif /// Set path for format schema files @@ -846,15 +895,15 @@ int Server::main(const std::vector & /*args*/) listen_try = true; } - if (config().has("test_keeper_server")) + if (config().has("keeper_server")) { #if USE_NURAFT /// Initialize test keeper RAFT. Do nothing if no nu_keeper_server in config. - global_context->initializeNuKeeperStorageDispatcher(); + global_context->initializeKeeperStorageDispatcher(); for (const auto & listen_host : listen_hosts) { - /// TCP NuKeeper - const char * port_name = "test_keeper_server.tcp_port"; + /// TCP Keeper + const char * port_name = "keeper_server.tcp_port"; createServer(listen_host, port_name, listen_try, [&](UInt16 port) { Poco::Net::ServerSocket socket; @@ -864,9 +913,29 @@ int Server::main(const std::vector & /*args*/) servers_to_start_before_tables->emplace_back( port_name, std::make_unique( - new NuKeeperTCPHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams)); + new KeeperTCPHandlerFactory(*this, false), server_pool, socket, new Poco::Net::TCPServerParams)); - LOG_INFO(log, "Listening for connections to NuKeeper (tcp): {}", address.toString()); + LOG_INFO(log, "Listening for connections to Keeper (tcp): {}", address.toString()); + }); + + const char * secure_port_name = "keeper_server.tcp_port_secure"; + createServer(listen_host, secure_port_name, listen_try, [&](UInt16 port) + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + servers_to_start_before_tables->emplace_back( + secure_port_name, + std::make_unique( + new KeeperTCPHandlerFactory(*this, true), server_pool, socket, new Poco::Net::TCPServerParams)); + LOG_INFO(log, "Listening for connections to Keeper with secure protocol (tcp_secure): {}", address.toString()); +#else + UNUSED(port); + throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif }); } #else @@ -913,13 +982,15 @@ int Server::main(const std::vector & /*args*/) else LOG_INFO(log, "Closed connections to servers for tables."); - global_context->shutdownNuKeeperStorageDispatcher(); + global_context->shutdownKeeperStorageDispatcher(); } + /// Wait server pool to avoid use-after-free of destroyed context in the handlers + server_pool.joinAll(); + /** Explicitly destroy Context. It is more convenient than in destructor of Server, because logger is still available. * At this moment, no one could own shared part of Context. */ - global_context_ptr = nullptr; global_context.reset(); shared_context.reset(); LOG_DEBUG(log, "Destroyed global context."); @@ -933,14 +1004,14 @@ int Server::main(const std::vector & /*args*/) try { - loadMetadataSystem(*global_context); + loadMetadataSystem(global_context); /// After attaching system databases we can initialize system log. global_context->initializeSystemLogs(); auto & database_catalog = DatabaseCatalog::instance(); /// After the system database is created, attach virtual system tables (in addition to query_log and part_log) attachSystemTablesServer(*database_catalog.getSystemDatabase(), has_zookeeper); /// Then, load remaining databases - loadMetadata(*global_context, default_database); + loadMetadata(global_context, default_database); database_catalog.loadDatabases(); /// After loading validate that default database exists database_catalog.assertDatabaseExists(default_database); @@ -965,7 +1036,7 @@ int Server::main(const std::vector & /*args*/) /// /// Look at compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h /// -#if USE_UNWIND && !WITH_COVERAGE && !defined(SANITIZER) +#if USE_UNWIND && !WITH_COVERAGE && !defined(SANITIZER) && defined(__x86_64__) /// Profilers cannot work reliably with any other libunwind or without PHDR cache. if (hasPHDRCache()) { @@ -1002,21 +1073,14 @@ int Server::main(const std::vector & /*args*/) " when two different stack unwinding methods will interfere with each other."); #endif +#if !defined(__x86_64__) + LOG_INFO(log, "Query Profiler is only tested on x86_64. It also known to not work under qemu-user."); +#endif + if (!hasPHDRCache()) LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created" " (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe)."); - if (has_zookeeper && config().has("distributed_ddl")) - { - /// DDL worker should be started after all tables were loaded - String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/"); - int pool_size = config().getInt("distributed_ddl.pool_size", 1); - if (pool_size < 1) - throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND); - global_context->setDDLWorker(std::make_unique(pool_size, ddl_zookeeper_path, *global_context, &config(), - "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID)); - } - std::unique_ptr dns_cache_updater; if (config().has("disable_internal_dns_cache") && config().getInt("disable_internal_dns_cache")) { @@ -1027,7 +1091,7 @@ int Server::main(const std::vector & /*args*/) else { /// Initialize a watcher periodically updating DNS cache - dns_cache_updater = std::make_unique(*global_context, config().getInt("dns_cache_update_period", 15)); + dns_cache_updater = std::make_unique(global_context, config().getInt("dns_cache_update_period", 15)); } #if defined(OS_LINUX) @@ -1059,7 +1123,7 @@ int Server::main(const std::vector & /*args*/) { /// This object will periodically calculate some metrics. AsynchronousMetrics async_metrics( - *global_context, config().getUInt("asynchronous_metrics_update_period_s", 60), servers_to_start_before_tables, servers); + global_context, config().getUInt("asynchronous_metrics_update_period_s", 60), servers_to_start_before_tables, servers); attachSystemTablesAsync(*DatabaseCatalog::instance().getSystemDatabase(), async_metrics); for (const auto & listen_host : listen_hosts) @@ -1275,9 +1339,6 @@ int Server::main(const std::vector & /*args*/) async_metrics.start(); global_context->enableNamedSessions(); - for (auto & server : *servers) - server.start(); - { String level_str = config().getString("text_log.level", ""); int level = level_str.empty() ? INT_MAX : Poco::Logger::parseLevel(level_str); @@ -1298,6 +1359,37 @@ int Server::main(const std::vector & /*args*/) std::thread::hardware_concurrency()); } + /// try to load dictionaries immediately, throw on error and die + ext::scope_guard dictionaries_xmls; + try + { + if (!config().getBool("dictionaries_lazy_load", true)) + { + global_context->tryCreateEmbeddedDictionaries(); + global_context->getExternalDictionariesLoader().enableAlwaysLoadEverything(true); + } + dictionaries_xmls = global_context->getExternalDictionariesLoader().addConfigRepository( + std::make_unique(config(), "dictionaries_config")); + } + catch (...) + { + LOG_ERROR(log, "Caught exception while loading dictionaries."); + throw; + } + + if (has_zookeeper && config().has("distributed_ddl")) + { + /// DDL worker should be started after all tables were loaded + String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/"); + int pool_size = config().getInt("distributed_ddl.pool_size", 1); + if (pool_size < 1) + throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + global_context->setDDLWorker(std::make_unique(pool_size, ddl_zookeeper_path, global_context, &config(), + "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID)); + } + + for (auto & server : *servers) + server.start(); LOG_INFO(log, "Ready for connections."); SCOPE_EXIT({ @@ -1347,26 +1439,6 @@ int Server::main(const std::vector & /*args*/) } }); - /// try to load dictionaries immediately, throw on error and die - ext::scope_guard dictionaries_xmls, models_xmls; - try - { - if (!config().getBool("dictionaries_lazy_load", true)) - { - global_context->tryCreateEmbeddedDictionaries(); - global_context->getExternalDictionariesLoader().enableAlwaysLoadEverything(true); - } - dictionaries_xmls = global_context->getExternalDictionariesLoader().addConfigRepository( - std::make_unique(config(), "dictionaries_config")); - models_xmls = global_context->getExternalModelsLoader().addConfigRepository( - std::make_unique(config(), "models_config")); - } - catch (...) - { - LOG_ERROR(log, "Caught exception while loading dictionaries."); - throw; - } - std::vector> metrics_transmitters; for (const auto & graphite_key : DB::getMultipleKeysFromConfig(config(), "", "graphite")) { diff --git a/programs/server/Server.h b/programs/server/Server.h index fbfc26f6ee5..c698108767c 100644 --- a/programs/server/Server.h +++ b/programs/server/Server.h @@ -40,9 +40,9 @@ public: return BaseDaemon::logger(); } - Context & context() const override + ContextPtr context() const override { - return *global_context_ptr; + return global_context; } bool isCancelled() const override @@ -64,8 +64,7 @@ protected: std::string getDefaultCorePath() const override; private: - Context * global_context_ptr = nullptr; - + ContextPtr global_context; Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const; using CreateServerFunc = std::function; diff --git a/programs/server/config.d/keeper_port.xml b/programs/server/config.d/keeper_port.xml new file mode 120000 index 00000000000..6ebfce266fc --- /dev/null +++ b/programs/server/config.d/keeper_port.xml @@ -0,0 +1 @@ +../../../tests/config/config.d/keeper_port.xml \ No newline at end of file diff --git a/programs/server/config.d/test_keeper_port.xml b/programs/server/config.d/test_keeper_port.xml deleted file mode 120000 index f3f721caae0..00000000000 --- a/programs/server/config.d/test_keeper_port.xml +++ /dev/null @@ -1 +0,0 @@ -../../../tests/config/config.d/test_keeper_port.xml \ No newline at end of file diff --git a/programs/server/config.xml b/programs/server/config.xml index ba9b8b04b05..195b6263595 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -7,7 +7,20 @@ --> - + trace /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.err.log @@ -76,7 +89,7 @@ - + 9005 + 1000 + + /var/lib/clickhouse/ @@ -362,6 +394,27 @@ --> + + @@ -578,7 +631,7 @@ - + - + + + + + + + + + + + + diff --git a/programs/server/data/.gitignore b/programs/server/data/.gitignore deleted file mode 100644 index b9719d9d1d1..00000000000 --- a/programs/server/data/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.txt -*.dat -*.idx diff --git a/programs/server/metadata/.gitignore b/programs/server/metadata/.gitignore deleted file mode 100644 index d1b811b7de5..00000000000 --- a/programs/server/metadata/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.sql diff --git a/programs/server/play.html b/programs/server/play.html index 81fc13f1f86..0c039097ce1 100644 --- a/programs/server/play.html +++ b/programs/server/play.html @@ -306,10 +306,10 @@ /// Incremental request number. When response is received, /// if it's request number does not equal to the current request number, response will be ignored. /// This is to avoid race conditions. - var request_num = 0; + let request_num = 0; /// Save query in history only if it is different. - var previous_query = ''; + let previous_query = ''; /// Substitute the address of the server where the page is served. if (location.protocol != 'file:') { @@ -317,7 +317,7 @@ } /// Substitute user name if it's specified in the query string - var user_from_url = (new URL(window.location)).searchParams.get('user'); + let user_from_url = (new URL(window.location)).searchParams.get('user'); if (user_from_url) { document.getElementById('user').value = user_from_url; } @@ -326,10 +326,10 @@ { /// TODO: Check if URL already contains query string (append parameters). - var user = document.getElementById('user').value; - var password = document.getElementById('password').value; + let user = document.getElementById('user').value; + let password = document.getElementById('password').value; - var url = document.getElementById('url').value + + let url = document.getElementById('url').value + /// Ask server to allow cross-domain requests. '?add_http_cors_header=1' + '&user=' + encodeURIComponent(user) + @@ -338,7 +338,7 @@ /// Safety settings to prevent results that browser cannot display. '&max_result_rows=1000&max_result_bytes=10000000&result_overflow_mode=break'; - var xhr = new XMLHttpRequest; + let xhr = new XMLHttpRequest; xhr.open('POST', url, true); @@ -352,13 +352,13 @@ /// The query is saved in browser history (in state JSON object) /// as well as in URL fragment identifier. if (query != previous_query) { - var state = { + let state = { query: query, status: this.status, response: this.response.length > 100000 ? null : this.response /// Lower than the browser's limit. }; - var title = "ClickHouse Query: " + query; - var url = window.location.pathname + '?user=' + encodeURIComponent(user) + '#' + window.btoa(query); + let title = "ClickHouse Query: " + query; + let url = window.location.pathname + '?user=' + encodeURIComponent(user) + '#' + window.btoa(query); if (previous_query == '') { history.replaceState(state, title, url); } else { @@ -382,7 +382,7 @@ document.getElementById('hourglass').style.display = 'none'; if (status === 200) { - var json; + let json; try { json = JSON.parse(response); } catch (e) {} if (json !== undefined && json.statistics !== undefined) { renderResult(json); @@ -415,7 +415,7 @@ function post() { ++request_num; - var query = document.getElementById('query').value; + let query = document.getElementById('query').value; postImpl(request_num, query); } @@ -434,7 +434,7 @@ function clear() { - var table = document.getElementById('data-table'); + let table = document.getElementById('data-table'); while (table.firstChild) { table.removeChild(table.lastChild); } @@ -456,33 +456,45 @@ //console.log(response); clear(); - var stats = document.getElementById('stats'); + let stats = document.getElementById('stats'); stats.innerText = 'Elapsed: ' + response.statistics.elapsed.toFixed(3) + " sec, read " + response.statistics.rows_read + " rows."; - var thead = document.createElement('thead'); - for (var idx in response.meta) { - var th = document.createElement('th'); - var name = document.createTextNode(response.meta[idx].name); + let thead = document.createElement('thead'); + for (let idx in response.meta) { + let th = document.createElement('th'); + let name = document.createTextNode(response.meta[idx].name); th.appendChild(name); thead.appendChild(th); } /// To prevent hanging the browser, limit the number of cells in a table. /// It's important to have the limit on number of cells, not just rows, because tables may be wide or narrow. - var max_rows = 10000 / response.meta.length; - var row_num = 0; + let max_rows = 10000 / response.meta.length; + let row_num = 0; - var tbody = document.createElement('tbody'); - for (var row_idx in response.data) { - var tr = document.createElement('tr'); - for (var col_idx in response.data[row_idx]) { - var td = document.createElement('td'); - var cell = response.data[row_idx][col_idx]; - var is_null = (cell === null); - var content = document.createTextNode(is_null ? 'ᴺᵁᴸᴸ' : cell); - td.appendChild(content); - /// TODO: Execute regexp only once for each column. - td.className = response.meta[col_idx].type.match(/^(U?Int|Decimal|Float)/) ? 'right' : 'left'; + let column_classes = response.meta.map(elem => elem.type.match(/^(U?Int|Decimal|Float)/) ? 'right' : 'left'); + + let tbody = document.createElement('tbody'); + for (let row_idx in response.data) { + let tr = document.createElement('tr'); + for (let col_idx in response.data[row_idx]) { + let td = document.createElement('td'); + let cell = response.data[row_idx][col_idx]; + + let is_null = (cell === null); + + /// Test: SELECT number, toString(number) AS str, number % 2 ? number : NULL AS nullable, range(number) AS arr, CAST((['hello', 'world'], [number, number % 2]) AS Map(String, UInt64)) AS map FROM numbers(10) + let text; + if (is_null) { + text = 'ᴺᵁᴸᴸ'; + } else if (typeof(cell) === 'object') { + text = JSON.stringify(cell); + } else { + text = cell; + } + + td.appendChild(document.createTextNode(text)); + td.className = column_classes[col_idx]; if (is_null) { td.className += ' null'; } @@ -496,7 +508,7 @@ } } - var table = document.getElementById('data-table'); + let table = document.getElementById('data-table'); table.appendChild(thead); table.appendChild(tbody); } @@ -505,7 +517,7 @@ function renderUnparsedResult(response) { clear(); - var data = document.getElementById('data-unparsed') + let data = document.getElementById('data-unparsed') if (response === '') { /// TODO: Fade or remove previous result when new request will be performed. @@ -531,12 +543,12 @@ } /// The choice of color theme is saved in browser. - var theme = window.localStorage.getItem('theme'); + let theme = window.localStorage.getItem('theme'); if (theme) { setColorTheme(theme); } else { /// Obtain system-level user preference - var media_query_list = window.matchMedia('prefers-color-scheme: dark') + let media_query_list = window.matchMedia('prefers-color-scheme: dark') if (media_query_list.matches) { /// Set without saving to localstorage diff --git a/programs/server/users.xml b/programs/server/users.xml index ef66891a6a0..b33dc0628d1 100644 --- a/programs/server/users.xml +++ b/programs/server/users.xml @@ -41,9 +41,18 @@ If you want to specify double SHA1, place it in 'password_double_sha1_hex' element. Example: e395796d6546b1b65db9d665cd43f0e858dd4303 - If you want to specify a previously defined LDAP server (see 'ldap_servers' in main config) for authentication, place its name in 'server' element inside 'ldap' element. + If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication, + place its name in 'server' element inside 'ldap' element. Example: my_ldap_server + If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config), + place 'kerberos' element instead of 'password' (and similar) elements. + The name part of the canonical principal name of the initiator must match the user name for authentication to succeed. + You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests + whose initiator's realm matches it. + Example: + Example: EXAMPLE.COM + How to generate decent password: Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' In first line will be password and in second - corresponding SHA256. diff --git a/src/Access/AccessControlManager.cpp b/src/Access/AccessControlManager.cpp index e874bda5b69..66023c1c0ea 100644 --- a/src/Access/AccessControlManager.cpp +++ b/src/Access/AccessControlManager.cpp @@ -361,9 +361,9 @@ void AccessControlManager::addStoragesFromMainConfig( } -UUID AccessControlManager::login(const String & user_name, const String & password, const Poco::Net::IPAddress & address) const +UUID AccessControlManager::login(const Credentials & credentials, const Poco::Net::IPAddress & address) const { - return MultipleAccessStorage::login(user_name, password, address, *external_authenticators); + return MultipleAccessStorage::login(credentials, address, *external_authenticators); } void AccessControlManager::setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config) @@ -403,7 +403,7 @@ void AccessControlManager::checkSettingNameIsAllowed(const std::string_view & se std::shared_ptr AccessControlManager::getContextAccess( const UUID & user_id, - const boost::container::flat_set & current_roles, + const std::vector & current_roles, bool use_default_roles, const Settings & settings, const String & current_database, @@ -411,7 +411,7 @@ std::shared_ptr AccessControlManager::getContextAccess( { ContextAccessParams params; params.user_id = user_id; - params.current_roles = current_roles; + params.current_roles.insert(current_roles.begin(), current_roles.end()); params.use_default_roles = use_default_roles; params.current_database = current_database; params.readonly = settings.readonly; @@ -444,8 +444,8 @@ std::shared_ptr AccessControlManager::getContextAccess(cons std::shared_ptr AccessControlManager::getEnabledRoles( - const boost::container::flat_set & current_roles, - const boost::container::flat_set & current_roles_with_admin_option) const + const std::vector & current_roles, + const std::vector & current_roles_with_admin_option) const { return role_cache->getEnabledRoles(current_roles, current_roles_with_admin_option); } diff --git a/src/Access/AccessControlManager.h b/src/Access/AccessControlManager.h index 07edfd93475..789c33af1c1 100644 --- a/src/Access/AccessControlManager.h +++ b/src/Access/AccessControlManager.h @@ -109,12 +109,12 @@ public: bool isSettingNameAllowed(const std::string_view & name) const; void checkSettingNameIsAllowed(const std::string_view & name) const; - UUID login(const String & user_name, const String & password, const Poco::Net::IPAddress & address) const; + UUID login(const Credentials & credentials, const Poco::Net::IPAddress & address) const; void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config); std::shared_ptr getContextAccess( const UUID & user_id, - const boost::container::flat_set & current_roles, + const std::vector & current_roles, bool use_default_roles, const Settings & settings, const String & current_database, @@ -123,8 +123,8 @@ public: std::shared_ptr getContextAccess(const ContextAccessParams & params) const; std::shared_ptr getEnabledRoles( - const boost::container::flat_set & current_roles, - const boost::container::flat_set & current_roles_with_admin_option) const; + const std::vector & current_roles, + const std::vector & current_roles_with_admin_option) const; std::shared_ptr getEnabledRowPolicies( const UUID & user_id, diff --git a/src/Access/AccessRights.cpp b/src/Access/AccessRights.cpp index 8ce71dd8da8..f9c1d23350d 100644 --- a/src/Access/AccessRights.cpp +++ b/src/Access/AccessRights.cpp @@ -7,16 +7,19 @@ namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + namespace { - using Kind = AccessRightsElementWithOptions::Kind; - struct ProtoElement { AccessFlags access_flags; boost::container::small_vector full_name; bool grant_option = false; - Kind kind = Kind::GRANT; + bool is_partial_revoke = false; friend bool operator<(const ProtoElement & left, const ProtoElement & right) { @@ -43,8 +46,8 @@ namespace if (int cmp = compare_name(left.full_name, right.full_name, 1)) return cmp < 0; - if (left.kind != right.kind) - return (left.kind == Kind::GRANT); + if (left.is_partial_revoke != right.is_partial_revoke) + return right.is_partial_revoke; if (left.grant_option != right.grant_option) return right.grant_option; @@ -55,12 +58,12 @@ namespace return (left.access_flags < right.access_flags); } - AccessRightsElementWithOptions getResult() const + AccessRightsElement getResult() const { - AccessRightsElementWithOptions res; + AccessRightsElement res; res.access_flags = access_flags; res.grant_option = grant_option; - res.kind = kind; + res.is_partial_revoke = is_partial_revoke; switch (full_name.size()) { case 0: @@ -105,11 +108,11 @@ namespace class ProtoElements : public std::vector { public: - AccessRightsElementsWithOptions getResult() const + AccessRightsElements getResult() const { ProtoElements sorted = *this; boost::range::sort(sorted); - AccessRightsElementsWithOptions res; + AccessRightsElements res; res.reserve(sorted.size()); for (size_t i = 0; i != sorted.size();) @@ -144,7 +147,7 @@ namespace { return (element.full_name.size() != 3) || (element.full_name[0] != start_element.full_name[0]) || (element.full_name[1] != start_element.full_name[1]) || (element.grant_option != start_element.grant_option) - || (element.kind != start_element.kind); + || (element.is_partial_revoke != start_element.is_partial_revoke); }); return it - (begin() + start); @@ -153,7 +156,7 @@ namespace /// Collects columns together to write multiple columns into one AccessRightsElement. /// That procedure allows to output access rights in more compact way, /// e.g. "SELECT(x, y)" instead of "SELECT(x), SELECT(y)". - void appendResultWithElementsWithDifferenceInColumnOnly(size_t start, size_t count, AccessRightsElementsWithOptions & res) const + void appendResultWithElementsWithDifferenceInColumnOnly(size_t start, size_t count, AccessRightsElements & res) const { const auto * pbegin = data() + start; const auto * pend = pbegin + count; @@ -180,7 +183,7 @@ namespace res.emplace_back(); auto & back = res.back(); back.grant_option = pbegin->grant_option; - back.kind = pbegin->kind; + back.is_partial_revoke = pbegin->is_partial_revoke; back.any_database = false; back.database = pbegin->full_name[0]; back.any_table = false; @@ -515,10 +518,10 @@ private: auto grants = flags - parent_fl; if (revokes) - res.push_back(ProtoElement{revokes, full_name, false, Kind::REVOKE}); + res.push_back(ProtoElement{revokes, full_name, false, true}); if (grants) - res.push_back(ProtoElement{grants, full_name, false, Kind::GRANT}); + res.push_back(ProtoElement{grants, full_name, false, false}); if (node.children) { @@ -550,16 +553,16 @@ private: auto grants = flags - parent_fl - grants_go; if (revokes) - res.push_back(ProtoElement{revokes, full_name, false, Kind::REVOKE}); + res.push_back(ProtoElement{revokes, full_name, false, true}); if (revokes_go) - res.push_back(ProtoElement{revokes_go, full_name, true, Kind::REVOKE}); + res.push_back(ProtoElement{revokes_go, full_name, true, true}); if (grants) - res.push_back(ProtoElement{grants, full_name, false, Kind::GRANT}); + res.push_back(ProtoElement{grants, full_name, false, false}); if (grants_go) - res.push_back(ProtoElement{grants_go, full_name, true, Kind::GRANT}); + res.push_back(ProtoElement{grants_go, full_name, true, false}); if (node && node->children) { @@ -774,8 +777,10 @@ void AccessRights::grantImpl(const AccessFlags & flags, const Args &... args) } template -void AccessRights::grantImpl(const AccessRightsElement & element) +void AccessRights::grantImplHelper(const AccessRightsElement & element) { + assert(!element.is_partial_revoke); + assert(!element.grant_option || with_grant_option); if (element.any_database) grantImpl(element.access_flags); else if (element.any_table) @@ -786,6 +791,24 @@ void AccessRights::grantImpl(const AccessRightsElement & element) grantImpl(element.access_flags, element.database, element.table, element.columns); } +template +void AccessRights::grantImpl(const AccessRightsElement & element) +{ + if (element.is_partial_revoke) + throw Exception("A partial revoke should be revoked, not granted", ErrorCodes::BAD_ARGUMENTS); + if constexpr (with_grant_option) + { + grantImplHelper(element); + } + else + { + if (element.grant_option) + grantImplHelper(element); + else + grantImplHelper(element); + } +} + template void AccessRights::grantImpl(const AccessRightsElements & elements) { @@ -830,8 +853,9 @@ void AccessRights::revokeImpl(const AccessFlags & flags, const Args &... args) } template -void AccessRights::revokeImpl(const AccessRightsElement & element) +void AccessRights::revokeImplHelper(const AccessRightsElement & element) { + assert(!element.grant_option || grant_option); if (element.any_database) revokeImpl(element.access_flags); else if (element.any_table) @@ -842,6 +866,22 @@ void AccessRights::revokeImpl(const AccessRightsElement & element) revokeImpl(element.access_flags, element.database, element.table, element.columns); } +template +void AccessRights::revokeImpl(const AccessRightsElement & element) +{ + if constexpr (grant_option) + { + revokeImplHelper(element); + } + else + { + if (element.grant_option) + revokeImplHelper(element); + else + revokeImplHelper(element); + } +} + template void AccessRights::revokeImpl(const AccessRightsElements & elements) { @@ -868,7 +908,7 @@ void AccessRights::revokeGrantOption(const AccessRightsElement & element) { revo void AccessRights::revokeGrantOption(const AccessRightsElements & elements) { revokeImpl(elements); } -AccessRightsElementsWithOptions AccessRights::getElements() const +AccessRightsElements AccessRights::getElements() const { #if 0 logTree(); @@ -903,8 +943,9 @@ bool AccessRights::isGrantedImpl(const AccessFlags & flags, const Args &... args } template -bool AccessRights::isGrantedImpl(const AccessRightsElement & element) const +bool AccessRights::isGrantedImplHelper(const AccessRightsElement & element) const { + assert(!element.grant_option || grant_option); if (element.any_database) return isGrantedImpl(element.access_flags); else if (element.any_table) @@ -915,6 +956,22 @@ bool AccessRights::isGrantedImpl(const AccessRightsElement & element) const return isGrantedImpl(element.access_flags, element.database, element.table, element.columns); } +template +bool AccessRights::isGrantedImpl(const AccessRightsElement & element) const +{ + if constexpr (grant_option) + { + return isGrantedImplHelper(element); + } + else + { + if (element.grant_option) + return isGrantedImplHelper(element); + else + return isGrantedImplHelper(element); + } +} + template bool AccessRights::isGrantedImpl(const AccessRightsElements & elements) const { diff --git a/src/Access/AccessRights.h b/src/Access/AccessRights.h index c610795ab45..a90616ea27f 100644 --- a/src/Access/AccessRights.h +++ b/src/Access/AccessRights.h @@ -30,7 +30,7 @@ public: String toString() const; /// Returns the information about all the access granted. - AccessRightsElementsWithOptions getElements() const; + AccessRightsElements getElements() const; /// Grants access on a specified database/table/column. /// Does nothing if the specified access has been already granted. @@ -119,12 +119,15 @@ private: template void grantImpl(const AccessFlags & flags, const Args &... args); - template + template void grantImpl(const AccessRightsElement & element); - template + template void grantImpl(const AccessRightsElements & elements); + template + void grantImplHelper(const AccessRightsElement & element); + template void revokeImpl(const AccessFlags & flags, const Args &... args); @@ -134,6 +137,9 @@ private: template void revokeImpl(const AccessRightsElements & elements); + template + void revokeImplHelper(const AccessRightsElement & element); + template bool isGrantedImpl(const AccessFlags & flags, const Args &... args) const; @@ -143,6 +149,9 @@ private: template bool isGrantedImpl(const AccessRightsElements & elements) const; + template + bool isGrantedImplHelper(const AccessRightsElement & element) const; + void logTree() const; struct Node; diff --git a/src/Access/AccessRightsElement.cpp b/src/Access/AccessRightsElement.cpp index e69fb6d3b74..823019ffebd 100644 --- a/src/Access/AccessRightsElement.cpp +++ b/src/Access/AccessRightsElement.cpp @@ -1,169 +1,162 @@ #include -#include #include -#include -#include -#include #include -#include namespace DB { namespace { - using Kind = AccessRightsElementWithOptions::Kind; - - String formatOptions(bool grant_option, Kind kind, const String & inner_part) + void formatColumnNames(const Strings & columns, String & result) { - if (kind == Kind::REVOKE) + result += "("; + bool need_comma = false; + for (const auto & column : columns) { - if (grant_option) - return "REVOKE GRANT OPTION " + inner_part; - else - return "REVOKE " + inner_part; - } - else - { - if (grant_option) - return "GRANT " + inner_part + " WITH GRANT OPTION"; - else - return "GRANT " + inner_part; + if (need_comma) + result += ", "; + need_comma = true; + result += backQuoteIfNeed(column); } + result += ")"; } - - String formatONClause(const String & database, bool any_database, const String & table, bool any_table) + void formatONClause(const String & database, bool any_database, const String & table, bool any_table, String & result) { - String msg = "ON "; - + result += "ON "; if (any_database) - msg += "*."; - else if (!database.empty()) - msg += backQuoteIfNeed(database) + "."; - - if (any_table) - msg += "*"; + { + result += "*.*"; + } else - msg += backQuoteIfNeed(table); - return msg; + { + if (!database.empty()) + { + result += backQuoteIfNeed(database); + result += "."; + } + if (any_table) + result += "*"; + else + result += backQuoteIfNeed(table); + } } - - String formatAccessFlagsWithColumns(const AccessFlags & access_flags, const Strings & columns, bool any_column) + void formatOptions(bool grant_option, bool is_partial_revoke, String & result) { - String columns_in_parentheses; + if (is_partial_revoke) + { + if (grant_option) + result.insert(0, "REVOKE GRANT OPTION "); + else + result.insert(0, "REVOKE "); + } + else + { + if (grant_option) + result.insert(0, "GRANT ").append(" WITH GRANT OPTION"); + else + result.insert(0, "GRANT "); + } + } + + void formatAccessFlagsWithColumns(const AccessFlags & access_flags, const Strings & columns, bool any_column, String & result) + { + String columns_as_str; if (!any_column) { if (columns.empty()) - return "USAGE"; - for (const auto & column : columns) { - columns_in_parentheses += columns_in_parentheses.empty() ? "(" : ", "; - columns_in_parentheses += backQuoteIfNeed(column); + result += "USAGE"; + return; } - columns_in_parentheses += ")"; + formatColumnNames(columns, columns_as_str); } auto keywords = access_flags.toKeywords(); if (keywords.empty()) - return "USAGE"; + { + result += "USAGE"; + return; + } - String msg; + bool need_comma = false; for (const std::string_view & keyword : keywords) { - if (!msg.empty()) - msg += ", "; - msg += String{keyword} + columns_in_parentheses; + if (need_comma) + result.append(", "); + need_comma = true; + result += keyword; + result += columns_as_str; } - return msg; } -} - -String AccessRightsElement::toString() const -{ - return formatAccessFlagsWithColumns(access_flags, columns, any_column) + " " + formatONClause(database, any_database, table, any_table); -} - -String AccessRightsElementWithOptions::toString() const -{ - return formatOptions(grant_option, kind, AccessRightsElement::toString()); -} - -String AccessRightsElements::toString() const -{ - if (empty()) - return "USAGE ON *.*"; - - String res; - String inner_part; - - for (size_t i = 0; i != size(); ++i) + String toStringImpl(const AccessRightsElement & element, bool with_options) { - const auto & element = (*this)[i]; - - if (!inner_part.empty()) - inner_part += ", "; - inner_part += formatAccessFlagsWithColumns(element.access_flags, element.columns, element.any_column); - - bool next_element_uses_same_table = false; - if (i != size() - 1) - { - const auto & next_element = (*this)[i + 1]; - if (element.sameDatabaseAndTable(next_element)) - next_element_uses_same_table = true; - } - - if (!next_element_uses_same_table) - { - if (!res.empty()) - res += ", "; - res += inner_part + " " + formatONClause(element.database, element.any_database, element.table, element.any_table); - inner_part.clear(); - } + String result; + formatAccessFlagsWithColumns(element.access_flags, element.columns, element.any_column, result); + result += " "; + formatONClause(element.database, element.any_database, element.table, element.any_table, result); + if (with_options) + formatOptions(element.grant_option, element.is_partial_revoke, result); + return result; } - return res; -} - -String AccessRightsElementsWithOptions::toString() const -{ - if (empty()) - return "GRANT USAGE ON *.*"; - - String res; - String inner_part; - - for (size_t i = 0; i != size(); ++i) + String toStringImpl(const AccessRightsElements & elements, bool with_options) { - const auto & element = (*this)[i]; + if (elements.empty()) + return with_options ? "GRANT USAGE ON *.*" : "USAGE ON *.*"; - if (!inner_part.empty()) - inner_part += ", "; - inner_part += formatAccessFlagsWithColumns(element.access_flags, element.columns, element.any_column); + String result; + String part; - bool next_element_uses_same_mode_and_table = false; - if (i != size() - 1) + for (size_t i = 0; i != elements.size(); ++i) { - const auto & next_element = (*this)[i + 1]; - if (element.sameDatabaseAndTable(next_element) && element.sameOptions(next_element)) - next_element_uses_same_mode_and_table = true; + const auto & element = elements[i]; + + if (!part.empty()) + part += ", "; + formatAccessFlagsWithColumns(element.access_flags, element.columns, element.any_column, part); + + bool next_element_uses_same_table_and_options = false; + if (i != elements.size() - 1) + { + const auto & next_element = elements[i + 1]; + if (element.sameDatabaseAndTable(next_element) && element.sameOptions(next_element)) + next_element_uses_same_table_and_options = true; + } + + if (!next_element_uses_same_table_and_options) + { + part += " "; + formatONClause(element.database, element.any_database, element.table, element.any_table, part); + if (with_options) + formatOptions(element.grant_option, element.is_partial_revoke, part); + if (result.empty()) + result = std::move(part); + else + result.append(", ").append(part); + part.clear(); + } } - if (!next_element_uses_same_mode_and_table) - { - if (!res.empty()) - res += ", "; - res += formatOptions( - element.grant_option, - element.kind, - inner_part + " " + formatONClause(element.database, element.any_database, element.table, element.any_table)); - inner_part.clear(); - } + return result; } +} - return res; + +String AccessRightsElement::toString() const { return toStringImpl(*this, true); } +String AccessRightsElement::toStringWithoutOptions() const { return toStringImpl(*this, false); } +String AccessRightsElements::toString() const { return toStringImpl(*this, true); } +String AccessRightsElements::toStringWithoutOptions() const { return toStringImpl(*this, false); } + +void AccessRightsElements::eraseNonGrantable() +{ + boost::range::remove_erase_if(*this, [](AccessRightsElement & element) + { + element.eraseNonGrantable(); + return element.empty(); + }); } } diff --git a/src/Access/AccessRightsElement.h b/src/Access/AccessRightsElement.h index 36cb64e6eba..c76f019bc61 100644 --- a/src/Access/AccessRightsElement.h +++ b/src/Access/AccessRightsElement.h @@ -16,6 +16,8 @@ struct AccessRightsElement bool any_database = true; bool any_table = true; bool any_column = true; + bool grant_option = false; + bool is_partial_revoke = false; AccessRightsElement() = default; AccessRightsElement(const AccessRightsElement &) = default; @@ -73,7 +75,7 @@ struct AccessRightsElement bool empty() const { return !access_flags || (!any_column && columns.empty()); } - auto toTuple() const { return std::tie(access_flags, any_database, database, any_table, table, any_column, columns); } + auto toTuple() const { return std::tie(access_flags, any_database, database, any_table, table, any_column, columns, grant_option, is_partial_revoke); } friend bool operator==(const AccessRightsElement & left, const AccessRightsElement & right) { return left.toTuple() == right.toTuple(); } friend bool operator!=(const AccessRightsElement & left, const AccessRightsElement & right) { return !(left == right); } @@ -83,44 +85,36 @@ struct AccessRightsElement && (any_table == other.any_table); } - bool isEmptyDatabase() const { return !any_database && database.empty(); } - - /// If the database is empty, replaces it with `new_database`. Otherwise does nothing. - void replaceEmptyDatabase(const String & new_database); - - /// Resets flags which cannot be granted. - void removeNonGrantableFlags(); - - /// Returns a human-readable representation like "SELECT, UPDATE(x, y) ON db.table". - String toString() const; -}; - - -struct AccessRightsElementWithOptions : public AccessRightsElement -{ - bool grant_option = false; - - enum class Kind + bool sameOptions(const AccessRightsElement & other) const { - GRANT, - REVOKE, - }; - Kind kind = Kind::GRANT; - - bool sameOptions(const AccessRightsElementWithOptions & other) const - { - return (grant_option == other.grant_option) && (kind == other.kind); + return (grant_option == other.grant_option) && (is_partial_revoke == other.is_partial_revoke); } - auto toTuple() const { return std::tie(access_flags, any_database, database, any_table, table, any_column, columns, grant_option, kind); } - friend bool operator==(const AccessRightsElementWithOptions & left, const AccessRightsElementWithOptions & right) { return left.toTuple() == right.toTuple(); } - friend bool operator!=(const AccessRightsElementWithOptions & left, const AccessRightsElementWithOptions & right) { return !(left == right); } - /// Resets flags which cannot be granted. - void removeNonGrantableFlags(); + void eraseNonGrantable() + { + if (!any_column) + access_flags &= AccessFlags::allFlagsGrantableOnColumnLevel(); + else if (!any_table) + access_flags &= AccessFlags::allFlagsGrantableOnTableLevel(); + else if (!any_database) + access_flags &= AccessFlags::allFlagsGrantableOnDatabaseLevel(); + else + access_flags &= AccessFlags::allFlagsGrantableOnGlobalLevel(); + } + + bool isEmptyDatabase() const { return !any_database && database.empty(); } + + /// If the database is empty, replaces it with `current_database`. Otherwise does nothing. + void replaceEmptyDatabase(const String & current_database) + { + if (isEmptyDatabase()) + database = current_database; + } /// Returns a human-readable representation like "GRANT SELECT, UPDATE(x, y) ON db.table". String toString() const; + String toStringWithoutOptions() const; }; @@ -130,77 +124,29 @@ class AccessRightsElements : public std::vector public: bool empty() const { return std::all_of(begin(), end(), [](const AccessRightsElement & e) { return e.empty(); }); } - /// Replaces the empty database with `new_database`. - void replaceEmptyDatabase(const String & new_database); + bool sameDatabaseAndTable() const + { + return (size() < 2) || std::all_of(std::next(begin()), end(), [this](const AccessRightsElement & e) { return e.sameDatabaseAndTable(front()); }); + } + + bool sameOptions() const + { + return (size() < 2) || std::all_of(std::next(begin()), end(), [this](const AccessRightsElement & e) { return e.sameOptions(front()); }); + } /// Resets flags which cannot be granted. - void removeNonGrantableFlags(); + void eraseNonGrantable(); + + /// If the database is empty, replaces it with `current_database`. Otherwise does nothing. + void replaceEmptyDatabase(const String & current_database) + { + for (auto & element : *this) + element.replaceEmptyDatabase(current_database); + } /// Returns a human-readable representation like "GRANT SELECT, UPDATE(x, y) ON db.table". String toString() const; + String toStringWithoutOptions() const; }; - -class AccessRightsElementsWithOptions : public std::vector -{ -public: - /// Replaces the empty database with `new_database`. - void replaceEmptyDatabase(const String & new_database); - - /// Resets flags which cannot be granted. - void removeNonGrantableFlags(); - - /// Returns a human-readable representation like "GRANT SELECT, UPDATE(x, y) ON db.table". - String toString() const; -}; - - -inline void AccessRightsElement::replaceEmptyDatabase(const String & new_database) -{ - if (isEmptyDatabase()) - database = new_database; -} - -inline void AccessRightsElements::replaceEmptyDatabase(const String & new_database) -{ - for (auto & element : *this) - element.replaceEmptyDatabase(new_database); -} - -inline void AccessRightsElementsWithOptions::replaceEmptyDatabase(const String & new_database) -{ - for (auto & element : *this) - element.replaceEmptyDatabase(new_database); -} - -inline void AccessRightsElement::removeNonGrantableFlags() -{ - if (!any_column) - access_flags &= AccessFlags::allFlagsGrantableOnColumnLevel(); - else if (!any_table) - access_flags &= AccessFlags::allFlagsGrantableOnTableLevel(); - else if (!any_database) - access_flags &= AccessFlags::allFlagsGrantableOnDatabaseLevel(); - else - access_flags &= AccessFlags::allFlagsGrantableOnGlobalLevel(); -} - -inline void AccessRightsElementWithOptions::removeNonGrantableFlags() -{ - if (kind == Kind::GRANT) - AccessRightsElement::removeNonGrantableFlags(); -} - -inline void AccessRightsElements::removeNonGrantableFlags() -{ - for (auto & element : *this) - element.removeNonGrantableFlags(); -} - -inline void AccessRightsElementsWithOptions::removeNonGrantableFlags() -{ - for (auto & element : *this) - element.removeNonGrantableFlags(); -} - } diff --git a/src/Access/AccessType.h b/src/Access/AccessType.h index 5a84aa66739..d5185b9931d 100644 --- a/src/Access/AccessType.h +++ b/src/Access/AccessType.h @@ -62,8 +62,8 @@ enum class AccessType enabled implicitly by the grant ALTER_TABLE */\ M(ALTER_SETTINGS, "ALTER SETTING, ALTER MODIFY SETTING, MODIFY SETTING", TABLE, ALTER_TABLE) /* allows to execute ALTER MODIFY SETTING */\ M(ALTER_MOVE_PARTITION, "ALTER MOVE PART, MOVE PARTITION, MOVE PART", TABLE, ALTER_TABLE) \ - M(ALTER_FETCH_PARTITION, "FETCH PARTITION", TABLE, ALTER_TABLE) \ - M(ALTER_FREEZE_PARTITION, "FREEZE PARTITION", TABLE, ALTER_TABLE) \ + M(ALTER_FETCH_PARTITION, "ALTER FETCH PART, FETCH PARTITION", TABLE, ALTER_TABLE) \ + M(ALTER_FREEZE_PARTITION, "FREEZE PARTITION, UNFREEZE", TABLE, ALTER_TABLE) \ \ M(ALTER_TABLE, "", GROUP, ALTER) \ \ @@ -124,11 +124,13 @@ enum class AccessType M(SYSTEM_DROP_DNS_CACHE, "SYSTEM DROP DNS, DROP DNS CACHE, DROP DNS", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_MARK_CACHE, "SYSTEM DROP MARK, DROP MARK CACHE, DROP MARKS", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_UNCOMPRESSED_CACHE, "SYSTEM DROP UNCOMPRESSED, DROP UNCOMPRESSED CACHE, DROP UNCOMPRESSED", GLOBAL, SYSTEM_DROP_CACHE) \ + M(SYSTEM_DROP_MMAP_CACHE, "SYSTEM DROP MMAP, DROP MMAP CACHE, DROP MMAP", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_COMPILED_EXPRESSION_CACHE, "SYSTEM DROP COMPILED EXPRESSION, DROP COMPILED EXPRESSION CACHE, DROP COMPILED EXPRESSIONS", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_CACHE, "DROP CACHE", GROUP, SYSTEM) \ M(SYSTEM_RELOAD_CONFIG, "RELOAD CONFIG", GLOBAL, SYSTEM_RELOAD) \ M(SYSTEM_RELOAD_SYMBOLS, "RELOAD SYMBOLS", GLOBAL, SYSTEM_RELOAD) \ M(SYSTEM_RELOAD_DICTIONARY, "SYSTEM RELOAD DICTIONARIES, RELOAD DICTIONARY, RELOAD DICTIONARIES", GLOBAL, SYSTEM_RELOAD) \ + M(SYSTEM_RELOAD_MODEL, "SYSTEM RELOAD MODELS, RELOAD MODEL, RELOAD MODELS", GLOBAL, SYSTEM_RELOAD) \ M(SYSTEM_RELOAD_EMBEDDED_DICTIONARIES, "RELOAD EMBEDDED DICTIONARIES", GLOBAL, SYSTEM_RELOAD) /* implicitly enabled by the grant SYSTEM_RELOAD_DICTIONARY ON *.* */\ M(SYSTEM_RELOAD, "", GROUP, SYSTEM) \ M(SYSTEM_MERGES, "SYSTEM STOP MERGES, SYSTEM START MERGES, STOP_MERGES, START MERGES", TABLE, SYSTEM) \ diff --git a/src/Access/AllowedClientHosts.h b/src/Access/AllowedClientHosts.h index 615782d75a2..a6895b120e0 100644 --- a/src/Access/AllowedClientHosts.h +++ b/src/Access/AllowedClientHosts.h @@ -14,7 +14,7 @@ namespace DB using Strings = std::vector; -/// Represents lists of hosts an user is allowed to connect to server from. +/// Represents lists of hosts a user is allowed to connect to server from. class AllowedClientHosts { public: diff --git a/src/Access/Authentication.cpp b/src/Access/Authentication.cpp index 19c40c068b4..54d94d905ae 100644 --- a/src/Access/Authentication.cpp +++ b/src/Access/Authentication.cpp @@ -1,5 +1,8 @@ #include +#include #include +#include +#include #include #include @@ -8,8 +11,8 @@ namespace DB { namespace ErrorCodes { - extern const int BAD_ARGUMENTS; extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; } @@ -32,14 +35,13 @@ Authentication::Digest Authentication::getPasswordDoubleSHA1() const return engine.digest(); } - case SHA256_PASSWORD: - throw Exception("Cannot get password double SHA1 for user with 'SHA256_PASSWORD' authentication", ErrorCodes::BAD_ARGUMENTS); - case DOUBLE_SHA1_PASSWORD: return password_hash; - case LDAP_SERVER: - throw Exception("Cannot get password double SHA1 for user with 'LDAP_SERVER' authentication", ErrorCodes::BAD_ARGUMENTS); + case SHA256_PASSWORD: + case LDAP: + case KERBEROS: + throw Exception("Cannot get password double SHA1 hash for authentication type " + toString(type), ErrorCodes::LOGICAL_ERROR); case MAX_TYPE: break; @@ -48,44 +50,76 @@ Authentication::Digest Authentication::getPasswordDoubleSHA1() const } -bool Authentication::isCorrectPassword(const String & user_, const String & password_, const ExternalAuthenticators & external_authenticators) const +bool Authentication::areCredentialsValid(const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const { - switch (type) + if (!credentials.isReady()) + return false; + + if (const auto * gss_acceptor_context = dynamic_cast(&credentials)) { - case NO_PASSWORD: - return true; - - case PLAINTEXT_PASSWORD: + switch (type) { - if (password_ == std::string_view{reinterpret_cast(password_hash.data()), password_hash.size()}) - return true; + case NO_PASSWORD: + case PLAINTEXT_PASSWORD: + case SHA256_PASSWORD: + case DOUBLE_SHA1_PASSWORD: + case LDAP: + throw Require("ClickHouse Basic Authentication"); - // For compatibility with MySQL clients which support only native authentication plugin, SHA1 can be passed instead of password. - auto password_sha1 = encodeSHA1(password_hash); - return password_ == std::string_view{reinterpret_cast(password_sha1.data()), password_sha1.size()}; + case KERBEROS: + return external_authenticators.checkKerberosCredentials(kerberos_realm, *gss_acceptor_context); + + case MAX_TYPE: + break; } - - case SHA256_PASSWORD: - return encodeSHA256(password_) == password_hash; - - case DOUBLE_SHA1_PASSWORD: - { - auto first_sha1 = encodeSHA1(password_); - - /// If it was MySQL compatibility server, then first_sha1 already contains double SHA1. - if (first_sha1 == password_hash) - return true; - - return encodeSHA1(first_sha1) == password_hash; - } - - case LDAP_SERVER: - return external_authenticators.checkLDAPCredentials(server_name, user_, password_); - - case MAX_TYPE: - break; } - throw Exception("Cannot check if the password is correct for authentication type " + toString(type), ErrorCodes::NOT_IMPLEMENTED); + + if (const auto * basic_credentials = dynamic_cast(&credentials)) + { + switch (type) + { + case NO_PASSWORD: + return true; // N.B. even if the password is not empty! + + case PLAINTEXT_PASSWORD: + { + if (basic_credentials->getPassword() == std::string_view{reinterpret_cast(password_hash.data()), password_hash.size()}) + return true; + + // For compatibility with MySQL clients which support only native authentication plugin, SHA1 can be passed instead of password. + const auto password_sha1 = encodeSHA1(password_hash); + return basic_credentials->getPassword() == std::string_view{reinterpret_cast(password_sha1.data()), password_sha1.size()}; + } + + case SHA256_PASSWORD: + return encodeSHA256(basic_credentials->getPassword()) == password_hash; + + case DOUBLE_SHA1_PASSWORD: + { + const auto first_sha1 = encodeSHA1(basic_credentials->getPassword()); + + /// If it was MySQL compatibility server, then first_sha1 already contains double SHA1. + if (first_sha1 == password_hash) + return true; + + return encodeSHA1(first_sha1) == password_hash; + } + + case LDAP: + return external_authenticators.checkLDAPCredentials(ldap_server_name, *basic_credentials); + + case KERBEROS: + throw Require(kerberos_realm); + + case MAX_TYPE: + break; + } + } + + if ([[maybe_unused]] const auto * always_allow_credentials = dynamic_cast(&credentials)) + return true; + + throw Exception("areCredentialsValid(): authentication type " + toString(type) + " not supported", ErrorCodes::NOT_IMPLEMENTED); } } diff --git a/src/Access/Authentication.h b/src/Access/Authentication.h index bd22b7f0ff5..54542cb504a 100644 --- a/src/Access/Authentication.h +++ b/src/Access/Authentication.h @@ -6,8 +6,6 @@ #include #include #include -#include -#include namespace DB @@ -20,13 +18,10 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +class Credentials; class ExternalAuthenticators; -struct LDAPSearchParams; -using LDAPSearchParamsList = std::vector; -using LDAPSearchResults = std::set; -using LDAPSearchResultsList = std::vector; -/// Authentication type and encrypted password for checking when an user logins. +/// Authentication type and encrypted password for checking when a user logins. class Authentication { public: @@ -46,7 +41,10 @@ public: DOUBLE_SHA1_PASSWORD, /// Password is checked by a [remote] LDAP server. Connection will be made at each authentication attempt. - LDAP_SERVER, + LDAP, + + /// Kerberos authentication performed through GSS-API negotiation loop. + KERBEROS, MAX_TYPE, }; @@ -58,6 +56,18 @@ public: static const TypeInfo & get(Type type_); }; + // A signaling class used to communicate requirements for credentials. + template + class Require : public Exception + { + public: + explicit Require(const String & realm_); + const String & getRealm() const; + + private: + const String realm; + }; + using Digest = std::vector; Authentication(Authentication::Type type_ = NO_PASSWORD) : type(type_) {} @@ -88,14 +98,16 @@ public: /// Allowed to use for Type::NO_PASSWORD, Type::PLAINTEXT_PASSWORD, Type::DOUBLE_SHA1_PASSWORD. Digest getPasswordDoubleSHA1() const; - /// Sets an external authentication server name. - /// When authentication type is LDAP_SERVER, server name is expected to be the name of a preconfigured LDAP server. - const String & getServerName() const; - void setServerName(const String & server_name_); + /// Sets the server name for authentication type LDAP. + const String & getLDAPServerName() const; + void setLDAPServerName(const String & name); - /// Checks if the provided password is correct. Returns false if not. - /// User name and external authenticators are used by the specific authentication types only (e.g., LDAP_SERVER). - bool isCorrectPassword(const String & user_, const String & password_, const ExternalAuthenticators & external_authenticators) const; + /// Sets the realm name for authentication type KERBEROS. + const String & getKerberosRealm() const; + void setKerberosRealm(const String & realm); + + /// Checks the credentials (passwords, readiness, etc.) + bool areCredentialsValid(const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const; friend bool operator ==(const Authentication & lhs, const Authentication & rhs) { return (lhs.type == rhs.type) && (lhs.password_hash == rhs.password_hash); } friend bool operator !=(const Authentication & lhs, const Authentication & rhs) { return !(lhs == rhs); } @@ -109,7 +121,8 @@ private: Type type = Type::NO_PASSWORD; Digest password_hash; - String server_name; + String ldap_server_name; + String kerberos_realm; }; @@ -144,16 +157,35 @@ inline const Authentication::TypeInfo & Authentication::TypeInfo::get(Type type_ static const auto info = make_info("DOUBLE_SHA1_PASSWORD"); return info; } - case LDAP_SERVER: + case LDAP: { - static const auto info = make_info("LDAP_SERVER"); + static const auto info = make_info("LDAP"); return info; } - case MAX_TYPE: break; + case KERBEROS: + { + static const auto info = make_info("KERBEROS"); + return info; + } + case MAX_TYPE: + break; } throw Exception("Unknown authentication type: " + std::to_string(static_cast(type_)), ErrorCodes::LOGICAL_ERROR); } +template +Authentication::Require::Require(const String & realm_) + : Exception("Credentials required", ErrorCodes::BAD_ARGUMENTS) + , realm(realm_) +{ +} + +template +const String & Authentication::Require::getRealm() const +{ + return realm; +} + inline String toString(Authentication::Type type_) { return Authentication::TypeInfo::get(type_).raw_name; @@ -186,9 +218,6 @@ inline void Authentication::setPassword(const String & password_) { switch (type) { - case NO_PASSWORD: - throw Exception("Cannot specify password for the 'NO_PASSWORD' authentication type", ErrorCodes::LOGICAL_ERROR); - case PLAINTEXT_PASSWORD: return setPasswordHashBinary(encodePlainText(password_)); @@ -198,10 +227,13 @@ inline void Authentication::setPassword(const String & password_) case DOUBLE_SHA1_PASSWORD: return setPasswordHashBinary(encodeDoubleSHA1(password_)); - case LDAP_SERVER: - throw Exception("Cannot specify password for the 'LDAP_SERVER' authentication type", ErrorCodes::LOGICAL_ERROR); + case NO_PASSWORD: + case LDAP: + case KERBEROS: + throw Exception("Cannot specify password for authentication type " + toString(type), ErrorCodes::LOGICAL_ERROR); - case MAX_TYPE: break; + case MAX_TYPE: + break; } throw Exception("setPassword(): authentication type " + toString(type) + " not supported", ErrorCodes::NOT_IMPLEMENTED); } @@ -225,8 +257,9 @@ inline void Authentication::setPasswordHashHex(const String & hash) inline String Authentication::getPasswordHashHex() const { - if (type == LDAP_SERVER) - throw Exception("Cannot get password of a user with the 'LDAP_SERVER' authentication type", ErrorCodes::LOGICAL_ERROR); + if (type == LDAP || type == KERBEROS) + throw Exception("Cannot get password hex hash for authentication type " + toString(type), ErrorCodes::LOGICAL_ERROR); + String hex; hex.resize(password_hash.size() * 2); boost::algorithm::hex(password_hash.begin(), password_hash.end(), hex.data()); @@ -238,9 +271,6 @@ inline void Authentication::setPasswordHashBinary(const Digest & hash) { switch (type) { - case NO_PASSWORD: - throw Exception("Cannot specify password for the 'NO_PASSWORD' authentication type", ErrorCodes::LOGICAL_ERROR); - case PLAINTEXT_PASSWORD: { password_hash = hash; @@ -269,22 +299,35 @@ inline void Authentication::setPasswordHashBinary(const Digest & hash) return; } - case LDAP_SERVER: - throw Exception("Cannot specify password for the 'LDAP_SERVER' authentication type", ErrorCodes::LOGICAL_ERROR); + case NO_PASSWORD: + case LDAP: + case KERBEROS: + throw Exception("Cannot specify password binary hash for authentication type " + toString(type), ErrorCodes::LOGICAL_ERROR); - case MAX_TYPE: break; + case MAX_TYPE: + break; } throw Exception("setPasswordHashBinary(): authentication type " + toString(type) + " not supported", ErrorCodes::NOT_IMPLEMENTED); } -inline const String & Authentication::getServerName() const +inline const String & Authentication::getLDAPServerName() const { - return server_name; + return ldap_server_name; } -inline void Authentication::setServerName(const String & server_name_) +inline void Authentication::setLDAPServerName(const String & name) { - server_name = server_name_; + ldap_server_name = name; +} + +inline const String & Authentication::getKerberosRealm() const +{ + return kerberos_realm; +} + +inline void Authentication::setKerberosRealm(const String & realm) +{ + kerberos_realm = realm; } } diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 494da4eaeae..0bcaef1e441 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -177,28 +177,18 @@ void ContextAccess::setUser(const UserPtr & user_) const user_name = user->getName(); trace_log = &Poco::Logger::get("ContextAccess (" + user_name + ")"); - boost::container::flat_set current_roles, current_roles_with_admin_option; + std::vector current_roles, current_roles_with_admin_option; if (params.use_default_roles) { - for (const UUID & id : user->granted_roles.roles) - { - if (user->default_roles.match(id)) - current_roles.emplace(id); - } + current_roles = user->granted_roles.findGranted(user->default_roles); + current_roles_with_admin_option = user->granted_roles.findGrantedWithAdminOption(user->default_roles); } else { - boost::range::set_intersection( - params.current_roles, - user->granted_roles.roles, - std::inserter(current_roles, current_roles.end())); + current_roles = user->granted_roles.findGranted(params.current_roles); + current_roles_with_admin_option = user->granted_roles.findGrantedWithAdminOption(params.current_roles); } - boost::range::set_intersection( - current_roles, - user->granted_roles.roles_with_admin_option, - std::inserter(current_roles_with_admin_option, current_roles_with_admin_option.end())); - subscription_for_roles_changes = {}; enabled_roles = manager->getEnabledRoles(current_roles, current_roles_with_admin_option); subscription_for_roles_changes = enabled_roles->subscribeForChanges([this](const std::shared_ptr & roles_info_) @@ -331,47 +321,13 @@ std::shared_ptr ContextAccess::getAccessRightsWithImplicit() } -template -bool ContextAccess::checkAccessImpl(const AccessFlags & flags) const -{ - return checkAccessImpl2(flags); -} - template -bool ContextAccess::checkAccessImpl(const AccessFlags & flags, const std::string_view & database, const Args &... args) const -{ - return checkAccessImpl2(flags, database.empty() ? params.current_database : database, args...); -} - -template -bool ContextAccess::checkAccessImpl(const AccessRightsElement & element) const -{ - if (element.any_database) - return checkAccessImpl(element.access_flags); - else if (element.any_table) - return checkAccessImpl(element.access_flags, element.database); - else if (element.any_column) - return checkAccessImpl(element.access_flags, element.database, element.table); - else - return checkAccessImpl(element.access_flags, element.database, element.table, element.columns); -} - -template -bool ContextAccess::checkAccessImpl(const AccessRightsElements & elements) const -{ - for (const auto & element : elements) - if (!checkAccessImpl(element)) - return false; - return true; -} - -template -bool ContextAccess::checkAccessImpl2(const AccessFlags & flags, const Args &... args) const +bool ContextAccess::checkAccessImplHelper(const AccessFlags & flags, const Args &... args) const { auto access_granted = [&] { if (trace_log) - LOG_TRACE(trace_log, "Access granted: {}{}", (AccessRightsElement{flags, args...}.toString()), + LOG_TRACE(trace_log, "Access granted: {}{}", (AccessRightsElement{flags, args...}.toStringWithoutOptions()), (grant_option ? " WITH GRANT OPTION" : "")); return true; }; @@ -379,7 +335,7 @@ bool ContextAccess::checkAccessImpl2(const AccessFlags & flags, const Args &... auto access_denied = [&](const String & error_msg, int error_code [[maybe_unused]]) { if (trace_log) - LOG_TRACE(trace_log, "Access denied: {}{}", (AccessRightsElement{flags, args...}.toString()), + LOG_TRACE(trace_log, "Access denied: {}{}", (AccessRightsElement{flags, args...}.toStringWithoutOptions()), (grant_option ? " WITH GRANT OPTION" : "")); if constexpr (throw_if_denied) throw Exception(getUserName() + ": " + error_msg, error_code); @@ -415,13 +371,13 @@ bool ContextAccess::checkAccessImpl2(const AccessFlags & flags, const Args &... "Not enough privileges. " "The required privileges have been granted, but without grant option. " "To execute this query it's necessary to have grant " - + AccessRightsElement{flags, args...}.toString() + " WITH GRANT OPTION", + + AccessRightsElement{flags, args...}.toStringWithoutOptions() + " WITH GRANT OPTION", ErrorCodes::ACCESS_DENIED); } return access_denied( "Not enough privileges. To execute this query it's necessary to have grant " - + AccessRightsElement{flags, args...}.toString() + (grant_option ? " WITH GRANT OPTION" : ""), + + AccessRightsElement{flags, args...}.toStringWithoutOptions() + (grant_option ? " WITH GRANT OPTION" : ""), ErrorCodes::ACCESS_DENIED); } @@ -478,6 +434,56 @@ bool ContextAccess::checkAccessImpl2(const AccessFlags & flags, const Args &... return access_granted(); } +template +bool ContextAccess::checkAccessImpl(const AccessFlags & flags) const +{ + return checkAccessImplHelper(flags); +} + +template +bool ContextAccess::checkAccessImpl(const AccessFlags & flags, const std::string_view & database, const Args &... args) const +{ + return checkAccessImplHelper(flags, database.empty() ? params.current_database : database, args...); +} + +template +bool ContextAccess::checkAccessImplHelper(const AccessRightsElement & element) const +{ + assert(!element.grant_option || grant_option); + if (element.any_database) + return checkAccessImpl(element.access_flags); + else if (element.any_table) + return checkAccessImpl(element.access_flags, element.database); + else if (element.any_column) + return checkAccessImpl(element.access_flags, element.database, element.table); + else + return checkAccessImpl(element.access_flags, element.database, element.table, element.columns); +} + +template +bool ContextAccess::checkAccessImpl(const AccessRightsElement & element) const +{ + if constexpr (grant_option) + { + return checkAccessImplHelper(element); + } + else + { + if (element.grant_option) + return checkAccessImplHelper(element); + else + return checkAccessImplHelper(element); + } +} + +template +bool ContextAccess::checkAccessImpl(const AccessRightsElements & elements) const +{ + for (const auto & element : elements) + if (!checkAccessImpl(element)) + return false; + return true; +} bool ContextAccess::isGranted(const AccessFlags & flags) const { return checkAccessImpl(flags); } bool ContextAccess::isGranted(const AccessFlags & flags, const std::string_view & database) const { return checkAccessImpl(flags, database); } @@ -516,44 +522,8 @@ void ContextAccess::checkGrantOption(const AccessRightsElement & element) const void ContextAccess::checkGrantOption(const AccessRightsElements & elements) const { checkAccessImpl(elements); } -template -bool ContextAccess::checkAdminOptionImpl(const UUID & role_id) const -{ - return checkAdminOptionImpl2(to_array(role_id), [this](const UUID & id, size_t) { return manager->tryReadName(id); }); -} - -template -bool ContextAccess::checkAdminOptionImpl(const UUID & role_id, const String & role_name) const -{ - return checkAdminOptionImpl2(to_array(role_id), [&role_name](const UUID &, size_t) { return std::optional{role_name}; }); -} - -template -bool ContextAccess::checkAdminOptionImpl(const UUID & role_id, const std::unordered_map & names_of_roles) const -{ - return checkAdminOptionImpl2(to_array(role_id), [&names_of_roles](const UUID & id, size_t) { auto it = names_of_roles.find(id); return (it != names_of_roles.end()) ? it->second : std::optional{}; }); -} - -template -bool ContextAccess::checkAdminOptionImpl(const std::vector & role_ids) const -{ - return checkAdminOptionImpl2(role_ids, [this](const UUID & id, size_t) { return manager->tryReadName(id); }); -} - -template -bool ContextAccess::checkAdminOptionImpl(const std::vector & role_ids, const Strings & names_of_roles) const -{ - return checkAdminOptionImpl2(role_ids, [&names_of_roles](const UUID &, size_t i) { return std::optional{names_of_roles[i]}; }); -} - -template -bool ContextAccess::checkAdminOptionImpl(const std::vector & role_ids, const std::unordered_map & names_of_roles) const -{ - return checkAdminOptionImpl2(role_ids, [&names_of_roles](const UUID & id, size_t) { auto it = names_of_roles.find(id); return (it != names_of_roles.end()) ? it->second : std::optional{}; }); -} - template -bool ContextAccess::checkAdminOptionImpl2(const Container & role_ids, const GetNameFunction & get_name_function) const +bool ContextAccess::checkAdminOptionImplHelper(const Container & role_ids, const GetNameFunction & get_name_function) const { if (!std::size(role_ids) || is_full_access) return true; @@ -605,6 +575,42 @@ bool ContextAccess::checkAdminOptionImpl2(const Container & role_ids, const GetN return true; } +template +bool ContextAccess::checkAdminOptionImpl(const UUID & role_id) const +{ + return checkAdminOptionImplHelper(to_array(role_id), [this](const UUID & id, size_t) { return manager->tryReadName(id); }); +} + +template +bool ContextAccess::checkAdminOptionImpl(const UUID & role_id, const String & role_name) const +{ + return checkAdminOptionImplHelper(to_array(role_id), [&role_name](const UUID &, size_t) { return std::optional{role_name}; }); +} + +template +bool ContextAccess::checkAdminOptionImpl(const UUID & role_id, const std::unordered_map & names_of_roles) const +{ + return checkAdminOptionImplHelper(to_array(role_id), [&names_of_roles](const UUID & id, size_t) { auto it = names_of_roles.find(id); return (it != names_of_roles.end()) ? it->second : std::optional{}; }); +} + +template +bool ContextAccess::checkAdminOptionImpl(const std::vector & role_ids) const +{ + return checkAdminOptionImplHelper(role_ids, [this](const UUID & id, size_t) { return manager->tryReadName(id); }); +} + +template +bool ContextAccess::checkAdminOptionImpl(const std::vector & role_ids, const Strings & names_of_roles) const +{ + return checkAdminOptionImplHelper(role_ids, [&names_of_roles](const UUID &, size_t i) { return std::optional{names_of_roles[i]}; }); +} + +template +bool ContextAccess::checkAdminOptionImpl(const std::vector & role_ids, const std::unordered_map & names_of_roles) const +{ + return checkAdminOptionImplHelper(role_ids, [&names_of_roles](const UUID & id, size_t) { auto it = names_of_roles.find(id); return (it != names_of_roles.end()) ? it->second : std::optional{}; }); +} + bool ContextAccess::hasAdminOption(const UUID & role_id) const { return checkAdminOptionImpl(role_id); } bool ContextAccess::hasAdminOption(const UUID & role_id, const String & role_name) const { return checkAdminOptionImpl(role_id, role_name); } bool ContextAccess::hasAdminOption(const UUID & role_id, const std::unordered_map & names_of_roles) const { return checkAdminOptionImpl(role_id, names_of_roles); } diff --git a/src/Access/ContextAccess.h b/src/Access/ContextAccess.h index 43e9f60a4c6..320c2566769 100644 --- a/src/Access/ContextAccess.h +++ b/src/Access/ContextAccess.h @@ -99,25 +99,6 @@ public: std::shared_ptr getAccessRights() const; std::shared_ptr getAccessRightsWithImplicit() const; - /// Checks if a specified access is granted. - bool isGranted(const AccessFlags & flags) const; - bool isGranted(const AccessFlags & flags, const std::string_view & database) const; - bool isGranted(const AccessFlags & flags, const std::string_view & database, const std::string_view & table) const; - bool isGranted(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const std::string_view & column) const; - bool isGranted(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const std::vector & columns) const; - bool isGranted(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const Strings & columns) const; - bool isGranted(const AccessRightsElement & element) const; - bool isGranted(const AccessRightsElements & elements) const; - - bool hasGrantOption(const AccessFlags & flags) const; - bool hasGrantOption(const AccessFlags & flags, const std::string_view & database) const; - bool hasGrantOption(const AccessFlags & flags, const std::string_view & database, const std::string_view & table) const; - bool hasGrantOption(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const std::string_view & column) const; - bool hasGrantOption(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const std::vector & columns) const; - bool hasGrantOption(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const Strings & columns) const; - bool hasGrantOption(const AccessRightsElement & element) const; - bool hasGrantOption(const AccessRightsElements & elements) const; - /// Checks if a specified access is granted, and throws an exception if not. /// Empty database means the current database. void checkAccess(const AccessFlags & flags) const; @@ -138,6 +119,26 @@ public: void checkGrantOption(const AccessRightsElement & element) const; void checkGrantOption(const AccessRightsElements & elements) const; + /// Checks if a specified access is granted, and returns false if not. + /// Empty database means the current database. + bool isGranted(const AccessFlags & flags) const; + bool isGranted(const AccessFlags & flags, const std::string_view & database) const; + bool isGranted(const AccessFlags & flags, const std::string_view & database, const std::string_view & table) const; + bool isGranted(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const std::string_view & column) const; + bool isGranted(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const std::vector & columns) const; + bool isGranted(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const Strings & columns) const; + bool isGranted(const AccessRightsElement & element) const; + bool isGranted(const AccessRightsElements & elements) const; + + bool hasGrantOption(const AccessFlags & flags) const; + bool hasGrantOption(const AccessFlags & flags, const std::string_view & database) const; + bool hasGrantOption(const AccessFlags & flags, const std::string_view & database, const std::string_view & table) const; + bool hasGrantOption(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const std::string_view & column) const; + bool hasGrantOption(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const std::vector & columns) const; + bool hasGrantOption(const AccessFlags & flags, const std::string_view & database, const std::string_view & table, const Strings & columns) const; + bool hasGrantOption(const AccessRightsElement & element) const; + bool hasGrantOption(const AccessRightsElements & elements) const; + /// Checks if a specified role is granted with admin option, and throws an exception if not. void checkAdminOption(const UUID & role_id) const; void checkAdminOption(const UUID & role_id, const String & role_name) const; @@ -146,6 +147,7 @@ public: void checkAdminOption(const std::vector & role_ids, const Strings & names_of_roles) const; void checkAdminOption(const std::vector & role_ids, const std::unordered_map & names_of_roles) const; + /// Checks if a specified role is granted with admin option, and returns false if not. bool hasAdminOption(const UUID & role_id) const; bool hasAdminOption(const UUID & role_id, const String & role_name) const; bool hasAdminOption(const UUID & role_id, const std::unordered_map & names_of_roles) const; @@ -180,7 +182,10 @@ private: bool checkAccessImpl(const AccessRightsElements & elements) const; template - bool checkAccessImpl2(const AccessFlags & flags, const Args &... args) const; + bool checkAccessImplHelper(const AccessFlags & flags, const Args &... args) const; + + template + bool checkAccessImplHelper(const AccessRightsElement & element) const; template bool checkAdminOptionImpl(const UUID & role_id) const; @@ -201,7 +206,7 @@ private: bool checkAdminOptionImpl(const std::vector & role_ids, const std::unordered_map & names_of_roles) const; template - bool checkAdminOptionImpl2(const Container & role_ids, const GetNameFunction & get_name_function) const; + bool checkAdminOptionImplHelper(const Container & role_ids, const GetNameFunction & get_name_function) const; const AccessControlManager * manager = nullptr; const Params params; diff --git a/src/Access/Credentials.cpp b/src/Access/Credentials.cpp new file mode 100644 index 00000000000..c2850ad4d4f --- /dev/null +++ b/src/Access/Credentials.cpp @@ -0,0 +1,86 @@ +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +Credentials::Credentials(const String & user_name_) + : user_name(user_name_) +{ +} + +const String & Credentials::getUserName() const +{ + if (!isReady()) + throwNotReady(); + return user_name; +} + +bool Credentials::isReady() const +{ + return is_ready; +} + +void Credentials::throwNotReady() +{ + throw Exception("Credentials are not ready", ErrorCodes::LOGICAL_ERROR); +} + +AlwaysAllowCredentials::AlwaysAllowCredentials() +{ + is_ready = true; +} + +AlwaysAllowCredentials::AlwaysAllowCredentials(const String & user_name_) + : Credentials(user_name_) +{ + is_ready = true; +} + +void AlwaysAllowCredentials::setUserName(const String & user_name_) +{ + user_name = user_name_; +} + +BasicCredentials::BasicCredentials() +{ + is_ready = true; +} + +BasicCredentials::BasicCredentials(const String & user_name_) + : Credentials(user_name_) +{ + is_ready = true; +} + +BasicCredentials::BasicCredentials(const String & user_name_, const String & password_) + : Credentials(user_name_) + , password(password_) +{ + is_ready = true; +} + +void BasicCredentials::setUserName(const String & user_name_) +{ + user_name = user_name_; +} + +void BasicCredentials::setPassword(const String & password_) +{ + password = password_; +} + +const String & BasicCredentials::getPassword() const +{ + if (!isReady()) + throwNotReady(); + return password; +} + +} diff --git a/src/Access/Credentials.h b/src/Access/Credentials.h new file mode 100644 index 00000000000..5e9fd1589e0 --- /dev/null +++ b/src/Access/Credentials.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +class Credentials +{ +public: + explicit Credentials() = default; + explicit Credentials(const String & user_name_); + + virtual ~Credentials() = default; + + const String & getUserName() const; + bool isReady() const; + +protected: + [[noreturn]] static void throwNotReady(); + +protected: + bool is_ready = false; + String user_name; +}; + +class AlwaysAllowCredentials + : public Credentials +{ +public: + explicit AlwaysAllowCredentials(); + explicit AlwaysAllowCredentials(const String & user_name_); + + void setUserName(const String & user_name_); +}; + +class BasicCredentials + : public Credentials +{ +public: + explicit BasicCredentials(); + explicit BasicCredentials(const String & user_name_); + explicit BasicCredentials(const String & user_name_, const String & password_); + + void setUserName(const String & user_name_); + void setPassword(const String & password_); + const String & getPassword() const; + +private: + String password; +}; + +} diff --git a/src/Access/ExternalAuthenticators.cpp b/src/Access/ExternalAuthenticators.cpp index 6f66f4303e1..1cade973724 100644 --- a/src/Access/ExternalAuthenticators.cpp +++ b/src/Access/ExternalAuthenticators.cpp @@ -20,14 +20,14 @@ namespace ErrorCodes namespace { -auto parseLDAPServer(const Poco::Util::AbstractConfiguration & config, const String & ldap_server_name) +auto parseLDAPServer(const Poco::Util::AbstractConfiguration & config, const String & name) { - if (ldap_server_name.empty()) + if (name.empty()) throw Exception("LDAP server name cannot be empty", ErrorCodes::BAD_ARGUMENTS); - LDAPServerParams params; + LDAPClient::Params params; - const String ldap_server_config = "ldap_servers." + ldap_server_name; + const String ldap_server_config = "ldap_servers." + name; const bool has_host = config.has(ldap_server_config + ".host"); const bool has_port = config.has(ldap_server_config + ".port"); @@ -75,11 +75,11 @@ auto parseLDAPServer(const Poco::Util::AbstractConfiguration & config, const Str boost::to_lower(enable_tls_lc_str); if (enable_tls_lc_str == "starttls") - params.enable_tls = LDAPServerParams::TLSEnable::YES_STARTTLS; + params.enable_tls = LDAPClient::Params::TLSEnable::YES_STARTTLS; else if (config.getBool(ldap_server_config + ".enable_tls")) - params.enable_tls = LDAPServerParams::TLSEnable::YES; + params.enable_tls = LDAPClient::Params::TLSEnable::YES; else - params.enable_tls = LDAPServerParams::TLSEnable::NO; + params.enable_tls = LDAPClient::Params::TLSEnable::NO; } if (has_tls_minimum_protocol_version) @@ -88,15 +88,15 @@ auto parseLDAPServer(const Poco::Util::AbstractConfiguration & config, const Str boost::to_lower(tls_minimum_protocol_version_lc_str); if (tls_minimum_protocol_version_lc_str == "ssl2") - params.tls_minimum_protocol_version = LDAPServerParams::TLSProtocolVersion::SSL2; + params.tls_minimum_protocol_version = LDAPClient::Params::TLSProtocolVersion::SSL2; else if (tls_minimum_protocol_version_lc_str == "ssl3") - params.tls_minimum_protocol_version = LDAPServerParams::TLSProtocolVersion::SSL3; + params.tls_minimum_protocol_version = LDAPClient::Params::TLSProtocolVersion::SSL3; else if (tls_minimum_protocol_version_lc_str == "tls1.0") - params.tls_minimum_protocol_version = LDAPServerParams::TLSProtocolVersion::TLS1_0; + params.tls_minimum_protocol_version = LDAPClient::Params::TLSProtocolVersion::TLS1_0; else if (tls_minimum_protocol_version_lc_str == "tls1.1") - params.tls_minimum_protocol_version = LDAPServerParams::TLSProtocolVersion::TLS1_1; + params.tls_minimum_protocol_version = LDAPClient::Params::TLSProtocolVersion::TLS1_1; else if (tls_minimum_protocol_version_lc_str == "tls1.2") - params.tls_minimum_protocol_version = LDAPServerParams::TLSProtocolVersion::TLS1_2; + params.tls_minimum_protocol_version = LDAPClient::Params::TLSProtocolVersion::TLS1_2; else throw Exception("Bad value for 'tls_minimum_protocol_version' entry, allowed values are: 'ssl2', 'ssl3', 'tls1.0', 'tls1.1', 'tls1.2'", ErrorCodes::BAD_ARGUMENTS); } @@ -107,13 +107,13 @@ auto parseLDAPServer(const Poco::Util::AbstractConfiguration & config, const Str boost::to_lower(tls_require_cert_lc_str); if (tls_require_cert_lc_str == "never") - params.tls_require_cert = LDAPServerParams::TLSRequireCert::NEVER; + params.tls_require_cert = LDAPClient::Params::TLSRequireCert::NEVER; else if (tls_require_cert_lc_str == "allow") - params.tls_require_cert = LDAPServerParams::TLSRequireCert::ALLOW; + params.tls_require_cert = LDAPClient::Params::TLSRequireCert::ALLOW; else if (tls_require_cert_lc_str == "try") - params.tls_require_cert = LDAPServerParams::TLSRequireCert::TRY; + params.tls_require_cert = LDAPClient::Params::TLSRequireCert::TRY; else if (tls_require_cert_lc_str == "demand") - params.tls_require_cert = LDAPServerParams::TLSRequireCert::DEMAND; + params.tls_require_cert = LDAPClient::Params::TLSRequireCert::DEMAND; else throw Exception("Bad value for 'tls_require_cert' entry, allowed values are: 'never', 'allow', 'try', 'demand'", ErrorCodes::BAD_ARGUMENTS); } @@ -142,7 +142,44 @@ auto parseLDAPServer(const Poco::Util::AbstractConfiguration & config, const Str params.port = port; } else - params.port = (params.enable_tls == LDAPServerParams::TLSEnable::YES ? 636 : 389); + params.port = (params.enable_tls == LDAPClient::Params::TLSEnable::YES ? 636 : 389); + + return params; +} + +auto parseKerberosParams(const Poco::Util::AbstractConfiguration & config) +{ + GSSAcceptorContext::Params params; + + Poco::Util::AbstractConfiguration::Keys keys; + config.keys("kerberos", keys); + + std::size_t reealm_key_count = 0; + std::size_t principal_keys_count = 0; + + for (auto key : keys) + { + const auto bracket_pos = key.find('['); + if (bracket_pos != std::string::npos) + key.resize(bracket_pos); + + boost::algorithm::to_lower(key); + + reealm_key_count += (key == "realm"); + principal_keys_count += (key == "principal"); + } + + if (reealm_key_count > 0 && principal_keys_count > 0) + throw Exception("Realm and principal name cannot be specified simultaneously", ErrorCodes::BAD_ARGUMENTS); + + if (reealm_key_count > 1) + throw Exception("Multiple realm sections are not allowed", ErrorCodes::BAD_ARGUMENTS); + + if (principal_keys_count > 1) + throw Exception("Multiple principal sections are not allowed", ErrorCodes::BAD_ARGUMENTS); + + params.realm = config.getString("kerberos.realm", ""); + params.principal = config.getString("kerberos.principal", ""); return params; } @@ -152,48 +189,82 @@ auto parseLDAPServer(const Poco::Util::AbstractConfiguration & config, const Str void ExternalAuthenticators::reset() { std::scoped_lock lock(mutex); - ldap_server_params.clear(); - ldap_server_caches.clear(); + ldap_client_params_blueprint.clear(); + ldap_caches.clear(); + kerberos_params.reset(); } void ExternalAuthenticators::setConfiguration(const Poco::Util::AbstractConfiguration & config, Poco::Logger * log) { std::scoped_lock lock(mutex); - reset(); + Poco::Util::AbstractConfiguration::Keys all_keys; + config.keys("", all_keys); + + std::size_t ldap_servers_key_count = 0; + std::size_t kerberos_keys_count = 0; + + for (auto key : all_keys) + { + const auto bracket_pos = key.find('['); + if (bracket_pos != std::string::npos) + key.resize(bracket_pos); + + boost::algorithm::to_lower(key); + + ldap_servers_key_count += (key == "ldap_servers"); + kerberos_keys_count += (key == "kerberos"); + } + + if (ldap_servers_key_count > 1) + throw Exception("Multiple ldap_servers sections are not allowed", ErrorCodes::BAD_ARGUMENTS); + + if (kerberos_keys_count > 1) + throw Exception("Multiple kerberos sections are not allowed", ErrorCodes::BAD_ARGUMENTS); + Poco::Util::AbstractConfiguration::Keys ldap_server_names; config.keys("ldap_servers", ldap_server_names); for (const auto & ldap_server_name : ldap_server_names) { try { - ldap_server_params.insert_or_assign(ldap_server_name, parseLDAPServer(config, ldap_server_name)); + ldap_client_params_blueprint.insert_or_assign(ldap_server_name, parseLDAPServer(config, ldap_server_name)); } catch (...) { tryLogCurrentException(log, "Could not parse LDAP server " + backQuote(ldap_server_name)); } } + + try + { + if (kerberos_keys_count > 0) + kerberos_params = parseKerberosParams(config); + } + catch (...) + { + tryLogCurrentException(log, "Could not parse Kerberos section"); + } } -bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const String & user_name, const String & password, - const LDAPSearchParamsList * search_params, LDAPSearchResultsList * search_results) const +bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const BasicCredentials & credentials, + const LDAPClient::SearchParamsList * search_params, LDAPClient::SearchResultsList * search_results) const { - std::optional params; + std::optional params; std::size_t params_hash = 0; { std::scoped_lock lock(mutex); // Retrieve the server parameters. - const auto pit = ldap_server_params.find(server); - if (pit == ldap_server_params.end()) + const auto pit = ldap_client_params_blueprint.find(server); + if (pit == ldap_client_params_blueprint.end()) throw Exception("LDAP server '" + server + "' is not configured", ErrorCodes::BAD_ARGUMENTS); params = pit->second; - params->user = user_name; - params->password = password; + params->user = credentials.getUserName(); + params->password = credentials.getPassword(); params->combineCoreHash(params_hash); if (search_params) @@ -207,12 +278,12 @@ bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const S // Check the cache, but only if the caching is enabled at all. if (params->verification_cooldown > std::chrono::seconds{0}) { - const auto cit = ldap_server_caches.find(server); - if (cit != ldap_server_caches.end()) + const auto cit = ldap_caches.find(server); + if (cit != ldap_caches.end()) { auto & cache = cit->second; - const auto eit = cache.find(user_name); + const auto eit = cache.find(credentials.getUserName()); if (eit != cache.end()) { const auto & entry = eit->second; @@ -249,7 +320,7 @@ bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const S // Erase the cache, if empty. if (cache.empty()) - ldap_server_caches.erase(cit); + ldap_caches.erase(cit); } } } @@ -264,13 +335,13 @@ bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const S std::scoped_lock lock(mutex); // If the server was removed from the config while we were checking the password, we discard the current result. - const auto pit = ldap_server_params.find(server); - if (pit == ldap_server_params.end()) + const auto pit = ldap_client_params_blueprint.find(server); + if (pit == ldap_client_params_blueprint.end()) return false; auto new_params = pit->second; - new_params.user = user_name; - new_params.password = password; + new_params.user = credentials.getUserName(); + new_params.password = credentials.getPassword(); std::size_t new_params_hash = 0; new_params.combineCoreHash(new_params_hash); @@ -286,7 +357,7 @@ bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const S if (params_hash != new_params_hash) return false; - auto & entry = ldap_server_caches[server][user_name]; + auto & entry = ldap_caches[server][credentials.getUserName()]; if (entry.last_successful_authentication_timestamp < current_check_timestamp) { entry.last_successful_params_hash = params_hash; @@ -314,4 +385,33 @@ bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const S return result; } +bool ExternalAuthenticators::checkKerberosCredentials(const String & realm, const GSSAcceptorContext & credentials) const +{ + std::scoped_lock lock(mutex); + + if (!kerberos_params.has_value()) + throw Exception("Kerberos is not enabled", ErrorCodes::BAD_ARGUMENTS); + + if (!credentials.isReady()) + return false; + + if (credentials.isFailed()) + return false; + + if (!realm.empty() && realm != credentials.getRealm()) + return false; + + return true; +} + +GSSAcceptorContext::Params ExternalAuthenticators::getKerberosParams() const +{ + std::scoped_lock lock(mutex); + + if (!kerberos_params.has_value()) + throw Exception("Kerberos is not enabled", ErrorCodes::BAD_ARGUMENTS); + + return kerberos_params.value(); +} + } diff --git a/src/Access/ExternalAuthenticators.h b/src/Access/ExternalAuthenticators.h index abcc8e8d10d..c8feea7eada 100644 --- a/src/Access/ExternalAuthenticators.h +++ b/src/Access/ExternalAuthenticators.h @@ -1,11 +1,14 @@ #pragma once -#include +#include +#include +#include #include #include #include #include +#include #include @@ -28,25 +31,31 @@ class ExternalAuthenticators public: void reset(); void setConfiguration(const Poco::Util::AbstractConfiguration & config, Poco::Logger * log); - bool checkLDAPCredentials(const String & server, const String & user_name, const String & password, - const LDAPSearchParamsList * search_params = nullptr, LDAPSearchResultsList * search_results = nullptr) const; + + // The name and readiness of the credentials must be verified before calling these. + bool checkLDAPCredentials(const String & server, const BasicCredentials & credentials, + const LDAPClient::SearchParamsList * search_params = nullptr, LDAPClient::SearchResultsList * search_results = nullptr) const; + bool checkKerberosCredentials(const String & realm, const GSSAcceptorContext & credentials) const; + + GSSAcceptorContext::Params getKerberosParams() const; private: struct LDAPCacheEntry { std::size_t last_successful_params_hash = 0; std::chrono::steady_clock::time_point last_successful_authentication_timestamp; - LDAPSearchResultsList last_successful_search_results; + LDAPClient::SearchResultsList last_successful_search_results; }; - using LDAPServerCache = std::unordered_map; // user name -> cache entry - using LDAPServerCaches = std::map; // server name -> cache - using LDAPServersParams = std::map; // server name -> params + using LDAPCache = std::unordered_map; // user name -> cache entry + using LDAPCaches = std::map; // server name -> cache + using LDAPParams = std::map; // server name -> params private: mutable std::recursive_mutex mutex; - LDAPServersParams ldap_server_params; - mutable LDAPServerCaches ldap_server_caches; + LDAPParams ldap_client_params_blueprint; + mutable LDAPCaches ldap_caches; + std::optional kerberos_params; }; } diff --git a/src/Access/GSSAcceptor.cpp b/src/Access/GSSAcceptor.cpp new file mode 100644 index 00000000000..49b83cc883c --- /dev/null +++ b/src/Access/GSSAcceptor.cpp @@ -0,0 +1,469 @@ +#include +#include +#include + +#include + +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME; + extern const int KERBEROS_ERROR; +} + +GSSAcceptorContext::GSSAcceptorContext(const GSSAcceptorContext::Params& params_) + : params(params_) +{ +} + +GSSAcceptorContext::~GSSAcceptorContext() +{ + resetHandles(); +} + +const String & GSSAcceptorContext::getRealm() const +{ + if (!isReady()) + throwNotReady(); + return realm; +} + +bool GSSAcceptorContext::isFailed() const +{ + return is_failed; +} + +#if USE_KRB5 + +namespace +{ + +std::recursive_mutex gss_global_mutex; + +struct PrincipalName +{ + explicit PrincipalName(String principal); +// operator String() const; + + String name; + std::vector instances; + String realm; +}; + +PrincipalName::PrincipalName(String principal) +{ + const auto at_pos = principal.find('@'); + if (at_pos != std::string::npos) + { + realm = principal.substr(at_pos + 1); + principal.resize(at_pos); + } + + Poco::StringTokenizer st(principal, "/"); + auto it = st.begin(); + if (it != st.end()) + { + name = *it; + instances.assign(++it, st.end()); + } +} + +/* +PrincipalName::operator String() const +{ + String principal = name; + + for (const auto & instance : instances) + { + principal += '/'; + principal += instance; + } + + principal += '@'; + principal += realm; + + return principal; +} +*/ + +String bufferToString(const gss_buffer_desc & buf) +{ + String str; + + if (buf.length > 0 && buf.value != nullptr) + { + str.assign(static_cast(buf.value), buf.length); + while (!str.empty() && str.back() == '\0') { str.pop_back(); } + } + + return str; +} + +String extractSpecificStatusMessages(OM_uint32 status_code, int status_type, const gss_OID & mech_type) +{ + std::scoped_lock lock(gss_global_mutex); + + String messages; + OM_uint32 message_context = 0; + + do + { + gss_buffer_desc status_string_buf; + status_string_buf.length = 0; + status_string_buf.value = nullptr; + + SCOPE_EXIT({ + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_release_buffer( + &minor_status, + &status_string_buf + ); + }); + + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_display_status( + &minor_status, + status_code, + status_type, + mech_type, + &message_context, + &status_string_buf + ); + + const auto message = bufferToString(status_string_buf); + + if (!message.empty()) + { + if (!messages.empty()) + messages += ", "; + + messages += message; + } + } while (message_context != 0); + + return messages; +} + +String extractStatusMessages(OM_uint32 major_status_code, OM_uint32 minor_status_code, const gss_OID & mech_type) +{ + std::scoped_lock lock(gss_global_mutex); + + const auto gss_messages = extractSpecificStatusMessages(major_status_code, GSS_C_GSS_CODE, mech_type); + const auto mech_messages = extractSpecificStatusMessages(minor_status_code, GSS_C_MECH_CODE, mech_type); + + String messages; + + if (!gss_messages.empty()) + messages += "Majors: " + gss_messages; + + if (!mech_messages.empty()) + { + if (!messages.empty()) + messages += "; "; + + messages += "Minors: " + mech_messages; + } + + return messages; +} + +std::pair extractNameAndRealm(const gss_name_t & name) +{ + std::scoped_lock lock(gss_global_mutex); + + gss_buffer_desc name_buf; + name_buf.length = 0; + name_buf.value = nullptr; + + SCOPE_EXIT({ + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_release_buffer( + &minor_status, + &name_buf + ); + }); + + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_display_name( + &minor_status, + name, + &name_buf, + nullptr + ); + + const PrincipalName principal(bufferToString(name_buf)); + return { principal.name, principal.realm }; +} + +bool equalMechanisms(const String & left_str, const gss_OID & right_oid) +{ + std::scoped_lock lock(gss_global_mutex); + + gss_buffer_desc left_buf; + left_buf.length = left_str.size(); + left_buf.value = const_cast(left_str.c_str()); + + gss_OID left_oid = GSS_C_NO_OID; + + SCOPE_EXIT({ + if (left_oid != GSS_C_NO_OID) + { + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_release_oid( + &minor_status, + &left_oid + ); + left_oid = GSS_C_NO_OID; + } + }); + + OM_uint32 minor_status = 0; + OM_uint32 major_status = gss_str_to_oid( + &minor_status, + &left_buf, + &left_oid + ); + + if (GSS_ERROR(major_status)) + return false; + + return gss_oid_equal(left_oid, right_oid); +} + +} + +void GSSAcceptorContext::reset() +{ + is_ready = false; + is_failed = false; + user_name.clear(); + realm.clear(); + initHandles(); +} + +void GSSAcceptorContext::resetHandles() noexcept +{ + std::scoped_lock lock(gss_global_mutex); + + if (acceptor_credentials_handle != GSS_C_NO_CREDENTIAL) + { + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_release_cred( + &minor_status, + &acceptor_credentials_handle + ); + acceptor_credentials_handle = GSS_C_NO_CREDENTIAL; + } + + if (context_handle != GSS_C_NO_CONTEXT) + { + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_delete_sec_context( + &minor_status, + &context_handle, + GSS_C_NO_BUFFER + ); + context_handle = GSS_C_NO_CONTEXT; + } +} + +void GSSAcceptorContext::initHandles() +{ + std::scoped_lock lock(gss_global_mutex); + + resetHandles(); + + if (!params.principal.empty()) + { + if (!params.realm.empty()) + throw Exception("Realm and principal name cannot be specified simultaneously", ErrorCodes::BAD_ARGUMENTS); + + gss_buffer_desc acceptor_name_buf; + acceptor_name_buf.length = params.principal.size(); + acceptor_name_buf.value = const_cast(params.principal.c_str()); + + gss_name_t acceptor_name = GSS_C_NO_NAME; + + SCOPE_EXIT({ + if (acceptor_name != GSS_C_NO_NAME) + { + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_release_name( + &minor_status, + &acceptor_name + ); + acceptor_name = GSS_C_NO_NAME; + } + }); + + OM_uint32 minor_status = 0; + OM_uint32 major_status = gss_import_name( + &minor_status, + &acceptor_name_buf, + GSS_C_NT_HOSTBASED_SERVICE, + &acceptor_name + ); + + if (GSS_ERROR(major_status)) + { + const auto messages = extractStatusMessages(major_status, minor_status, GSS_C_NO_OID); + throw Exception("gss_import_name() failed" + (messages.empty() ? "" : ": " + messages), ErrorCodes::KERBEROS_ERROR); + } + + minor_status = 0; + major_status = gss_acquire_cred( + &minor_status, + acceptor_name, + GSS_C_INDEFINITE, + GSS_C_NO_OID_SET, + GSS_C_ACCEPT, + &acceptor_credentials_handle, + nullptr, + nullptr + ); + + if (GSS_ERROR(major_status)) + { + const auto messages = extractStatusMessages(major_status, minor_status, GSS_C_NO_OID); + throw Exception("gss_acquire_cred() failed" + (messages.empty() ? "" : ": " + messages), ErrorCodes::KERBEROS_ERROR); + } + } +} + +String GSSAcceptorContext::processToken(const String & input_token, Poco::Logger * log) +{ + std::scoped_lock lock(gss_global_mutex); + + String output_token; + + try + { + if (is_ready || is_failed || context_handle == GSS_C_NO_CONTEXT) + reset(); + + gss_buffer_desc input_token_buf; + input_token_buf.length = input_token.size(); + input_token_buf.value = const_cast(input_token.c_str()); + + gss_buffer_desc output_token_buf; + output_token_buf.length = 0; + output_token_buf.value = nullptr; + + gss_name_t initiator_name = GSS_C_NO_NAME; + gss_OID mech_type = GSS_C_NO_OID; + OM_uint32 flags = 0; + + SCOPE_EXIT({ + if (initiator_name != GSS_C_NO_NAME) + { + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_release_name( + &minor_status, + &initiator_name + ); + initiator_name = GSS_C_NO_NAME; + } + + OM_uint32 minor_status = 0; + [[maybe_unused]] OM_uint32 major_status = gss_release_buffer( + &minor_status, + &output_token_buf + ); + }); + + OM_uint32 minor_status = 0; + OM_uint32 major_status = gss_accept_sec_context( + &minor_status, + &context_handle, + acceptor_credentials_handle, + &input_token_buf, + GSS_C_NO_CHANNEL_BINDINGS, + &initiator_name, + &mech_type, + &output_token_buf, + &flags, + nullptr, + nullptr + ); + + if (major_status == GSS_S_COMPLETE) + { + if (!params.mechanism.empty() && !equalMechanisms(params.mechanism, mech_type)) + throw Exception("gss_accept_sec_context() succeeded, but: the authentication mechanism is not what was expected", ErrorCodes::KERBEROS_ERROR); + + if (flags & GSS_C_ANON_FLAG) + throw Exception("gss_accept_sec_context() succeeded, but: the initiator does not wish to be authenticated", ErrorCodes::KERBEROS_ERROR); + + std::tie(user_name, realm) = extractNameAndRealm(initiator_name); + + if (user_name.empty()) + throw Exception("gss_accept_sec_context() succeeded, but: the initiator name cannot be extracted", ErrorCodes::KERBEROS_ERROR); + + if (realm.empty()) + throw Exception("gss_accept_sec_context() succeeded, but: the initiator realm cannot be extracted", ErrorCodes::KERBEROS_ERROR); + + if (!params.realm.empty() && params.realm != realm) + throw Exception("gss_accept_sec_context() succeeded, but: the initiator realm is not what was expected (expected: " + params.realm + ", actual: " + realm + ")", ErrorCodes::KERBEROS_ERROR); + + output_token = bufferToString(output_token_buf); + + is_ready = true; + is_failed = false; + + resetHandles(); + } + else if (!GSS_ERROR(major_status) && (major_status & GSS_S_CONTINUE_NEEDED)) + { + output_token = bufferToString(output_token_buf); + + is_ready = false; + is_failed = false; + } + else + { + const auto messages = extractStatusMessages(major_status, minor_status, mech_type); + throw Exception("gss_accept_sec_context() failed" + (messages.empty() ? "" : ": " + messages), ErrorCodes::KERBEROS_ERROR); + } + } + catch (...) + { + tryLogCurrentException(log, "Could not process GSS token"); + + is_ready = true; + is_failed = true; + + resetHandles(); + } + + return output_token; +} + +#else // USE_KRB5 + +void GSSAcceptorContext::reset() +{ +} + +void GSSAcceptorContext::resetHandles() noexcept +{ +} + +void GSSAcceptorContext::initHandles() +{ +} + +String GSSAcceptorContext::processToken(const String &, Poco::Logger *) +{ + throw Exception("ClickHouse was built without GSS-API/Kerberos support", ErrorCodes::FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME); +} + +#endif // USE_KRB5 + +} diff --git a/src/Access/GSSAcceptor.h b/src/Access/GSSAcceptor.h new file mode 100644 index 00000000000..8d207c59c01 --- /dev/null +++ b/src/Access/GSSAcceptor.h @@ -0,0 +1,66 @@ +#pragma once + +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#include +#include +#include + +#if USE_KRB5 +# include +# include +# define MAYBE_NORETURN +#else +# define MAYBE_NORETURN [[noreturn]] +#endif + +namespace Poco { class Logger; } + +namespace DB +{ + +class GSSAcceptorContext + : public Credentials +{ +public: + struct Params + { + String mechanism = "1.2.840.113554.1.2.2"; // OID: krb5 + String principal; + String realm; + }; + + explicit GSSAcceptorContext(const Params& params_); + virtual ~GSSAcceptorContext() override; + + GSSAcceptorContext(const GSSAcceptorContext &) = delete; + GSSAcceptorContext(GSSAcceptorContext &&) = delete; + GSSAcceptorContext & operator= (const GSSAcceptorContext &) = delete; + GSSAcceptorContext & operator= (GSSAcceptorContext &&) = delete; + + const String & getRealm() const; + bool isFailed() const; + MAYBE_NORETURN String processToken(const String & input_token, Poco::Logger * log); + +private: + void reset(); + void resetHandles() noexcept; + void initHandles(); + +private: + const Params params; + + bool is_failed = false; + String realm; + +#if USE_KRB5 + gss_ctx_id_t context_handle = GSS_C_NO_CONTEXT; + gss_cred_id_t acceptor_credentials_handle = GSS_C_NO_CREDENTIAL; +#endif +}; + +} + +#undef MAYBE_NORETURN diff --git a/src/Access/GrantedRoles.cpp b/src/Access/GrantedRoles.cpp index 4d7007c4db6..7930b56e44d 100644 --- a/src/Access/GrantedRoles.cpp +++ b/src/Access/GrantedRoles.cpp @@ -1,37 +1,38 @@ #include +#include #include +#include namespace DB { -void GrantedRoles::grant(const UUID & role) +void GrantedRoles::grant(const UUID & role_) { - roles.insert(role); + roles.insert(role_); } void GrantedRoles::grant(const std::vector & roles_) { - for (const UUID & role : roles_) - grant(role); + roles.insert(roles_.begin(), roles_.end()); } -void GrantedRoles::grantWithAdminOption(const UUID & role) +void GrantedRoles::grantWithAdminOption(const UUID & role_) { - roles.insert(role); - roles_with_admin_option.insert(role); + roles.insert(role_); + roles_with_admin_option.insert(role_); } void GrantedRoles::grantWithAdminOption(const std::vector & roles_) { - for (const UUID & role : roles_) - grantWithAdminOption(role); + roles.insert(roles_.begin(), roles_.end()); + roles_with_admin_option.insert(roles_.begin(), roles_.end()); } -void GrantedRoles::revoke(const UUID & role) +void GrantedRoles::revoke(const UUID & role_) { - roles.erase(role); - roles_with_admin_option.erase(role); + roles.erase(role_); + roles_with_admin_option.erase(role_); } void GrantedRoles::revoke(const std::vector & roles_) @@ -40,9 +41,9 @@ void GrantedRoles::revoke(const std::vector & roles_) revoke(role); } -void GrantedRoles::revokeAdminOption(const UUID & role) +void GrantedRoles::revokeAdminOption(const UUID & role_) { - roles_with_admin_option.erase(role); + roles_with_admin_option.erase(role_); } void GrantedRoles::revokeAdminOption(const std::vector & roles_) @@ -52,13 +53,118 @@ void GrantedRoles::revokeAdminOption(const std::vector & roles_) } -GrantedRoles::Grants GrantedRoles::getGrants() const +bool GrantedRoles::isGranted(const UUID & role_) const { - Grants res; - res.grants_with_admin_option.insert(res.grants_with_admin_option.end(), roles_with_admin_option.begin(), roles_with_admin_option.end()); - res.grants.reserve(roles.size() - roles_with_admin_option.size()); - boost::range::set_difference(roles, roles_with_admin_option, std::back_inserter(res.grants)); + return roles.count(role_); +} + +bool GrantedRoles::isGrantedWithAdminOption(const UUID & role_) const +{ + return roles_with_admin_option.count(role_); +} + + +std::vector GrantedRoles::findGranted(const std::vector & ids) const +{ + std::vector res; + res.reserve(ids.size()); + for (const UUID & id : ids) + { + if (isGranted(id)) + res.push_back(id); + } return res; } +std::vector GrantedRoles::findGranted(const boost::container::flat_set & ids) const +{ + std::vector res; + res.reserve(ids.size()); + boost::range::set_difference(ids, roles, std::back_inserter(res)); + return res; +} + +std::vector GrantedRoles::findGranted(const RolesOrUsersSet & ids) const +{ + std::vector res; + for (const UUID & id : roles) + { + if (ids.match(id)) + res.emplace_back(id); + } + return res; +} + +std::vector GrantedRoles::findGrantedWithAdminOption(const std::vector & ids) const +{ + std::vector res; + res.reserve(ids.size()); + for (const UUID & id : ids) + { + if (isGrantedWithAdminOption(id)) + res.push_back(id); + } + return res; +} + +std::vector GrantedRoles::findGrantedWithAdminOption(const boost::container::flat_set & ids) const +{ + std::vector res; + res.reserve(ids.size()); + boost::range::set_difference(ids, roles_with_admin_option, std::back_inserter(res)); + return res; +} + +std::vector GrantedRoles::findGrantedWithAdminOption(const RolesOrUsersSet & ids) const +{ + std::vector res; + for (const UUID & id : roles_with_admin_option) + { + if (ids.match(id)) + res.emplace_back(id); + } + return res; +} + + +GrantedRoles::Elements GrantedRoles::getElements() const +{ + Elements elements; + + Element element; + element.ids.reserve(roles.size()); + boost::range::set_difference(roles, roles_with_admin_option, std::back_inserter(element.ids)); + if (!element.empty()) + { + element.admin_option = false; + elements.emplace_back(std::move(element)); + } + + if (!roles_with_admin_option.empty()) + { + element = {}; + element.ids.insert(element.ids.end(), roles_with_admin_option.begin(), roles_with_admin_option.end()); + element.admin_option = true; + elements.emplace_back(std::move(element)); + } + + return elements; +} + + +void GrantedRoles::makeUnion(const GrantedRoles & other) +{ + roles.insert(other.roles.begin(), other.roles.end()); + roles_with_admin_option.insert(other.roles_with_admin_option.begin(), other.roles_with_admin_option.end()); +} + +void GrantedRoles::makeIntersection(const GrantedRoles & other) +{ + boost::range::remove_erase_if(roles, [&other](const UUID & id) { return other.roles.find(id) == other.roles.end(); }); + + boost::range::remove_erase_if(roles_with_admin_option, [&other](const UUID & id) + { + return other.roles_with_admin_option.find(id) == other.roles_with_admin_option.end(); + }); +} } diff --git a/src/Access/GrantedRoles.h b/src/Access/GrantedRoles.h index fd091755a80..75ea56aba96 100644 --- a/src/Access/GrantedRoles.h +++ b/src/Access/GrantedRoles.h @@ -7,33 +7,55 @@ namespace DB { +struct RolesOrUsersSet; + /// Roles when they are granted to a role or user. /// Stores both the roles themselves and the roles with admin option. -struct GrantedRoles +class GrantedRoles { - boost::container::flat_set roles; - boost::container::flat_set roles_with_admin_option; - - void grant(const UUID & role); +public: + void grant(const UUID & role_); void grant(const std::vector & roles_); - void grantWithAdminOption(const UUID & role); + void grantWithAdminOption(const UUID & role_); void grantWithAdminOption(const std::vector & roles_); - void revoke(const UUID & role); + void revoke(const UUID & role_); void revoke(const std::vector & roles_); - void revokeAdminOption(const UUID & role); + void revokeAdminOption(const UUID & role_); void revokeAdminOption(const std::vector & roles_); - struct Grants + bool isGranted(const UUID & role_) const; + bool isGrantedWithAdminOption(const UUID & role_) const; + + const boost::container::flat_set & getGranted() const { return roles; } + const boost::container::flat_set & getGrantedWithAdminOption() const { return roles_with_admin_option; } + + std::vector findGranted(const std::vector & ids) const; + std::vector findGranted(const boost::container::flat_set & ids) const; + std::vector findGranted(const RolesOrUsersSet & ids) const; + std::vector findGrantedWithAdminOption(const std::vector & ids) const; + std::vector findGrantedWithAdminOption(const boost::container::flat_set & ids) const; + std::vector findGrantedWithAdminOption(const RolesOrUsersSet & ids) const; + + struct Element { - std::vector grants; - std::vector grants_with_admin_option; + std::vector ids; + bool admin_option = false; + bool empty() const { return ids.empty(); } }; + using Elements = std::vector; /// Retrieves the information about grants. - Grants getGrants() const; + Elements getElements() const; + + void makeUnion(const GrantedRoles & other); + void makeIntersection(const GrantedRoles & other); friend bool operator ==(const GrantedRoles & left, const GrantedRoles & right) { return (left.roles == right.roles) && (left.roles_with_admin_option == right.roles_with_admin_option); } friend bool operator !=(const GrantedRoles & left, const GrantedRoles & right) { return !(left == right); } + +private: + boost::container::flat_set roles; + boost::container::flat_set roles_with_admin_option; }; } diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index c68f5f55ef5..7b42abb2737 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -417,53 +418,60 @@ void IAccessStorage::notify(const Notifications & notifications) UUID IAccessStorage::login( - const String & user_name, - const String & password, + const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool replace_exception_with_cannot_authenticate) const { try { - return loginImpl(user_name, password, address, external_authenticators); + return loginImpl(credentials, address, external_authenticators); } catch (...) { if (!replace_exception_with_cannot_authenticate) throw; - tryLogCurrentException(getLogger(), user_name + ": Authentication failed"); - throwCannotAuthenticate(user_name); + tryLogCurrentException(getLogger(), credentials.getUserName() + ": Authentication failed"); + throwCannotAuthenticate(credentials.getUserName()); } } UUID IAccessStorage::loginImpl( - const String & user_name, - const String & password, + const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const { - if (auto id = find(user_name)) + if (auto id = find(credentials.getUserName())) { if (auto user = tryRead(*id)) { - if (!isPasswordCorrectImpl(*user, password, external_authenticators)) - throwInvalidPassword(); - if (!isAddressAllowedImpl(*user, address)) throwAddressNotAllowed(address); + if (!areCredentialsValidImpl(*user, credentials, external_authenticators)) + throwInvalidCredentials(); + return *id; } } - throwNotFound(EntityType::USER, user_name); + throwNotFound(EntityType::USER, credentials.getUserName()); } -bool IAccessStorage::isPasswordCorrectImpl(const User & user, const String & password, const ExternalAuthenticators & external_authenticators) const +bool IAccessStorage::areCredentialsValidImpl( + const User & user, + const Credentials & credentials, + const ExternalAuthenticators & external_authenticators) const { - return user.authentication.isCorrectPassword(user.getName(), password, external_authenticators); + if (!credentials.isReady()) + return false; + + if (credentials.getUserName() != user.getName()) + return false; + + return user.authentication.areCredentialsValid(credentials, external_authenticators); } @@ -472,6 +480,7 @@ bool IAccessStorage::isAddressAllowedImpl(const User & user, const Poco::Net::IP return user.allowed_client_hosts.contains(address); } + UUID IAccessStorage::getIDOfLoggedUser(const String & user_name) const { return getIDOfLoggedUserImpl(user_name); @@ -578,9 +587,9 @@ void IAccessStorage::throwAddressNotAllowed(const Poco::Net::IPAddress & address throw Exception("Connections from " + address.toString() + " are not allowed", ErrorCodes::IP_ADDRESS_NOT_ALLOWED); } -void IAccessStorage::throwInvalidPassword() +void IAccessStorage::throwInvalidCredentials() { - throw Exception("Invalid password", ErrorCodes::WRONG_PASSWORD); + throw Exception("Invalid credentials", ErrorCodes::WRONG_PASSWORD); } void IAccessStorage::throwCannotAuthenticate(const String & user_name) diff --git a/src/Access/IAccessStorage.h b/src/Access/IAccessStorage.h index ecf6b260712..2cdd8eabf73 100644 --- a/src/Access/IAccessStorage.h +++ b/src/Access/IAccessStorage.h @@ -16,6 +16,7 @@ namespace Poco::Net { class IPAddress; } namespace DB { struct User; +class Credentials; class ExternalAuthenticators; /// Contains entities, i.e. instances of classes derived from IAccessEntity. @@ -142,11 +143,11 @@ public: bool hasSubscription(EntityType type) const; bool hasSubscription(const UUID & id) const; - /// Finds an user, check its password and returns the ID of the user. - /// Throws an exception if no such user or password is incorrect. - UUID login(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool replace_exception_with_cannot_authenticate = true) const; + /// Finds a user, check the provided credentials and returns the ID of the user if they are valid. + /// Throws an exception if no such user or credentials are invalid. + UUID login(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool replace_exception_with_cannot_authenticate = true) const; - /// Returns the ID of an user who has logged in (maybe on another node). + /// Returns the ID of a user who has logged in (maybe on another node). /// The function assumes that the password has been already checked somehow, so we can skip checking it now. UUID getIDOfLoggedUser(const String & user_name) const; @@ -164,8 +165,8 @@ protected: virtual ext::scope_guard subscribeForChangesImpl(EntityType type, const OnChangedHandler & handler) const = 0; virtual bool hasSubscriptionImpl(const UUID & id) const = 0; virtual bool hasSubscriptionImpl(EntityType type) const = 0; - virtual UUID loginImpl(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const; - virtual bool isPasswordCorrectImpl(const User & user, const String & password, const ExternalAuthenticators & external_authenticators) const; + virtual UUID loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const; + virtual bool areCredentialsValidImpl(const User & user, const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const; virtual bool isAddressAllowedImpl(const User & user, const Poco::Net::IPAddress & address) const; virtual UUID getIDOfLoggedUserImpl(const String & user_name) const; @@ -183,7 +184,7 @@ protected: [[noreturn]] void throwReadonlyCannotUpdate(EntityType type, const String & name) const; [[noreturn]] void throwReadonlyCannotRemove(EntityType type, const String & name) const; [[noreturn]] static void throwAddressNotAllowed(const Poco::Net::IPAddress & address); - [[noreturn]] static void throwInvalidPassword(); + [[noreturn]] static void throwInvalidCredentials(); [[noreturn]] static void throwCannotAuthenticate(const String & user_name); using Notification = std::tuple; diff --git a/src/Access/LDAPAccessStorage.cpp b/src/Access/LDAPAccessStorage.cpp index 2602422a59a..b47a9b3e041 100644 --- a/src/Access/LDAPAccessStorage.cpp +++ b/src/Access/LDAPAccessStorage.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -35,7 +36,7 @@ LDAPAccessStorage::LDAPAccessStorage(const String & storage_name_, AccessControl String LDAPAccessStorage::getLDAPServerName() const { - return ldap_server; + return ldap_server_name; } @@ -53,8 +54,8 @@ void LDAPAccessStorage::setConfiguration(AccessControlManager * access_control_m if (!has_server) throw Exception("Missing 'server' field for LDAP user directory", ErrorCodes::BAD_ARGUMENTS); - const auto ldap_server_cfg = config.getString(prefix_str + "server"); - if (ldap_server_cfg.empty()) + const auto ldap_server_name_cfg = config.getString(prefix_str + "server"); + if (ldap_server_name_cfg.empty()) throw Exception("Empty 'server' field for LDAP user directory", ErrorCodes::BAD_ARGUMENTS); std::set common_roles_cfg; @@ -67,7 +68,7 @@ void LDAPAccessStorage::setConfiguration(AccessControlManager * access_control_m common_roles_cfg.insert(role_names.begin(), role_names.end()); } - LDAPSearchParamsList role_search_params_cfg; + LDAPClient::SearchParamsList role_search_params_cfg; if (has_role_mapping) { Poco::Util::AbstractConfiguration::Keys all_keys; @@ -89,17 +90,17 @@ void LDAPAccessStorage::setConfiguration(AccessControlManager * access_control_m auto scope = config.getString(rm_prefix_str + "scope", "subtree"); boost::algorithm::to_lower(scope); - if (scope == "base") rm_params.scope = LDAPSearchParams::Scope::BASE; - else if (scope == "one_level") rm_params.scope = LDAPSearchParams::Scope::ONE_LEVEL; - else if (scope == "subtree") rm_params.scope = LDAPSearchParams::Scope::SUBTREE; - else if (scope == "children") rm_params.scope = LDAPSearchParams::Scope::CHILDREN; + if (scope == "base") rm_params.scope = LDAPClient::SearchParams::Scope::BASE; + else if (scope == "one_level") rm_params.scope = LDAPClient::SearchParams::Scope::ONE_LEVEL; + else if (scope == "subtree") rm_params.scope = LDAPClient::SearchParams::Scope::SUBTREE; + else if (scope == "children") rm_params.scope = LDAPClient::SearchParams::Scope::CHILDREN; else throw Exception("Invalid value of 'scope' field in '" + key + "' section of LDAP user directory, must be one of 'base', 'one_level', 'subtree', or 'children'", ErrorCodes::BAD_ARGUMENTS); } } access_control_manager = access_control_manager_; - ldap_server = ldap_server_cfg; + ldap_server_name = ldap_server_name_cfg; role_search_params.swap(role_search_params_cfg); common_role_names.swap(common_roles_cfg); @@ -186,13 +187,10 @@ void LDAPAccessStorage::applyRoleChangeNoLock(bool grant, const UUID & role_id, if (auto user = typeid_cast>(entity_)) { auto changed_user = typeid_cast>(user->clone()); - auto & granted_roles = changed_user->granted_roles.roles; - if (grant) - granted_roles.insert(role_id); + changed_user->granted_roles.grant(role_id); else - granted_roles.erase(role_id); - + changed_user->granted_roles.revoke(role_id); return changed_user; } return entity_; @@ -218,17 +216,17 @@ void LDAPAccessStorage::applyRoleChangeNoLock(bool grant, const UUID & role_id, } -void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPSearchResultsList & external_roles) const +void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles) const { - const auto external_roles_hash = boost::hash{}(external_roles); + const auto external_roles_hash = boost::hash{}(external_roles); return assignRolesNoLock(user, external_roles, external_roles_hash); } -void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPSearchResultsList & external_roles, const std::size_t external_roles_hash) const +void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles, const std::size_t external_roles_hash) const { const auto & user_name = user.getName(); - auto & granted_roles = user.granted_roles.roles; + auto & granted_roles = user.granted_roles; const auto local_role_names = mapExternalRolesNoLock(external_roles); auto grant_role = [this, &user_name, &granted_roles] (const String & role_name, const bool common) @@ -246,7 +244,7 @@ void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPSearchResultsLi if (it != granted_role_ids.end()) { const auto & role_id = it->second; - granted_roles.insert(role_id); + granted_roles.grant(role_id); } else { @@ -255,7 +253,7 @@ void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPSearchResultsLi }; external_role_hashes.erase(user_name); - granted_roles.clear(); + granted_roles = {}; const auto old_role_names = std::move(roles_per_users[user_name]); // Grant the common roles first. @@ -312,10 +310,10 @@ void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPSearchResultsLi } -void LDAPAccessStorage::updateAssignedRolesNoLock(const UUID & id, const String & user_name, const LDAPSearchResultsList & external_roles) const +void LDAPAccessStorage::updateAssignedRolesNoLock(const UUID & id, const String & user_name, const LDAPClient::SearchResultsList & external_roles) const { // No need to include common_role_names in this hash each time, since they don't change. - const auto external_roles_hash = boost::hash{}(external_roles); + const auto external_roles_hash = boost::hash{}(external_roles); // Map and grant the roles from scratch only if the list of external role has changed. const auto it = external_role_hashes.find(user_name); @@ -337,7 +335,7 @@ void LDAPAccessStorage::updateAssignedRolesNoLock(const UUID & id, const String } -std::set LDAPAccessStorage::mapExternalRolesNoLock(const LDAPSearchResultsList & external_roles) const +std::set LDAPAccessStorage::mapExternalRolesNoLock(const LDAPClient::SearchResultsList & external_roles) const { std::set role_names; @@ -365,10 +363,19 @@ std::set LDAPAccessStorage::mapExternalRolesNoLock(const LDAPSearchResul } -bool LDAPAccessStorage::isPasswordCorrectLDAPNoLock(const String & user_name, const String & password, - const ExternalAuthenticators & external_authenticators, LDAPSearchResultsList & search_results) const +bool LDAPAccessStorage::areLDAPCredentialsValidNoLock(const User & user, const Credentials & credentials, + const ExternalAuthenticators & external_authenticators, LDAPClient::SearchResultsList & search_results) const { - return external_authenticators.checkLDAPCredentials(ldap_server, user_name, password, &role_search_params, &search_results); + if (!credentials.isReady()) + return false; + + if (credentials.getUserName() != user.getName()) + return false; + + if (const auto * basic_credentials = dynamic_cast(&credentials)) + return external_authenticators.checkLDAPCredentials(ldap_server_name, *basic_credentials, &role_search_params, &search_results); + + return false; } @@ -383,7 +390,7 @@ String LDAPAccessStorage::getStorageParamsJSON() const std::scoped_lock lock(mutex); Poco::JSON::Object params_json; - params_json.set("server", ldap_server); + params_json.set("server", ldap_server_name); Poco::JSON::Array common_role_names_json; for (const auto & role : common_role_names) @@ -405,10 +412,10 @@ String LDAPAccessStorage::getStorageParamsJSON() const String scope; switch (role_mapping.scope) { - case LDAPSearchParams::Scope::BASE: scope = "base"; break; - case LDAPSearchParams::Scope::ONE_LEVEL: scope = "one_level"; break; - case LDAPSearchParams::Scope::SUBTREE: scope = "subtree"; break; - case LDAPSearchParams::Scope::CHILDREN: scope = "children"; break; + case LDAPClient::SearchParams::Scope::BASE: scope = "base"; break; + case LDAPClient::SearchParams::Scope::ONE_LEVEL: scope = "one_level"; break; + case LDAPClient::SearchParams::Scope::SUBTREE: scope = "subtree"; break; + case LDAPClient::SearchParams::Scope::CHILDREN: scope = "children"; break; } role_mapping_json.set("scope", scope); @@ -514,23 +521,23 @@ bool LDAPAccessStorage::hasSubscriptionImpl(EntityType type) const return memory_storage.hasSubscription(type); } -UUID LDAPAccessStorage::loginImpl(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const +UUID LDAPAccessStorage::loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const { std::scoped_lock lock(mutex); - LDAPSearchResultsList external_roles; - auto id = memory_storage.find(user_name); + LDAPClient::SearchResultsList external_roles; + auto id = memory_storage.find(credentials.getUserName()); if (id) { auto user = memory_storage.read(*id); - if (!isPasswordCorrectLDAPNoLock(user->getName(), password, external_authenticators, external_roles)) - throwInvalidPassword(); - if (!isAddressAllowedImpl(*user, address)) throwAddressNotAllowed(address); + if (!areLDAPCredentialsValidNoLock(*user, credentials, external_authenticators, external_roles)) + throwInvalidCredentials(); + // Just in case external_roles are changed. This will be no-op if they are not. - updateAssignedRolesNoLock(*id, user_name, external_roles); + updateAssignedRolesNoLock(*id, user->getName(), external_roles); return *id; } @@ -538,16 +545,16 @@ UUID LDAPAccessStorage::loginImpl(const String & user_name, const String & passw { // User does not exist, so we create one, and will add it if authentication is successful. auto user = std::make_shared(); - user->setName(user_name); - user->authentication = Authentication(Authentication::Type::LDAP_SERVER); - user->authentication.setServerName(ldap_server); - - if (!isPasswordCorrectLDAPNoLock(user->getName(), password, external_authenticators, external_roles)) - throwInvalidPassword(); + user->setName(credentials.getUserName()); + user->authentication = Authentication(Authentication::Type::LDAP); + user->authentication.setLDAPServerName(ldap_server_name); if (!isAddressAllowedImpl(*user, address)) throwAddressNotAllowed(address); + if (!areLDAPCredentialsValidNoLock(*user, credentials, external_authenticators, external_roles)) + throwInvalidCredentials(); + assignRolesNoLock(*user, external_roles); return memory_storage.insert(user); @@ -567,10 +574,10 @@ UUID LDAPAccessStorage::getIDOfLoggedUserImpl(const String & user_name) const // User does not exist, so we create one, and add it pretending that the authentication is successful. auto user = std::make_shared(); user->setName(user_name); - user->authentication = Authentication(Authentication::Type::LDAP_SERVER); - user->authentication.setServerName(ldap_server); + user->authentication = Authentication(Authentication::Type::LDAP); + user->authentication.setLDAPServerName(ldap_server_name); - LDAPSearchResultsList external_roles; + LDAPClient::SearchResultsList external_roles; // TODO: mapped external roles are not available here. Without a password we can't authenticate and retrieve roles from LDAP server. diff --git a/src/Access/LDAPAccessStorage.h b/src/Access/LDAPAccessStorage.h index b3d82d1e86b..ea0ab47c225 100644 --- a/src/Access/LDAPAccessStorage.h +++ b/src/Access/LDAPAccessStorage.h @@ -1,7 +1,9 @@ #pragma once #include -#include +#include +#include +#include #include #include #include @@ -21,14 +23,10 @@ namespace Poco namespace DB { class AccessControlManager; -struct LDAPSearchParams; -using LDAPSearchParamsList = std::vector; -using LDAPSearchResults = std::set; -using LDAPSearchResultsList = std::vector; /// Implementation of IAccessStorage which allows attaching users from a remote LDAP server. /// Currently, any user name will be treated as a name of an existing remote user, -/// a user info entity will be created, with LDAP_SERVER authentication type. +/// a user info entity will be created, with LDAP authentication type. class LDAPAccessStorage : public IAccessStorage { public: @@ -57,7 +55,7 @@ private: // IAccessStorage implementations. virtual ext::scope_guard subscribeForChangesImpl(EntityType type, const OnChangedHandler & handler) const override; virtual bool hasSubscriptionImpl(const UUID & id) const override; virtual bool hasSubscriptionImpl(EntityType type) const override; - virtual UUID loginImpl(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const override; + virtual UUID loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const override; virtual UUID getIDOfLoggedUserImpl(const String & user_name) const override; private: @@ -65,19 +63,19 @@ private: void processRoleChange(const UUID & id, const AccessEntityPtr & entity); void applyRoleChangeNoLock(bool grant, const UUID & role_id, const String & role_name); - void assignRolesNoLock(User & user, const LDAPSearchResultsList & external_roles) const; - void assignRolesNoLock(User & user, const LDAPSearchResultsList & external_roles, const std::size_t external_roles_hash) const; - void updateAssignedRolesNoLock(const UUID & id, const String & user_name, const LDAPSearchResultsList & external_roles) const; - std::set mapExternalRolesNoLock(const LDAPSearchResultsList & external_roles) const; - bool isPasswordCorrectLDAPNoLock(const String & user_name, const String & password, - const ExternalAuthenticators & external_authenticators, LDAPSearchResultsList & search_results) const; + void assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles) const; + void assignRolesNoLock(User & user, const LDAPClient::SearchResultsList & external_roles, const std::size_t external_roles_hash) const; + void updateAssignedRolesNoLock(const UUID & id, const String & user_name, const LDAPClient::SearchResultsList & external_roles) const; + std::set mapExternalRolesNoLock(const LDAPClient::SearchResultsList & external_roles) const; + bool areLDAPCredentialsValidNoLock(const User & user, const Credentials & credentials, + const ExternalAuthenticators & external_authenticators, LDAPClient::SearchResultsList & search_results) const; mutable std::recursive_mutex mutex; AccessControlManager * access_control_manager = nullptr; - String ldap_server; - LDAPSearchParamsList role_search_params; + String ldap_server_name; + LDAPClient::SearchParamsList role_search_params; std::set common_role_names; // role name that should be granted to all users at all times - mutable std::map external_role_hashes; // user name -> LDAPSearchResultsList hash (most recently retrieved and processed) + mutable std::map external_role_hashes; // user name -> LDAPClient::SearchResultsList hash (most recently retrieved and processed) mutable std::map> users_per_roles; // role name -> user names (...it should be granted to; may but don't have to exist for common roles) mutable std::map> roles_per_users; // user name -> role names (...that should be granted to it; may but don't have to include common roles) mutable std::map granted_role_names; // (currently granted) role id -> its name diff --git a/src/Access/LDAPClient.cpp b/src/Access/LDAPClient.cpp index 41756aebb9a..5c4b7dd8d99 100644 --- a/src/Access/LDAPClient.cpp +++ b/src/Access/LDAPClient.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -25,7 +26,25 @@ namespace ErrorCodes extern const int LDAP_ERROR; } -LDAPClient::LDAPClient(const LDAPServerParams & params_) +void LDAPClient::SearchParams::combineHash(std::size_t & seed) const +{ + boost::hash_combine(seed, base_dn); + boost::hash_combine(seed, static_cast(scope)); + boost::hash_combine(seed, search_filter); + boost::hash_combine(seed, attribute); + boost::hash_combine(seed, prefix); +} + +void LDAPClient::Params::combineCoreHash(std::size_t & seed) const +{ + boost::hash_combine(seed, host); + boost::hash_combine(seed, port); + boost::hash_combine(seed, bind_dn); + boost::hash_combine(seed, user); + boost::hash_combine(seed, password); +} + +LDAPClient::LDAPClient(const Params & params_) : params(params_) { } @@ -143,7 +162,7 @@ void LDAPClient::openConnection() LDAPURLDesc url; std::memset(&url, 0, sizeof(url)); - url.lud_scheme = const_cast(params.enable_tls == LDAPServerParams::TLSEnable::YES ? "ldaps" : "ldap"); + url.lud_scheme = const_cast(params.enable_tls == LDAPClient::Params::TLSEnable::YES ? "ldaps" : "ldap"); url.lud_host = const_cast(params.host.c_str()); url.lud_port = params.port; url.lud_scope = LDAP_SCOPE_DEFAULT; @@ -163,8 +182,8 @@ void LDAPClient::openConnection() int value = 0; switch (params.protocol_version) { - case LDAPServerParams::ProtocolVersion::V2: value = LDAP_VERSION2; break; - case LDAPServerParams::ProtocolVersion::V3: value = LDAP_VERSION3; break; + case LDAPClient::Params::ProtocolVersion::V2: value = LDAP_VERSION2; break; + case LDAPClient::Params::ProtocolVersion::V3: value = LDAP_VERSION3; break; } diag(ldap_set_option(handle, LDAP_OPT_PROTOCOL_VERSION, &value)); } @@ -208,11 +227,11 @@ void LDAPClient::openConnection() int value = 0; switch (params.tls_minimum_protocol_version) { - case LDAPServerParams::TLSProtocolVersion::SSL2: value = LDAP_OPT_X_TLS_PROTOCOL_SSL2; break; - case LDAPServerParams::TLSProtocolVersion::SSL3: value = LDAP_OPT_X_TLS_PROTOCOL_SSL3; break; - case LDAPServerParams::TLSProtocolVersion::TLS1_0: value = LDAP_OPT_X_TLS_PROTOCOL_TLS1_0; break; - case LDAPServerParams::TLSProtocolVersion::TLS1_1: value = LDAP_OPT_X_TLS_PROTOCOL_TLS1_1; break; - case LDAPServerParams::TLSProtocolVersion::TLS1_2: value = LDAP_OPT_X_TLS_PROTOCOL_TLS1_2; break; + case LDAPClient::Params::TLSProtocolVersion::SSL2: value = LDAP_OPT_X_TLS_PROTOCOL_SSL2; break; + case LDAPClient::Params::TLSProtocolVersion::SSL3: value = LDAP_OPT_X_TLS_PROTOCOL_SSL3; break; + case LDAPClient::Params::TLSProtocolVersion::TLS1_0: value = LDAP_OPT_X_TLS_PROTOCOL_TLS1_0; break; + case LDAPClient::Params::TLSProtocolVersion::TLS1_1: value = LDAP_OPT_X_TLS_PROTOCOL_TLS1_1; break; + case LDAPClient::Params::TLSProtocolVersion::TLS1_2: value = LDAP_OPT_X_TLS_PROTOCOL_TLS1_2; break; } diag(ldap_set_option(handle, LDAP_OPT_X_TLS_PROTOCOL_MIN, &value)); } @@ -223,10 +242,10 @@ void LDAPClient::openConnection() int value = 0; switch (params.tls_require_cert) { - case LDAPServerParams::TLSRequireCert::NEVER: value = LDAP_OPT_X_TLS_NEVER; break; - case LDAPServerParams::TLSRequireCert::ALLOW: value = LDAP_OPT_X_TLS_ALLOW; break; - case LDAPServerParams::TLSRequireCert::TRY: value = LDAP_OPT_X_TLS_TRY; break; - case LDAPServerParams::TLSRequireCert::DEMAND: value = LDAP_OPT_X_TLS_DEMAND; break; + case LDAPClient::Params::TLSRequireCert::NEVER: value = LDAP_OPT_X_TLS_NEVER; break; + case LDAPClient::Params::TLSRequireCert::ALLOW: value = LDAP_OPT_X_TLS_ALLOW; break; + case LDAPClient::Params::TLSRequireCert::TRY: value = LDAP_OPT_X_TLS_TRY; break; + case LDAPClient::Params::TLSRequireCert::DEMAND: value = LDAP_OPT_X_TLS_DEMAND; break; } diag(ldap_set_option(handle, LDAP_OPT_X_TLS_REQUIRE_CERT, &value)); } @@ -264,12 +283,12 @@ void LDAPClient::openConnection() } #endif - if (params.enable_tls == LDAPServerParams::TLSEnable::YES_STARTTLS) + if (params.enable_tls == LDAPClient::Params::TLSEnable::YES_STARTTLS) diag(ldap_start_tls_s(handle, nullptr, nullptr)); switch (params.sasl_mechanism) { - case LDAPServerParams::SASLMechanism::SIMPLE: + case LDAPClient::Params::SASLMechanism::SIMPLE: { const auto escaped_user_name = escapeForLDAP(params.user); const auto bind_dn = replacePlaceholders(params.bind_dn, { {"{user_name}", escaped_user_name} }); @@ -299,19 +318,19 @@ void LDAPClient::closeConnection() noexcept handle = nullptr; } -LDAPSearchResults LDAPClient::search(const LDAPSearchParams & search_params) +LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params) { std::scoped_lock lock(ldap_global_mutex); - LDAPSearchResults result; + SearchResults result; int scope = 0; switch (search_params.scope) { - case LDAPSearchParams::Scope::BASE: scope = LDAP_SCOPE_BASE; break; - case LDAPSearchParams::Scope::ONE_LEVEL: scope = LDAP_SCOPE_ONELEVEL; break; - case LDAPSearchParams::Scope::SUBTREE: scope = LDAP_SCOPE_SUBTREE; break; - case LDAPSearchParams::Scope::CHILDREN: scope = LDAP_SCOPE_CHILDREN; break; + case SearchParams::Scope::BASE: scope = LDAP_SCOPE_BASE; break; + case SearchParams::Scope::ONE_LEVEL: scope = LDAP_SCOPE_ONELEVEL; break; + case SearchParams::Scope::SUBTREE: scope = LDAP_SCOPE_SUBTREE; break; + case SearchParams::Scope::CHILDREN: scope = LDAP_SCOPE_CHILDREN; break; } const auto escaped_user_name = escapeForLDAP(params.user); @@ -452,7 +471,7 @@ LDAPSearchResults LDAPClient::search(const LDAPSearchParams & search_params) return result; } -bool LDAPSimpleAuthClient::authenticate(const LDAPSearchParamsList * search_params, LDAPSearchResultsList * search_results) +bool LDAPSimpleAuthClient::authenticate(const SearchParamsList * search_params, SearchResultsList * search_results) { if (params.user.empty()) throw Exception("LDAP authentication of a user with empty name is not allowed", ErrorCodes::BAD_ARGUMENTS); @@ -508,12 +527,12 @@ void LDAPClient::closeConnection() noexcept { } -LDAPSearchResults LDAPClient::search(const LDAPSearchParams &) +LDAPClient::SearchResults LDAPClient::search(const SearchParams &) { throw Exception("ClickHouse was built without LDAP support", ErrorCodes::FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME); } -bool LDAPSimpleAuthClient::authenticate(const LDAPSearchParamsList *, LDAPSearchResultsList *) +bool LDAPSimpleAuthClient::authenticate(const SearchParamsList *, SearchResultsList *) { throw Exception("ClickHouse was built without LDAP support", ErrorCodes::FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME); } diff --git a/src/Access/LDAPClient.h b/src/Access/LDAPClient.h index f0ace69649b..4fc97bb957b 100644 --- a/src/Access/LDAPClient.h +++ b/src/Access/LDAPClient.h @@ -4,7 +4,6 @@ # include "config_core.h" #endif -#include #include #if USE_LDAP @@ -14,6 +13,10 @@ # define MAYBE_NORETURN [[noreturn]] #endif +#include +#include +#include + namespace DB { @@ -21,7 +24,98 @@ namespace DB class LDAPClient { public: - explicit LDAPClient(const LDAPServerParams & params_); + struct SearchParams + { + enum class Scope + { + BASE, + ONE_LEVEL, + SUBTREE, + CHILDREN + }; + + String base_dn; + Scope scope = Scope::SUBTREE; + String search_filter; + String attribute = "cn"; + String prefix; + + void combineHash(std::size_t & seed) const; + }; + + using SearchParamsList = std::vector; + using SearchResults = std::set; + using SearchResultsList = std::vector; + + struct Params + { + enum class ProtocolVersion + { + V2, + V3 + }; + + enum class TLSEnable + { + NO, + YES_STARTTLS, + YES + }; + + enum class TLSProtocolVersion + { + SSL2, + SSL3, + TLS1_0, + TLS1_1, + TLS1_2 + }; + + enum class TLSRequireCert + { + NEVER, + ALLOW, + TRY, + DEMAND + }; + + enum class SASLMechanism + { + UNKNOWN, + SIMPLE + }; + + ProtocolVersion protocol_version = ProtocolVersion::V3; + + String host; + std::uint16_t port = 636; + + TLSEnable enable_tls = TLSEnable::YES; + TLSProtocolVersion tls_minimum_protocol_version = TLSProtocolVersion::TLS1_2; + TLSRequireCert tls_require_cert = TLSRequireCert::DEMAND; + String tls_cert_file; + String tls_key_file; + String tls_ca_cert_file; + String tls_ca_cert_dir; + String tls_cipher_suite; + + SASLMechanism sasl_mechanism = SASLMechanism::SIMPLE; + + String bind_dn; + String user; + String password; + + std::chrono::seconds verification_cooldown{0}; + + std::chrono::seconds operation_timeout{40}; + std::chrono::seconds network_timeout{30}; + std::chrono::seconds search_timeout{20}; + std::uint32_t search_limit = 100; + + void combineCoreHash(std::size_t & seed) const; + }; + + explicit LDAPClient(const Params & params_); ~LDAPClient(); LDAPClient(const LDAPClient &) = delete; @@ -33,10 +127,10 @@ protected: MAYBE_NORETURN void diag(const int rc, String text = ""); MAYBE_NORETURN void openConnection(); void closeConnection() noexcept; - LDAPSearchResults search(const LDAPSearchParams & search_params); + SearchResults search(const SearchParams & search_params); protected: - const LDAPServerParams params; + const Params params; #if USE_LDAP LDAP * handle = nullptr; #endif @@ -47,7 +141,7 @@ class LDAPSimpleAuthClient { public: using LDAPClient::LDAPClient; - bool authenticate(const LDAPSearchParamsList * search_params, LDAPSearchResultsList * search_results); + bool authenticate(const SearchParamsList * search_params, SearchResultsList * search_results); }; } diff --git a/src/Access/LDAPParams.h b/src/Access/LDAPParams.h deleted file mode 100644 index 5181b2d1621..00000000000 --- a/src/Access/LDAPParams.h +++ /dev/null @@ -1,120 +0,0 @@ -#pragma once - -#include - -#include - -#include -#include -#include - - -namespace DB -{ - -struct LDAPSearchParams -{ - enum class Scope - { - BASE, - ONE_LEVEL, - SUBTREE, - CHILDREN - }; - - String base_dn; - Scope scope = Scope::SUBTREE; - String search_filter; - String attribute = "cn"; - String prefix; - - void combineHash(std::size_t & seed) const - { - boost::hash_combine(seed, base_dn); - boost::hash_combine(seed, static_cast(scope)); - boost::hash_combine(seed, search_filter); - boost::hash_combine(seed, attribute); - boost::hash_combine(seed, prefix); - } -}; - -using LDAPSearchParamsList = std::vector; -using LDAPSearchResults = std::set; -using LDAPSearchResultsList = std::vector; - -struct LDAPServerParams -{ - enum class ProtocolVersion - { - V2, - V3 - }; - - enum class TLSEnable - { - NO, - YES_STARTTLS, - YES - }; - - enum class TLSProtocolVersion - { - SSL2, - SSL3, - TLS1_0, - TLS1_1, - TLS1_2 - }; - - enum class TLSRequireCert - { - NEVER, - ALLOW, - TRY, - DEMAND - }; - - enum class SASLMechanism - { - UNKNOWN, - SIMPLE - }; - - ProtocolVersion protocol_version = ProtocolVersion::V3; - - String host; - std::uint16_t port = 636; - - TLSEnable enable_tls = TLSEnable::YES; - TLSProtocolVersion tls_minimum_protocol_version = TLSProtocolVersion::TLS1_2; - TLSRequireCert tls_require_cert = TLSRequireCert::DEMAND; - String tls_cert_file; - String tls_key_file; - String tls_ca_cert_file; - String tls_ca_cert_dir; - String tls_cipher_suite; - - SASLMechanism sasl_mechanism = SASLMechanism::SIMPLE; - - String bind_dn; - String user; - String password; - - std::chrono::seconds verification_cooldown{0}; - - std::chrono::seconds operation_timeout{40}; - std::chrono::seconds network_timeout{30}; - std::chrono::seconds search_timeout{20}; - std::uint32_t search_limit = 100; - - void combineCoreHash(std::size_t & seed) const - { - boost::hash_combine(seed, host); - boost::hash_combine(seed, port); - boost::hash_combine(seed, bind_dn); - boost::hash_combine(seed, user); - boost::hash_combine(seed, password); - } -}; - -} diff --git a/src/Access/MultipleAccessStorage.cpp b/src/Access/MultipleAccessStorage.cpp index a8ce3f602ed..15281d8c471 100644 --- a/src/Access/MultipleAccessStorage.cpp +++ b/src/Access/MultipleAccessStorage.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -382,6 +383,7 @@ void MultipleAccessStorage::updateSubscriptionsToNestedStorages(std::unique_lock /// Lock the mutex again to store added subscriptions to the nested storages. lock.lock(); + for (auto type : ext::range(EntityType::MAX)) { if (!added_subscriptions[static_cast(type)].empty()) @@ -399,25 +401,24 @@ void MultipleAccessStorage::updateSubscriptionsToNestedStorages(std::unique_lock } lock.unlock(); - added_subscriptions->clear(); } -UUID MultipleAccessStorage::loginImpl(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const +UUID MultipleAccessStorage::loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const { auto storages = getStoragesInternal(); for (const auto & storage : *storages) { try { - auto id = storage->login(user_name, password, address, external_authenticators, /* replace_exception_with_cannot_authenticate = */ false); + auto id = storage->login(credentials, address, external_authenticators, /* replace_exception_with_cannot_authenticate = */ false); std::lock_guard lock{mutex}; ids_cache.set(id, storage); return id; } catch (...) { - if (!storage->find(EntityType::USER, user_name)) + if (!storage->find(EntityType::USER, credentials.getUserName())) { /// The authentication failed because there no users with such name in the `storage` /// thus we can try to search in other nested storages. @@ -426,7 +427,7 @@ UUID MultipleAccessStorage::loginImpl(const String & user_name, const String & p throw; } } - throwNotFound(EntityType::USER, user_name); + throwNotFound(EntityType::USER, credentials.getUserName()); } diff --git a/src/Access/MultipleAccessStorage.h b/src/Access/MultipleAccessStorage.h index 8844de8c029..610bf2fadcc 100644 --- a/src/Access/MultipleAccessStorage.h +++ b/src/Access/MultipleAccessStorage.h @@ -48,7 +48,7 @@ protected: ext::scope_guard subscribeForChangesImpl(EntityType type, const OnChangedHandler & handler) const override; bool hasSubscriptionImpl(const UUID & id) const override; bool hasSubscriptionImpl(EntityType type) const override; - UUID loginImpl(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const override; + UUID loginImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const override; UUID getIDOfLoggedUserImpl(const String & user_name) const override; private: diff --git a/src/Access/RoleCache.cpp b/src/Access/RoleCache.cpp index f386044bbf7..8fa3d51f867 100644 --- a/src/Access/RoleCache.cpp +++ b/src/Access/RoleCache.cpp @@ -46,10 +46,10 @@ namespace roles_info.access.makeUnion(role->access); roles_info.settings_from_enabled_roles.merge(role->settings); - for (const auto & granted_role : role->granted_roles.roles) + for (const auto & granted_role : role->granted_roles.getGranted()) collectRoles(roles_info, skip_ids, get_role_function, granted_role, false, false); - for (const auto & granted_role : role->granted_roles.roles_with_admin_option) + for (const auto & granted_role : role->granted_roles.getGrantedWithAdminOption()) collectRoles(roles_info, skip_ids, get_role_function, granted_role, false, true); } } @@ -63,15 +63,15 @@ RoleCache::~RoleCache() = default; std::shared_ptr -RoleCache::getEnabledRoles(const boost::container::flat_set & roles, const boost::container::flat_set & roles_with_admin_option) +RoleCache::getEnabledRoles(const std::vector & roles, const std::vector & roles_with_admin_option) { /// Declared before `lock` to send notifications after the mutex will be unlocked. ext::scope_guard notifications; std::lock_guard lock{mutex}; EnabledRoles::Params params; - params.current_roles = roles; - params.current_roles_with_admin_option = roles_with_admin_option; + params.current_roles.insert(roles.begin(), roles.end()); + params.current_roles_with_admin_option.insert(roles_with_admin_option.begin(), roles_with_admin_option.end()); auto it = enabled_roles.find(params); if (it != enabled_roles.end()) { diff --git a/src/Access/RoleCache.h b/src/Access/RoleCache.h index cc6c8599f27..b3f426debcb 100644 --- a/src/Access/RoleCache.h +++ b/src/Access/RoleCache.h @@ -20,7 +20,8 @@ public: ~RoleCache(); std::shared_ptr getEnabledRoles( - const boost::container::flat_set & current_roles, const boost::container::flat_set & current_roles_with_admin_option); + const std::vector & current_roles, + const std::vector & current_roles_with_admin_option); private: void collectEnabledRoles(ext::scope_guard & notifications); diff --git a/src/Access/RolesOrUsersSet.cpp b/src/Access/RolesOrUsersSet.cpp index cb0beb42700..ebd4f0f7a40 100644 --- a/src/Access/RolesOrUsersSet.cpp +++ b/src/Access/RolesOrUsersSet.cpp @@ -72,20 +72,20 @@ void RolesOrUsersSet::init(const ASTRolesOrUsersSet & ast, const AccessControlMa if (ast.id_mode) return parse(name); assert(manager); - if (ast.allow_user_names && ast.allow_role_names) + if (ast.allow_users && ast.allow_roles) { auto id = manager->find(name); if (id) return *id; return manager->getID(name); } - else if (ast.allow_user_names) + else if (ast.allow_users) { return manager->getID(name); } else { - assert(ast.allow_role_names); + assert(ast.allow_roles); return manager->getID(name); } }; @@ -106,8 +106,8 @@ void RolesOrUsersSet::init(const ASTRolesOrUsersSet & ast, const AccessControlMa if (!ast.except_names.empty()) { except_ids.reserve(ast.except_names.size()); - for (const String & except_name : ast.except_names) - except_ids.insert(name_to_id(except_name)); + for (const String & name : ast.except_names) + except_ids.insert(name_to_id(name)); } if (ast.except_current_user) @@ -116,8 +116,8 @@ void RolesOrUsersSet::init(const ASTRolesOrUsersSet & ast, const AccessControlMa except_ids.insert(*current_user_id); } - for (const UUID & except_id : except_ids) - ids.erase(except_id); + for (const UUID & id : except_ids) + ids.erase(id); } @@ -127,7 +127,7 @@ std::shared_ptr RolesOrUsersSet::toAST() const ast->id_mode = true; ast->all = all; - if (!ids.empty()) + if (!ids.empty() && !all) { ast->names.reserve(ids.size()); for (const UUID & id : ids) @@ -152,7 +152,7 @@ std::shared_ptr RolesOrUsersSet::toASTWithNames(const Access auto ast = std::make_shared(); ast->all = all; - if (!ids.empty()) + if (!ids.empty() && !all) { ast->names.reserve(ids.size()); for (const UUID & id : ids) @@ -194,44 +194,6 @@ String RolesOrUsersSet::toStringWithNames(const AccessControlManager & manager) } -Strings RolesOrUsersSet::toStringsWithNames(const AccessControlManager & manager) const -{ - if (!all && ids.empty()) - return {}; - - Strings res; - res.reserve(ids.size() + except_ids.size()); - - if (all) - res.emplace_back("ALL"); - else - { - for (const UUID & id : ids) - { - auto name = manager.tryReadName(id); - if (name) - res.emplace_back(std::move(*name)); - } - std::sort(res.begin(), res.end()); - } - - if (!except_ids.empty()) - { - res.emplace_back("EXCEPT"); - size_t old_size = res.size(); - for (const UUID & id : except_ids) - { - auto name = manager.tryReadName(id); - if (name) - res.emplace_back(std::move(*name)); - } - std::sort(res.begin() + old_size, res.end()); - } - - return res; -} - - bool RolesOrUsersSet::empty() const { return ids.empty() && !all; @@ -248,14 +210,18 @@ void RolesOrUsersSet::clear() void RolesOrUsersSet::add(const UUID & id) { - ids.insert(id); + if (!all) + ids.insert(id); + except_ids.erase(id); } void RolesOrUsersSet::add(const std::vector & ids_) { + if (!all) + ids.insert(ids_.begin(), ids_.end()); for (const auto & id : ids_) - add(id); + except_ids.erase(id); } diff --git a/src/Access/RolesOrUsersSet.h b/src/Access/RolesOrUsersSet.h index bae7f52a574..0d8983c2ec3 100644 --- a/src/Access/RolesOrUsersSet.h +++ b/src/Access/RolesOrUsersSet.h @@ -13,7 +13,8 @@ class AccessControlManager; /// Represents a set of users/roles like -/// {user_name | role_name | CURRENT_USER} [,...] | NONE | ALL | ALL EXCEPT {user_name | role_name | CURRENT_USER} [,...] +/// {user_name | role_name | CURRENT_USER | ALL | NONE} [,...] +/// [EXCEPT {user_name | role_name | CURRENT_USER | ALL | NONE} [,...]] /// Similar to ASTRolesOrUsersSet, but with IDs instead of names. struct RolesOrUsersSet { @@ -60,8 +61,8 @@ struct RolesOrUsersSet friend bool operator ==(const RolesOrUsersSet & lhs, const RolesOrUsersSet & rhs); friend bool operator !=(const RolesOrUsersSet & lhs, const RolesOrUsersSet & rhs) { return !(lhs == rhs); } - boost::container::flat_set ids; bool all = false; + boost::container::flat_set ids; boost::container::flat_set except_ids; private: diff --git a/src/Access/User.cpp b/src/Access/User.cpp index f57ec7c1359..016f378e83f 100644 --- a/src/Access/User.cpp +++ b/src/Access/User.cpp @@ -11,7 +11,7 @@ bool User::equal(const IAccessEntity & other) const const auto & other_user = typeid_cast(other); return (authentication == other_user.authentication) && (allowed_client_hosts == other_user.allowed_client_hosts) && (access == other_user.access) && (granted_roles == other_user.granted_roles) && (default_roles == other_user.default_roles) - && (settings == other_user.settings); + && (settings == other_user.settings) && (grantees == other_user.grantees); } } diff --git a/src/Access/User.h b/src/Access/User.h index 13f1e532015..5b10d953fc0 100644 --- a/src/Access/User.h +++ b/src/Access/User.h @@ -21,6 +21,7 @@ struct User : public IAccessEntity GrantedRoles granted_roles; RolesOrUsersSet default_roles = RolesOrUsersSet::AllTag{}; SettingsProfileElements settings; + RolesOrUsersSet grantees = RolesOrUsersSet::AllTag{}; bool equal(const IAccessEntity & other) const override; std::shared_ptr clone() const override { return cloneImpl(); } diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index b3f151c3030..407c3679e9f 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -62,14 +62,15 @@ namespace bool has_password_sha256_hex = config.has(user_config + ".password_sha256_hex"); bool has_password_double_sha1_hex = config.has(user_config + ".password_double_sha1_hex"); bool has_ldap = config.has(user_config + ".ldap"); + bool has_kerberos = config.has(user_config + ".kerberos"); - size_t num_password_fields = has_no_password + has_password_plaintext + has_password_sha256_hex + has_password_double_sha1_hex + has_ldap; + size_t num_password_fields = has_no_password + has_password_plaintext + has_password_sha256_hex + has_password_double_sha1_hex + has_ldap + has_kerberos; if (num_password_fields > 1) - throw Exception("More than one field of 'password', 'password_sha256_hex', 'password_double_sha1_hex', 'no_password', 'ldap' are used to specify password for user " + user_name + ". Must be only one of them.", + throw Exception("More than one field of 'password', 'password_sha256_hex', 'password_double_sha1_hex', 'no_password', 'ldap', 'kerberos' are used to specify password for user " + user_name + ". Must be only one of them.", ErrorCodes::BAD_ARGUMENTS); if (num_password_fields < 1) - throw Exception("Either 'password' or 'password_sha256_hex' or 'password_double_sha1_hex' or 'no_password' or 'ldap' must be specified for user " + user_name + ".", ErrorCodes::BAD_ARGUMENTS); + throw Exception("Either 'password' or 'password_sha256_hex' or 'password_double_sha1_hex' or 'no_password' or 'ldap' or 'kerberos' must be specified for user " + user_name + ".", ErrorCodes::BAD_ARGUMENTS); if (has_password_plaintext) { @@ -96,8 +97,15 @@ namespace if (ldap_server_name.empty()) throw Exception("LDAP server name cannot be empty for user " + user_name + ".", ErrorCodes::BAD_ARGUMENTS); - user->authentication = Authentication{Authentication::LDAP_SERVER}; - user->authentication.setServerName(ldap_server_name); + user->authentication = Authentication{Authentication::LDAP}; + user->authentication.setLDAPServerName(ldap_server_name); + } + else if (has_kerberos) + { + const auto realm = config.getString(user_config + ".kerberos.realm", ""); + + user->authentication = Authentication{Authentication::KERBEROS}; + user->authentication.setKerberosRealm(realm); } const auto profile_name_config = user_config + ".profile"; @@ -518,7 +526,7 @@ void UsersConfigAccessStorage::load( preprocessed_dir, zkutil::ZooKeeperNodeCache(get_zookeeper_function), std::make_shared(), - [&](Poco::AutoPtr new_config) + [&](Poco::AutoPtr new_config, bool /*initial_loading*/) { parseFromConfig(*new_config); Settings::checkNoSettingNamesAtTopLevel(*new_config, users_config_path); diff --git a/src/Access/ya.make b/src/Access/ya.make index b4469aa3167..e8584230538 100644 --- a/src/Access/ya.make +++ b/src/Access/ya.make @@ -15,6 +15,7 @@ SRCS( AllowedClientHosts.cpp Authentication.cpp ContextAccess.cpp + Credentials.cpp DiskAccessStorage.cpp EnabledQuota.cpp EnabledRoles.cpp @@ -22,6 +23,7 @@ SRCS( EnabledRowPolicies.cpp EnabledSettings.cpp ExternalAuthenticators.cpp + GSSAcceptor.cpp GrantedRoles.cpp IAccessEntity.cpp IAccessStorage.cpp diff --git a/src/AggregateFunctions/AggregateFunctionAggThrow.cpp b/src/AggregateFunctions/AggregateFunctionAggThrow.cpp index c699dd4f217..09e343b2dc5 100644 --- a/src/AggregateFunctions/AggregateFunctionAggThrow.cpp +++ b/src/AggregateFunctions/AggregateFunctionAggThrow.cpp @@ -60,6 +60,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void create(AggregateDataPtr __restrict place) const override { if (std::uniform_real_distribution<>(0.0, 1.0)(thread_local_rng) <= throw_probability) diff --git a/src/AggregateFunctions/AggregateFunctionArgMinMax.h b/src/AggregateFunctions/AggregateFunctionArgMinMax.h index b559c1c8a7e..9efc907aed3 100644 --- a/src/AggregateFunctions/AggregateFunctionArgMinMax.h +++ b/src/AggregateFunctions/AggregateFunctionArgMinMax.h @@ -39,6 +39,8 @@ class AggregateFunctionArgMinMax final : public IAggregateFunctionTupleArgHelper private: const DataTypePtr & type_res; const DataTypePtr & type_val; + const SerializationPtr serialization_res; + const SerializationPtr serialization_val; bool tuple_argument; using Base = IAggregateFunctionTupleArgHelper, 2>; @@ -48,6 +50,8 @@ public: : Base({type_res_, type_val_}, {}, tuple_argument_) , type_res(this->argument_types[0]) , type_val(this->argument_types[1]) + , serialization_res(type_res->getDefaultSerialization()) + , serialization_val(type_val->getDefaultSerialization()) { if (!type_val->isComparable()) throw Exception( @@ -84,14 +88,14 @@ public: void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf) const override { - this->data(place).result.write(buf, *type_res); - this->data(place).value.write(buf, *type_val); + this->data(place).result.write(buf, *serialization_res); + this->data(place).value.write(buf, *serialization_val); } void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena * arena) const override { - this->data(place).result.read(buf, *type_res, arena); - this->data(place).value.read(buf, *type_val, arena); + this->data(place).result.read(buf, *serialization_res, arena); + this->data(place).value.read(buf, *serialization_val, arena); } bool allocatesMemoryInArena() const override { return Data::allocatesMemoryInArena(); } diff --git a/src/AggregateFunctions/AggregateFunctionAvg.h b/src/AggregateFunctions/AggregateFunctionAvg.h index 7bf742294b4..8a6491d9b61 100644 --- a/src/AggregateFunctions/AggregateFunctionAvg.h +++ b/src/AggregateFunctions/AggregateFunctionAvg.h @@ -96,7 +96,9 @@ public: UInt32 num_scale_ = 0, UInt32 denom_scale_ = 0) : Base(argument_types_, {}), num_scale(num_scale_), denom_scale(denom_scale_) {} - DataTypePtr getReturnType() const final { return std::make_shared>(); } + DataTypePtr getReturnType() const override { return std::make_shared>(); } + + bool allocatesMemoryInArena() const override { return false; } void NO_SANITIZE_UNDEFINED merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override { diff --git a/src/AggregateFunctions/AggregateFunctionBitwise.h b/src/AggregateFunctions/AggregateFunctionBitwise.h index 3ba8e045069..b48b1960329 100644 --- a/src/AggregateFunctions/AggregateFunctionBitwise.h +++ b/src/AggregateFunctions/AggregateFunctionBitwise.h @@ -54,6 +54,8 @@ public: return std::make_shared>(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { this->data(place).update(assert_cast &>(*columns[0]).getData()[row_num]); diff --git a/src/AggregateFunctions/AggregateFunctionBoundingRatio.h b/src/AggregateFunctions/AggregateFunctionBoundingRatio.h index 32ae22fd573..0cfb33efc10 100644 --- a/src/AggregateFunctions/AggregateFunctionBoundingRatio.h +++ b/src/AggregateFunctions/AggregateFunctionBoundingRatio.h @@ -127,6 +127,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override { /// NOTE Slightly inefficient. diff --git a/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.h b/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.h index ba8acb208ea..fb3e35fbcf1 100644 --- a/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.h +++ b/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.h @@ -33,6 +33,8 @@ public: return "categoricalInformationValue"; } + bool allocatesMemoryInArena() const override { return false; } + void create(AggregateDataPtr __restrict place) const override { memset(place, 0, sizeOfData()); diff --git a/src/AggregateFunctions/AggregateFunctionCombinatorFactory.cpp b/src/AggregateFunctions/AggregateFunctionCombinatorFactory.cpp index a20d355bb2f..e4ff8c134c5 100644 --- a/src/AggregateFunctions/AggregateFunctionCombinatorFactory.cpp +++ b/src/AggregateFunctions/AggregateFunctionCombinatorFactory.cpp @@ -13,17 +13,25 @@ namespace ErrorCodes void AggregateFunctionCombinatorFactory::registerCombinator(const AggregateFunctionCombinatorPtr & value) { - if (!dict.emplace(value->getName(), value).second) - throw Exception("AggregateFunctionCombinatorFactory: the name '" + value->getName() + "' is not unique", - ErrorCodes::LOGICAL_ERROR); + CombinatorPair pair{ + .name = value->getName(), + .combinator_ptr = value, + }; + + /// lower_bound() cannot be used since sort order of the dict is by length of the combinator + /// but there are just a few combiners, so not a problem. + if (std::find(dict.begin(), dict.end(), pair) != dict.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "AggregateFunctionCombinatorFactory: the name '{}' is not unique", + value->getName()); + dict.emplace(std::lower_bound(dict.begin(), dict.end(), pair), pair); } AggregateFunctionCombinatorPtr AggregateFunctionCombinatorFactory::tryFindSuffix(const std::string & name) const { /// O(N) is ok for just a few combinators. for (const auto & suffix_value : dict) - if (endsWith(name, suffix_value.first)) - return suffix_value.second; + if (endsWith(name, suffix_value.name)) + return suffix_value.combinator_ptr; return {}; } diff --git a/src/AggregateFunctions/AggregateFunctionCombinatorFactory.h b/src/AggregateFunctions/AggregateFunctionCombinatorFactory.h index b535475d111..5f7658c16af 100644 --- a/src/AggregateFunctions/AggregateFunctionCombinatorFactory.h +++ b/src/AggregateFunctions/AggregateFunctionCombinatorFactory.h @@ -15,7 +15,17 @@ namespace DB class AggregateFunctionCombinatorFactory final: private boost::noncopyable { private: - using Dict = std::unordered_map; + struct CombinatorPair + { + std::string name; + AggregateFunctionCombinatorPtr combinator_ptr; + + bool operator==(const CombinatorPair & rhs) const { return name == rhs.name; } + /// Sort by the length of the combinator name for proper tryFindSuffix() + /// for combiners with common prefix (i.e. "State" and "SimpleState"). + bool operator<(const CombinatorPair & rhs) const { return name.length() > rhs.name.length(); } + }; + using Dict = std::vector; Dict dict; public: diff --git a/src/AggregateFunctions/AggregateFunctionCount.h b/src/AggregateFunctions/AggregateFunctionCount.h index 1b3a0acb528..71db28390af 100644 --- a/src/AggregateFunctions/AggregateFunctionCount.h +++ b/src/AggregateFunctions/AggregateFunctionCount.h @@ -38,6 +38,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn **, size_t, Arena *) const override { ++data(place).count; @@ -126,6 +128,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { data(place).count += !assert_cast(*columns[0]).isNullAt(row_num); diff --git a/src/AggregateFunctions/AggregateFunctionDeltaSum.h b/src/AggregateFunctions/AggregateFunctionDeltaSum.h index d5760de84ae..99406618da5 100644 --- a/src/AggregateFunctions/AggregateFunctionDeltaSum.h +++ b/src/AggregateFunctions/AggregateFunctionDeltaSum.h @@ -43,6 +43,8 @@ public: DataTypePtr getReturnType() const override { return std::make_shared>(); } + bool allocatesMemoryInArena() const override { return false; } + void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { auto value = assert_cast &>(*columns[0]).getData()[row_num]; diff --git a/src/AggregateFunctions/AggregateFunctionEntropy.h b/src/AggregateFunctions/AggregateFunctionEntropy.h index 9bb1bc039c5..587f5aa3fc7 100644 --- a/src/AggregateFunctions/AggregateFunctionEntropy.h +++ b/src/AggregateFunctions/AggregateFunctionEntropy.h @@ -103,6 +103,8 @@ public: return std::make_shared>(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { if constexpr (!std::is_same_v) diff --git a/src/AggregateFunctions/AggregateFunctionFactory.cpp b/src/AggregateFunctions/AggregateFunctionFactory.cpp index 061077dd8fa..eec8b374424 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -106,6 +106,7 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( bool has_null_arguments) const { String name = getAliasToOrName(name_param); + bool is_case_insensitive = false; Value found; /// Find by exact match. @@ -115,9 +116,12 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( } if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end()) + { found = jt->second; + is_case_insensitive = true; + } - const Context * query_context = nullptr; + ContextPtr query_context; if (CurrentThread::isInitialized()) query_context = CurrentThread::get().getQueryContext(); @@ -126,7 +130,8 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( out_properties = found.properties; if (query_context && query_context->getSettingsRef().log_queries) - query_context->addQueryFactoriesInfo(Context::QueryLogFactories::AggregateFunction, name); + query_context->addQueryFactoriesInfo( + Context::QueryLogFactories::AggregateFunction, is_case_insensitive ? Poco::toLower(name) : name); /// The case when aggregate function should return NULL on NULL arguments. This case is handled in "get" method. if (!out_properties.returns_default_when_only_null && has_null_arguments) diff --git a/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h b/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h index 42005659a36..723ee7140bc 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h +++ b/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h @@ -55,7 +55,8 @@ class AggregateFunctionGroupArrayInsertAtGeneric final : public IAggregateFunctionDataHelper { private: - DataTypePtr & type; + DataTypePtr type; + SerializationPtr serialization; Field default_value; UInt64 length_to_resize = 0; /// zero means - do not do resizing. @@ -63,6 +64,7 @@ public: AggregateFunctionGroupArrayInsertAtGeneric(const DataTypes & arguments, const Array & params) : IAggregateFunctionDataHelper(arguments, params) , type(argument_types[0]) + , serialization(type->getDefaultSerialization()) { if (!params.empty()) { @@ -102,6 +104,8 @@ public: return std::make_shared(type); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { /// TODO Do positions need to be 1-based for this function? @@ -154,7 +158,7 @@ public: else { writeBinary(UInt8(0), buf); - type->serializeBinary(elem, buf); + serialization->serializeBinary(elem, buf); } } } @@ -175,7 +179,7 @@ public: UInt8 is_null = 0; readBinary(is_null, buf); if (!is_null) - type->deserializeBinary(arr[i], buf); + serialization->deserializeBinary(arr[i], buf); } } diff --git a/src/AggregateFunctions/AggregateFunctionGroupBitmap.h b/src/AggregateFunctions/AggregateFunctionGroupBitmap.h index 4628410286d..83db274752b 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupBitmap.h +++ b/src/AggregateFunctions/AggregateFunctionGroupBitmap.h @@ -22,6 +22,8 @@ public: DataTypePtr getReturnType() const override { return std::make_shared>(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { this->data(place).rbs.add(assert_cast &>(*columns[0]).getData()[row_num]); @@ -56,6 +58,8 @@ public: DataTypePtr getReturnType() const override { return std::make_shared>(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { Data & data_lhs = this->data(place); diff --git a/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h b/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h index 435efdd2373..16911184f31 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h +++ b/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h @@ -59,6 +59,8 @@ public: return std::make_shared(this->argument_types[0]); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { if (limit_num_elems && this->data(place).value.size() >= max_elems) diff --git a/src/AggregateFunctions/AggregateFunctionHistogram.h b/src/AggregateFunctions/AggregateFunctionHistogram.h index 76aa96ba663..c44cb61b275 100644 --- a/src/AggregateFunctions/AggregateFunctionHistogram.h +++ b/src/AggregateFunctions/AggregateFunctionHistogram.h @@ -332,6 +332,8 @@ public: return std::make_shared(tuple); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { auto val = assert_cast &>(*columns[0]).getData()[row_num]; diff --git a/src/AggregateFunctions/AggregateFunctionMLMethod.cpp b/src/AggregateFunctions/AggregateFunctionMLMethod.cpp index 6c5c5af2f1d..145660e881b 100644 --- a/src/AggregateFunctions/AggregateFunctionMLMethod.cpp +++ b/src/AggregateFunctions/AggregateFunctionMLMethod.cpp @@ -146,7 +146,7 @@ void LinearModelData::predict( const ColumnsWithTypeAndName & arguments, size_t offset, size_t limit, - const Context & context) const + ContextPtr context) const { gradient_computer->predict(container, arguments, offset, limit, weights, bias, context); } @@ -453,7 +453,7 @@ void LogisticRegression::predict( size_t limit, const std::vector & weights, Float64 bias, - const Context & /*context*/) const + ContextPtr /*context*/) const { size_t rows_num = arguments.front().column->size(); @@ -521,7 +521,7 @@ void LinearRegression::predict( size_t limit, const std::vector & weights, Float64 bias, - const Context & /*context*/) const + ContextPtr /*context*/) const { if (weights.size() + 1 != arguments.size()) { diff --git a/src/AggregateFunctions/AggregateFunctionMLMethod.h b/src/AggregateFunctions/AggregateFunctionMLMethod.h index 0c88f9d877d..6d97feb3193 100644 --- a/src/AggregateFunctions/AggregateFunctionMLMethod.h +++ b/src/AggregateFunctions/AggregateFunctionMLMethod.h @@ -3,10 +3,10 @@ #include #include #include -#include -#include -#include #include +#include +#include +#include #include "IAggregateFunction.h" namespace DB @@ -44,7 +44,7 @@ public: size_t limit, const std::vector & weights, Float64 bias, - const Context & context) const = 0; + ContextPtr context) const = 0; }; @@ -69,7 +69,7 @@ public: size_t limit, const std::vector & weights, Float64 bias, - const Context & context) const override; + ContextPtr context) const override; }; @@ -94,7 +94,7 @@ public: size_t limit, const std::vector & weights, Float64 bias, - const Context & context) const override; + ContextPtr context) const override; }; @@ -264,7 +264,7 @@ public: const ColumnsWithTypeAndName & arguments, size_t offset, size_t limit, - const Context & context) const; + ContextPtr context) const; void returnWeights(IColumn & to) const; private: @@ -323,6 +323,8 @@ public: return std::make_shared(std::make_shared()); } + bool allocatesMemoryInArena() const override { return false; } + /// This function is called from evalMLMethod function for correct predictValues call DataTypePtr getReturnTypeToPredict() const override { @@ -363,7 +365,7 @@ public: const ColumnsWithTypeAndName & arguments, size_t offset, size_t limit, - const Context & context) const override + ContextPtr context) const override { if (arguments.size() != param_num + 1) throw Exception( diff --git a/src/AggregateFunctions/AggregateFunctionMannWhitney.h b/src/AggregateFunctions/AggregateFunctionMannWhitney.h index 1451536d519..7efdebf1639 100644 --- a/src/AggregateFunctions/AggregateFunctionMannWhitney.h +++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.h @@ -174,6 +174,8 @@ public: return "mannWhitneyUTest"; } + bool allocatesMemoryInArena() const override { return true; } + DataTypePtr getReturnType() const override { DataTypes types @@ -208,7 +210,7 @@ public: void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override { auto & a = this->data(place); - auto & b = this->data(rhs); + const auto & b = this->data(rhs); a.merge(b, arena); } diff --git a/src/AggregateFunctions/AggregateFunctionMaxIntersections.h b/src/AggregateFunctions/AggregateFunctionMaxIntersections.h index d4946ad2c9d..94509a40ada 100644 --- a/src/AggregateFunctions/AggregateFunctionMaxIntersections.h +++ b/src/AggregateFunctions/AggregateFunctionMaxIntersections.h @@ -87,6 +87,8 @@ public: return std::make_shared>(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override { PointType left = assert_cast &>(*columns[0]).getData()[row_num]; diff --git a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h index a39d9af000b..919026a78c1 100644 --- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h +++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h @@ -50,14 +50,14 @@ public: assert_cast(to).insertDefault(); } - void write(WriteBuffer & buf, const IDataType & /*data_type*/) const + void write(WriteBuffer & buf, const ISerialization & /*serialization*/) const { writeBinary(has(), buf); if (has()) writeBinary(value, buf); } - void read(ReadBuffer & buf, const IDataType & /*data_type*/, Arena *) + void read(ReadBuffer & buf, const ISerialization & /*serialization*/, Arena *) { readBinary(has_value, buf); if (has()) @@ -221,14 +221,14 @@ public: assert_cast(to).insertDefault(); } - void write(WriteBuffer & buf, const IDataType & /*data_type*/) const + void write(WriteBuffer & buf, const ISerialization & /*serialization*/) const { writeBinary(size, buf); if (has()) buf.write(getData(), size); } - void read(ReadBuffer & buf, const IDataType & /*data_type*/, Arena * arena) + void read(ReadBuffer & buf, const ISerialization & /*serialization*/, Arena * arena) { Int32 rhs_size; readBinary(rhs_size, buf); @@ -427,24 +427,24 @@ public: to.insertDefault(); } - void write(WriteBuffer & buf, const IDataType & data_type) const + void write(WriteBuffer & buf, const ISerialization & serialization) const { if (!value.isNull()) { writeBinary(true, buf); - data_type.serializeBinary(value, buf); + serialization.serializeBinary(value, buf); } else writeBinary(false, buf); } - void read(ReadBuffer & buf, const IDataType & data_type, Arena *) + void read(ReadBuffer & buf, const ISerialization & serialization, Arena *) { bool is_not_null; readBinary(is_not_null, buf); if (is_not_null) - data_type.deserializeBinary(value, buf); + serialization.deserializeBinary(value, buf); } void change(const IColumn & column, size_t row_num, Arena *) @@ -678,15 +678,15 @@ struct AggregateFunctionAnyHeavyData : Data return false; } - void write(WriteBuffer & buf, const IDataType & data_type) const + void write(WriteBuffer & buf, const ISerialization & serialization) const { - Data::write(buf, data_type); + Data::write(buf, serialization); writeBinary(counter, buf); } - void read(ReadBuffer & buf, const IDataType & data_type, Arena * arena) + void read(ReadBuffer & buf, const ISerialization & serialization, Arena * arena) { - Data::read(buf, data_type, arena); + Data::read(buf, serialization, arena); readBinary(counter, buf); } @@ -698,12 +698,14 @@ template class AggregateFunctionsSingleValue final : public IAggregateFunctionDataHelper> { private: - DataTypePtr & type; + DataTypePtr type; + SerializationPtr serialization; public: AggregateFunctionsSingleValue(const DataTypePtr & type_) : IAggregateFunctionDataHelper>({type_}, {}) , type(this->argument_types[0]) + , serialization(type->getDefaultSerialization()) { if (StringRef(Data::name()) == StringRef("min") || StringRef(Data::name()) == StringRef("max")) @@ -733,12 +735,12 @@ public: void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf) const override { - this->data(place).write(buf, *type.get()); + this->data(place).write(buf, *serialization); } void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena * arena) const override { - this->data(place).read(buf, *type.get(), arena); + this->data(place).read(buf, *serialization, arena); } bool allocatesMemoryInArena() const override diff --git a/src/AggregateFunctions/AggregateFunctionNothing.h b/src/AggregateFunctions/AggregateFunctionNothing.h index f373b3b55b0..a094c1e0fac 100644 --- a/src/AggregateFunctions/AggregateFunctionNothing.h +++ b/src/AggregateFunctions/AggregateFunctionNothing.h @@ -28,6 +28,8 @@ public: return argument_types.front(); } + bool allocatesMemoryInArena() const override { return false; } + void create(AggregateDataPtr) const override { } diff --git a/src/AggregateFunctions/AggregateFunctionOrFill.h b/src/AggregateFunctions/AggregateFunctionOrFill.h index 93fe84a036a..4bb25e0d4de 100644 --- a/src/AggregateFunctions/AggregateFunctionOrFill.h +++ b/src/AggregateFunctions/AggregateFunctionOrFill.h @@ -110,7 +110,7 @@ public: const auto & flags = assert_cast(*columns[if_argument_pos]).getData(); for (size_t i = 0; i < batch_size; ++i) { - if (flags[i]) + if (flags[i] && places[i]) add(places[i] + place_offset, columns, i, arena); } } @@ -118,7 +118,8 @@ public: { nested_function->addBatch(batch_size, places, place_offset, columns, arena, if_argument_pos); for (size_t i = 0; i < batch_size; ++i) - (places[i] + place_offset)[size_of_data] = 1; + if (places[i]) + (places[i] + place_offset)[size_of_data] = 1; } } diff --git a/src/AggregateFunctions/AggregateFunctionQuantile.h b/src/AggregateFunctions/AggregateFunctionQuantile.h index edd24add736..209784361dd 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantile.h +++ b/src/AggregateFunctions/AggregateFunctionQuantile.h @@ -103,6 +103,8 @@ public: return res; } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { auto value = static_cast(*columns[0]).getData()[row_num]; diff --git a/src/AggregateFunctions/AggregateFunctionRankCorrelation.h b/src/AggregateFunctions/AggregateFunctionRankCorrelation.h index a7e0852378c..fbba1b99ab9 100644 --- a/src/AggregateFunctions/AggregateFunctionRankCorrelation.h +++ b/src/AggregateFunctions/AggregateFunctionRankCorrelation.h @@ -58,6 +58,8 @@ public: return "rankCorr"; } + bool allocatesMemoryInArena() const override { return true; } + DataTypePtr getReturnType() const override { return std::make_shared>(); diff --git a/src/AggregateFunctions/AggregateFunctionRetention.h b/src/AggregateFunctions/AggregateFunctionRetention.h index 5f0d9907280..aee74cb7324 100644 --- a/src/AggregateFunctions/AggregateFunctionRetention.h +++ b/src/AggregateFunctions/AggregateFunctionRetention.h @@ -94,6 +94,8 @@ public: return std::make_shared(std::make_shared()); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override { for (const auto i : ext::range(0, events_size)) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 48015a6d282..2c2a4d4c75a 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -560,6 +560,8 @@ public: DataTypePtr getReturnType() const override { return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { this->data(place).sort(); @@ -588,6 +590,8 @@ public: DataTypePtr getReturnType() const override { return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { this->data(place).sort(); diff --git a/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.h b/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.h index 8c029855a26..61f10895de6 100644 --- a/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.h +++ b/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.h @@ -168,6 +168,8 @@ public: ); } + bool allocatesMemoryInArena() const override { return false; } + void insertResultInto( AggregateDataPtr place, IColumn & to, diff --git a/src/AggregateFunctions/AggregateFunctionStatistics.h b/src/AggregateFunctions/AggregateFunctionStatistics.h index 76b6e843c15..473bce1d89a 100644 --- a/src/AggregateFunctions/AggregateFunctionStatistics.h +++ b/src/AggregateFunctions/AggregateFunctionStatistics.h @@ -123,6 +123,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { this->data(place).update(*columns[0], row_num); @@ -375,6 +377,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { this->data(place).update(*columns[0], *columns[1], row_num); diff --git a/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h b/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h index 9903e2f6eaa..a5d6bbb7dc8 100644 --- a/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h +++ b/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h @@ -121,6 +121,8 @@ public: return std::make_shared>(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { if constexpr (StatFunc::num_args == 2) diff --git a/src/AggregateFunctions/AggregateFunctionSum.h b/src/AggregateFunctions/AggregateFunctionSum.h index bd1f9fc302e..1748458f6d8 100644 --- a/src/AggregateFunctions/AggregateFunctionSum.h +++ b/src/AggregateFunctions/AggregateFunctionSum.h @@ -314,6 +314,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { const auto & column = assert_cast(*columns[0]); diff --git a/src/AggregateFunctions/AggregateFunctionSumCount.cpp b/src/AggregateFunctions/AggregateFunctionSumCount.cpp new file mode 100644 index 00000000000..b979779d907 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionSumCount.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include "registerAggregateFunctions.h" + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +namespace +{ +bool allowType(const DataTypePtr& type) noexcept +{ + const WhichDataType t(type); + return t.isInt() || t.isUInt() || t.isFloat() || t.isDecimal(); +} + +AggregateFunctionPtr createAggregateFunctionSumCount(const std::string & name, const DataTypes & argument_types, const Array & parameters) +{ + assertNoParameters(name, parameters); + assertUnary(name, argument_types); + + AggregateFunctionPtr res; + DataTypePtr data_type = argument_types[0]; + if (!allowType(data_type)) + throw Exception("Illegal type " + data_type->getName() + " of argument for aggregate function " + name, + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (isDecimal(data_type)) + res.reset(createWithDecimalType( + *data_type, argument_types, getDecimalScale(*data_type))); + else + res.reset(createWithNumericType(*data_type, argument_types)); + + return res; +} + +} + +void registerAggregateFunctionSumCount(AggregateFunctionFactory & factory) +{ + factory.registerFunction("sumCount", createAggregateFunctionSumCount); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionSumCount.h b/src/AggregateFunctions/AggregateFunctionSumCount.h new file mode 100644 index 00000000000..1026b6272ba --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionSumCount.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ +template +using DecimalOrNumberDataType = std::conditional_t, DataTypeDecimal>, DataTypeNumber>>; +template +class AggregateFunctionSumCount final : public AggregateFunctionAvgBase, UInt64, AggregateFunctionSumCount> +{ +public: + using Base = AggregateFunctionAvgBase, UInt64, AggregateFunctionSumCount>; + + AggregateFunctionSumCount(const DataTypes & argument_types_, UInt32 num_scale_ = 0) + : Base(argument_types_, num_scale_), scale(num_scale_) {} + + DataTypePtr getReturnType() const override + { + DataTypes types; + if constexpr (IsDecimalNumber) + types.emplace_back(std::make_shared>(DecimalOrNumberDataType::maxPrecision(), scale)); + else + types.emplace_back(std::make_shared>()); + + types.emplace_back(std::make_shared()); + + return std::make_shared(types); + } + + void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const final + { + assert_cast> &>((assert_cast(to)).getColumn(0)).getData().push_back( + this->data(place).numerator); + + assert_cast((assert_cast(to)).getColumn(1)).getData().push_back( + this->data(place).denominator); + } + + void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const final + { + this->data(place).numerator += static_cast &>(*columns[0]).getData()[row_num]; + ++this->data(place).denominator; + } + + String getName() const final { return "sumCount"; } + +private: + UInt32 scale; +}; + +} diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index f6a473546f9..7819bb8752c 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -64,7 +64,9 @@ class AggregateFunctionMapBase : public IAggregateFunctionDataHelper< { private: DataTypePtr keys_type; + SerializationPtr keys_serialization; DataTypes values_types; + Serializations values_serializations; public: using Base = IAggregateFunctionDataHelper< @@ -72,9 +74,14 @@ public: AggregateFunctionMapBase(const DataTypePtr & keys_type_, const DataTypes & values_types_, const DataTypes & argument_types_) - : Base(argument_types_, {} /* parameters */), keys_type(keys_type_), - values_types(values_types_) + : Base(argument_types_, {} /* parameters */) + , keys_type(keys_type_) + , keys_serialization(keys_type->getDefaultSerialization()) + , values_types(values_types_) { + values_serializations.reserve(values_types.size()); + for (const auto & type : values_types) + values_serializations.emplace_back(type->getDefaultSerialization()); } DataTypePtr getReturnType() const override @@ -118,6 +125,8 @@ public: WhichDataType value_type_to_check(value_type); /// Do not promote decimal because of implementation issues of this function design + /// Currently we cannot get result column type in case of decimal we cannot get decimal scale + /// in method void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override /// If we decide to make this function more efficient we should promote decimal type during summ if (value_type_to_check.isDecimal()) result_type = value_type_without_nullable; @@ -131,6 +140,8 @@ public: return std::make_shared(types); } + bool allocatesMemoryInArena() const override { return false; } + static const auto & getArgumentColumns(const IColumn**& columns) { if constexpr (tuple_argument) @@ -246,9 +257,9 @@ public: for (const auto & elem : merged_maps) { - keys_type->serializeBinary(elem.first, buf); + keys_serialization->serializeBinary(elem.first, buf); for (size_t col = 0; col < values_types.size(); ++col) - values_types[col]->serializeBinary(elem.second[col], buf); + values_serializations[col]->serializeBinary(elem.second[col], buf); } } @@ -261,12 +272,12 @@ public: for (size_t i = 0; i < size; ++i) { Field key; - keys_type->deserializeBinary(key, buf); + keys_serialization->deserializeBinary(key, buf); Array values; values.resize(values_types.size()); for (size_t col = 0; col < values_types.size(); ++col) - values_types[col]->deserializeBinary(values[col], buf); + values_serializations[col]->deserializeBinary(values[col], buf); if constexpr (IsDecimalNumber) merged_maps[key.get>()] = values; @@ -411,7 +422,7 @@ public: for (const Field & f : keys_to_keep_) { - keys_to_keep.emplace(f.safeGet>()); + keys_to_keep.emplace(f.safeGet()); } } diff --git a/src/AggregateFunctions/AggregateFunctionTTest.h b/src/AggregateFunctions/AggregateFunctionTTest.h index 3c9873ebd1e..5617adf38dd 100644 --- a/src/AggregateFunctions/AggregateFunctionTTest.h +++ b/src/AggregateFunctions/AggregateFunctionTTest.h @@ -109,6 +109,8 @@ public: ); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { Float64 value = columns[0]->getFloat64(row_num); diff --git a/src/AggregateFunctions/AggregateFunctionTopK.h b/src/AggregateFunctions/AggregateFunctionTopK.h index 43320a96b99..dbc471687ea 100644 --- a/src/AggregateFunctions/AggregateFunctionTopK.h +++ b/src/AggregateFunctions/AggregateFunctionTopK.h @@ -50,6 +50,8 @@ public: return std::make_shared(this->argument_types[0]); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { auto & set = this->data(place).value; diff --git a/src/AggregateFunctions/AggregateFunctionUniq.h b/src/AggregateFunctions/AggregateFunctionUniq.h index 4e27922ba7c..253af6e2895 100644 --- a/src/AggregateFunctions/AggregateFunctionUniq.h +++ b/src/AggregateFunctions/AggregateFunctionUniq.h @@ -210,6 +210,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + /// ALWAYS_INLINE is required to have better code layout for uniqHLL12 function void ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { @@ -265,6 +267,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { this->data(place).set.insert(typename Data::Set::value_type( diff --git a/src/AggregateFunctions/AggregateFunctionUniqCombined.h b/src/AggregateFunctions/AggregateFunctionUniqCombined.h index c9327594670..31bd8021dcf 100644 --- a/src/AggregateFunctions/AggregateFunctionUniqCombined.h +++ b/src/AggregateFunctions/AggregateFunctionUniqCombined.h @@ -141,6 +141,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { if constexpr (!std::is_same_v) @@ -211,6 +213,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { this->data(place).set.insert(typename AggregateFunctionUniqCombinedData::Set::value_type( diff --git a/src/AggregateFunctions/AggregateFunctionUniqUpTo.h b/src/AggregateFunctions/AggregateFunctionUniqUpTo.h index d2c765137bc..853e1cb6447 100644 --- a/src/AggregateFunctions/AggregateFunctionUniqUpTo.h +++ b/src/AggregateFunctions/AggregateFunctionUniqUpTo.h @@ -17,7 +17,7 @@ #include -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" #endif @@ -184,6 +184,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function void ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { @@ -247,6 +249,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { this->data(place).insert(UInt64(UniqVariadicHash::apply(num_args, columns, row_num)), threshold); @@ -276,7 +280,7 @@ public: } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/AggregateFunctions/AggregateFunctionWindowFunnel.cpp b/src/AggregateFunctions/AggregateFunctionWindowFunnel.cpp index 1e9f2782d95..ed732a197a1 100644 --- a/src/AggregateFunctions/AggregateFunctionWindowFunnel.cpp +++ b/src/AggregateFunctions/AggregateFunctionWindowFunnel.cpp @@ -6,7 +6,6 @@ #include #include -#include "registerAggregateFunctions.h" namespace DB diff --git a/src/AggregateFunctions/AggregateFunctionWindowFunnel.h b/src/AggregateFunctions/AggregateFunctionWindowFunnel.h index c765024507e..9f399a9f25b 100644 --- a/src/AggregateFunctions/AggregateFunctionWindowFunnel.h +++ b/src/AggregateFunctions/AggregateFunctionWindowFunnel.h @@ -19,22 +19,13 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -struct ComparePair final -{ - template - bool operator()(const std::pair & lhs, const std::pair & rhs) const - { - return lhs.first == rhs.first ? lhs.second < rhs.second : lhs.first < rhs.first; - } -}; - static constexpr auto max_events = 32; + template struct AggregateFunctionWindowFunnelData { using TimestampEvent = std::pair; using TimestampEvents = PODArrayWithStackMemory; - using Comparator = ComparePair; bool sorted = true; TimestampEvents events_list; @@ -46,7 +37,7 @@ struct AggregateFunctionWindowFunnelData void add(T timestamp, UInt8 event) { - // Since most events should have already been sorted by timestamp. + /// Since most events should have already been sorted by timestamp. if (sorted && events_list.size() > 0) { if (events_list.back().first == timestamp) @@ -68,7 +59,7 @@ struct AggregateFunctionWindowFunnelData /// either sort whole container or do so partially merging ranges afterwards if (!sorted && !other.sorted) - std::stable_sort(std::begin(events_list), std::end(events_list), Comparator{}); + std::stable_sort(std::begin(events_list), std::end(events_list)); else { const auto begin = std::begin(events_list); @@ -76,12 +67,12 @@ struct AggregateFunctionWindowFunnelData const auto end = std::end(events_list); if (!sorted) - std::stable_sort(begin, middle, Comparator{}); + std::stable_sort(begin, middle); if (!other.sorted) - std::stable_sort(middle, end, Comparator{}); + std::stable_sort(middle, end); - std::inplace_merge(begin, middle, end, Comparator{}); + std::inplace_merge(begin, middle, end); } sorted = true; @@ -91,7 +82,7 @@ struct AggregateFunctionWindowFunnelData { if (!sorted) { - std::stable_sort(std::begin(events_list), std::end(events_list), Comparator{}); + std::stable_sort(std::begin(events_list), std::end(events_list)); sorted = true; } } @@ -145,14 +136,20 @@ class AggregateFunctionWindowFunnel final private: UInt64 window; UInt8 events_size; - UInt8 strict; // When the 'strict' is set, it applies conditions only for the not repeating values. - UInt8 strict_order; // When the 'strict_order' is set, it doesn't allow interventions of other events. - // In the case of 'A->B->D->C', it stops finding 'A->B->C' at the 'D' and the max event level is 2. + /// When the 'strict' is set, it applies conditions only for the not repeating values. + bool strict; - // Loop through the entire events_list, update the event timestamp value - // The level path must be 1---2---3---...---check_events_size, find the max event level that satisfied the path in the sliding window. - // If found, returns the max event level, else return 0. - // The Algorithm complexity is O(n). + /// When the 'strict_order' is set, it doesn't allow interventions of other events. + /// In the case of 'A->B->D->C', it stops finding 'A->B->C' at the 'D' and the max event level is 2. + bool strict_order; + + /// Applies conditions only to events with strictly increasing timestamps + bool strict_increase; + + /// Loop through the entire events_list, update the event timestamp value + /// The level path must be 1---2---3---...---check_events_size, find the max event level that satisfied the path in the sliding window. + /// If found, returns the max event level, else return 0. + /// The Algorithm complexity is O(n). UInt8 getEventLevel(Data & data) const { if (data.size() == 0) @@ -162,16 +159,13 @@ private: data.sort(); - /// events_timestamp stores the timestamp that latest i-th level event happen within time window after previous level event. - /// timestamp defaults to -1, which unsigned timestamp value never meet - /// there may be some bugs when UInt64 type timstamp overflows Int64, but it works on most cases. - std::vector events_timestamp(events_size, -1); + /// events_timestamp stores the timestamp of the first and previous i-th level event happen within time window + std::vector>> events_timestamp(events_size); bool first_event = false; for (const auto & pair : data.events_list) { const T & timestamp = pair.first; const auto & event_idx = pair.second - 1; - if (strict_order && event_idx == -1) { if (first_event) @@ -181,31 +175,39 @@ private: } else if (event_idx == 0) { - events_timestamp[0] = timestamp; + events_timestamp[0] = std::make_pair(timestamp, timestamp); first_event = true; } - else if (strict && events_timestamp[event_idx] >= 0) + else if (strict && events_timestamp[event_idx].has_value()) { return event_idx + 1; } - else if (strict_order && first_event && events_timestamp[event_idx - 1] == -1) + else if (strict_order && first_event && !events_timestamp[event_idx - 1].has_value()) { for (size_t event = 0; event < events_timestamp.size(); ++event) { - if (events_timestamp[event] == -1) + if (!events_timestamp[event].has_value()) return event; } } - else if (events_timestamp[event_idx - 1] >= 0 && timestamp <= events_timestamp[event_idx - 1] + window) + else if (events_timestamp[event_idx - 1].has_value()) { - events_timestamp[event_idx] = events_timestamp[event_idx - 1]; - if (event_idx + 1 == events_size) - return events_size; + auto first_timestamp = events_timestamp[event_idx - 1]->first; + bool time_matched = timestamp <= first_timestamp + window; + if (strict_increase) + time_matched = time_matched && events_timestamp[event_idx - 1]->second < timestamp; + if (time_matched) + { + events_timestamp[event_idx] = std::make_pair(first_timestamp, timestamp); + if (event_idx + 1 == events_size) + return events_size; + } } } + for (size_t event = events_timestamp.size(); event > 0; --event) { - if (events_timestamp[event - 1] >= 0) + if (events_timestamp[event - 1].has_value()) return event; } return 0; @@ -223,15 +225,18 @@ public: events_size = arguments.size() - 1; window = params.at(0).safeGet(); - strict = 0; - strict_order = 0; + strict = false; + strict_order = false; + strict_increase = false; for (size_t i = 1; i < params.size(); ++i) { String option = params.at(i).safeGet(); - if (option.compare("strict") == 0) - strict = 1; - else if (option.compare("strict_order") == 0) - strict_order = 1; + if (option == "strict") + strict = true; + else if (option == "strict_order") + strict_order = true; + else if (option == "strict_increase") + strict_increase = true; else throw Exception{"Aggregate function " + getName() + " doesn't support a parameter: " + option, ErrorCodes::BAD_ARGUMENTS}; } @@ -242,6 +247,8 @@ public: return std::make_shared(); } + bool allocatesMemoryInArena() const override { return false; } + AggregateFunctionPtr getOwnNullAdapter( const AggregateFunctionPtr & nested_function, const DataTypes & arguments, const Array & params, const AggregateFunctionProperties & /*properties*/) const override @@ -253,7 +260,7 @@ public: { bool has_event = false; const auto timestamp = assert_cast *>(columns[0])->getData()[row_num]; - // reverse iteration and stable sorting are needed for events that are qualified by more than one condition. + /// reverse iteration and stable sorting are needed for events that are qualified by more than one condition. for (auto i = events_size; i > 0; --i) { auto event = assert_cast *>(columns[i])->getData()[row_num]; diff --git a/src/AggregateFunctions/IAggregateFunction.h b/src/AggregateFunctions/IAggregateFunction.h index d15ff4e8a78..dbb0a9f4349 100644 --- a/src/AggregateFunctions/IAggregateFunction.h +++ b/src/AggregateFunctions/IAggregateFunction.h @@ -1,21 +1,23 @@ #pragma once +#include +#include +#include +#include +#include +#include +#include +#include + #include #include #include #include -#include -#include -#include -#include -#include -#include -#include - namespace DB { + namespace ErrorCodes { extern const int NOT_IMPLEMENTED; @@ -104,7 +106,7 @@ public: virtual void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena * arena) const = 0; /// Returns true if a function requires Arena to handle own states (see add(), merge(), deserialize()). - virtual bool allocatesMemoryInArena() const { return false; } + virtual bool allocatesMemoryInArena() const = 0; /// Inserts results into a column. This method might modify the state (e.g. /// sort an array), so must be called once, from single thread. The state @@ -122,7 +124,7 @@ public: const ColumnsWithTypeAndName & /*arguments*/, size_t /*offset*/, size_t /*limit*/, - const Context & /*context*/) const + ContextPtr /*context*/) const { throw Exception("Method predictValues is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } @@ -267,14 +269,15 @@ public: const auto & flags = assert_cast(*columns[if_argument_pos]).getData(); for (size_t i = 0; i < batch_size; ++i) { - if (flags[i]) + if (flags[i] && places[i]) static_cast(this)->add(places[i] + place_offset, columns, i, arena); } } else { for (size_t i = 0; i < batch_size; ++i) - static_cast(this)->add(places[i] + place_offset, columns, i, arena); + if (places[i]) + static_cast(this)->add(places[i] + place_offset, columns, i, arena); } } @@ -349,7 +352,8 @@ public: { size_t next_offset = offsets[i]; for (size_t j = current_offset; j < next_offset; ++j) - static_cast(this)->add(places[i] + place_offset, columns, j, arena); + if (places[i]) + static_cast(this)->add(places[i] + place_offset, columns, j, arena); current_offset = next_offset; } } diff --git a/src/AggregateFunctions/QuantileExact.h b/src/AggregateFunctions/QuantileExact.h index bc85d5c1c28..aa7b4bbe250 100644 --- a/src/AggregateFunctions/QuantileExact.h +++ b/src/AggregateFunctions/QuantileExact.h @@ -121,7 +121,7 @@ struct QuantileExact : QuantileExactBase> /// QuantileExactExclusive is equivalent to Excel PERCENTILE.EXC, R-6, SAS-4, SciPy-(0,0) template -/// There is no virtual-like functions. So we don't inherit from QuantileExactBase. +/// There are no virtual-like functions. So we don't inherit from QuantileExactBase. struct QuantileExactExclusive : public QuantileExact { using QuantileExact::array; @@ -189,7 +189,7 @@ struct QuantileExactExclusive : public QuantileExact /// QuantileExactInclusive is equivalent to Excel PERCENTILE and PERCENTILE.INC, R-7, SciPy-(1,1) template -/// There is no virtual-like functions. So we don't inherit from QuantileExactBase. +/// There are no virtual-like functions. So we don't inherit from QuantileExactBase. struct QuantileExactInclusive : public QuantileExact { using QuantileExact::array; diff --git a/src/AggregateFunctions/ReservoirSamplerDeterministic.h b/src/AggregateFunctions/ReservoirSamplerDeterministic.h index 3b7817e9308..9c62160b964 100644 --- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h +++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h @@ -13,6 +13,7 @@ #include #include + namespace DB { namespace ErrorCodes @@ -56,7 +57,7 @@ class ReservoirSamplerDeterministic { bool good(const UInt32 hash) { - return hash == ((hash >> skip_degree) << skip_degree); + return !(hash & skip_mask); } public: @@ -135,11 +136,8 @@ public: throw Poco::Exception("Cannot merge ReservoirSamplerDeterministic's with different max sample size"); sorted = false; - if (b.skip_degree > skip_degree) - { - skip_degree = b.skip_degree; - thinOut(); - } + if (skip_degree < b.skip_degree) + setSkipDegree(b.skip_degree); for (const auto & sample : b.samples) if (good(sample.second)) @@ -165,6 +163,11 @@ public: sorted = false; } +#if !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif + void write(DB::WriteBuffer & buf) const { size_t size = samples.size(); @@ -172,9 +175,26 @@ public: DB::writeIntBinary(total_values, buf); for (size_t i = 0; i < size; ++i) - DB::writePODBinary(samples[i], buf); + { + /// There was a mistake in this function. + /// Instead of correctly serializing the elements, + /// it was writing them with uninitialized padding. + /// Here we ensure that padding is zero without changing the protocol. + /// TODO: After implementation of "versioning aggregate function state", + /// change the serialization format. + + Element elem; + memset(&elem, 0, sizeof(elem)); + elem = samples[i]; + + DB::writePODBinary(elem, buf); + } } +#if !defined(__clang__) +#pragma GCC diagnostic pop +#endif + private: /// We allocate some memory on the stack to avoid allocations when there are many objects with a small number of elements. using Element = std::pair; @@ -184,22 +204,39 @@ private: size_t total_values = 0; /// How many values were inserted (regardless if they remain in sample or not). bool sorted = false; Array samples; - UInt8 skip_degree = 0; /// The number N determining that we save only one per 2^N elements in average. + + /// The number N determining that we store only one per 2^N elements in average. + UInt8 skip_degree = 0; + + /// skip_mask is calculated as (2 ^ skip_degree - 1). We store an element only if (hash & skip_mask) == 0. + /// For example, if skip_degree==0 then skip_mask==0 means we store each element; + /// if skip_degree==1 then skip_mask==0b0001 means we store one per 2 elements in average; + /// if skip_degree==4 then skip_mask==0b1111 means we store one per 16 elements in average. + UInt32 skip_mask = 0; void insertImpl(const T & v, const UInt32 hash) { /// Make a room for plus one element. while (samples.size() >= max_sample_size) - { - ++skip_degree; - if (skip_degree > detail::MAX_SKIP_DEGREE) - throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED}; - thinOut(); - } + setSkipDegree(skip_degree + 1); samples.emplace_back(v, hash); } + void setSkipDegree(UInt8 skip_degree_) + { + if (skip_degree_ == skip_degree) + return; + if (skip_degree_ > detail::MAX_SKIP_DEGREE) + throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED}; + skip_degree = skip_degree_; + if (skip_degree == detail::MAX_SKIP_DEGREE) + skip_mask = static_cast(-1); + else + skip_mask = (1 << skip_degree) - 1; + thinOut(); + } + void thinOut() { samples.resize(std::distance(samples.begin(), diff --git a/src/AggregateFunctions/registerAggregateFunctions.cpp b/src/AggregateFunctions/registerAggregateFunctions.cpp index ae26fdc5d40..ceba1531e03 100644 --- a/src/AggregateFunctions/registerAggregateFunctions.cpp +++ b/src/AggregateFunctions/registerAggregateFunctions.cpp @@ -25,6 +25,7 @@ void registerAggregateFunctionsAny(AggregateFunctionFactory &); void registerAggregateFunctionsStatisticsStable(AggregateFunctionFactory &); void registerAggregateFunctionsStatisticsSimple(AggregateFunctionFactory &); void registerAggregateFunctionSum(AggregateFunctionFactory &); +void registerAggregateFunctionSumCount(AggregateFunctionFactory &); void registerAggregateFunctionSumMap(AggregateFunctionFactory &); void registerAggregateFunctionsUniq(AggregateFunctionFactory &); void registerAggregateFunctionUniqCombined(AggregateFunctionFactory &); @@ -83,6 +84,7 @@ void registerAggregateFunctions() registerAggregateFunctionsStatisticsStable(factory); registerAggregateFunctionsStatisticsSimple(factory); registerAggregateFunctionSum(factory); + registerAggregateFunctionSumCount(factory); registerAggregateFunctionSumMap(factory); registerAggregateFunctionsUniq(factory); registerAggregateFunctionUniqCombined(factory); diff --git a/src/AggregateFunctions/ya.make b/src/AggregateFunctions/ya.make index 3a8f0ad9fba..64605aee659 100644 --- a/src/AggregateFunctions/ya.make +++ b/src/AggregateFunctions/ya.make @@ -50,6 +50,7 @@ SRCS( AggregateFunctionStatisticsSimple.cpp AggregateFunctionStudentTTest.cpp AggregateFunctionSum.cpp + AggregateFunctionSumCount.cpp AggregateFunctionSumMap.cpp AggregateFunctionTopK.cpp AggregateFunctionUniq.cpp diff --git a/src/Bridge/IBridgeHelper.cpp b/src/Bridge/IBridgeHelper.cpp new file mode 100644 index 00000000000..16886bae4fe --- /dev/null +++ b/src/Bridge/IBridgeHelper.cpp @@ -0,0 +1,132 @@ +#include "IBridgeHelper.h" + +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int EXTERNAL_SERVER_IS_NOT_RESPONDING; +} + + +Poco::URI IBridgeHelper::getMainURI() const +{ + auto uri = createBaseURI(); + uri.setPath(MAIN_HANDLER); + return uri; +} + + +Poco::URI IBridgeHelper::getPingURI() const +{ + auto uri = createBaseURI(); + uri.setPath(PING_HANDLER); + return uri; +} + + +bool IBridgeHelper::checkBridgeIsRunning() const +{ + try + { + ReadWriteBufferFromHTTP buf( + getPingURI(), Poco::Net::HTTPRequest::HTTP_GET, {}, ConnectionTimeouts::getHTTPTimeouts(getContext())); + return checkString(PING_OK_ANSWER, buf); + } + catch (...) + { + return false; + } +} + + +void IBridgeHelper::startBridgeSync() const +{ + if (!checkBridgeIsRunning()) + { + LOG_TRACE(getLog(), "{} is not running, will try to start it", serviceAlias()); + startBridge(startBridgeCommand()); + bool started = false; + + uint64_t milliseconds_to_wait = 10; /// Exponential backoff + uint64_t counter = 0; + + while (milliseconds_to_wait < 10000) + { + ++counter; + LOG_TRACE(getLog(), "Checking {} is running, try {}", serviceAlias(), counter); + + if (checkBridgeIsRunning()) + { + started = true; + break; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds_to_wait)); + milliseconds_to_wait *= 2; + } + + if (!started) + throw Exception("BridgeHelper: " + serviceAlias() + " is not responding", + ErrorCodes::EXTERNAL_SERVER_IS_NOT_RESPONDING); + } +} + + +std::unique_ptr IBridgeHelper::startBridgeCommand() const +{ + if (startBridgeManually()) + throw Exception(serviceAlias() + " is not running. Please, start it manually", ErrorCodes::EXTERNAL_SERVER_IS_NOT_RESPONDING); + + const auto & config = getConfig(); + /// Path to executable folder + Poco::Path path{config.getString("application.dir", "/usr/bin")}; + + std::vector cmd_args; + path.setFileName(serviceFileName()); + + cmd_args.push_back("--http-port"); + cmd_args.push_back(std::to_string(config.getUInt(configPrefix() + ".port", getDefaultPort()))); + cmd_args.push_back("--listen-host"); + cmd_args.push_back(config.getString(configPrefix() + ".listen_host", DEFAULT_HOST)); + cmd_args.push_back("--http-timeout"); + cmd_args.push_back(std::to_string(getHTTPTimeout().totalMicroseconds())); + if (config.has("logger." + configPrefix() + "_log")) + { + cmd_args.push_back("--log-path"); + cmd_args.push_back(config.getString("logger." + configPrefix() + "_log")); + } + if (config.has("logger." + configPrefix() + "_errlog")) + { + cmd_args.push_back("--err-log-path"); + cmd_args.push_back(config.getString("logger." + configPrefix() + "_errlog")); + } + if (config.has("logger." + configPrefix() + "_stdout")) + { + cmd_args.push_back("--stdout-path"); + cmd_args.push_back(config.getString("logger." + configPrefix() + "_stdout")); + } + if (config.has("logger." + configPrefix() + "_stderr")) + { + cmd_args.push_back("--stderr-path"); + cmd_args.push_back(config.getString("logger." + configPrefix() + "_stderr")); + } + if (config.has("logger." + configPrefix() + "_level")) + { + cmd_args.push_back("--log-level"); + cmd_args.push_back(config.getString("logger." + configPrefix() + "_level")); + } + + LOG_TRACE(getLog(), "Starting {}", serviceAlias()); + + return ShellCommand::executeDirect(path.toString(), cmd_args, ShellCommandDestructorStrategy(true)); +} + +} diff --git a/src/Bridge/IBridgeHelper.h b/src/Bridge/IBridgeHelper.h new file mode 100644 index 00000000000..3473e24047e --- /dev/null +++ b/src/Bridge/IBridgeHelper.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +/// Common base class for XDBC and Library bridge helpers. +/// Contains helper methods to check/start bridge sync. +class IBridgeHelper: protected WithContext +{ + +public: + static constexpr inline auto DEFAULT_HOST = "127.0.0.1"; + static constexpr inline auto PING_HANDLER = "/ping"; + static constexpr inline auto MAIN_HANDLER = "/"; + static constexpr inline auto DEFAULT_FORMAT = "RowBinary"; + static constexpr inline auto PING_OK_ANSWER = "Ok."; + + static const inline std::string PING_METHOD = Poco::Net::HTTPRequest::HTTP_GET; + static const inline std::string MAIN_METHOD = Poco::Net::HTTPRequest::HTTP_POST; + + explicit IBridgeHelper(ContextPtr context_) : WithContext(context_) {} + virtual ~IBridgeHelper() = default; + + void startBridgeSync() const; + + Poco::URI getMainURI() const; + + Poco::URI getPingURI() const; + + +protected: + /// clickhouse-odbc-bridge, clickhouse-library-bridge + virtual String serviceAlias() const = 0; + + virtual String serviceFileName() const = 0; + + virtual size_t getDefaultPort() const = 0; + + virtual bool startBridgeManually() const = 0; + + virtual void startBridge(std::unique_ptr cmd) const = 0; + + virtual String configPrefix() const = 0; + + virtual const Poco::Util::AbstractConfiguration & getConfig() const = 0; + + virtual Poco::Logger * getLog() const = 0; + + virtual const Poco::Timespan & getHTTPTimeout() const = 0; + + virtual Poco::URI createBaseURI() const = 0; + + +private: + bool checkBridgeIsRunning() const; + + std::unique_ptr startBridgeCommand() const; +}; + +} diff --git a/src/Bridge/LibraryBridgeHelper.cpp b/src/Bridge/LibraryBridgeHelper.cpp new file mode 100644 index 00000000000..3ab01e18b11 --- /dev/null +++ b/src/Bridge/LibraryBridgeHelper.cpp @@ -0,0 +1,182 @@ +#include "LibraryBridgeHelper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +LibraryBridgeHelper::LibraryBridgeHelper( + ContextPtr context_, + const Block & sample_block_, + const Field & dictionary_id_) + : IBridgeHelper(context_) + , log(&Poco::Logger::get("LibraryBridgeHelper")) + , sample_block(sample_block_) + , config(context_->getConfigRef()) + , http_timeout(context_->getSettingsRef().http_receive_timeout.value.totalSeconds()) + , dictionary_id(dictionary_id_) +{ + bridge_port = config.getUInt("library_bridge.port", DEFAULT_PORT); + bridge_host = config.getString("library_bridge.host", DEFAULT_HOST); +} + + +Poco::URI LibraryBridgeHelper::createRequestURI(const String & method) const +{ + auto uri = getMainURI(); + uri.addQueryParameter("dictionary_id", toString(dictionary_id)); + uri.addQueryParameter("method", method); + return uri; +} + + +Poco::URI LibraryBridgeHelper::createBaseURI() const +{ + Poco::URI uri; + uri.setHost(bridge_host); + uri.setPort(bridge_port); + uri.setScheme("http"); + return uri; +} + + +void LibraryBridgeHelper::startBridge(std::unique_ptr cmd) const +{ + getContext()->addBridgeCommand(std::move(cmd)); +} + + +bool LibraryBridgeHelper::initLibrary(const std::string & library_path, const std::string library_settings, const std::string attributes_names) +{ + startBridgeSync(); + auto uri = createRequestURI(LIB_NEW_METHOD); + + /// Sample block must contain null values + WriteBufferFromOwnString out; + auto output_stream = getContext()->getOutputStream(LibraryBridgeHelper::DEFAULT_FORMAT, out, sample_block); + formatBlock(output_stream, sample_block); + auto block_string = out.str(); + + auto out_stream_callback = [library_path, library_settings, attributes_names, block_string, this](std::ostream & os) + { + os << "library_path=" << escapeForFileName(library_path) << "&"; + os << "library_settings=" << escapeForFileName(library_settings) << "&"; + os << "attributes_names=" << escapeForFileName(attributes_names) << "&"; + os << "sample_block=" << escapeForFileName(sample_block.getNamesAndTypesList().toString()) << "&"; + os << "null_values=" << escapeForFileName(block_string); + }; + return executeRequest(uri, out_stream_callback); +} + + +bool LibraryBridgeHelper::cloneLibrary(const Field & other_dictionary_id) +{ + startBridgeSync(); + auto uri = createRequestURI(LIB_CLONE_METHOD); + uri.addQueryParameter("from_dictionary_id", toString(other_dictionary_id)); + return executeRequest(uri); +} + + +bool LibraryBridgeHelper::removeLibrary() +{ + startBridgeSync(); + auto uri = createRequestURI(LIB_DELETE_METHOD); + return executeRequest(uri); +} + + +bool LibraryBridgeHelper::isModified() +{ + startBridgeSync(); + auto uri = createRequestURI(IS_MODIFIED_METHOD); + return executeRequest(uri); +} + + +bool LibraryBridgeHelper::supportsSelectiveLoad() +{ + startBridgeSync(); + auto uri = createRequestURI(SUPPORTS_SELECTIVE_LOAD_METHOD); + return executeRequest(uri); +} + + +BlockInputStreamPtr LibraryBridgeHelper::loadAll() +{ + startBridgeSync(); + auto uri = createRequestURI(LOAD_ALL_METHOD); + return loadBase(uri); +} + + +BlockInputStreamPtr LibraryBridgeHelper::loadIds(const std::string ids_string) +{ + startBridgeSync(); + auto uri = createRequestURI(LOAD_IDS_METHOD); + return loadBase(uri, [ids_string](std::ostream & os) { os << "ids=" << ids_string; }); +} + + +BlockInputStreamPtr LibraryBridgeHelper::loadKeys(const Block & requested_block) +{ + startBridgeSync(); + auto uri = createRequestURI(LOAD_KEYS_METHOD); + /// Sample block to parse block from callback + uri.addQueryParameter("requested_block_sample", requested_block.getNamesAndTypesList().toString()); + ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = [requested_block, this](std::ostream & os) + { + WriteBufferFromOStream out_buffer(os); + auto output_stream = getContext()->getOutputStream(LibraryBridgeHelper::DEFAULT_FORMAT, out_buffer, sample_block); + formatBlock(output_stream, requested_block); + }; + return loadBase(uri, out_stream_callback); +} + + +bool LibraryBridgeHelper::executeRequest(const Poco::URI & uri, ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback) +{ + ReadWriteBufferFromHTTP buf( + uri, + Poco::Net::HTTPRequest::HTTP_POST, + std::move(out_stream_callback), + ConnectionTimeouts::getHTTPTimeouts(getContext())); + + bool res; + readBoolText(res, buf); + return res; +} + + +BlockInputStreamPtr LibraryBridgeHelper::loadBase(const Poco::URI & uri, ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback) +{ + auto read_buf_ptr = std::make_unique( + uri, + Poco::Net::HTTPRequest::HTTP_POST, + std::move(out_stream_callback), + ConnectionTimeouts::getHTTPTimeouts(getContext()), + 0, + Poco::Net::HTTPBasicCredentials{}, + DBMS_DEFAULT_BUFFER_SIZE, + ReadWriteBufferFromHTTP::HTTPHeaderEntries{}); + + auto input_stream = getContext()->getInputFormat(LibraryBridgeHelper::DEFAULT_FORMAT, *read_buf_ptr, sample_block, DEFAULT_BLOCK_SIZE); + return std::make_shared>(input_stream, std::move(read_buf_ptr)); +} + +} diff --git a/src/Bridge/LibraryBridgeHelper.h b/src/Bridge/LibraryBridgeHelper.h new file mode 100644 index 00000000000..dc3f9020e71 --- /dev/null +++ b/src/Bridge/LibraryBridgeHelper.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +class LibraryBridgeHelper : public IBridgeHelper +{ + +public: + static constexpr inline size_t DEFAULT_PORT = 9012; + + LibraryBridgeHelper(ContextPtr context_, const Block & sample_block, const Field & dictionary_id_); + + bool initLibrary(const std::string & library_path, std::string library_settings, std::string attributes_names); + + bool cloneLibrary(const Field & other_dictionary_id); + + bool removeLibrary(); + + bool isModified(); + + bool supportsSelectiveLoad(); + + BlockInputStreamPtr loadAll(); + + BlockInputStreamPtr loadIds(std::string ids_string); + + BlockInputStreamPtr loadKeys(const Block & requested_block); + + BlockInputStreamPtr loadBase(const Poco::URI & uri, ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = {}); + + bool executeRequest(const Poco::URI & uri, ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = {}); + + +protected: + void startBridge(std::unique_ptr cmd) const override; + + String serviceAlias() const override { return "clickhouse-library-bridge"; } + + String serviceFileName() const override { return serviceAlias(); } + + size_t getDefaultPort() const override { return DEFAULT_PORT; } + + bool startBridgeManually() const override { return false; } + + String configPrefix() const override { return "library_bridge"; } + + const Poco::Util::AbstractConfiguration & getConfig() const override { return config; } + + Poco::Logger * getLog() const override { return log; } + + const Poco::Timespan & getHTTPTimeout() const override { return http_timeout; } + + Poco::URI createBaseURI() const override; + +private: + static constexpr inline auto LIB_NEW_METHOD = "libNew"; + static constexpr inline auto LIB_CLONE_METHOD = "libClone"; + static constexpr inline auto LIB_DELETE_METHOD = "libDelete"; + static constexpr inline auto LOAD_ALL_METHOD = "loadAll"; + static constexpr inline auto LOAD_IDS_METHOD = "loadIds"; + static constexpr inline auto LOAD_KEYS_METHOD = "loadKeys"; + static constexpr inline auto IS_MODIFIED_METHOD = "isModified"; + static constexpr inline auto SUPPORTS_SELECTIVE_LOAD_METHOD = "supportsSelectiveLoad"; + + Poco::URI createRequestURI(const String & method) const; + + Poco::Logger * log; + const Block sample_block; + const Poco::Util::AbstractConfiguration & config; + const Poco::Timespan http_timeout; + + Field dictionary_id; + std::string bridge_host; + size_t bridge_port; +}; + +} diff --git a/src/Bridge/XDBCBridgeHelper.h b/src/Bridge/XDBCBridgeHelper.h new file mode 100644 index 00000000000..a5f21e28204 --- /dev/null +++ b/src/Bridge/XDBCBridgeHelper.h @@ -0,0 +1,265 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(ARCADIA_BUILD) +# include +#endif + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +/// Class for Helpers for XDBC-bridges, provide utility methods, not main request. +class IXDBCBridgeHelper : public IBridgeHelper +{ + +public: + explicit IXDBCBridgeHelper(ContextPtr context_) : IBridgeHelper(context_) {} + + virtual std::vector> getURLParams(UInt64 max_block_size) const = 0; + + virtual Poco::URI getColumnsInfoURI() const = 0; + + virtual IdentifierQuotingStyle getIdentifierQuotingStyle() = 0; + + virtual bool isSchemaAllowed() = 0; + + virtual String getName() const = 0; +}; + +using BridgeHelperPtr = std::shared_ptr; + + +template +class XDBCBridgeHelper : public IXDBCBridgeHelper +{ + +public: + static constexpr inline auto DEFAULT_PORT = BridgeHelperMixin::DEFAULT_PORT; + static constexpr inline auto COL_INFO_HANDLER = "/columns_info"; + static constexpr inline auto IDENTIFIER_QUOTE_HANDLER = "/identifier_quote"; + static constexpr inline auto SCHEMA_ALLOWED_HANDLER = "/schema_allowed"; + + XDBCBridgeHelper( + ContextPtr global_context_, + const Poco::Timespan & http_timeout_, + const std::string & connection_string_) + : IXDBCBridgeHelper(global_context_) + , log(&Poco::Logger::get(BridgeHelperMixin::getName() + "BridgeHelper")) + , connection_string(connection_string_) + , http_timeout(http_timeout_) + , config(global_context_->getConfigRef()) +{ + bridge_host = config.getString(BridgeHelperMixin::configPrefix() + ".host", DEFAULT_HOST); + bridge_port = config.getUInt(BridgeHelperMixin::configPrefix() + ".port", DEFAULT_PORT); +} + + +protected: + auto getConnectionString() const { return connection_string; } + + String getName() const override { return BridgeHelperMixin::getName(); } + + size_t getDefaultPort() const override { return DEFAULT_PORT; } + + String serviceAlias() const override { return BridgeHelperMixin::serviceAlias(); } + + /// Same for odbc and jdbc + String serviceFileName() const override { return "clickhouse-odbc-bridge"; } + + String configPrefix() const override { return BridgeHelperMixin::configPrefix(); } + + const Poco::Timespan & getHTTPTimeout() const override { return http_timeout; } + + const Poco::Util::AbstractConfiguration & getConfig() const override { return config; } + + Poco::Logger * getLog() const override { return log; } + + bool startBridgeManually() const override { return BridgeHelperMixin::startBridgeManually(); } + + Poco::URI createBaseURI() const override + { + Poco::URI uri; + uri.setHost(bridge_host); + uri.setPort(bridge_port); + uri.setScheme("http"); + return uri; + } + + void startBridge(std::unique_ptr cmd) const override + { + getContext()->addBridgeCommand(std::move(cmd)); + } + + +private: + using Configuration = Poco::Util::AbstractConfiguration; + + Poco::Logger * log; + std::string connection_string; + const Poco::Timespan & http_timeout; + std::string bridge_host; + size_t bridge_port; + + const Configuration & config; + + std::optional quote_style; + std::optional is_schema_allowed; + + +protected: + using URLParams = std::vector>; + + Poco::URI getColumnsInfoURI() const override + { + auto uri = createBaseURI(); + uri.setPath(COL_INFO_HANDLER); + return uri; + } + + URLParams getURLParams(UInt64 max_block_size) const override + { + std::vector> result; + + result.emplace_back("connection_string", connection_string); /// already validated + result.emplace_back("max_block_size", std::to_string(max_block_size)); + + return result; + } + + bool isSchemaAllowed() override + { + if (!is_schema_allowed.has_value()) + { + startBridgeSync(); + + auto uri = createBaseURI(); + uri.setPath(SCHEMA_ALLOWED_HANDLER); + uri.addQueryParameter("connection_string", getConnectionString()); + + ReadWriteBufferFromHTTP buf(uri, Poco::Net::HTTPRequest::HTTP_POST, {}, ConnectionTimeouts::getHTTPTimeouts(getContext())); + + bool res; + readBoolText(res, buf); + is_schema_allowed = res; + } + + return *is_schema_allowed; + } + + IdentifierQuotingStyle getIdentifierQuotingStyle() override + { + if (!quote_style.has_value()) + { + startBridgeSync(); + + auto uri = createBaseURI(); + uri.setPath(IDENTIFIER_QUOTE_HANDLER); + uri.addQueryParameter("connection_string", getConnectionString()); + + ReadWriteBufferFromHTTP buf(uri, Poco::Net::HTTPRequest::HTTP_POST, {}, ConnectionTimeouts::getHTTPTimeouts(getContext())); + + std::string character; + readStringBinary(character, buf); + if (character.length() > 1) + throw Exception( + "Failed to parse quoting style from '" + character + "' for service " + BridgeHelperMixin::serviceAlias(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else if (character.length() == 0) + quote_style = IdentifierQuotingStyle::None; + else if (character[0] == '`') + quote_style = IdentifierQuotingStyle::Backticks; + else if (character[0] == '"') + quote_style = IdentifierQuotingStyle::DoubleQuotes; + else + throw Exception("Can not map quote identifier '" + character + "' to enum value", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + + return *quote_style; + } +}; + + +struct JDBCBridgeMixin +{ + static constexpr inline auto DEFAULT_PORT = 9019; + + static String configPrefix() + { + return "jdbc_bridge"; + } + + static String serviceAlias() + { + return "clickhouse-jdbc-bridge"; + } + + static String getName() + { + return "JDBC"; + } + + static AccessType getSourceAccessType() + { + return AccessType::JDBC; + } + + static bool startBridgeManually() + { + return true; + } +}; + + +struct ODBCBridgeMixin +{ + static constexpr inline auto DEFAULT_PORT = 9018; + + static String configPrefix() + { + return "odbc_bridge"; + } + + static String serviceAlias() + { + return "clickhouse-odbc-bridge"; + } + + static String getName() + { + return "ODBC"; + } + + static AccessType getSourceAccessType() + { + return AccessType::ODBC; + } + + static bool startBridgeManually() + { + return false; + } +}; + +} diff --git a/src/Bridge/ya.make b/src/Bridge/ya.make new file mode 100644 index 00000000000..e900bab036e --- /dev/null +++ b/src/Bridge/ya.make @@ -0,0 +1,17 @@ +# This file is generated automatically, do not edit. See 'ya.make.in' and use 'utils/generate-ya-make' to regenerate it. +OWNER(g:clickhouse) + +LIBRARY() + +PEERDIR( + clickhouse/src/Common +) + + +SRCS( + IBridgeHelper.cpp + LibraryBridgeHelper.cpp + +) + +END() diff --git a/src/Bridge/ya.make.in b/src/Bridge/ya.make.in new file mode 100644 index 00000000000..01edf8dca82 --- /dev/null +++ b/src/Bridge/ya.make.in @@ -0,0 +1,14 @@ +OWNER(g:clickhouse) + +LIBRARY() + +PEERDIR( + clickhouse/src/Common +) + + +SRCS( + +) + +END() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 215a13cce1a..43f6ae8fea5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -27,6 +27,11 @@ configure_file (Common/config.h.in ${CONFIG_COMMON}) configure_file (Common/config_version.h.in ${CONFIG_VERSION}) configure_file (Core/config_core.h.in ${CMAKE_CURRENT_BINARY_DIR}/Core/include/config_core.h) +if (USE_DEBUG_HELPERS) + set (INCLUDE_DEBUG_HELPERS "-I${ClickHouse_SOURCE_DIR}/base -include ${ClickHouse_SOURCE_DIR}/src/Core/iostream_debug_helpers.h") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${INCLUDE_DEBUG_HELPERS}") +endif () + if (COMPILER_GCC) # If we leave this optimization enabled, gcc-7 replaces a pair of SSE intrinsics (16 byte load, store) with a call to memcpy. # It leads to slow code. This is compiler bug. It looks like this: @@ -101,8 +106,8 @@ endif() list (APPEND clickhouse_common_io_sources ${CONFIG_BUILD}) list (APPEND clickhouse_common_io_headers ${CONFIG_VERSION} ${CONFIG_COMMON}) -list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/replicate.cpp) -list (APPEND dbms_headers Functions/IFunctionImpl.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/replicate.h) +list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/replicate.cpp Functions/FunctionsLogical.cpp) +list (APPEND dbms_headers Functions/IFunctionImpl.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/replicate.h Functions/FunctionsLogical.h) list (APPEND dbms_sources AggregateFunctions/AggregateFunctionFactory.cpp @@ -158,7 +163,11 @@ macro(add_object_library name common_path) list (APPEND all_modules ${name}) add_headers_and_sources(${name} ${common_path}) add_library(${name} SHARED ${${name}_sources} ${${name}_headers}) - target_link_libraries (${name} PRIVATE -Wl,--unresolved-symbols=ignore-all) + if (OS_DARWIN) + target_link_libraries (${name} PRIVATE -Wl,-undefined,dynamic_lookup) + else() + target_link_libraries (${name} PRIVATE -Wl,--unresolved-symbols=ignore-all) + endif() endif () endmacro() @@ -168,6 +177,7 @@ add_object_library(clickhouse_core_mysql Core/MySQL) add_object_library(clickhouse_compression Compression) add_object_library(clickhouse_datastreams DataStreams) add_object_library(clickhouse_datatypes DataTypes) +add_object_library(clickhouse_datatypes_serializations DataTypes/Serializations) add_object_library(clickhouse_databases Databases) add_object_library(clickhouse_databases_mysql Databases/MySQL) add_object_library(clickhouse_disks Disks) @@ -180,6 +190,7 @@ add_object_library(clickhouse_storages_distributed Storages/Distributed) add_object_library(clickhouse_storages_mergetree Storages/MergeTree) add_object_library(clickhouse_storages_liveview Storages/LiveView) add_object_library(clickhouse_client Client) +add_object_library(clickhouse_bridge Bridge) add_object_library(clickhouse_server Server) add_object_library(clickhouse_server_http Server/HTTP) add_object_library(clickhouse_formats Formats) @@ -215,7 +226,11 @@ else() target_link_libraries (clickhouse_interpreters PRIVATE clickhouse_parsers_new jemalloc libdivide) list (APPEND all_modules dbms) # force all split libs to be linked - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed") + if (OS_DARWIN) + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-undefined,error") + else() + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed") + endif() endif () macro (dbms_target_include_directories) @@ -316,6 +331,7 @@ if (USE_CYRUS_SASL) endif() if (USE_KRB5) + dbms_target_include_directories(SYSTEM BEFORE PRIVATE ${KRB5_INCLUDE_DIR}) dbms_target_link_libraries(PRIVATE ${KRB5_LIBRARY}) endif() @@ -360,11 +376,9 @@ if (ZSTD_LIBRARY) endif () endif() -set (LZMA_LIBRARY liblzma) -set (LZMA_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/xz/src/liblzma/api) -if (LZMA_LIBRARY) - target_link_libraries (clickhouse_common_io PUBLIC ${LZMA_LIBRARY}) - target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${LZMA_INCLUDE_DIR}) +if (XZ_LIBRARY) + target_link_libraries (clickhouse_common_io PUBLIC ${XZ_LIBRARY}) + target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${XZ_INCLUDE_DIR}) endif() if (USE_ICU) @@ -456,8 +470,14 @@ if (USE_LIBPQXX) dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${LIBPQXX_INCLUDE_DIR}) endif() +if (USE_DATASKETCHES) + target_include_directories (clickhouse_aggregate_functions SYSTEM BEFORE PRIVATE ${DATASKETCHES_INCLUDE_DIR}) +endif () + dbms_target_link_libraries(PRIVATE _boost_context) +include (${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake) + if (ENABLE_TESTS AND USE_GTEST) macro (grep_gtest_sources BASE_DIR DST_VAR) # Cold match files that are not in tests/ directories @@ -478,6 +498,20 @@ if (ENABLE_TESTS AND USE_GTEST) -Wno-gnu-zero-variadic-macro-arguments ) - target_link_libraries(unit_tests_dbms PRIVATE ${GTEST_BOTH_LIBRARIES} clickhouse_functions clickhouse_aggregate_functions clickhouse_parsers dbms clickhouse_common_zookeeper string_utils) + target_link_libraries(unit_tests_dbms PRIVATE + ${GTEST_BOTH_LIBRARIES} + clickhouse_functions + clickhouse_aggregate_functions + clickhouse_parsers + clickhouse_storages_system + dbms + clickhouse_common_zookeeper + string_utils) + + # For __udivmodti4 referenced in Core/tests/gtest_DecimalFunctions.cpp + if (OS_DARWIN AND COMPILER_GCC) + target_link_libraries(unit_tests_dbms PRIVATE gcc) + endif () + add_check(unit_tests_dbms) endif () diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 164b9565633..70d8109545b 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -8,10 +8,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -109,6 +109,8 @@ void Connection::connect(const ConnectionTimeouts & timeouts) } in = std::make_shared(*socket); + in->setAsyncCallback(std::move(async_callback)); + out = std::make_shared(*socket); connected = true; @@ -549,6 +551,15 @@ void Connection::sendIgnoredPartUUIDs(const std::vector & uuids) out->next(); } + +void Connection::sendReadTaskResponse(const String & response) +{ + writeVarUInt(Protocol::Client::ReadTaskResponse, *out); + writeVarUInt(DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION, *out); + writeStringBinary(response, *out); + out->next(); +} + void Connection::sendPreparedData(ReadBuffer & input, size_t size, const String & name) { /// NOTE 'Throttler' is not used in this method (could use, but it's not important right now). @@ -753,15 +764,8 @@ std::optional Connection::checkPacket(size_t timeout_microseconds) } -Packet Connection::receivePacket(std::function async_callback) +Packet Connection::receivePacket() { - in->setAsyncCallback(std::move(async_callback)); - SCOPE_EXIT({ - /// disconnect() will reset "in". - if (in) - in->setAsyncCallback({}); - }); - try { Packet res; @@ -812,6 +816,9 @@ Packet Connection::receivePacket(std::function async_ readVectorBinary(res.part_uuids, *in); return res; + case Protocol::Server::ReadTaskRequest: + return res; + default: /// In unknown state, disconnect - to not leave unsynchronised connection. disconnect(); @@ -912,13 +919,13 @@ void Connection::setDescription() } -std::unique_ptr Connection::receiveException() +std::unique_ptr Connection::receiveException() const { return std::make_unique(readException(*in, "Received from " + getDescription(), true /* remote */)); } -std::vector Connection::receiveMultistringMessage(UInt64 msg_type) +std::vector Connection::receiveMultistringMessage(UInt64 msg_type) const { size_t num = Protocol::Server::stringsInMessage(msg_type); std::vector strings(num); @@ -928,7 +935,7 @@ std::vector Connection::receiveMultistringMessage(UInt64 msg_type) } -Progress Connection::receiveProgress() +Progress Connection::receiveProgress() const { Progress progress; progress.read(*in, server_revision); @@ -936,7 +943,7 @@ Progress Connection::receiveProgress() } -BlockStreamProfileInfo Connection::receiveProfileInfo() +BlockStreamProfileInfo Connection::receiveProfileInfo() const { BlockStreamProfileInfo profile_info; profile_info.read(*in); diff --git a/src/Client/Connection.h b/src/Client/Connection.h index 2d24b143d7a..6c7edfb2761 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -21,13 +21,13 @@ #include #include +#include #include #include #include - namespace DB { @@ -53,8 +53,6 @@ class Connection; using ConnectionPtr = std::shared_ptr; using Connections = std::vector; -using Scalars = std::map; - /// Packet that could be received from server. struct Packet @@ -89,9 +87,9 @@ public: const String & user_, const String & password_, const String & cluster_, const String & cluster_secret_, - const String & client_name_ = "client", - Protocol::Compression compression_ = Protocol::Compression::Enable, - Protocol::Secure secure_ = Protocol::Secure::Disable, + const String & client_name_, + Protocol::Compression compression_, + Protocol::Secure secure_, Poco::Timespan sync_request_timeout_ = Poco::Timespan(DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC, 0)) : host(host_), port(port_), default_database(default_database_), @@ -112,7 +110,7 @@ public: setDescription(); } - virtual ~Connection() {} + virtual ~Connection() = default; /// Set throttler of network traffic. One throttler could be used for multiple connections to limit total traffic. void setThrottler(const ThrottlerPtr & throttler_) @@ -141,6 +139,8 @@ public: UInt16 getPort() const; const String & getDefaultDatabase() const; + Protocol::Compression getCompression() const { return compression; } + /// If last flag is true, you need to call sendExternalTablesData after. void sendQuery( const ConnectionTimeouts & timeouts, @@ -161,6 +161,8 @@ public: /// Send parts' uuids to excluded them from query processing void sendIgnoredPartUUIDs(const std::vector & uuids); + void sendReadTaskResponse(const String &); + /// Send prepared block of data (serialized and, if need, compressed), that will be read from 'input'. /// You could pass size of serialized/compressed block. void sendPreparedData(ReadBuffer & input, size_t size, const String & name = ""); @@ -175,8 +177,7 @@ public: std::optional checkPacket(size_t timeout_microseconds = 0); /// Receive packet from server. - /// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it. - Packet receivePacket(std::function async_callback = {}); + Packet receivePacket(); /// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception. void forceConnected(const ConnectionTimeouts & timeouts); @@ -195,6 +196,16 @@ public: size_t outBytesCount() const { return out ? out->count() : 0; } size_t inBytesCount() const { return in ? in->count() : 0; } + Poco::Net::Socket * getSocket() { return socket.get(); } + + /// Each time read from socket blocks and async_callback is set, it will be called. You can poll socket inside it. + void setAsyncCallback(AsyncCallback async_callback_) + { + async_callback = std::move(async_callback_); + if (in) + in->setAsyncCallback(std::move(async_callback)); + } + private: String host; UInt16 port; @@ -262,7 +273,7 @@ private: class LoggerWrapper { public: - LoggerWrapper(Connection & parent_) + explicit LoggerWrapper(Connection & parent_) : log(nullptr), parent(parent_) { } @@ -282,6 +293,8 @@ private: LoggerWrapper log_wrapper; + AsyncCallback async_callback = {}; + void connect(const ConnectionTimeouts & timeouts); void sendHello(); void receiveHello(); @@ -295,10 +308,10 @@ private: Block receiveLogData(); Block receiveDataImpl(BlockInputStreamPtr & stream); - std::vector receiveMultistringMessage(UInt64 msg_type); - std::unique_ptr receiveException(); - Progress receiveProgress(); - BlockStreamProfileInfo receiveProfileInfo(); + std::vector receiveMultistringMessage(UInt64 msg_type) const; + std::unique_ptr receiveException() const; + Progress receiveProgress() const; + BlockStreamProfileInfo receiveProfileInfo() const; void initInputBuffers(); void initBlockInput(); @@ -307,4 +320,20 @@ private: [[noreturn]] void throwUnexpectedPacket(UInt64 packet_type, const char * expected) const; }; +class AsyncCallbackSetter +{ +public: + AsyncCallbackSetter(Connection * connection_, AsyncCallback async_callback) : connection(connection_) + { + connection->setAsyncCallback(std::move(async_callback)); + } + + ~AsyncCallbackSetter() + { + connection->setAsyncCallback({}); + } +private: + Connection * connection; +}; + } diff --git a/src/Client/ConnectionEstablisher.cpp b/src/Client/ConnectionEstablisher.cpp new file mode 100644 index 00000000000..2532035fabd --- /dev/null +++ b/src/Client/ConnectionEstablisher.cpp @@ -0,0 +1,239 @@ +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event DistributedConnectionMissingTable; + extern const Event DistributedConnectionStaleReplica; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int NETWORK_ERROR; + extern const int SOCKET_TIMEOUT; +} + +ConnectionEstablisher::ConnectionEstablisher( + IConnectionPool * pool_, + const ConnectionTimeouts * timeouts_, + const Settings * settings_, + Poco::Logger * log_, + const QualifiedTableName * table_to_check_) + : pool(pool_), timeouts(timeouts_), settings(settings_), log(log_), table_to_check(table_to_check_), is_finished(false) +{ +} + +void ConnectionEstablisher::run(ConnectionEstablisher::TryResult & result, std::string & fail_message) +{ + is_finished = false; + SCOPE_EXIT(is_finished = true); + try + { + result.entry = pool->get(*timeouts, settings, /* force_connected = */ false); + AsyncCallbackSetter async_setter(&*result.entry, std::move(async_callback)); + + UInt64 server_revision = 0; + if (table_to_check) + server_revision = result.entry->getServerRevision(*timeouts); + + if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) + { + result.entry->forceConnected(*timeouts); + result.is_usable = true; + result.is_up_to_date = true; + return; + } + + /// Only status of the remote table corresponding to the Distributed table is taken into account. + /// TODO: request status for joined tables also. + TablesStatusRequest status_request; + status_request.tables.emplace(*table_to_check); + + TablesStatusResponse status_response = result.entry->getTablesStatus(*timeouts, status_request); + auto table_status_it = status_response.table_states_by_id.find(*table_to_check); + if (table_status_it == status_response.table_states_by_id.end()) + { + const char * message_pattern = "There is no table {}.{} on server: {}"; + fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription()); + LOG_WARNING(log, fail_message); + ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); + return; + } + + result.is_usable = true; + + UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; + if (!max_allowed_delay) + { + result.is_up_to_date = true; + return; + } + + UInt32 delay = table_status_it->second.absolute_delay; + + if (delay < max_allowed_delay) + result.is_up_to_date = true; + else + { + result.is_up_to_date = false; + result.staleness = delay; + + LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay); + ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); + } + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT + && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throw; + + fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); + + if (!result.entry.isNull()) + { + result.entry->disconnect(); + result.reset(); + } + } +} + +#if defined(OS_LINUX) + +ConnectionEstablisherAsync::ConnectionEstablisherAsync( + IConnectionPool * pool_, + const ConnectionTimeouts * timeouts_, + const Settings * settings_, + Poco::Logger * log_, + const QualifiedTableName * table_to_check_) + : connection_establisher(pool_, timeouts_, settings_, log_, table_to_check_) +{ + epoll.add(receive_timeout.getDescriptor()); +} + +void ConnectionEstablisherAsync::Routine::ReadCallback::operator()(int fd, const Poco::Timespan & timeout, const std::string &) +{ + /// Check if it's the first time and we need to add socket fd to epoll. + if (connection_establisher_async.socket_fd == -1) + { + connection_establisher_async.epoll.add(fd); + connection_establisher_async.socket_fd = fd; + } + + connection_establisher_async.receive_timeout.setRelative(timeout); + fiber = std::move(fiber).resume(); + connection_establisher_async.receive_timeout.reset(); +} + +Fiber ConnectionEstablisherAsync::Routine::operator()(Fiber && sink) +{ + try + { + connection_establisher_async.connection_establisher.setAsyncCallback(ReadCallback{connection_establisher_async, sink}); + connection_establisher_async.connection_establisher.run(connection_establisher_async.result, connection_establisher_async.fail_message); + } + catch (const boost::context::detail::forced_unwind &) + { + /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited + /// It should not be caught or it will segfault. + /// Other exceptions must be caught + throw; + } + catch (...) + { + connection_establisher_async.exception = std::current_exception(); + } + + return std::move(sink); +} + +std::variant ConnectionEstablisherAsync::resume() +{ + if (!fiber_created) + { + reset(); + fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); + fiber_created = true; + } else if (!checkReceiveTimeout()) + return result; + + fiber = std::move(fiber).resume(); + + if (exception) + std::rethrow_exception(std::move(exception)); + + if (connection_establisher.isFinished()) + { + destroyFiber(); + return result; + } + + return epoll.getFileDescriptor(); +} + +bool ConnectionEstablisherAsync::checkReceiveTimeout() +{ + bool is_socket_ready = false; + bool is_receive_timeout_alarmed = false; + + epoll_event events[2]; + events[0].data.fd = events[1].data.fd = -1; + size_t ready_count = epoll.getManyReady(2, events, false); + for (size_t i = 0; i != ready_count; ++i) + { + if (events[i].data.fd == socket_fd) + is_socket_ready = true; + if (events[i].data.fd == receive_timeout.getDescriptor()) + is_receive_timeout_alarmed = true; + } + + if (is_receive_timeout_alarmed && !is_socket_ready) + { + destroyFiber(); + /// In not async case this exception would be thrown and caught in ConnectionEstablisher::run, + /// but in async case we process timeout outside and cannot throw exception. So, we just save fail message. + fail_message = "Timeout exceeded while reading from socket (" + result.entry->getDescription() + ")"; + epoll.remove(socket_fd); + resetResult(); + return false; + } + + return true; +} + +void ConnectionEstablisherAsync::cancel() +{ + destroyFiber(); + reset(); +} + +void ConnectionEstablisherAsync::reset() +{ + resetResult(); + fail_message.clear(); + socket_fd = -1; +} + +void ConnectionEstablisherAsync::resetResult() +{ + if (!result.entry.isNull()) + { + result.entry->disconnect(); + result.reset(); + } +} + +void ConnectionEstablisherAsync::destroyFiber() +{ + Fiber to_destroy = std::move(fiber); + fiber_created = false; +} + +#endif + +} diff --git a/src/Client/ConnectionEstablisher.h b/src/Client/ConnectionEstablisher.h new file mode 100644 index 00000000000..1096452ebce --- /dev/null +++ b/src/Client/ConnectionEstablisher.h @@ -0,0 +1,131 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Class for establishing connection to the replica. It supports setting up +/// an async callback that will be called when reading from socket blocks. +class ConnectionEstablisher +{ +public: + using TryResult = PoolWithFailoverBase::TryResult; + + ConnectionEstablisher(IConnectionPool * pool_, + const ConnectionTimeouts * timeouts_, + const Settings * settings_, + Poco::Logger * log, + const QualifiedTableName * table_to_check = nullptr); + + /// Establish connection and save it in result, write possible exception message in fail_message. + void run(TryResult & result, std::string & fail_message); + + /// Set async callback that will be called when reading from socket blocks. + void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); } + + bool isFinished() const { return is_finished; } + +private: + IConnectionPool * pool; + const ConnectionTimeouts * timeouts; + const Settings * settings; + Poco::Logger * log; + const QualifiedTableName * table_to_check; + + bool is_finished; + AsyncCallback async_callback = {}; + +}; + +#if defined(OS_LINUX) + +/// Class for nonblocking establishing connection to the replica. +/// It runs establishing connection process in fiber and sets special +/// read callback which is called when reading from socket blocks. +/// When read callback is called, socket and receive timeout are added in epoll +/// and execution returns to the main program. +/// So, you can poll this epoll file descriptor to determine when to resume. +class ConnectionEstablisherAsync +{ +public: + using TryResult = PoolWithFailoverBase::TryResult; + + ConnectionEstablisherAsync(IConnectionPool * pool_, + const ConnectionTimeouts * timeouts_, + const Settings * settings_, + Poco::Logger * log_, + const QualifiedTableName * table_to_check = nullptr); + + /// Resume establishing connection. If the process was not finished, + /// return file descriptor (you can add it in epoll and poll it, + /// when this fd become ready, call resume again), + /// if the process was failed or finished, return it's result, + std::variant resume(); + + /// Cancel establishing connections. Fiber will be destroyed, + /// class will be set in initial stage. + void cancel(); + + TryResult getResult() const { return result; } + + const std::string & getFailMessage() const { return fail_message; } + +private: + /// When epoll file descriptor is ready, check if it's an expired timeout. + /// Return false if receive timeout expired and socket is not ready, return true otherwise. + bool checkReceiveTimeout(); + + struct Routine + { + ConnectionEstablisherAsync & connection_establisher_async; + + struct ReadCallback + { + ConnectionEstablisherAsync & connection_establisher_async; + Fiber & fiber; + + void operator()(int fd, const Poco::Timespan & timeout, const std::string &); + }; + + Fiber operator()(Fiber && sink); + }; + + void reset(); + + void resetResult(); + + void destroyFiber(); + + ConnectionEstablisher connection_establisher; + TryResult result; + std::string fail_message; + + Fiber fiber; + FiberStack fiber_stack; + + /// We use timer descriptor for checking socket receive timeout. + TimerDescriptor receive_timeout; + + /// In read callback we add socket file descriptor and timer descriptor with receive timeout + /// in epoll, so we can return epoll file descriptor outside for polling. + Epoll epoll; + int socket_fd = -1; + std::string socket_description; + + /// If and exception occurred in fiber resume, we save it and rethrow. + std::exception_ptr exception; + + bool fiber_created = false; +}; + +#endif + +} diff --git a/src/Client/ConnectionPool.h b/src/Client/ConnectionPool.h index 2389cc6755d..bf73e9756d2 100644 --- a/src/Client/ConnectionPool.h +++ b/src/Client/ConnectionPool.h @@ -26,7 +26,7 @@ public: using Entry = PoolBase::Entry; public: - virtual ~IConnectionPool() {} + virtual ~IConnectionPool() = default; /// Selects the connection to work. /// If force_connected is false, the client must manually ensure that returned connection is good. @@ -56,9 +56,9 @@ public: const String & password_, const String & cluster_, const String & cluster_secret_, - const String & client_name_ = "client", - Protocol::Compression compression_ = Protocol::Compression::Enable, - Protocol::Secure secure_ = Protocol::Secure::Disable, + const String & client_name_, + Protocol::Compression compression_, + Protocol::Secure secure_, Int64 priority_ = 1) : Base(max_connections_, &Poco::Logger::get("ConnectionPool (" + host_ + ":" + toString(port_) + ")")), diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index 1ca61dc8059..eff589edaa8 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -12,20 +13,12 @@ #include -namespace ProfileEvents -{ - extern const Event DistributedConnectionMissingTable; - extern const Event DistributedConnectionStaleReplica; -} namespace DB { namespace ErrorCodes { - extern const int ATTEMPT_TO_READ_AFTER_EOF; - extern const int NETWORK_ERROR; - extern const int SOCKET_TIMEOUT; extern const int LOGICAL_ERROR; } @@ -112,15 +105,20 @@ ConnectionPoolWithFailover::Status ConnectionPoolWithFailover::getStatus() const ConnectionPoolWithFailover::Status result; result.reserve(states.size()); const time_t since_last_error_decrease = time(nullptr) - error_decrease_time; - + /// Update error_count and slowdown_count in states to return actual information. + auto updated_states = states; + auto updated_error_decrease_time = error_decrease_time; + Base::updateErrorCounts(updated_states, updated_error_decrease_time); for (size_t i = 0; i < states.size(); ++i) { const auto rounds_to_zero_errors = states[i].error_count ? bitScanReverse(states[i].error_count) + 1 : 0; - const auto seconds_to_zero_errors = std::max(static_cast(0), rounds_to_zero_errors * decrease_error_period - since_last_error_decrease); + const auto rounds_to_zero_slowdowns = states[i].slowdown_count ? bitScanReverse(states[i].slowdown_count) + 1 : 0; + const auto seconds_to_zero_errors = std::max(static_cast(0), std::max(rounds_to_zero_errors, rounds_to_zero_slowdowns) * decrease_error_period - since_last_error_decrease); result.emplace_back(NestedPoolStatus{ pools[i], - states[i].error_count, + updated_states[i].error_count, + updated_states[i].slowdown_count, std::chrono::seconds{seconds_to_zero_errors} }); } @@ -172,6 +170,43 @@ std::vector ConnectionPoolWithFailover::g return getManyImpl(settings, pool_mode, try_get_entry); } +ConnectionPoolWithFailover::Base::GetPriorityFunc ConnectionPoolWithFailover::makeGetPriorityFunc(const Settings * settings) +{ + size_t offset = 0; + if (settings) + offset = settings->load_balancing_first_offset % nested_pools.size(); + + GetPriorityFunc get_priority; + switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing) + { + case LoadBalancing::NEAREST_HOSTNAME: + get_priority = [&](size_t i) { return hostname_differences[i]; }; + break; + case LoadBalancing::IN_ORDER: + get_priority = [](size_t i) { return i; }; + break; + case LoadBalancing::RANDOM: + break; + case LoadBalancing::FIRST_OR_RANDOM: + get_priority = [offset](size_t i) -> size_t { return i != offset; }; + break; + case LoadBalancing::ROUND_ROBIN: + if (last_used >= nested_pools.size()) + last_used = 0; + ++last_used; + /* Consider nested_pools.size() equals to 5 + * last_used = 1 -> get_priority: 0 1 2 3 4 + * last_used = 2 -> get_priority: 5 0 1 2 3 + * last_used = 3 -> get_priority: 5 4 0 1 2 + * ... + * */ + get_priority = [&](size_t i) { ++i; return i < last_used ? nested_pools.size() - i : i - last_used; }; + break; + } + + return get_priority; +} + std::vector ConnectionPoolWithFailover::getManyImpl( const Settings * settings, PoolMode pool_mode, @@ -194,36 +229,7 @@ std::vector ConnectionPoolWithFailover::g else throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR); - size_t offset = 0; - if (settings) - offset = settings->load_balancing_first_offset % nested_pools.size(); - GetPriorityFunc get_priority; - switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing) - { - case LoadBalancing::NEAREST_HOSTNAME: - get_priority = [&](size_t i) { return hostname_differences[i]; }; - break; - case LoadBalancing::IN_ORDER: - get_priority = [](size_t i) { return i; }; - break; - case LoadBalancing::RANDOM: - break; - case LoadBalancing::FIRST_OR_RANDOM: - get_priority = [offset](size_t i) -> size_t { return i != offset; }; - break; - case LoadBalancing::ROUND_ROBIN: - if (last_used >= nested_pools.size()) - last_used = 0; - ++last_used; - /* Consider nested_pools.size() equals to 5 - * last_used = 1 -> get_priority: 0 1 2 3 4 - * last_used = 2 -> get_priority: 5 0 1 2 3 - * last_used = 3 -> get_priority: 5 4 0 1 2 - * ... - * */ - get_priority = [&](size_t i) { ++i; return i < last_used ? nested_pools.size() - i : i - last_used; }; - break; - } + GetPriorityFunc get_priority = makeGetPriorityFunc(settings); UInt64 max_ignored_errors = settings ? settings->distributed_replica_max_ignored_errors.value : 0; bool fallback_to_stale_replicas = settings ? settings->fallback_to_stale_replicas_for_distributed_queries.value : true; @@ -241,77 +247,17 @@ ConnectionPoolWithFailover::tryGetEntry( const Settings * settings, const QualifiedTableName * table_to_check) { + ConnectionEstablisher connection_establisher(&pool, &timeouts, settings, log, table_to_check); TryResult result; - try - { - result.entry = pool.get(timeouts, settings, /* force_connected = */ false); - - UInt64 server_revision = 0; - if (table_to_check) - server_revision = result.entry->getServerRevision(timeouts); - - if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) - { - result.entry->forceConnected(timeouts); - result.is_usable = true; - result.is_up_to_date = true; - return result; - } - - /// Only status of the remote table corresponding to the Distributed table is taken into account. - /// TODO: request status for joined tables also. - TablesStatusRequest status_request; - status_request.tables.emplace(*table_to_check); - - TablesStatusResponse status_response = result.entry->getTablesStatus(timeouts, status_request); - auto table_status_it = status_response.table_states_by_id.find(*table_to_check); - if (table_status_it == status_response.table_states_by_id.end()) - { - const char * message_pattern = "There is no table {}.{} on server: {}"; - fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription()); - LOG_WARNING(log, fail_message); - ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); - - return result; - } - - result.is_usable = true; - - UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; - if (!max_allowed_delay) - { - result.is_up_to_date = true; - return result; - } - - UInt32 delay = table_status_it->second.absolute_delay; - - if (delay < max_allowed_delay) - result.is_up_to_date = true; - else - { - result.is_up_to_date = false; - result.staleness = delay; - - LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay); - ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); - } - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT - && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throw; - - fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); - - if (!result.entry.isNull()) - { - result.entry->disconnect(); - result.reset(); - } - } + connection_establisher.run(result, fail_message); return result; } +std::vector ConnectionPoolWithFailover::getShuffledPools(const Settings * settings) +{ + GetPriorityFunc get_priority = makeGetPriorityFunc(settings); + UInt64 max_ignored_errors = settings ? settings->distributed_replica_max_ignored_errors.value : 0; + return Base::getShuffledPools(max_ignored_errors, get_priority); +} + } diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index 7d5f713f6a9..ce70c27838b 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -74,12 +74,22 @@ public: { const Base::NestedPoolPtr pool; size_t error_count; + size_t slowdown_count; std::chrono::seconds estimated_recovery_time; }; using Status = std::vector; Status getStatus() const; + std::vector getShuffledPools(const Settings * settings); + + size_t getMaxErrorCup() const { return Base::max_error_cap; } + + void updateSharedError(std::vector & shuffled_pools) + { + Base::updateSharedErrorCounts(shuffled_pools); + } + private: /// Get the values of relevant settings and call Base::getMany() std::vector getManyImpl( @@ -97,6 +107,8 @@ private: const Settings * settings, const QualifiedTableName * table_to_check = nullptr); + GetPriorityFunc makeGetPriorityFunc(const Settings * settings); + private: std::vector hostname_differences; /// Distances from name of this host to the names of hosts of pools. size_t last_used = 0; /// Last used for round_robin policy. diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp new file mode 100644 index 00000000000..8455ef3117e --- /dev/null +++ b/src/Client/HedgedConnections.cpp @@ -0,0 +1,545 @@ +#if defined(OS_LINUX) + +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event HedgedRequestsChangeReplica; +} + +namespace DB +{ +namespace ErrorCodes +{ + extern const int MISMATCH_REPLICAS_DATA_SOURCES; + extern const int LOGICAL_ERROR; + extern const int SOCKET_TIMEOUT; + extern const int ALL_CONNECTION_TRIES_FAILED; +} + +HedgedConnections::HedgedConnections( + const ConnectionPoolWithFailoverPtr & pool_, + const Settings & settings_, + const ConnectionTimeouts & timeouts_, + const ThrottlerPtr & throttler_, + PoolMode pool_mode, + std::shared_ptr table_to_check_) + : hedged_connections_factory(pool_, &settings_, timeouts_, table_to_check_) + , settings(settings_) + , throttler(throttler_) +{ + std::vector connections = hedged_connections_factory.getManyConnections(pool_mode); + + if (connections.empty()) + return; + + offset_states.reserve(connections.size()); + for (size_t i = 0; i != connections.size(); ++i) + { + offset_states.emplace_back(); + offset_states[i].replicas.emplace_back(connections[i]); + offset_states[i].active_connection_count = 1; + + ReplicaState & replica = offset_states[i].replicas.back(); + replica.connection->setThrottler(throttler_); + + epoll.add(replica.packet_receiver->getFileDescriptor()); + fd_to_replica_location[replica.packet_receiver->getFileDescriptor()] = ReplicaLocation{i, 0}; + + epoll.add(replica.change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_location[replica.change_replica_timeout.getDescriptor()] = ReplicaLocation{i, 0}; + } + + active_connection_count = connections.size(); + offsets_with_disabled_changing_replica = 0; + pipeline_for_new_replicas.add([throttler_](ReplicaState & replica_) { replica_.connection->setThrottler(throttler_); }); +} + +void HedgedConnections::Pipeline::add(std::function send_function) +{ + pipeline.push_back(send_function); +} + +void HedgedConnections::Pipeline::run(ReplicaState & replica) +{ + for (auto & send_func : pipeline) + send_func(replica); +} + +void HedgedConnections::sendScalarsData(Scalars & data) +{ + std::lock_guard lock(cancel_mutex); + + if (!sent_query) + throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); + + auto send_scalars_data = [&data](ReplicaState & replica) { replica.connection->sendScalarsData(data); }; + + for (auto & offset_state : offset_states) + for (auto & replica : offset_state.replicas) + if (replica.connection) + send_scalars_data(replica); + + pipeline_for_new_replicas.add(send_scalars_data); +} + +void HedgedConnections::sendExternalTablesData(std::vector & data) +{ + std::lock_guard lock(cancel_mutex); + + if (!sent_query) + throw Exception("Cannot send external tables data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); + + if (data.size() != size()) + throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES); + + auto send_external_tables_data = [&data](ReplicaState & replica) { replica.connection->sendExternalTablesData(data[0]); }; + + for (auto & offset_state : offset_states) + for (auto & replica : offset_state.replicas) + if (replica.connection) + send_external_tables_data(replica); + + pipeline_for_new_replicas.add(send_external_tables_data); +} + +void HedgedConnections::sendIgnoredPartUUIDs(const std::vector & uuids) +{ + std::lock_guard lock(cancel_mutex); + + if (sent_query) + throw Exception("Cannot send uuids after query is sent.", ErrorCodes::LOGICAL_ERROR); + + auto send_ignored_part_uuids = [&uuids](ReplicaState & replica) { replica.connection->sendIgnoredPartUUIDs(uuids); }; + + for (auto & offset_state : offset_states) + for (auto & replica : offset_state.replicas) + if (replica.connection) + send_ignored_part_uuids(replica); + + pipeline_for_new_replicas.add(send_ignored_part_uuids); +} + +void HedgedConnections::sendQuery( + const ConnectionTimeouts & timeouts, + const String & query, + const String & query_id, + UInt64 stage, + const ClientInfo & client_info, + bool with_pending_data) +{ + std::lock_guard lock(cancel_mutex); + + if (sent_query) + throw Exception("Query already sent.", ErrorCodes::LOGICAL_ERROR); + + for (auto & offset_state : offset_states) + { + for (auto & replica : offset_state.replicas) + { + if (replica.connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + { + disable_two_level_aggregation = true; + break; + } + } + if (disable_two_level_aggregation) + break; + } + + if (!disable_two_level_aggregation) + { + /// Tell hedged_connections_factory to skip replicas that doesn't support two-level aggregation. + hedged_connections_factory.skipReplicasWithTwoLevelAggregationIncompatibility(); + } + + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaState & replica) + { + Settings modified_settings = settings; + + if (disable_two_level_aggregation) + { + /// Disable two-level aggregation due to version incompatibility. + modified_settings.group_by_two_level_threshold = 0; + modified_settings.group_by_two_level_threshold_bytes = 0; + } + + if (offset_states.size() > 1) + { + modified_settings.parallel_replicas_count = offset_states.size(); + modified_settings.parallel_replica_offset = fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset; + } + + replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); + replica.change_replica_timeout.setRelative(timeouts.receive_data_timeout); + }; + + for (auto & offset_status : offset_states) + for (auto & replica : offset_status.replicas) + send_query(replica); + + pipeline_for_new_replicas.add(send_query); + sent_query = true; +} + +void HedgedConnections::disconnect() +{ + std::lock_guard lock(cancel_mutex); + + for (auto & offset_status : offset_states) + for (auto & replica : offset_status.replicas) + if (replica.connection) + finishProcessReplica(replica, true); + + if (hedged_connections_factory.hasEventsInProcess()) + { + if (hedged_connections_factory.numberOfProcessingReplicas() > 0) + epoll.remove(hedged_connections_factory.getFileDescriptor()); + + hedged_connections_factory.stopChoosingReplicas(); + } +} + +std::string HedgedConnections::dumpAddresses() const +{ + std::lock_guard lock(cancel_mutex); + + std::string addresses; + bool is_first = true; + + for (const auto & offset_state : offset_states) + { + for (const auto & replica : offset_state.replicas) + { + if (replica.connection) + { + addresses += (is_first ? "" : "; ") + replica.connection->getDescription(); + is_first = false; + } + } + } + + return addresses; +} + +void HedgedConnections::sendCancel() +{ + std::lock_guard lock(cancel_mutex); + + if (!sent_query || cancelled) + throw Exception("Cannot cancel. Either no query sent or already cancelled.", ErrorCodes::LOGICAL_ERROR); + + for (auto & offset_status : offset_states) + for (auto & replica : offset_status.replicas) + if (replica.connection) + replica.connection->sendCancel(); + + cancelled = true; +} + +Packet HedgedConnections::drain() +{ + std::lock_guard lock(cancel_mutex); + + if (!cancelled) + throw Exception("Cannot drain connections: cancel first.", ErrorCodes::LOGICAL_ERROR); + + Packet res; + res.type = Protocol::Server::EndOfStream; + + while (!epoll.empty()) + { + ReplicaLocation location = getReadyReplicaLocation(); + Packet packet = receivePacketFromReplica(location); + switch (packet.type) + { + case Protocol::Server::PartUUIDs: + case Protocol::Server::Data: + case Protocol::Server::Progress: + case Protocol::Server::ProfileInfo: + case Protocol::Server::Totals: + case Protocol::Server::Extremes: + case Protocol::Server::EndOfStream: + break; + + case Protocol::Server::Exception: + default: + /// If we receive an exception or an unknown packet, we save it. + res = std::move(packet); + break; + } + } + + return res; +} + +Packet HedgedConnections::receivePacket() +{ + std::lock_guard lock(cancel_mutex); + return receivePacketUnlocked({}); +} + +Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) +{ + if (!sent_query) + throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR); + if (!hasActiveConnections()) + throw Exception("No more packets are available.", ErrorCodes::LOGICAL_ERROR); + + if (epoll.empty()) + throw Exception("No pending events in epoll.", ErrorCodes::LOGICAL_ERROR); + + ReplicaLocation location = getReadyReplicaLocation(std::move(async_callback)); + return receivePacketFromReplica(location); +} + +HedgedConnections::ReplicaLocation HedgedConnections::getReadyReplicaLocation(AsyncCallback async_callback) +{ + /// Firstly, resume replica with the last received packet if it has pending data. + if (replica_with_last_received_packet) + { + ReplicaLocation location = replica_with_last_received_packet.value(); + replica_with_last_received_packet.reset(); + if (offset_states[location.offset].replicas[location.index].connection->hasReadPendingData() && resumePacketReceiver(location)) + return location; + } + + int event_fd; + while (true) + { + /// Get ready file descriptor from epoll and process it. + event_fd = getReadyFileDescriptor(async_callback); + + if (event_fd == hedged_connections_factory.getFileDescriptor()) + checkNewReplica(); + else if (fd_to_replica_location.contains(event_fd)) + { + ReplicaLocation location = fd_to_replica_location[event_fd]; + if (resumePacketReceiver(location)) + return location; + } + else if (timeout_fd_to_replica_location.contains(event_fd)) + { + ReplicaLocation location = timeout_fd_to_replica_location[event_fd]; + offset_states[location.offset].replicas[location.index].change_replica_timeout.reset(); + offset_states[location.offset].replicas[location.index].is_change_replica_timeout_expired = true; + offset_states[location.offset].next_replica_in_process = true; + offsets_queue.push(location.offset); + ProfileEvents::increment(ProfileEvents::HedgedRequestsChangeReplica); + startNewReplica(); + } + else + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); + } +}; + +bool HedgedConnections::resumePacketReceiver(const HedgedConnections::ReplicaLocation & location) +{ + ReplicaState & replica_state = offset_states[location.offset].replicas[location.index]; + auto res = replica_state.packet_receiver->resume(); + + if (std::holds_alternative(res)) + { + last_received_packet = std::move(std::get(res)); + return true; + } + else if (std::holds_alternative(res)) + { + finishProcessReplica(replica_state, true); + + /// Check if there is no more active connections with the same offset and there is no new replica in process. + if (offset_states[location.offset].active_connection_count == 0 && !offset_states[location.offset].next_replica_in_process) + throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); + } + + return false; +} + +int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) +{ + epoll_event event; + event.data.fd = -1; + size_t events_count = 0; + bool blocking = !static_cast(async_callback); + while (events_count == 0) + { + events_count = epoll.getManyReady(1, &event, blocking); + if (!events_count && async_callback) + async_callback(epoll.getFileDescriptor(), 0, epoll.getDescription()); + } + return event.data.fd; +} + +Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & replica_location) +{ + ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; + Packet packet = std::move(last_received_packet); + switch (packet.type) + { + case Protocol::Server::Data: + /// If we received the first not empty data packet and still can change replica, + /// disable changing replica with this offset. + if (offset_states[replica_location.offset].can_change_replica && packet.block.rows() > 0) + disableChangingReplica(replica_location); + replica_with_last_received_packet = replica_location; + break; + case Protocol::Server::Progress: + /// Check if we have made some progress and still can change replica. + if (offset_states[replica_location.offset].can_change_replica && packet.progress.read_bytes > 0) + { + /// If we are allowed to change replica until the first data packet, + /// just restart timeout (if it hasn't expired yet). Otherwise disable changing replica with this offset. + if (settings.allow_changing_replica_until_first_data_packet && !replica.is_change_replica_timeout_expired) + replica.change_replica_timeout.setRelative(hedged_connections_factory.getConnectionTimeouts().receive_data_timeout); + else + disableChangingReplica(replica_location); + } + replica_with_last_received_packet = replica_location; + break; + case Protocol::Server::PartUUIDs: + case Protocol::Server::ProfileInfo: + case Protocol::Server::Totals: + case Protocol::Server::Extremes: + case Protocol::Server::Log: + replica_with_last_received_packet = replica_location; + break; + + case Protocol::Server::EndOfStream: + /// Check case when we receive EndOfStream before first not empty data packet + /// or positive progress. It may happen if max_parallel_replicas > 1 and + /// there is no way to sample data in this query. + if (offset_states[replica_location.offset].can_change_replica) + disableChangingReplica(replica_location); + finishProcessReplica(replica, false); + break; + + case Protocol::Server::Exception: + default: + /// Check case when we receive Exception before first not empty data packet + /// or positive progress. It may happen if max_parallel_replicas > 1 and + /// there is no way to sample data in this query. + if (offset_states[replica_location.offset].can_change_replica) + disableChangingReplica(replica_location); + finishProcessReplica(replica, true); + break; + } + + return packet; +} + +void HedgedConnections::disableChangingReplica(const ReplicaLocation & replica_location) +{ + /// Stop working with replicas, that are responsible for the same offset. + OffsetState & offset_state = offset_states[replica_location.offset]; + offset_state.replicas[replica_location.index].change_replica_timeout.reset(); + ++offsets_with_disabled_changing_replica; + offset_state.can_change_replica = false; + + for (size_t i = 0; i != offset_state.replicas.size(); ++i) + { + if (i != replica_location.index && offset_state.replicas[i].connection) + { + offset_state.replicas[i].connection->sendCancel(); + finishProcessReplica(offset_state.replicas[i], true); + } + } + + /// If we disabled changing replica with all offsets, we need to stop choosing new replicas. + if (hedged_connections_factory.hasEventsInProcess() && offsets_with_disabled_changing_replica == offset_states.size()) + { + if (hedged_connections_factory.numberOfProcessingReplicas() > 0) + epoll.remove(hedged_connections_factory.getFileDescriptor()); + hedged_connections_factory.stopChoosingReplicas(); + } +} + +void HedgedConnections::startNewReplica() +{ + Connection * connection = nullptr; + HedgedConnectionsFactory::State state = hedged_connections_factory.startNewConnection(connection); + + /// Check if we need to add hedged_connections_factory file descriptor to epoll. + if (state == HedgedConnectionsFactory::State::NOT_READY && hedged_connections_factory.numberOfProcessingReplicas() == 1) + epoll.add(hedged_connections_factory.getFileDescriptor()); + + processNewReplicaState(state, connection); +} + +void HedgedConnections::checkNewReplica() +{ + Connection * connection = nullptr; + HedgedConnectionsFactory::State state = hedged_connections_factory.waitForReadyConnections(connection); + + processNewReplicaState(state, connection); + + /// Check if we don't need to listen hedged_connections_factory file descriptor in epoll anymore. + if (hedged_connections_factory.numberOfProcessingReplicas() == 0) + epoll.remove(hedged_connections_factory.getFileDescriptor()); +} + +void HedgedConnections::processNewReplicaState(HedgedConnectionsFactory::State state, Connection * connection) +{ + switch (state) + { + case HedgedConnectionsFactory::State::READY: + { + size_t offset = offsets_queue.front(); + offsets_queue.pop(); + + offset_states[offset].replicas.emplace_back(connection); + ++offset_states[offset].active_connection_count; + offset_states[offset].next_replica_in_process = false; + ++active_connection_count; + + ReplicaState & replica = offset_states[offset].replicas.back(); + epoll.add(replica.packet_receiver->getFileDescriptor()); + fd_to_replica_location[replica.packet_receiver->getFileDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size() - 1}; + epoll.add(replica.change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_location[replica.change_replica_timeout.getDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size() - 1}; + + pipeline_for_new_replicas.run(replica); + break; + } + case HedgedConnectionsFactory::State::CANNOT_CHOOSE: + { + while (!offsets_queue.empty()) + { + /// Check if there is no active replica with needed offsets. + if (offset_states[offsets_queue.front()].active_connection_count == 0) + throw Exception("Cannot find enough connections to replicas", ErrorCodes::ALL_CONNECTION_TRIES_FAILED); + offset_states[offsets_queue.front()].next_replica_in_process = false; + offsets_queue.pop(); + } + break; + } + case HedgedConnectionsFactory::State::NOT_READY: + break; + } +} + +void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) +{ + /// It's important to remove file descriptor from epoll exactly before cancelling packet_receiver, + /// because otherwise another thread can try to receive a packet, get this file descriptor + /// from epoll and resume cancelled packet_receiver. + epoll.remove(replica.packet_receiver->getFileDescriptor()); + epoll.remove(replica.change_replica_timeout.getDescriptor()); + + replica.packet_receiver->cancel(); + replica.change_replica_timeout.reset(); + + --offset_states[fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset].active_connection_count; + fd_to_replica_location.erase(replica.packet_receiver->getFileDescriptor()); + timeout_fd_to_replica_location.erase(replica.change_replica_timeout.getDescriptor()); + + --active_connection_count; + + if (disconnect) + replica.connection->disconnect(); + replica.connection = nullptr; +} + +} +#endif diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h new file mode 100644 index 00000000000..9f7d8837536 --- /dev/null +++ b/src/Client/HedgedConnections.h @@ -0,0 +1,200 @@ +#pragma once +#if defined(OS_LINUX) + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + + +/** To receive data from multiple replicas (connections) from one shard asynchronously. + * The principe of Hedged Connections is used to reduce tail latency: + * if we don't receive data from replica and there is no progress in query execution + * for a long time, we try to get new replica and send query to it, + * without cancelling working with previous replica. This class + * supports all functionality that MultipleConnections has. + */ +class HedgedConnections : public IConnections +{ +public: + using PacketReceiverPtr = std::unique_ptr; + struct ReplicaState + { + explicit ReplicaState(Connection * connection_) : connection(connection_), packet_receiver(std::make_unique(connection_)) + { + } + + Connection * connection = nullptr; + PacketReceiverPtr packet_receiver; + TimerDescriptor change_replica_timeout; + bool is_change_replica_timeout_expired = false; + }; + + struct OffsetState + { + /// Replicas with the same offset. + std::vector replicas; + /// An amount of active replicas. When can_change_replica is false, + /// active_connection_count is always <= 1 (because we stopped working with + /// other replicas with the same offset) + size_t active_connection_count = 0; + bool can_change_replica = true; + + /// This flag is true when this offset is in queue for + /// new replicas. It's needed to process receive timeout + /// (throw an exception when receive timeout expired and there is no + /// new replica in process) + bool next_replica_in_process = false; + }; + + /// We process events in epoll, so we need to determine replica by it's + /// file descriptor. We store map fd -> replica location. To determine + /// where replica is, we need a replica offset + /// (the same as parallel_replica_offset), and index, which is needed because + /// we can have many replicas with same offset (when receive_data_timeout has expired). + struct ReplicaLocation + { + size_t offset; + size_t index; + }; + + HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, + const Settings & settings_, + const ConnectionTimeouts & timeouts_, + const ThrottlerPtr & throttler, + PoolMode pool_mode, + std::shared_ptr table_to_check_ = nullptr); + + void sendScalarsData(Scalars & data) override; + + void sendExternalTablesData(std::vector & data) override; + + void sendQuery( + const ConnectionTimeouts & timeouts, + const String & query, + const String & query_id, + UInt64 stage, + const ClientInfo & client_info, + bool with_pending_data) override; + + void sendReadTaskResponse(const String &) override + { + throw Exception("sendReadTaskResponse in not supported with HedgedConnections", ErrorCodes::LOGICAL_ERROR); + } + + Packet receivePacket() override; + + Packet receivePacketUnlocked(AsyncCallback async_callback) override; + + void disconnect() override; + + void sendCancel() override; + + void sendIgnoredPartUUIDs(const std::vector & uuids) override; + + Packet drain() override; + + std::string dumpAddresses() const override; + + size_t size() const override { return offset_states.size(); } + + bool hasActiveConnections() const override { return active_connection_count > 0; } + +private: + /// If we don't receive data from replica and there is no progress in query + /// execution for receive_data_timeout, we are trying to get new + /// replica and send query to it. Beside sending query, there are some + /// additional actions like sendScalarsData or sendExternalTablesData and we need + /// to perform these actions in the same order on the new replica. So, we will + /// save actions with replicas in pipeline to perform them on the new replicas. + class Pipeline + { + public: + void add(std::function send_function); + + void run(ReplicaState & replica); + private: + std::vector> pipeline; + }; + + Packet receivePacketFromReplica(const ReplicaLocation & replica_location); + + ReplicaLocation getReadyReplicaLocation(AsyncCallback async_callback = {}); + + bool resumePacketReceiver(const ReplicaLocation & replica_location); + + void disableChangingReplica(const ReplicaLocation & replica_location); + + void startNewReplica(); + + void checkNewReplica(); + + void processNewReplicaState(HedgedConnectionsFactory::State state, Connection * connection); + + void finishProcessReplica(ReplicaState & replica, bool disconnect); + + int getReadyFileDescriptor(AsyncCallback async_callback = {}); + + HedgedConnectionsFactory hedged_connections_factory; + + /// All replicas in offset_states[offset] is responsible for process query + /// with setting parallel_replica_offset = offset. In common situations + /// replica_states[offset].replicas.size() = 1 (like in MultiplexedConnections). + std::vector offset_states; + + /// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas). + std::unordered_map fd_to_replica_location; + + /// Map receive data timeout file descriptor to replica location. + std::unordered_map timeout_fd_to_replica_location; + + /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from + /// the replica, we push it's offset to this queue and start trying to get + /// new replica. + std::queue offsets_queue; + + /// The current number of valid connections to the replicas of this shard. + size_t active_connection_count; + + /// We count offsets in which we can't change replica anymore, + /// it's needed to cancel choosing new replicas when we + /// disabled replica changing in all offsets. + size_t offsets_with_disabled_changing_replica; + + Pipeline pipeline_for_new_replicas; + + /// New replica may not support two-level aggregation due to version incompatibility. + /// If we didn't disabled it, we need to skip this replica. + bool disable_two_level_aggregation = false; + + /// We will save replica with last received packet + /// (except cases when packet type is EndOfStream or Exception) + /// to resume it's packet receiver when new packet is needed. + std::optional replica_with_last_received_packet; + + Packet last_received_packet; + + Epoll epoll; + const Settings & settings; + ThrottlerPtr throttler; + bool sent_query = false; + bool cancelled = false; + + mutable std::mutex cancel_mutex; +}; + +} +#endif diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp new file mode 100644 index 00000000000..b0c5a3d22f5 --- /dev/null +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -0,0 +1,409 @@ +#if defined(OS_LINUX) + +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event HedgedRequestsChangeReplica; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ALL_CONNECTION_TRIES_FAILED; + extern const int ALL_REPLICAS_ARE_STALE; + extern const int LOGICAL_ERROR; +} + +HedgedConnectionsFactory::HedgedConnectionsFactory( + const ConnectionPoolWithFailoverPtr & pool_, + const Settings * settings_, + const ConnectionTimeouts & timeouts_, + std::shared_ptr table_to_check_) + : pool(pool_), settings(settings_), timeouts(timeouts_), table_to_check(table_to_check_), log(&Poco::Logger::get("HedgedConnectionsFactory")) +{ + shuffled_pools = pool->getShuffledPools(settings); + for (size_t i = 0; i != shuffled_pools.size(); ++i) + replicas.emplace_back(ConnectionEstablisherAsync(shuffled_pools[i].pool, &timeouts, settings, log, table_to_check.get())); + + max_tries + = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); + + fallback_to_stale_replicas = settings && settings->fallback_to_stale_replicas_for_distributed_queries; +} + +HedgedConnectionsFactory::~HedgedConnectionsFactory() +{ + /// Stop anything that maybe in progress, + /// to avoid interfer with the subsequent connections. + /// + /// I.e. some replcas may be in the establishing state, + /// this means that hedged connection is waiting for TablesStatusResponse, + /// and if the connection will not be canceled, + /// then next user of the connection will get TablesStatusResponse, + /// while this is not the expected package. + stopChoosingReplicas(); + + pool->updateSharedError(shuffled_pools); +} + +std::vector HedgedConnectionsFactory::getManyConnections(PoolMode pool_mode) +{ + size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; + + size_t max_entries = 1; + switch (pool_mode) + { + case PoolMode::GET_ALL: + { + min_entries = shuffled_pools.size(); + max_entries = shuffled_pools.size(); + break; + } + case PoolMode::GET_ONE: + { + max_entries = 1; + break; + } + case PoolMode::GET_MANY: + { + max_entries = settings ? size_t(settings->max_parallel_replicas) : 1; + break; + } + } + + std::vector connections; + connections.reserve(max_entries); + Connection * connection = nullptr; + + /// Try to start establishing connections with max_entries replicas. + for (size_t i = 0; i != max_entries; ++i) + { + ++requested_connections_count; + State state = startNewConnectionImpl(connection); + if (state == State::READY) + connections.push_back(connection); + if (state == State::CANNOT_CHOOSE) + break; + } + + /// Process connections until we get enough READY connections + /// (work asynchronously with all connections we started). + /// TODO: when we get GET_ALL mode we can start reading packets from ready + /// TODO: connection as soon as we got it, not even waiting for the others. + while (connections.size() < max_entries) + { + /// Set blocking = true to avoid busy-waiting here. + auto state = waitForReadyConnectionsImpl(/*blocking = */true, connection); + if (state == State::READY) + connections.push_back(connection); + else if (state == State::CANNOT_CHOOSE) + { + if (connections.size() >= min_entries) + break; + + /// Determine the reason of not enough replicas. + if (!fallback_to_stale_replicas && up_to_date_count < min_entries) + throw Exception( + "Could not find enough connections to up-to-date replicas. Got: " + std::to_string(connections.size()) + + ", needed: " + std::to_string(min_entries), + DB::ErrorCodes::ALL_REPLICAS_ARE_STALE); + if (usable_count < min_entries) + throw NetException( + "All connection tries failed. Log: \n\n" + fail_messages + "\n", + DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED); + + throw Exception("Unknown reason of not enough replicas.", ErrorCodes::LOGICAL_ERROR); + } + } + + return connections; +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::startNewConnection(Connection *& connection_out) +{ + ++requested_connections_count; + State state = startNewConnectionImpl(connection_out); + /// If we cannot start new connection but there are connections in epoll, return NOT_READY. + if (state == State::CANNOT_CHOOSE && !epoll.empty()) + state = State::NOT_READY; + + return state; +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::waitForReadyConnections(Connection *& connection_out) +{ + return waitForReadyConnectionsImpl(false, connection_out); +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::waitForReadyConnectionsImpl(bool blocking, Connection *& connection_out) +{ + State state = processEpollEvents(blocking, connection_out); + if (state != State::CANNOT_CHOOSE) + return state; + + /// We reach this point only if there was no free up to date replica. + /// We will try to use usable replica. + + /// Check if we are not allowed to use usable replicas or there is no even a free usable replica. + if (!fallback_to_stale_replicas) + return State::CANNOT_CHOOSE; + + return setBestUsableReplica(connection_out); +} + +int HedgedConnectionsFactory::getNextIndex() +{ + /// Check if there is no free replica. + if (entries_count + replicas_in_process_count + failed_pools_count >= shuffled_pools.size()) + return -1; + + /// Check if it's the first time. + if (last_used_index == -1) + { + last_used_index = 0; + return 0; + } + + bool finish = false; + int next_index = last_used_index; + while (!finish) + { + next_index = (next_index + 1) % shuffled_pools.size(); + + /// Check if we can try this replica. + if (replicas[next_index].connection_establisher.getResult().entry.isNull() + && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries)) + finish = true; + + /// If we made a complete round, there is no replica to connect. + else if (next_index == last_used_index) + return -1; + } + + last_used_index = next_index; + return next_index; +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::startNewConnectionImpl(Connection *& connection_out) +{ + int index; + State state; + do + { + index = getNextIndex(); + if (index == -1) + return State::CANNOT_CHOOSE; + + state = resumeConnectionEstablisher(index, connection_out); + } + while (state == State::CANNOT_CHOOSE); + + return state; +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(bool blocking, Connection *& connection_out) +{ + int event_fd; + while (!epoll.empty()) + { + event_fd = getReadyFileDescriptor(blocking); + + if (event_fd == -1) + return State::NOT_READY; + + if (fd_to_replica_index.contains(event_fd)) + { + int index = fd_to_replica_index[event_fd]; + State state = resumeConnectionEstablisher(index, connection_out); + if (state == State::NOT_READY) + continue; + + /// Connection establishing not in process now, remove all + /// information about it from epoll. + removeReplicaFromEpoll(index, event_fd); + + if (state == State::READY) + return state; + } + else if (timeout_fd_to_replica_index.contains(event_fd)) + { + int index = timeout_fd_to_replica_index[event_fd]; + replicas[index].change_replica_timeout.reset(); + ++shuffled_pools[index].slowdown_count; + ProfileEvents::increment(ProfileEvents::HedgedRequestsChangeReplica); + } + else + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); + + /// We reach this point only if we need to start new connection + /// (Special timeout expired or one of the previous connections failed). + /// Return only if replica is ready. + if (startNewConnectionImpl(connection_out) == State::READY) + return State::READY; + } + + return State::CANNOT_CHOOSE; +} + +int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking) +{ + epoll_event event; + event.data.fd = -1; + epoll.getManyReady(1, &event, blocking); + return event.data.fd; +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::resumeConnectionEstablisher(int index, Connection *& connection_out) +{ + auto res = replicas[index].connection_establisher.resume(); + + if (std::holds_alternative(res)) + return processFinishedConnection(index, std::get(res), connection_out); + + int fd = std::get(res); + if (!fd_to_replica_index.contains(fd)) + addNewReplicaToEpoll(index, fd); + + return State::NOT_READY; +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::processFinishedConnection(int index, TryResult result, Connection *& connection_out) +{ + const std::string & fail_message = replicas[index].connection_establisher.getFailMessage(); + if (!fail_message.empty()) + fail_messages += fail_message + "\n"; + + if (!result.entry.isNull()) + { + ++entries_count; + + if (result.is_usable) + { + ++usable_count; + if (result.is_up_to_date) + { + ++up_to_date_count; + if (!skip_replicas_with_two_level_aggregation_incompatibility || !isTwoLevelAggregationIncompatible(&*result.entry)) + { + replicas[index].is_ready = true; + ++ready_replicas_count; + connection_out = &*result.entry; + return State::READY; + } + } + } + } + else + { + ShuffledPool & shuffled_pool = shuffled_pools[index]; + LOG_WARNING( + log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message); + ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); + + shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1); + shuffled_pool.slowdown_count = 0; + + if (shuffled_pool.error_count >= max_tries) + { + ++failed_pools_count; + ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll); + } + } + + return State::CANNOT_CHOOSE; +} + +void HedgedConnectionsFactory::stopChoosingReplicas() +{ + for (auto & [fd, index] : fd_to_replica_index) + { + --replicas_in_process_count; + epoll.remove(fd); + replicas[index].connection_establisher.cancel(); + } + + for (auto & [timeout_fd, index] : timeout_fd_to_replica_index) + { + replicas[index].change_replica_timeout.reset(); + epoll.remove(timeout_fd); + } + + fd_to_replica_index.clear(); + timeout_fd_to_replica_index.clear(); +} + +void HedgedConnectionsFactory::addNewReplicaToEpoll(int index, int fd) +{ + ++replicas_in_process_count; + epoll.add(fd); + fd_to_replica_index[fd] = index; + + /// Add timeout for changing replica. + replicas[index].change_replica_timeout.setRelative(timeouts.hedged_connection_timeout); + epoll.add(replicas[index].change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_index[replicas[index].change_replica_timeout.getDescriptor()] = index; +} + +void HedgedConnectionsFactory::removeReplicaFromEpoll(int index, int fd) +{ + --replicas_in_process_count; + epoll.remove(fd); + fd_to_replica_index.erase(fd); + + replicas[index].change_replica_timeout.reset(); + epoll.remove(replicas[index].change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_index.erase(replicas[index].change_replica_timeout.getDescriptor()); +} + +int HedgedConnectionsFactory::numberOfProcessingReplicas() const +{ + if (epoll.empty()) + return 0; + + return requested_connections_count - ready_replicas_count; +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(Connection *& connection_out) +{ + std::vector indexes; + for (size_t i = 0; i != replicas.size(); ++i) + { + /// Don't add unusable, failed replicas and replicas that are ready or in process. + TryResult result = replicas[i].connection_establisher.getResult(); + if (!result.entry.isNull() + && result.is_usable + && !replicas[i].is_ready + && (!skip_replicas_with_two_level_aggregation_incompatibility || !isTwoLevelAggregationIncompatible(&*result.entry))) + indexes.push_back(i); + } + + if (indexes.empty()) + return State::CANNOT_CHOOSE; + + /// Sort replicas by staleness. + std::stable_sort( + indexes.begin(), + indexes.end(), + [&](size_t lhs, size_t rhs) + { + return replicas[lhs].connection_establisher.getResult().staleness < replicas[rhs].connection_establisher.getResult().staleness; + }); + + replicas[indexes[0]].is_ready = true; + TryResult result = replicas[indexes[0]].connection_establisher.getResult(); + connection_out = &*result.entry; + return State::READY; +} + +bool HedgedConnectionsFactory::isTwoLevelAggregationIncompatible(Connection * connection) +{ + return connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD; +} + +} +#endif diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h new file mode 100644 index 00000000000..c5e8d493efa --- /dev/null +++ b/src/Client/HedgedConnectionsFactory.h @@ -0,0 +1,158 @@ +#pragma once + +#if defined(OS_LINUX) + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +/** Class for establishing hedged connections with replicas. + * The process of establishing connection is divided on stages, on each stage if + * replica doesn't respond for a long time, we start establishing connection with + * the next replica, without cancelling working with previous one. + * It works with multiple replicas simultaneously without blocking by using epoll. + */ +class HedgedConnectionsFactory +{ +public: + using ShuffledPool = ConnectionPoolWithFailover::Base::ShuffledPool; + using TryResult = PoolWithFailoverBase::TryResult; + + enum class State + { + READY, + NOT_READY, + CANNOT_CHOOSE, + }; + + struct ReplicaStatus + { + explicit ReplicaStatus(ConnectionEstablisherAsync connection_stablisher_) : connection_establisher(std::move(connection_stablisher_)) + { + } + + ConnectionEstablisherAsync connection_establisher; + TimerDescriptor change_replica_timeout; + bool is_ready = false; + }; + + HedgedConnectionsFactory(const ConnectionPoolWithFailoverPtr & pool_, + const Settings * settings_, + const ConnectionTimeouts & timeouts_, + std::shared_ptr table_to_check_ = nullptr); + + /// Create and return active connections according to pool_mode. + std::vector getManyConnections(PoolMode pool_mode); + + /// Try to get connection to the new replica without blocking. Process all current events in epoll (connections, timeouts), + /// Returned state might be READY (connection established successfully), + /// NOT_READY (there are no ready events now) and CANNOT_CHOOSE (cannot produce new connection anymore). + /// If state is READY, replica connection will be written in connection_out. + State waitForReadyConnections(Connection *& connection_out); + + State startNewConnection(Connection *& connection_out); + + /// Stop working with all replicas that are not READY. + void stopChoosingReplicas(); + + bool hasEventsInProcess() const { return !epoll.empty(); } + + int getFileDescriptor() const { return epoll.getFileDescriptor(); } + + const ConnectionTimeouts & getConnectionTimeouts() const { return timeouts; } + + int numberOfProcessingReplicas() const; + + /// Tell Factory to not return connections with two level aggregation incompatibility. + void skipReplicasWithTwoLevelAggregationIncompatibility() { skip_replicas_with_two_level_aggregation_incompatibility = true; } + + ~HedgedConnectionsFactory(); + +private: + State waitForReadyConnectionsImpl(bool blocking, Connection *& connection_out); + + /// Try to start establishing connection to the new replica. Return + /// the index of the new replica or -1 if cannot start new connection. + State startNewConnectionImpl(Connection *& connection_out); + + /// Find an index of the next free replica to start connection. + /// Return -1 if there is no free replica. + int getNextIndex(); + + int getReadyFileDescriptor(bool blocking); + + void processFailedConnection(int index, const std::string & fail_message); + + State resumeConnectionEstablisher(int index, Connection *& connection_out); + + State processFinishedConnection(int index, TryResult result, Connection *& connection_out); + + void removeReplicaFromEpoll(int index, int fd); + + void addNewReplicaToEpoll(int index, int fd); + + /// Return NOT_READY state if there is no ready events, READY if replica is ready + /// and CANNOT_CHOOSE if there is no more events in epoll. + State processEpollEvents(bool blocking, Connection *& connection_out); + + State setBestUsableReplica(Connection *& connection_out); + + bool isTwoLevelAggregationIncompatible(Connection * connection); + + const ConnectionPoolWithFailoverPtr pool; + const Settings * settings; + const ConnectionTimeouts timeouts; + + std::vector shuffled_pools; + std::vector replicas; + + /// Map socket file descriptor to replica index. + std::unordered_map fd_to_replica_index; + + /// Map timeout for changing replica to replica index. + std::unordered_map timeout_fd_to_replica_index; + + /// If this flag is true, don't return connections with + /// two level aggregation incompatibility + bool skip_replicas_with_two_level_aggregation_incompatibility = false; + + std::shared_ptr table_to_check; + int last_used_index = -1; + bool fallback_to_stale_replicas; + Epoll epoll; + Poco::Logger * log; + std::string fail_messages; + + /// The maximum number of attempts to connect to replicas. + size_t max_tries; + /// Total number of established connections. + size_t entries_count = 0; + /// The number of established connections that are usable. + size_t usable_count = 0; + /// The number of established connections that are up to date. + size_t up_to_date_count = 0; + /// The number of failed connections (replica is considered failed after max_tries attempts to connect). + size_t failed_pools_count= 0; + + /// The number of replicas that are in process of connection. + size_t replicas_in_process_count = 0; + /// The number of ready replicas (replica is considered ready when it's + /// connection returns outside). + size_t ready_replicas_count = 0; + + /// The number of requested in startNewConnection replicas (it's needed for + /// checking the number of requested replicas that are still in process). + size_t requested_connections_count = 0; +}; + +} +#endif diff --git a/src/Client/IConnections.h b/src/Client/IConnections.h new file mode 100644 index 00000000000..d251a5fb3ab --- /dev/null +++ b/src/Client/IConnections.h @@ -0,0 +1,62 @@ +#pragma once + +#include + +namespace DB +{ + +/// Base class for working with multiple replicas (connections) +/// from one shard within a single thread +class IConnections : boost::noncopyable +{ +public: + /// Send all scalars to replicas. + virtual void sendScalarsData(Scalars & data) = 0; + /// Send all content of external tables to replicas. + virtual void sendExternalTablesData(std::vector & data) = 0; + + /// Send request to replicas. + virtual void sendQuery( + const ConnectionTimeouts & timeouts, + const String & query, + const String & query_id, + UInt64 stage, + const ClientInfo & client_info, + bool with_pending_data) = 0; + + virtual void sendReadTaskResponse(const String &) = 0; + + /// Get packet from any replica. + virtual Packet receivePacket() = 0; + + /// Version of `receivePacket` function without locking. + virtual Packet receivePacketUnlocked(AsyncCallback async_callback) = 0; + + /// Break all active connections. + virtual void disconnect() = 0; + + /// Send a request to replicas to cancel the request + virtual void sendCancel() = 0; + + /// Send parts' uuids to replicas to exclude them from query processing + virtual void sendIgnoredPartUUIDs(const std::vector & uuids) = 0; + + /** On each replica, read and skip all packets to EndOfStream or Exception. + * Returns EndOfStream if no exception has been received. Otherwise + * returns the last received packet of type Exception. + */ + virtual Packet drain() = 0; + + /// Get the replica addresses as a string. + virtual std::string dumpAddresses() const = 0; + + /// Returns the number of replicas. + virtual size_t size() const = 0; + + /// Check if there are any valid replicas. + virtual bool hasActiveConnections() const = 0; + + virtual ~IConnections() = default; +}; + +} diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index c50dd7b6454..350beffce28 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -13,6 +13,7 @@ namespace ErrorCodes extern const int MISMATCH_REPLICAS_DATA_SOURCES; extern const int NO_AVAILABLE_REPLICA; extern const int TIMEOUT_EXCEEDED; + extern const int UNKNOWN_PACKET_FROM_SERVER; } @@ -155,10 +156,19 @@ void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector & uuid } } + +void MultiplexedConnections::sendReadTaskResponse(const String & response) +{ + std::lock_guard lock(cancel_mutex); + if (cancelled) + return; + current_connection->sendReadTaskResponse(response); +} + Packet MultiplexedConnections::receivePacket() { std::lock_guard lock(cancel_mutex); - Packet packet = receivePacketUnlocked(); + Packet packet = receivePacketUnlocked({}); return packet; } @@ -206,10 +216,11 @@ Packet MultiplexedConnections::drain() while (hasActiveConnections()) { - Packet packet = receivePacketUnlocked(); + Packet packet = receivePacketUnlocked({}); switch (packet.type) { + case Protocol::Server::ReadTaskRequest: case Protocol::Server::PartUUIDs: case Protocol::Server::Data: case Protocol::Server::Progress: @@ -253,7 +264,7 @@ std::string MultiplexedConnections::dumpAddressesUnlocked() const return buf.str(); } -Packet MultiplexedConnections::receivePacketUnlocked(std::function async_callback) +Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callback) { if (!sent_query) throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR); @@ -265,10 +276,30 @@ Packet MultiplexedConnections::receivePacketUnlocked(std::functionreceivePacket(std::move(async_callback)); + Packet packet; + { + AsyncCallbackSetter async_setter(current_connection, std::move(async_callback)); + + try + { + packet = current_connection->receivePacket(); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::UNKNOWN_PACKET_FROM_SERVER) + { + /// Exception may happen when packet is received, e.g. when got unknown packet. + /// In this case, invalidate replica, so that we would not read from it anymore. + current_connection->disconnect(); + invalidateReplica(state); + } + throw; + } + } switch (packet.type) { + case Protocol::Server::ReadTaskRequest: case Protocol::Server::PartUUIDs: case Protocol::Server::Data: case Protocol::Server::Progress: diff --git a/src/Client/MultiplexedConnections.h b/src/Client/MultiplexedConnections.h index da0326fa6c0..f642db1c4cd 100644 --- a/src/Client/MultiplexedConnections.h +++ b/src/Client/MultiplexedConnections.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -16,7 +17,7 @@ namespace DB * * The interface is almost the same as Connection. */ -class MultiplexedConnections final : private boost::noncopyable +class MultiplexedConnections final : public IConnections { public: /// Accepts ready connection. @@ -27,52 +28,40 @@ public: std::vector && connections, const Settings & settings_, const ThrottlerPtr & throttler_); - /// Send all scalars to replicas. - void sendScalarsData(Scalars & data); - /// Send all content of external tables to replicas. - void sendExternalTablesData(std::vector & data); + void sendScalarsData(Scalars & data) override; + void sendExternalTablesData(std::vector & data) override; - /// Send request to replicas. void sendQuery( const ConnectionTimeouts & timeouts, const String & query, const String & query_id, UInt64 stage, const ClientInfo & client_info, - bool with_pending_data); + bool with_pending_data) override; - /// Get packet from any replica. - Packet receivePacket(); + void sendReadTaskResponse(const String &) override; - /// Break all active connections. - void disconnect(); + Packet receivePacket() override; - /// Send a request to the replica to cancel the request - void sendCancel(); + void disconnect() override; + + void sendCancel() override; /// Send parts' uuids to replicas to exclude them from query processing - void sendIgnoredPartUUIDs(const std::vector & uuids); + void sendIgnoredPartUUIDs(const std::vector & uuids) override; - /** On each replica, read and skip all packets to EndOfStream or Exception. - * Returns EndOfStream if no exception has been received. Otherwise - * returns the last received packet of type Exception. - */ - Packet drain(); + Packet drain() override; - /// Get the replica addresses as a string. - std::string dumpAddresses() const; + std::string dumpAddresses() const override; - /// Returns the number of replicas. /// Without locking, because sendCancel() does not change this number. - size_t size() const { return replica_states.size(); } + size_t size() const override { return replica_states.size(); } - /// Check if there are any valid replicas. /// Without locking, because sendCancel() does not change the state of the replicas. - bool hasActiveConnections() const { return active_connection_count > 0; } + bool hasActiveConnections() const override { return active_connection_count > 0; } private: - /// Internal version of `receivePacket` function without locking. - Packet receivePacketUnlocked(std::function async_callback = {}); + Packet receivePacketUnlocked(AsyncCallback async_callback) override; /// Internal version of `dumpAddresses` function without locking. std::string dumpAddressesUnlocked() const; diff --git a/src/Client/PacketReceiver.h b/src/Client/PacketReceiver.h new file mode 100644 index 00000000000..2252e63a2f6 --- /dev/null +++ b/src/Client/PacketReceiver.h @@ -0,0 +1,161 @@ +#pragma once + +#if defined(OS_LINUX) + +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Class for nonblocking packet receiving. It runs connection->receivePacket +/// in fiber and sets special read callback which is called when +/// reading from socket blocks. When read callback is called, +/// socket and receive timeout are added in epoll and execution returns to the main program. +/// So, you can poll this epoll file descriptor to determine when to resume +/// packet receiving. +class PacketReceiver +{ +public: + explicit PacketReceiver(Connection * connection_) : connection(connection_) + { + epoll.add(receive_timeout.getDescriptor()); + epoll.add(connection->getSocket()->impl()->sockfd()); + + fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); + } + + /// Resume packet receiving. + std::variant resume() + { + /// If there is no pending data, check receive timeout. + if (!connection->hasReadPendingData() && !checkReceiveTimeout()) + { + /// Receive timeout expired. + return Poco::Timespan(); + } + + /// Resume fiber. + fiber = std::move(fiber).resume(); + if (exception) + std::rethrow_exception(std::move(exception)); + + if (is_read_in_process) + return epoll.getFileDescriptor(); + + /// Receiving packet was finished. + return std::move(packet); + } + + void cancel() + { + Fiber to_destroy = std::move(fiber); + connection = nullptr; + } + + int getFileDescriptor() const { return epoll.getFileDescriptor(); } + +private: + /// When epoll file descriptor is ready, check if it's an expired timeout. + /// Return false if receive timeout expired and socket is not ready, return true otherwise. + bool checkReceiveTimeout() + { + bool is_socket_ready = false; + bool is_receive_timeout_expired = false; + + epoll_event events[2]; + events[0].data.fd = events[1].data.fd = -1; + size_t ready_count = epoll.getManyReady(2, events, true); + + for (size_t i = 0; i != ready_count; ++i) + { + if (events[i].data.fd == connection->getSocket()->impl()->sockfd()) + is_socket_ready = true; + if (events[i].data.fd == receive_timeout.getDescriptor()) + is_receive_timeout_expired = true; + } + + if (is_receive_timeout_expired && !is_socket_ready) + { + receive_timeout.reset(); + return false; + } + + return true; + } + + struct Routine + { + PacketReceiver & receiver; + + struct ReadCallback + { + PacketReceiver & receiver; + Fiber & sink; + + void operator()(int, const Poco::Timespan & timeout, const std::string &) + { + receiver.receive_timeout.setRelative(timeout); + receiver.is_read_in_process = true; + sink = std::move(sink).resume(); + receiver.is_read_in_process = false; + receiver.receive_timeout.reset(); + } + }; + + Fiber operator()(Fiber && sink) + { + try + { + while (true) + { + { + AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink}); + receiver.packet = receiver.connection->receivePacket(); + } + sink = std::move(sink).resume(); + } + + } + catch (const boost::context::detail::forced_unwind &) + { + /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited + /// It should not be caught or it will segfault. + /// Other exceptions must be caught + throw; + } + catch (...) + { + receiver.exception = std::current_exception(); + } + + return std::move(sink); + } + }; + + Connection * connection; + Packet packet; + + Fiber fiber; + FiberStack fiber_stack; + + /// We use timer descriptor for checking socket receive timeout. + TimerDescriptor receive_timeout; + + /// In read callback we add socket file descriptor and timer descriptor with receive timeout + /// in epoll, so we can return epoll file descriptor outside for polling. + Epoll epoll; + + /// If and exception occurred in fiber resume, we save it and rethrow. + std::exception_ptr exception; + + bool is_read_in_process = false; +}; + +} +#endif diff --git a/src/Client/ya.make b/src/Client/ya.make index 87a0cea102a..4201203a8e9 100644 --- a/src/Client/ya.make +++ b/src/Client/ya.make @@ -11,9 +11,11 @@ PEERDIR( SRCS( Connection.cpp + ConnectionEstablisher.cpp ConnectionPoolWithFailover.cpp + HedgedConnections.cpp + HedgedConnectionsFactory.cpp MultiplexedConnections.cpp - TimeoutSetter.cpp ) diff --git a/src/Columns/ColumnAggregateFunction.cpp b/src/Columns/ColumnAggregateFunction.cpp index d0a5e120a07..8178802f3bd 100644 --- a/src/Columns/ColumnAggregateFunction.cpp +++ b/src/Columns/ColumnAggregateFunction.cpp @@ -24,6 +24,7 @@ namespace ErrorCodes extern const int PARAMETER_OUT_OF_BOUND; extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NOT_IMPLEMENTED; } @@ -161,7 +162,7 @@ MutableColumnPtr ColumnAggregateFunction::convertToValues(MutableColumnPtr colum return res; } -MutableColumnPtr ColumnAggregateFunction::predictValues(const ColumnsWithTypeAndName & arguments, const Context & context) const +MutableColumnPtr ColumnAggregateFunction::predictValues(const ColumnsWithTypeAndName & arguments, ContextPtr context) const { MutableColumnPtr res = func->getReturnTypeToPredict()->createColumn(); res->reserve(data.size()); @@ -553,6 +554,11 @@ const char * ColumnAggregateFunction::deserializeAndInsertFromArena(const char * return read_buffer.position(); } +const char * ColumnAggregateFunction::skipSerializedInArena(const char *) const +{ + throw Exception("Method skipSerializedInArena is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); +} + void ColumnAggregateFunction::popBack(size_t n) { size_t size = data.size(); diff --git a/src/Columns/ColumnAggregateFunction.h b/src/Columns/ColumnAggregateFunction.h index cd45cf583a0..8eb1a04b174 100644 --- a/src/Columns/ColumnAggregateFunction.h +++ b/src/Columns/ColumnAggregateFunction.h @@ -82,7 +82,7 @@ private: /// Name of the type to distinguish different aggregation states. String type_string; - ColumnAggregateFunction() {} + ColumnAggregateFunction() = default; /// Create a new column that has another column as a source. MutablePtr createView() const; @@ -119,7 +119,7 @@ public: const char * getFamilyName() const override { return "AggregateFunction"; } TypeIndex getDataType() const override { return TypeIndex::AggregateFunction; } - MutableColumnPtr predictValues(const ColumnsWithTypeAndName & arguments, const Context & context) const; + MutableColumnPtr predictValues(const ColumnsWithTypeAndName & arguments, ContextPtr context) const; size_t size() const override { @@ -155,6 +155,8 @@ public: const char * deserializeAndInsertFromArena(const char * src_arena) override; + const char * skipSerializedInArena(const char *) const override; + void updateHashWithValue(size_t n, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; @@ -198,6 +200,11 @@ public: throw Exception("Method compareColumn is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED); } + bool hasEqualValues() const override + { + throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED); + } + void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override; diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index e8a48672435..1b0c9f5162f 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -239,6 +239,16 @@ const char * ColumnArray::deserializeAndInsertFromArena(const char * pos) return pos; } +const char * ColumnArray::skipSerializedInArena(const char * pos) const +{ + size_t array_size = unalignedLoad(pos); + pos += sizeof(array_size); + + for (size_t i = 0; i < array_size; ++i) + pos = getData().skipSerializedInArena(pos); + + return pos; +} void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const { @@ -370,6 +380,10 @@ void ColumnArray::compareColumn(const IColumn & rhs, size_t rhs_row_num, compare_results, direction, nan_direction_hint); } +bool ColumnArray::hasEqualValues() const +{ + return hasEqualValuesImpl(); +} namespace { @@ -1197,7 +1211,6 @@ ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const assert_cast(*temporary_arrays.front()).getOffsetsPtr()); } - void ColumnArray::gather(ColumnGathererStream & gatherer) { gatherer.gather(*this); diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 1caaf672d49..75bd4a6dba4 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -61,6 +61,7 @@ public: void insertData(const char * pos, size_t length) override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; void updateHashWithValue(size_t n, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; void updateHashFast(SipHash & hash) const override; @@ -78,6 +79,7 @@ public: PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const override; int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint, const Collator & collator) const override; + bool hasEqualValues() const override; void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const override; void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index f6b6bf22177..3cc2c014732 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -85,6 +85,7 @@ public: void popBack(size_t) override { throwMustBeDecompressed(); } StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeDecompressed(); } const char * deserializeAndInsertFromArena(const char *) override { throwMustBeDecompressed(); } + const char * skipSerializedInArena(const char *) const override { throwMustBeDecompressed(); } void updateHashWithValue(size_t, SipHash &) const override { throwMustBeDecompressed(); } void updateWeakHash32(WeakHash32 &) const override { throwMustBeDecompressed(); } void updateHashFast(SipHash &) const override { throwMustBeDecompressed(); } @@ -96,6 +97,10 @@ public: { throwMustBeDecompressed(); } + bool hasEqualValues() const override + { + throwMustBeDecompressed(); + } void getPermutation(bool, size_t, int, Permutation &) const override { throwMustBeDecompressed(); } void updatePermutation(bool, size_t, int, Permutation &, EqualRanges &) const override { throwMustBeDecompressed(); } ColumnPtr replicate(const Offsets &) const override { throwMustBeDecompressed(); } diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index 99c997ab269..01d5b235a2b 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -163,6 +163,11 @@ public: return res; } + const char * skipSerializedInArena(const char * pos) const override + { + return data->skipSerializedInArena(pos); + } + void updateHashWithValue(size_t, SipHash & hash) const override { data->updateHashWithValue(0, hash); @@ -206,6 +211,8 @@ public: PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const override; + bool hasEqualValues() const override { return true; } + MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override; void gather(ColumnGathererStream &) override @@ -248,7 +255,7 @@ public: /// The constant value. It is valid even if the size of the column is 0. template - T getValue() const { return getField().safeGet>(); } + T getValue() const { return getField().safeGet(); } bool isCollationSupported() const override { return data->isCollationSupported(); } }; diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index bad3a4c3402..ec08db274b3 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -58,6 +58,12 @@ void ColumnDecimal::compareColumn(const IColumn & rhs, size_t rhs_row_num, compare_results, direction, nan_direction_hint); } +template +bool ColumnDecimal::hasEqualValues() const +{ + return this->template hasEqualValuesImpl>(); +} + template StringRef ColumnDecimal::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const { @@ -73,6 +79,12 @@ const char * ColumnDecimal::deserializeAndInsertFromArena(const char * pos) return pos + sizeof(T); } +template +const char * ColumnDecimal::skipSerializedInArena(const char * pos) const +{ + return pos + sizeof(T); +} + template UInt64 ColumnDecimal::get64([[maybe_unused]] size_t n) const { diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index 5016ddca791..a4fa15c2d95 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -107,7 +107,7 @@ public: { data.resize_fill(data.size() + length); } - void insert(const Field & x) override { data.push_back(DB::get>(x)); } + void insert(const Field & x) override { data.push_back(DB::get(x)); } void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; void popBack(size_t n) override @@ -129,6 +129,7 @@ public: StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; void updateHashWithValue(size_t n, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; void updateHashFast(SipHash & hash) const override; @@ -136,6 +137,7 @@ public: void compareColumn(const IColumn & rhs, size_t rhs_row_num, PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const override; + bool hasEqualValues() const override; void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges& equal_range) const override; diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index c4a7f923867..4d54a46c924 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -100,6 +100,11 @@ const char * ColumnFixedString::deserializeAndInsertFromArena(const char * pos) return pos + n; } +const char * ColumnFixedString::skipSerializedInArena(const char * pos) const +{ + return pos + n; +} + void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const { hash.update(reinterpret_cast(&chars[n * index]), n); @@ -474,19 +479,4 @@ ColumnPtr ColumnFixedString::compress() const }); } - -void ColumnFixedString::alignStringLength(ColumnFixedString::Chars & data, size_t n, size_t old_size) -{ - size_t length = data.size() - old_size; - if (length < n) - { - data.resize_fill(old_size + n); - } - else if (length > n) - { - data.resize_assume_reserved(old_size); - throw Exception("Too large value for FixedString(" + std::to_string(n) + ")", ErrorCodes::TOO_LARGE_STRING_SIZE); - } -} - } diff --git a/src/Columns/ColumnFixedString.h b/src/Columns/ColumnFixedString.h index d9f6619b2d1..5fd482aef6e 100644 --- a/src/Columns/ColumnFixedString.h +++ b/src/Columns/ColumnFixedString.h @@ -112,6 +112,8 @@ public: const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; + void updateHashWithValue(size_t index, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; @@ -132,6 +134,11 @@ public: compare_results, direction, nan_direction_hint); } + bool hasEqualValues() const override + { + return hasEqualValuesImpl(); + } + void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const override; @@ -184,8 +191,6 @@ public: const Chars & getChars() const { return chars; } size_t getN() const { return n; } - - static void alignStringLength(ColumnFixedString::Chars & data, size_t n, size_t old_size); }; } diff --git a/src/Columns/ColumnFunction.h b/src/Columns/ColumnFunction.h index f97f41a8627..fa605e741aa 100644 --- a/src/Columns/ColumnFunction.h +++ b/src/Columns/ColumnFunction.h @@ -98,6 +98,11 @@ public: throw Exception("Cannot deserialize to " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + const char * skipSerializedInArena(const char*) const override + { + throw Exception("Cannot skip serialized " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + void updateHashWithValue(size_t, SipHash &) const override { throw Exception("updateHashWithValue is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); @@ -128,6 +133,11 @@ public: throw Exception("compareColumn is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + bool hasEqualValues() const override + { + throw Exception("hasEqualValues is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + void getPermutation(bool, size_t, int, Permutation &) const override { throw Exception("getPermutation is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); diff --git a/src/Columns/ColumnLowCardinality.cpp b/src/Columns/ColumnLowCardinality.cpp index 8af3b240cb9..c4d7e75dd2d 100644 --- a/src/Columns/ColumnLowCardinality.cpp +++ b/src/Columns/ColumnLowCardinality.cpp @@ -122,7 +122,7 @@ namespace else if (auto * data_uint64 = getIndexesData(column)) return mapUniqueIndexImpl(*data_uint64); else - throw Exception("Indexes column for getUniqueIndex must be ColumnUInt, got" + column.getName(), + throw Exception("Indexes column for getUniqueIndex must be ColumnUInt, got " + column.getName(), ErrorCodes::LOGICAL_ERROR); } } @@ -151,7 +151,7 @@ void ColumnLowCardinality::insertFrom(const IColumn & src, size_t n) const auto * low_cardinality_src = typeid_cast(&src); if (!low_cardinality_src) - throw Exception("Expected ColumnLowCardinality, got" + src.getName(), ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Expected ColumnLowCardinality, got " + src.getName(), ErrorCodes::ILLEGAL_COLUMN); size_t position = low_cardinality_src->getIndexes().getUInt(n); @@ -247,6 +247,11 @@ const char * ColumnLowCardinality::deserializeAndInsertFromArena(const char * po return new_pos; } +const char * ColumnLowCardinality::skipSerializedInArena(const char * pos) const +{ + return getDictionary().skipSerializedInArena(pos); +} + void ColumnLowCardinality::updateWeakHash32(WeakHash32 & hash) const { auto s = size(); @@ -311,6 +316,13 @@ void ColumnLowCardinality::compareColumn(const IColumn & rhs, size_t rhs_row_num compare_results, direction, nan_direction_hint); } +bool ColumnLowCardinality::hasEqualValues() const +{ + if (getDictionary().size() <= 1) + return true; + return getIndexes().hasEqualValues(); +} + void ColumnLowCardinality::getPermutationImpl(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, const Collator * collator) const { if (limit == 0) diff --git a/src/Columns/ColumnLowCardinality.h b/src/Columns/ColumnLowCardinality.h index a497be8847d..92bf7ff0f95 100644 --- a/src/Columns/ColumnLowCardinality.h +++ b/src/Columns/ColumnLowCardinality.h @@ -94,6 +94,8 @@ public: const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; + void updateHashWithValue(size_t n, SipHash & hash) const override { return getDictionary().updateHashWithValue(getIndexes().getUInt(n), hash); @@ -126,6 +128,8 @@ public: int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator &) const override; + bool hasEqualValues() const override; + void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges & equal_range) const override; diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index cc2640a9cf6..05c0e0458d8 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -116,6 +116,11 @@ const char * ColumnMap::deserializeAndInsertFromArena(const char * pos) return nested->deserializeAndInsertFromArena(pos); } +const char * ColumnMap::skipSerializedInArena(const char * pos) const +{ + return nested->skipSerializedInArena(pos); +} + void ColumnMap::updateHashWithValue(size_t n, SipHash & hash) const { nested->updateHashWithValue(n, hash); @@ -187,6 +192,11 @@ void ColumnMap::compareColumn(const IColumn & rhs, size_t rhs_row_num, compare_results, direction, nan_direction_hint); } +bool ColumnMap::hasEqualValues() const +{ + return hasEqualValuesImpl(); +} + void ColumnMap::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const { nested->getPermutation(reverse, limit, nan_direction_hint, res); @@ -229,7 +239,21 @@ void ColumnMap::protect() void ColumnMap::getExtremes(Field & min, Field & max) const { - nested->getExtremes(min, max); + Field nested_min; + Field nested_max; + + nested->getExtremes(nested_min, nested_max); + + /// Convert result Array fields to Map fields because client expect min and max field to have type Map + + Array nested_min_value = nested_min.get(); + Array nested_max_value = nested_max.get(); + + Map map_min_value(nested_min_value.begin(), nested_min_value.end()); + Map map_max_value(nested_max_value.begin(), nested_max_value.end()); + + min = std::move(map_min_value); + max = std::move(map_max_value); } void ColumnMap::forEachSubcolumn(ColumnCallback callback) diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index acae1574f4c..17f0ccc422c 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -58,6 +58,7 @@ public: void popBack(size_t n) override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; void updateHashWithValue(size_t n, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; void updateHashFast(SipHash & hash) const override; @@ -72,6 +73,7 @@ public: void compareColumn(const IColumn & rhs, size_t rhs_row_num, PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const override; + bool hasEqualValues() const override; void getExtremes(Field & min, Field & max) const override; void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const override; diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 4e5cc2b4cf7..1e529751437 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -152,6 +152,17 @@ const char * ColumnNullable::deserializeAndInsertFromArena(const char * pos) return pos; } +const char * ColumnNullable::skipSerializedInArena(const char * pos) const +{ + UInt8 val = unalignedLoad(pos); + pos += sizeof(val); + + if (val == 0) + return getNestedColumn().skipSerializedInArena(pos); + + return pos; +} + void ColumnNullable::insertRangeFrom(const IColumn & src, size_t start, size_t length) { const ColumnNullable & nullable_col = assert_cast(src); @@ -271,6 +282,11 @@ void ColumnNullable::compareColumn(const IColumn & rhs, size_t rhs_row_num, compare_results, direction, nan_direction_hint); } +bool ColumnNullable::hasEqualValues() const +{ + return hasEqualValuesImpl(); +} + void ColumnNullable::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const { /// Cannot pass limit because of unknown amount of NULLs. diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 8d267de8644..963b3e1e8fa 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -71,6 +71,7 @@ public: void insertData(const char * pos, size_t length) override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; void insert(const Field & x) override; void insertFrom(const IColumn & src, size_t n) override; @@ -94,6 +95,7 @@ public: PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const override; int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator &) const override; + bool hasEqualValues() const override; void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override; void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override; diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 8fd22e85e10..fcd0516d465 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -237,6 +237,12 @@ const char * ColumnString::deserializeAndInsertFromArena(const char * pos) return pos + string_size; } +const char * ColumnString::skipSerializedInArena(const char * pos) const +{ + const size_t string_size = unalignedLoad(pos); + pos += sizeof(string_size); + return pos + string_size; +} ColumnPtr ColumnString::index(const IColumn & indexes, size_t limit) const { @@ -287,6 +293,11 @@ void ColumnString::compareColumn( compare_results, direction, nan_direction_hint); } +bool ColumnString::hasEqualValues() const +{ + return hasEqualValuesImpl(); +} + template struct ColumnString::Cmp { @@ -525,7 +536,6 @@ void ColumnString::getExtremes(Field & min, Field & max) const get(max_idx, max); } - ColumnPtr ColumnString::compress() const { size_t source_chars_size = chars.size(); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 843e445d1a0..0814ebaa826 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -111,7 +111,7 @@ public: } /// Suppress gcc 7.3.1 warning: '*((void*)& +8)' may be used uninitialized in this function -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -128,7 +128,7 @@ public: offsets.push_back(new_size); } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -189,6 +189,8 @@ public: const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; + void updateHashWithValue(size_t n, SipHash & hash) const override { size_t string_size = sizeAt(n); @@ -240,6 +242,8 @@ public: PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const override; + bool hasEqualValues() const override; + /// Variant of compareAt for string comparison with respect of collation. int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int, const Collator & collator) const override; @@ -275,7 +279,6 @@ public: return typeid(rhs) == typeid(ColumnString); } - Chars & getChars() { return chars; } const Chars & getChars() const { return chars; } diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index c7c5f7b97c6..bb59d58b75d 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -180,6 +180,14 @@ const char * ColumnTuple::deserializeAndInsertFromArena(const char * pos) return pos; } +const char * ColumnTuple::skipSerializedInArena(const char * pos) const +{ + for (const auto & column : columns) + pos = column->skipSerializedInArena(pos); + + return pos; +} + void ColumnTuple::updateHashWithValue(size_t n, SipHash & hash) const { for (const auto & column : columns) @@ -312,6 +320,11 @@ int ColumnTuple::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, return compareAtImpl(n, m, rhs, nan_direction_hint, &collator); } +bool ColumnTuple::hasEqualValues() const +{ + return hasEqualValuesImpl(); +} + template struct ColumnTuple::Less { diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index 818b29937bd..3f5422c7719 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -61,6 +61,7 @@ public: void popBack(size_t n) override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; void updateHashWithValue(size_t n, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; void updateHashFast(SipHash & hash) const override; @@ -76,6 +77,7 @@ public: PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const override; int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator & collator) const override; + bool hasEqualValues() const override; void getExtremes(Field & min, Field & max) const override; void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override; diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index fbd3c3641b5..652487c2b09 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -26,6 +26,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int ILLEGAL_COLUMN; + extern const int NOT_IMPLEMENTED; } /** Stores another column with unique values @@ -78,6 +79,7 @@ public: bool getBool(size_t n) const override { return getNestedColumn()->getBool(n); } bool isNullAt(size_t n) const override { return is_nullable && n == getNullValueIndex(); } StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; + const char * skipSerializedInArena(const char * pos) const override; void updateHashWithValue(size_t n, SipHash & hash_func) const override { return getNestedColumn()->updateHashWithValue(n, hash_func); @@ -373,6 +375,12 @@ size_t ColumnUnique::uniqueDeserializeAndInsertFromArena(const char return uniqueInsertData(pos, string_size - 1); } +template +const char * ColumnUnique::skipSerializedInArena(const char *) const +{ + throw Exception("Method skipSerializedInArena is not supported for " + this->getName(), ErrorCodes::NOT_IMPLEMENTED); +} + template int ColumnUnique::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const { diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 19ba86c5120..a64906ba257 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -50,6 +50,12 @@ const char * ColumnVector::deserializeAndInsertFromArena(const char * pos) return pos + sizeof(T); } +template +const char * ColumnVector::skipSerializedInArena(const char * pos) const +{ + return pos + sizeof(T); +} + template void ColumnVector::updateHashWithValue(size_t n, SipHash & hash) const { diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index f0aa4a3bab5..30ab38ed1c2 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -154,6 +154,8 @@ public: const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; + void updateHashWithValue(size_t n, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; @@ -205,6 +207,11 @@ public: compare_results, direction, nan_direction_hint); } + bool hasEqualValues() const override + { + return this->template hasEqualValuesImpl(); + } + void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override; void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_range) const override; @@ -254,7 +261,7 @@ public: void insert(const Field & x) override { - data.push_back(DB::get>(x)); + data.push_back(DB::get(x)); } void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; diff --git a/src/Columns/ColumnsCommon.h b/src/Columns/ColumnsCommon.h index 7655edffa71..71f2884bf86 100644 --- a/src/Columns/ColumnsCommon.h +++ b/src/Columns/ColumnsCommon.h @@ -66,7 +66,7 @@ ColumnPtr selectIndexImpl(const Column & column, const IColumn & indexes, size_t else if (auto * data_uint64 = detail::getIndexesData(indexes)) return column.template indexImpl(*data_uint64, limit); else - throw Exception("Indexes column for IColumn::select must be ColumnUInt, got" + indexes.getName(), + throw Exception("Indexes column for IColumn::select must be ColumnUInt, got " + indexes.getName(), ErrorCodes::LOGICAL_ERROR); } diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 2b4b633f9a5..23acc81e63d 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -26,6 +26,9 @@ class ColumnGathererStream; class Field; class WeakHash32; +class ISerialization; +using SerializationPtr = std::shared_ptr; + /* * Represents a set of equal ranges in previous column to perform sorting in current column. @@ -207,6 +210,10 @@ public: /// Returns pointer to the position after the read data. virtual const char * deserializeAndInsertFromArena(const char * pos) = 0; + /// Skip previously serialized value that was serialized using IColumn::serializeValueIntoArena method. + /// Returns a pointer to the position after the deserialized data. + virtual const char * skipSerializedInArena(const char *) const = 0; + /// Update state of hash function with value of n-th element. /// On subsequent calls of this method for sequence of column values of arbitrary types, /// passed bytes to hash must identify sequence of values unambiguously. @@ -266,6 +273,9 @@ public: PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const = 0; + /// Check if all elements in the column have equal values. Return true if column is empty. + virtual bool hasEqualValues() const = 0; + /** Returns a permutation that sorts elements of this column, * i.e. perm[i]-th element of source column should be i-th element of sorted column. * reverse - reverse ordering (acsending). @@ -467,6 +477,9 @@ protected: PaddedPODArray * row_indexes, PaddedPODArray & compare_results, int direction, int nan_direction_hint) const; + + template + bool hasEqualValuesImpl() const; }; using ColumnPtr = IColumn::Ptr; diff --git a/src/Columns/IColumnDummy.h b/src/Columns/IColumnDummy.h index 10ef692dc6a..7e1958f077e 100644 --- a/src/Columns/IColumnDummy.h +++ b/src/Columns/IColumnDummy.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -40,6 +41,8 @@ public: { } + bool hasEqualValues() const override { return true; } + Field operator[](size_t) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); } void get(size_t, Field &) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); } void insert(const Field &) override { throw Exception("Cannot insert element into " + getName(), ErrorCodes::NOT_IMPLEMENTED); } @@ -56,12 +59,20 @@ public: StringRef serializeValueIntoArena(size_t /*n*/, Arena & arena, char const *& begin) const override { - return { arena.allocContinue(0, begin), 0 }; + /// Has to put one useless byte into Arena, because serialization into zero number of bytes is ambiguous. + char * res = arena.allocContinue(1, begin); + *res = 0; + return { res, 1 }; } const char * deserializeAndInsertFromArena(const char * pos) override { ++s; + return pos + 1; + } + + const char * skipSerializedInArena(const char * pos) const override + { return pos; } diff --git a/src/Columns/IColumnImpl.h b/src/Columns/IColumnImpl.h index d2286981ac7..a1ee6a17982 100644 --- a/src/Columns/IColumnImpl.h +++ b/src/Columns/IColumnImpl.h @@ -127,4 +127,16 @@ void IColumn::doCompareColumn(const Derived & rhs, size_t rhs_row_num, } } +template +bool IColumn::hasEqualValuesImpl() const +{ + size_t num_rows = size(); + for (size_t i = 1; i < num_rows; ++i) + { + if (compareAt(i, 0, static_cast(*this), false) != 0) + return false; + } + return true; +} + } diff --git a/src/Columns/IColumnUnique.h b/src/Columns/IColumnUnique.h index 99e134675f6..5558f493b92 100644 --- a/src/Columns/IColumnUnique.h +++ b/src/Columns/IColumnUnique.h @@ -172,6 +172,11 @@ public: { throw Exception("Method compareColumn is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED); } + + bool hasEqualValues() const override + { + throw Exception("Method hasEqualValues is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED); + } }; using ColumnUniquePtr = IColumnUnique::ColumnUniquePtr; diff --git a/src/Columns/tests/gtest_weak_hash_32.cpp b/src/Columns/tests/gtest_weak_hash_32.cpp index a04bd94124c..0dabfc32b33 100644 --- a/src/Columns/tests/gtest_weak_hash_32.cpp +++ b/src/Columns/tests/gtest_weak_hash_32.cpp @@ -88,7 +88,6 @@ void checkColumn( if (num_collisions <= max_collisions_to_print) { collisions_str << "Collision:\n"; - collisions_str << print_for_row(it->second) << '\n'; collisions_str << print_for_row(i) << std::endl; } diff --git a/src/Common/Allocator.h b/src/Common/Allocator.h index e3c6ddf9ff4..ebfd654d558 100644 --- a/src/Common/Allocator.h +++ b/src/Common/Allocator.h @@ -277,7 +277,7 @@ private: * GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack. * In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this. */ -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfree-nonheap-object" #endif @@ -359,6 +359,6 @@ extern template class Allocator; extern template class Allocator; extern template class Allocator; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/AllocatorWithMemoryTracking.h b/src/Common/AllocatorWithMemoryTracking.h new file mode 100644 index 00000000000..e9597e4bf5a --- /dev/null +++ b/src/Common/AllocatorWithMemoryTracking.h @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include + +#include + + +/// Implementation of std::allocator interface that tracks memory with MemoryTracker. +/// NOTE We already plug MemoryTracker into new/delete operators. So, everything works even with default allocator. +/// But it is enabled only if jemalloc is used (to obtain the size of the allocation on call to delete). +/// And jemalloc is disabled for builds with sanitizers. In these cases memory was not always tracked. + +template +struct AllocatorWithMemoryTracking +{ + typedef T value_type; + + AllocatorWithMemoryTracking() = default; + + template + constexpr AllocatorWithMemoryTracking(const AllocatorWithMemoryTracking &) noexcept + { + } + + [[nodiscard]] T * allocate(size_t n) + { + if (n > std::numeric_limits::max() / sizeof(T)) + throw std::bad_alloc(); + + size_t bytes = n * sizeof(T); + CurrentMemoryTracker::alloc(bytes); + + T * p = static_cast(malloc(bytes)); + if (!p) + throw std::bad_alloc(); + + return p; + } + + void deallocate(T * p, size_t n) noexcept + { + free(p); + + size_t bytes = n * sizeof(T); + CurrentMemoryTracker::free(bytes); + } +}; + +template +bool operator==(const AllocatorWithMemoryTracking &, const AllocatorWithMemoryTracking &) +{ + return true; +} + +template +bool operator!=(const AllocatorWithMemoryTracking &, const AllocatorWithMemoryTracking &) +{ + return false; +} + diff --git a/src/Common/Arena.h b/src/Common/Arena.h index 0f4f0420c38..4d14c15197d 100644 --- a/src/Common/Arena.h +++ b/src/Common/Arena.h @@ -128,7 +128,7 @@ private: template friend class AlignedArenaAllocator; public: - Arena(size_t initial_size_ = 4096, size_t growth_factor_ = 2, size_t linear_growth_threshold_ = 128 * 1024 * 1024) + explicit Arena(size_t initial_size_ = 4096, size_t growth_factor_ = 2, size_t linear_growth_threshold_ = 128 * 1024 * 1024) : growth_factor(growth_factor_), linear_growth_threshold(linear_growth_threshold_), head(new MemoryChunk(initial_size_, nullptr)), size_in_bytes(head->size()), page_size(static_cast(::getPageSize())) @@ -160,7 +160,7 @@ public: void * head_pos = head->pos; size_t space = head->end - head->pos; - auto res = static_cast(std::align(alignment, size, head_pos, space)); + auto * res = static_cast(std::align(alignment, size, head_pos, space)); if (res) { head->pos = static_cast(head_pos); diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index 1ac753fbae5..b7173b25ce5 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -484,6 +484,20 @@ struct HashMethodKeysFixed std::unique_ptr columns_data; #endif + PaddedPODArray prepared_keys; + + static bool usePreparedKeys(const Sizes & key_sizes) + { + if (has_low_cardinality || has_nullable_keys || sizeof(Key) > 16) + return false; + + for (auto size : key_sizes) + if (size != 1 && size != 2 && size != 4 && size != 8 && size != 16) + return false; + + return true; + } + HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &) : Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size()) { @@ -505,8 +519,13 @@ struct HashMethodKeysFixed } } + if (usePreparedKeys(key_sizes)) + { + packFixedBatch(keys_size, Base::getActualColumns(), key_sizes, prepared_keys); + } + #if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) - if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16) + else if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16) { /** The task is to "pack" multiple fixed-size fields into single larger Key. * Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key: @@ -571,6 +590,9 @@ struct HashMethodKeysFixed return packFixed(row, keys_size, low_cardinality_keys.nested_columns, key_sizes, &low_cardinality_keys.positions, &low_cardinality_keys.position_sizes); + if (!prepared_keys.empty()) + return prepared_keys[row]; + #if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16) return packFixedShuffle(columns_data.get(), keys_size, key_sizes.data(), row, masks.get()); @@ -578,6 +600,37 @@ struct HashMethodKeysFixed return packFixed(row, keys_size, Base::getActualColumns(), key_sizes); } } + + static std::optional shuffleKeyColumns(std::vector & key_columns, const Sizes & key_sizes) + { + if (!usePreparedKeys(key_sizes)) + return {}; + + std::vector new_columns; + new_columns.reserve(key_columns.size()); + + Sizes new_sizes; + auto fill_size = [&](size_t size) + { + for (size_t i = 0; i < key_sizes.size(); ++i) + { + if (key_sizes[i] == size) + { + new_columns.push_back(key_columns[i]); + new_sizes.push_back(size); + } + } + }; + + fill_size(16); + fill_size(8); + fill_size(4); + fill_size(2); + fill_size(1); + + key_columns.swap(new_columns); + return new_sizes; + } }; /** Hash by concatenating serialized key values. diff --git a/src/Common/ConcurrentBoundedQueue.h b/src/Common/ConcurrentBoundedQueue.h index b888d68a286..cb29efc3349 100644 --- a/src/Common/ConcurrentBoundedQueue.h +++ b/src/Common/ConcurrentBoundedQueue.h @@ -6,38 +6,7 @@ #include #include -#include - - -namespace detail -{ - template > - struct MoveOrCopyIfThrow; - - template - struct MoveOrCopyIfThrow - { - void operator()(T && src, T & dst) const - { - dst = std::forward(src); - } - }; - - template - struct MoveOrCopyIfThrow - { - void operator()(T && src, T & dst) const - { - dst = src; - } - }; - - template - void moveOrCopyIfThrow(T && src, T & dst) - { - MoveOrCopyIfThrow()(std::forward(src), dst); - } -} +#include /** A very simple thread-safe queue of limited size. * If you try to pop an item from an empty queue, the thread is blocked until the queue becomes nonempty. @@ -53,8 +22,10 @@ private: Poco::Semaphore empty_count; public: - ConcurrentBoundedQueue(size_t max_fill) - : fill_count(0, max_fill), empty_count(max_fill, max_fill) {} + explicit ConcurrentBoundedQueue(size_t max_fill) + : fill_count(0, max_fill) + , empty_count(max_fill, max_fill) + {} void push(const T & x) { diff --git a/src/Common/Config/AbstractConfigurationComparison.cpp b/src/Common/Config/AbstractConfigurationComparison.cpp index 0e603cb1056..eb677debb02 100644 --- a/src/Common/Config/AbstractConfigurationComparison.cpp +++ b/src/Common/Config/AbstractConfigurationComparison.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -26,6 +27,27 @@ bool isSameConfiguration(const Poco::Util::AbstractConfiguration & left, const P return isSameConfiguration(left, String(), right, String()); } +bool isSameConfiguration(const Poco::Util::AbstractConfiguration & left, const Poco::Util::AbstractConfiguration & right, const String & key) +{ + return isSameConfiguration(left, key, right, key); +} + +bool isSameConfigurationWithMultipleKeys(const Poco::Util::AbstractConfiguration & left, const Poco::Util::AbstractConfiguration & right, const String & root, const String & name) +{ + if (&left == &right) + return true; + + auto left_multiple_keys = getMultipleKeysFromConfig(left, root, name); + auto right_multiple_keys = getMultipleKeysFromConfig(right, root, name); + if (left_multiple_keys.size() != right_multiple_keys.size()) + return false; + + for (auto & key : left_multiple_keys) + if (!isSameConfiguration(left, right, concatKeyAndSubKey(root, key))) + return false; + + return true; +} bool isSameConfiguration(const Poco::Util::AbstractConfiguration & left, const String & left_key, const Poco::Util::AbstractConfiguration & right, const String & right_key) diff --git a/src/Common/Config/AbstractConfigurationComparison.h b/src/Common/Config/AbstractConfigurationComparison.h index f825ad4e53d..6e1d8a890bb 100644 --- a/src/Common/Config/AbstractConfigurationComparison.h +++ b/src/Common/Config/AbstractConfigurationComparison.h @@ -13,6 +13,22 @@ namespace DB bool isSameConfiguration(const Poco::Util::AbstractConfiguration & left, const Poco::Util::AbstractConfiguration & right); + /// Config may have multiple keys with one name. For example: + /// + /// ... + /// ... + /// + /// Returns true if the specified subview of the two configurations contains + /// the same keys and values for each key with the given name. + bool isSameConfigurationWithMultipleKeys(const Poco::Util::AbstractConfiguration & left, + const Poco::Util::AbstractConfiguration & right, + const String & root, const String & name); + + /// Returns true if the specified subview of the two configurations contains the same keys and values. + bool isSameConfiguration(const Poco::Util::AbstractConfiguration & left, + const Poco::Util::AbstractConfiguration & right, + const String & key); + /// Returns true if specified subviews of the two configurations contains the same keys and values. bool isSameConfiguration(const Poco::Util::AbstractConfiguration & left, const String & left_key, const Poco::Util::AbstractConfiguration & right, const String & right_key); diff --git a/src/Common/Config/ConfigReloader.cpp b/src/Common/Config/ConfigReloader.cpp index 677448e03ae..afff08e82bb 100644 --- a/src/Common/Config/ConfigReloader.cpp +++ b/src/Common/Config/ConfigReloader.cpp @@ -27,7 +27,7 @@ ConfigReloader::ConfigReloader( , updater(std::move(updater_)) { if (!already_loaded) - reloadIfNewer(/* force = */ true, /* throw_on_error = */ true, /* fallback_to_preprocessed = */ true); + reloadIfNewer(/* force = */ true, /* throw_on_error = */ true, /* fallback_to_preprocessed = */ true, /* initial_loading = */ true); } @@ -66,7 +66,7 @@ void ConfigReloader::run() if (quit) return; - reloadIfNewer(zk_changed, /* throw_on_error = */ false, /* fallback_to_preprocessed = */ false); + reloadIfNewer(zk_changed, /* throw_on_error = */ false, /* fallback_to_preprocessed = */ false, /* initial_loading = */ false); } catch (...) { @@ -76,7 +76,7 @@ void ConfigReloader::run() } } -void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed) +void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed, bool initial_loading) { std::lock_guard lock(reload_mutex); @@ -131,7 +131,7 @@ void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallbac try { - updater(loaded_config.configuration); + updater(loaded_config.configuration, initial_loading); } catch (...) { diff --git a/src/Common/Config/ConfigReloader.h b/src/Common/Config/ConfigReloader.h index 489f062e2fe..2e4399d3c4e 100644 --- a/src/Common/Config/ConfigReloader.h +++ b/src/Common/Config/ConfigReloader.h @@ -27,7 +27,7 @@ class Context; class ConfigReloader { public: - using Updater = std::function; + using Updater = std::function; /** include_from_path is usually /etc/metrika.xml (i.e. value of tag) */ @@ -46,12 +46,12 @@ public: void start(); /// Reload immediately. For SYSTEM RELOAD CONFIG query. - void reload() { reloadIfNewer(/* force */ true, /* throw_on_error */ true, /* fallback_to_preprocessed */ false); } + void reload() { reloadIfNewer(/* force */ true, /* throw_on_error */ true, /* fallback_to_preprocessed */ false, /* initial_loading = */ false); } private: void run(); - void reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed); + void reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed, bool initial_loading); struct FileWithTimestamp; diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 4fb2709c8e4..abbb3c71d72 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -52,7 +52,7 @@ M(RWLockActiveWriters, "Number of threads holding write lock in a table RWLock.") \ M(GlobalThread, "Number of threads in global thread pool.") \ M(GlobalThreadActive, "Number of threads in global thread pool running a task.") \ - M(LocalThread, "Number of threads in local thread pools. Should be similar to GlobalThreadActive.") \ + M(LocalThread, "Number of threads in local thread pools. The threads in local thread pools are taken from the global thread pool.") \ M(LocalThreadActive, "Number of threads in local thread pools running a task.") \ M(DistributedFilesToInsert, "Number of pending files to process for asynchronous insertion into Distributed tables. Number of files for every shard is summed.") \ M(TablesToDropQueueSize, "Number of dropped tables, that are waiting for background data removal.") \ @@ -66,6 +66,8 @@ M(PartsWide, "Wide parts.") \ M(PartsCompact, "Compact parts.") \ M(PartsInMemory, "In-memory parts.") \ + M(MMappedFiles, "Total number of mmapped files.") \ + M(MMappedFileBytes, "Sum size of mmapped file regions.") \ namespace CurrentMetrics { diff --git a/src/Common/CurrentMetrics.h b/src/Common/CurrentMetrics.h index eabeca7a0e9..7d63b25f0f0 100644 --- a/src/Common/CurrentMetrics.h +++ b/src/Common/CurrentMetrics.h @@ -100,6 +100,12 @@ namespace CurrentMetrics amount -= value; } + void add(Value value = 1) + { + what->fetch_add(value, std::memory_order_relaxed); + amount += value; + } + /// Subtract value before destructor. void destroy() { diff --git a/src/Common/CurrentThread.h b/src/Common/CurrentThread.h index 7ab57ea7fab..069f9cf2af7 100644 --- a/src/Common/CurrentThread.h +++ b/src/Common/CurrentThread.h @@ -1,11 +1,12 @@ #pragma once +#include +#include +#include + #include #include -#include -#include - namespace ProfileEvents { @@ -18,7 +19,6 @@ class MemoryTracker; namespace DB { -class Context; class QueryStatus; struct Progress; class InternalTextLogsQueue; @@ -87,7 +87,7 @@ public: /// Initializes query with current thread as master thread in constructor, and detaches it in destructor struct QueryScope { - explicit QueryScope(Context & query_context); + explicit QueryScope(ContextPtr query_context); ~QueryScope(); void logPeakMemoryUsage(); @@ -99,7 +99,7 @@ private: /// Sets query_context for current thread group /// Can by used only through QueryScope - static void attachQueryContext(Context & query_context); + static void attachQueryContext(ContextPtr query_context); }; } diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp new file mode 100644 index 00000000000..454c1a115f7 --- /dev/null +++ b/src/Common/Epoll.cpp @@ -0,0 +1,86 @@ +#if defined(OS_LINUX) + +#include "Epoll.h" +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int EPOLL_ERROR; + extern const int LOGICAL_ERROR; +} + +Epoll::Epoll() : events_count(0) +{ + epoll_fd = epoll_create1(0); + if (epoll_fd == -1) + throwFromErrno("Cannot open epoll descriptor", DB::ErrorCodes::EPOLL_ERROR); +} + +Epoll::Epoll(Epoll && other) : epoll_fd(other.epoll_fd), events_count(other.events_count.load()) +{ + other.epoll_fd = -1; +} + +Epoll & Epoll::operator=(Epoll && other) +{ + epoll_fd = other.epoll_fd; + other.epoll_fd = -1; + events_count.store(other.events_count.load()); + return *this; +} + +void Epoll::add(int fd, void * ptr) +{ + epoll_event event; + event.events = EPOLLIN | EPOLLPRI; + if (ptr) + event.data.ptr = ptr; + else + event.data.fd = fd; + + ++events_count; + + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1) + throwFromErrno("Cannot add new descriptor to epoll", DB::ErrorCodes::EPOLL_ERROR); +} + +void Epoll::remove(int fd) +{ + --events_count; + + if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, nullptr) == -1) + throwFromErrno("Cannot remove descriptor from epoll", DB::ErrorCodes::EPOLL_ERROR); +} + +size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocking) const +{ + if (events_count == 0) + throw Exception("There are no events in epoll", ErrorCodes::LOGICAL_ERROR); + + int ready_size; + int timeout = blocking ? -1 : 0; + do + { + ready_size = epoll_wait(epoll_fd, events_out, max_events, timeout); + + if (ready_size == -1 && errno != EINTR) + throwFromErrno("Error in epoll_wait", DB::ErrorCodes::EPOLL_ERROR); + } + while (ready_size <= 0 && (ready_size != 0 || blocking)); + + return ready_size; +} + +Epoll::~Epoll() +{ + if (epoll_fd != -1) + close(epoll_fd); +} + +} +#endif diff --git a/src/Common/Epoll.h b/src/Common/Epoll.h new file mode 100644 index 00000000000..5d9aef9ef66 --- /dev/null +++ b/src/Common/Epoll.h @@ -0,0 +1,54 @@ +#pragma once +#if defined(OS_LINUX) + +#include +#include +#include +#include + +namespace DB +{ + +using AsyncCallback = std::function; + +class Epoll +{ +public: + Epoll(); + + Epoll(const Epoll &) = delete; + Epoll & operator=(const Epoll &) = delete; + + Epoll & operator=(Epoll && other); + Epoll(Epoll && other); + + /// Add new file descriptor to epoll. If ptr set to nullptr, epoll_event.data.fd = fd, + /// otherwise epoll_event.data.ptr = ptr. + void add(int fd, void * ptr = nullptr); + + /// Remove file descriptor to epoll. + void remove(int fd); + + /// Get events from epoll. Events are written in events_out, this function returns an amount of ready events. + /// If blocking is false and there are no ready events, + /// return empty vector, otherwise wait for ready events. + size_t getManyReady(int max_events, epoll_event * events_out, bool blocking) const; + + int getFileDescriptor() const { return epoll_fd; } + + int size() const { return events_count; } + + bool empty() const { return events_count == 0; } + + const std::string & getDescription() const { return fd_description; } + + ~Epoll(); + +private: + int epoll_fd; + std::atomic events_count; + const std::string fd_description = "epoll"; +}; + +} +#endif diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index ba8741efae7..ad0463db889 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -1,4 +1,5 @@ #include +#include /** Previously, these constants were located in one enum. * But in this case there is a problem: when you add a new constant, you need to recompile @@ -538,12 +539,22 @@ M(569, MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD) \ M(570, DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD) \ M(571, DATABASE_REPLICATION_FAILED) \ + M(572, TOO_MANY_QUERY_PLAN_OPTIMIZATIONS) \ + M(573, EPOLL_ERROR) \ + M(574, DISTRIBUTED_TOO_MANY_PENDING_BYTES) \ + M(575, UNKNOWN_SNAPSHOT) \ + M(576, KERBEROS_ERROR) \ + M(577, INVALID_SHARD_ID) \ + M(578, INVALID_FORMAT_INSERT_QUERY_WITH_DATA) \ + M(579, INCORRECT_PART_TYPE) \ + M(580, CANNOT_SET_ROUNDING_MODE) \ + M(581, TOO_LARGE_DISTRIBUTED_DEPTH) \ \ + M(998, POSTGRESQL_CONNECTION_FAILURE) \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ M(1001, STD_EXCEPTION) \ M(1002, UNKNOWN_EXCEPTION) \ - M(1003, INVALID_SHARD_ID) /* See END */ @@ -551,12 +562,12 @@ namespace DB { namespace ErrorCodes { -#define M(VALUE, NAME) extern const Value NAME = VALUE; +#define M(VALUE, NAME) extern const ErrorCode NAME = VALUE; APPLY_FOR_ERROR_CODES(M) #undef M - constexpr Value END = 3000; - std::atomic values[END + 1]{}; + constexpr ErrorCode END = 3000; + ErrorPairHolder values[END + 1]{}; struct ErrorCodesNames { @@ -571,12 +582,43 @@ namespace ErrorCodes std::string_view getName(ErrorCode error_code) { - if (error_code >= END) + if (error_code < 0 || error_code >= END) return std::string_view(); return error_codes_names.names[error_code]; } ErrorCode end() { return END + 1; } + + void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace) + { + if (error_code < 0 || error_code >= end()) + { + /// For everything outside the range, use END. + /// (end() is the pointer pass the end, while END is the last value that has an element in values array). + error_code = end() - 1; + } + + values[error_code].increment(remote, message, trace); + } + + void ErrorPairHolder::increment(bool remote, const std::string & message, const FramePointers & trace) + { + const auto now = std::chrono::system_clock::now(); + + std::lock_guard lock(mutex); + + auto & error = remote ? value.remote : value.local; + + ++error.count; + error.message = message; + error.trace = trace; + error.error_time_ms = std::chrono::duration_cast(now.time_since_epoch()).count(); + } + ErrorPair ErrorPairHolder::get() + { + std::lock_guard lock(mutex); + return value; + } } } diff --git a/src/Common/ErrorCodes.h b/src/Common/ErrorCodes.h index cc610c5d927..ffd0b8b8619 100644 --- a/src/Common/ErrorCodes.h +++ b/src/Common/ErrorCodes.h @@ -1,11 +1,12 @@ #pragma once -#include +#include #include #include -#include -#include +#include #include +#include +#include /** Allows to count number of simultaneously happening error codes. * See also Exception.cpp for incrementing part. @@ -17,30 +18,51 @@ namespace DB namespace ErrorCodes { /// ErrorCode identifier (index in array). - using ErrorCode = size_t; - using Value = int; + using ErrorCode = int; + using Value = size_t; + using FramePointers = std::vector; /// Get name of error_code by identifier. /// Returns statically allocated string. std::string_view getName(ErrorCode error_code); + struct Error + { + /// Number of times Exception with this ErrorCode had been throw. + Value count; + /// Time of the last error. + UInt64 error_time_ms = 0; + /// Message for the last error. + std::string message; + /// Stacktrace for the last error. + FramePointers trace; + }; + struct ErrorPair + { + Error local; + Error remote; + }; + + /// Thread-safe + struct ErrorPairHolder + { + public: + ErrorPair get(); + void increment(bool remote, const std::string & message, const FramePointers & trace); + + private: + ErrorPair value; + std::mutex mutex; + }; + /// ErrorCode identifier -> current value of error_code. - extern std::atomic values[]; + extern ErrorPairHolder values[]; /// Get index just after last error_code identifier. ErrorCode end(); /// Add value for specified error_code. - inline void increment(ErrorCode error_code) - { - if (error_code >= end()) - { - /// For everything outside the range, use END. - /// (end() is the pointer pass the end, while END is the last value that has an element in values array). - error_code = end() - 1; - } - values[error_code].fetch_add(1, std::memory_order_relaxed); - } + void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace); } } diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index f5a40a11d9c..dca19eea7f2 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -34,9 +34,9 @@ namespace ErrorCodes extern const int CANNOT_MREMAP; } -/// Aborts the process if error code is LOGICAL_ERROR. -/// Increments error codes statistics. -void handle_error_code([[maybe_unused]] const std::string & msg, int code) +/// - Aborts the process if error code is LOGICAL_ERROR. +/// - Increments error codes statistics. +void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool remote, const Exception::FramePointers & trace) { // In debug builds and builds with sanitizers, treat LOGICAL_ERROR as an assertion failure. // Log the message before we fail. @@ -47,20 +47,21 @@ void handle_error_code([[maybe_unused]] const std::string & msg, int code) abort(); } #endif - ErrorCodes::increment(code); + + ErrorCodes::increment(code, remote, msg, trace); } Exception::Exception(const std::string & msg, int code, bool remote_) : Poco::Exception(msg, code) , remote(remote_) { - handle_error_code(msg, code); + handle_error_code(msg, code, remote, getStackFramePointers()); } Exception::Exception(const std::string & msg, const Exception & nested, int code) : Poco::Exception(msg, nested, code) { - handle_error_code(msg, code); + handle_error_code(msg, code, remote, getStackFramePointers()); } Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc) @@ -101,6 +102,31 @@ std::string Exception::getStackTraceString() const #endif } +Exception::FramePointers Exception::getStackFramePointers() const +{ + FramePointers frame_pointers; +#ifdef STD_EXCEPTION_HAS_STACK_TRACE + { + frame_pointers.resize(get_stack_trace_size()); + for (size_t i = 0; i < frame_pointers.size(); ++i) + { + frame_pointers[i] = get_stack_trace_frames()[i]; + } + } +#else + { + size_t stack_trace_size = trace.getSize(); + size_t stack_trace_offset = trace.getOffset(); + frame_pointers.reserve(stack_trace_size - stack_trace_offset); + for (size_t i = stack_trace_offset; i < stack_trace_size; ++i) + { + frame_pointers.push_back(trace.getFramePointers()[i]); + } + } +#endif + return frame_pointers; +} + void throwFromErrno(const std::string & s, int code, int the_errno) { @@ -124,7 +150,7 @@ void tryLogCurrentException(Poco::Logger * logger, const std::string & start_of_ /// /// And in this case the exception will not be logged, so let's block the /// MemoryTracker until the exception will be logged. - MemoryTracker::LockExceptionInThread lock_memory_tracker; + MemoryTracker::LockExceptionInThread lock_memory_tracker(VariableContext::Global); try { @@ -458,33 +484,25 @@ ExecutionStatus ExecutionStatus::fromCurrentException(const std::string & start_ return ExecutionStatus(getCurrentExceptionCode(), msg); } -ParsingException::ParsingException() -{ - Exception::message(Exception::message() + "{}"); -} - +ParsingException::ParsingException() = default; ParsingException::ParsingException(const std::string & msg, int code) : Exception(msg, code) { - Exception::message(Exception::message() + "{}"); } - ParsingException::ParsingException(int code, const std::string & message) : Exception(message, code) { - Exception::message(Exception::message() + "{}"); } - /// We use additional field formatted_message_ to make this method const. std::string ParsingException::displayText() const { try { if (line_number_ == -1) - formatted_message_ = fmt::format(message(), ""); + formatted_message_ = message(); else - formatted_message_ = fmt::format(message(), fmt::format(": (at row {})\n", line_number_)); + formatted_message_ = message() + fmt::format(": (at row {})\n", line_number_); } catch (...) {} diff --git a/src/Common/Exception.h b/src/Common/Exception.h index 661d31469fe..79b4394948a 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -24,6 +24,8 @@ namespace DB class Exception : public Poco::Exception { public: + using FramePointers = std::vector; + Exception() = default; Exception(const std::string & msg, int code, bool remote_ = false); Exception(const std::string & msg, const Exception & nested, int code); @@ -66,6 +68,8 @@ public: bool isRemoteException() const { return remote; } std::string getStackTraceString() const; + /// Used for system.errors + FramePointers getStackFramePointers() const; private: #ifndef STD_EXCEPTION_HAS_STACK_TRACE @@ -115,9 +119,7 @@ public: template ParsingException(int code, const std::string & fmt, Args&&... args) : Exception(fmt::format(fmt, std::forward(args)...), code) - { - Exception::message(Exception::message() + "{}"); - } + {} std::string displayText() const diff --git a/src/Common/FieldVisitors.cpp b/src/Common/FieldVisitors.cpp index dae153bd8d2..62f04816032 100644 --- a/src/Common/FieldVisitors.cpp +++ b/src/Common/FieldVisitors.cpp @@ -180,7 +180,17 @@ String FieldVisitorToString::operator() (const Tuple & x) const { WriteBufferFromOwnString wb; - wb << '('; + // For single-element tuples we must use the explicit tuple() function, + // or they will be parsed back as plain literals. + if (x.size() > 1) + { + wb << '('; + } + else + { + wb << "tuple("; + } + for (auto it = x.begin(); it != x.end(); ++it) { if (it != x.begin()) diff --git a/src/Common/FieldVisitors.h b/src/Common/FieldVisitors.h index 954cd004e84..5adbf2b01ac 100644 --- a/src/Common/FieldVisitors.h +++ b/src/Common/FieldVisitors.h @@ -185,12 +185,20 @@ public: /// Conversion of infinite values to integer is undefined. throw Exception("Cannot convert infinite value to integer type", ErrorCodes::CANNOT_CONVERT_TYPE); } + else if (x > std::numeric_limits::max() || x < std::numeric_limits::lowest()) + { + throw Exception("Cannot convert out of range floating point value to integer type", ErrorCodes::CANNOT_CONVERT_TYPE); + } } if constexpr (std::is_same_v) + { return Int256(x); + } else + { return T(x); + } } T operator() (const UInt128 &) const diff --git a/src/Common/HashTable/Hash.h b/src/Common/HashTable/Hash.h index ef20b70917d..0abe96497bd 100644 --- a/src/Common/HashTable/Hash.h +++ b/src/Common/HashTable/Hash.h @@ -1,8 +1,9 @@ #pragma once #include -#include #include +#include +#include #include @@ -178,13 +179,19 @@ inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) <= sizeof(UInt64)), T> k } template -inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) > sizeof(UInt64)), T> key) +static constexpr bool UseDefaultHashForBigInts = + std::is_same_v || + std::is_same_v || + (is_big_int_v && sizeof(T) == 32); + +template +inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) > sizeof(UInt64) && UseDefaultHashForBigInts), T> key) { if constexpr (std::is_same_v) { return intHash64(static_cast(key) ^ static_cast(key >> 64)); } - if constexpr (std::is_same_v) + else if constexpr (std::is_same_v) { return intHash64(key.low ^ key.high); } @@ -195,6 +202,8 @@ inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) > sizeof(UInt64)), T> ke static_cast(key >> 128) ^ static_cast(key >> 256)); } + + assert(false); __builtin_unreachable(); } @@ -341,6 +350,11 @@ struct IntHash32 } else if constexpr (sizeof(T) <= sizeof(UInt64)) return intHash32(key); + + assert(false); __builtin_unreachable(); } }; + +template <> +struct DefaultHash : public StringRefHash {}; diff --git a/src/Common/HashTable/HashMap.h b/src/Common/HashTable/HashMap.h index 99dc5414107..50324fe64c7 100644 --- a/src/Common/HashTable/HashMap.h +++ b/src/Common/HashTable/HashMap.h @@ -24,17 +24,26 @@ struct PairNoInit PairNoInit() {} - template - PairNoInit(First_ && first_, NoInitTag) : first(std::forward(first_)) + template + PairNoInit(FirstValue && first_, NoInitTag) + : first(std::forward(first_)) { } - template - PairNoInit(First_ && first_, Second_ && second_) : first(std::forward(first_)), second(std::forward(second_)) + template + PairNoInit(FirstValue && first_, SecondValue && second_) + : first(std::forward(first_)) + , second(std::forward(second_)) { } }; +template +PairNoInit, std::decay_t> makePairNoInit(First && first, Second && second) +{ + return PairNoInit, std::decay_t>(std::forward(first), std::forward(second)); +} + template struct HashMapCell @@ -48,7 +57,7 @@ struct HashMapCell value_type value; - HashMapCell() {} + HashMapCell() = default; HashMapCell(const Key & key_, const State &) : value(key_, NoInitTag()) {} HashMapCell(const value_type & value_, const State &) : value(value_) {} @@ -114,8 +123,39 @@ struct HashMapCell static void move(HashMapCell * /* old_location */, HashMapCell * /* new_location */) {} + template + auto & get() & { + if constexpr (I == 0) return value.first; + else if constexpr (I == 1) return value.second; + } + + template + auto const & get() const & { + if constexpr (I == 0) return value.first; + else if constexpr (I == 1) return value.second; + } + + template + auto && get() && { + if constexpr (I == 0) return std::move(value.first); + else if constexpr (I == 1) return std::move(value.second); + } + }; +namespace std +{ + + template + struct tuple_size> : std::integral_constant { }; + + template + struct tuple_element<0, HashMapCell> { using type = Key; }; + + template + struct tuple_element<1, HashMapCell> { using type = TMapped; }; +} + template struct HashMapCellWithSavedHash : public HashMapCell { @@ -227,6 +267,19 @@ public: } }; +namespace std +{ + + template + struct tuple_size> : std::integral_constant { }; + + template + struct tuple_element<0, HashMapCellWithSavedHash> { using type = Key; }; + + template + struct tuple_element<1, HashMapCellWithSavedHash> { using type = TMapped; }; +} + template < typename Key, diff --git a/src/Common/HashTable/LRUHashMap.h b/src/Common/HashTable/LRUHashMap.h index 292006f2438..870fb219523 100644 --- a/src/Common/HashTable/LRUHashMap.h +++ b/src/Common/HashTable/LRUHashMap.h @@ -77,7 +77,7 @@ struct LRUHashMapCellNodeTraits static void set_previous(node * __restrict ptr, node * __restrict prev) { ptr->prev = prev; } }; -template +template class LRUHashMapImpl : private HashMapTable< TKey, @@ -108,24 +108,33 @@ public: boost::intrusive::value_traits, boost::intrusive::constant_time_size>; + using LookupResult = typename Base::LookupResult; + using ConstLookupResult = typename Base::ConstLookupResult; + using iterator = typename LRUList::iterator; using const_iterator = typename LRUList::const_iterator; using reverse_iterator = typename LRUList::reverse_iterator; using const_reverse_iterator = typename LRUList::const_reverse_iterator; - LRUHashMapImpl(size_t max_size_, bool preallocate_max_size_in_hash_map = false) + explicit LRUHashMapImpl(size_t max_size_, bool preallocate_max_size_in_hash_map = false, Disposer disposer_ = Disposer()) : Base(preallocate_max_size_in_hash_map ? max_size_ : 32) , max_size(max_size_) + , disposer(std::move(disposer_)) { assert(max_size > 0); } - std::pair insert(const Key & key, const Value & value) + ~LRUHashMapImpl() + { + clear(); + } + + std::pair ALWAYS_INLINE insert(const Key & key, const Value & value) { return emplace(key, value); } - std::pair insert(const Key & key, Value && value) + std::pair ALWAYS_INLINE insert(const Key & key, Value && value) { return emplace(key, std::move(value)); } @@ -147,15 +156,16 @@ public: if (size() == max_size) { /// Erase least recently used element from front of the list - Cell & node = lru_list.front(); + Cell copy_node = lru_list.front(); - const Key & element_to_remove_key = node.getKey(); - size_t key_hash = node.getHash(*this); + const Key & element_to_remove_key = copy_node.getKey(); lru_list.pop_front(); - [[maybe_unused]] bool erased = Base::erase(element_to_remove_key, key_hash); + [[maybe_unused]] bool erased = Base::erase(element_to_remove_key); assert(erased); + + disposer(element_to_remove_key, copy_node.getMapped()); } [[maybe_unused]] bool inserted; @@ -174,46 +184,64 @@ public: return std::make_pair(it, true); } - using Base::find; - - Value & get(const Key & key) + LookupResult ALWAYS_INLINE find(const Key & key) { auto it = Base::find(key); - assert(it); - Value & value = it->getMapped(); + if (!it) + return nullptr; /// Put cell to the end of lru list lru_list.splice(lru_list.end(), lru_list, lru_list.iterator_to(*it)); - return value; + return it; } - const Value & get(const Key & key) const + ConstLookupResult ALWAYS_INLINE find(const Key & key) const + { + return const_cast *>(this)->find(key); + } + + Value & ALWAYS_INLINE get(const Key & key) + { + auto it = find(key); + assert(it); + + return it->getMapped(); + } + + const Value & ALWAYS_INLINE get(const Key & key) const { return const_cast *>(this)->get(key); } - bool contains(const Key & key) const + bool ALWAYS_INLINE contains(const Key & key) const { - return Base::has(key); + return find(key) != nullptr; } - bool erase(const Key & key) + bool ALWAYS_INLINE erase(const Key & key) { - auto hash = Base::hash(key); - auto it = Base::find(key, hash); + auto key_hash = Base::hash(key); + auto it = Base::find(key, key_hash); if (!it) return false; lru_list.erase(lru_list.iterator_to(*it)); - return Base::erase(key, hash); + Cell copy_node = *it; + Base::erase(key, key_hash); + disposer(copy_node.getKey(), copy_node.getMapped()); + + return true; } - void clear() + void ALWAYS_INLINE clear() { + for (auto & cell : lru_list) + disposer(cell.getKey(), cell.getMapped()); + lru_list.clear(); Base::clear(); } @@ -222,6 +250,10 @@ public: size_t getMaxSize() const { return max_size; } + size_t getSizeInBytes() const { return Base::getBufferSizeInBytes(); } + + using Base::hash; + iterator begin() { return lru_list.begin(); } const_iterator begin() const { return lru_list.cbegin(); } iterator end() { return lru_list.end(); } @@ -235,10 +267,17 @@ public: private: size_t max_size; LRUList lru_list; + Disposer disposer; }; -template > -using LRUHashMap = LRUHashMapImpl; +template +struct DefaultLRUHashMapCellDisposer +{ + void operator()(const Key &, const Mapped &) const {} +}; -template > -using LRUHashMapWithSavedHash = LRUHashMapImpl; +template , typename Hash = DefaultHash> +using LRUHashMap = LRUHashMapImpl; + +template , typename Hash = DefaultHash> +using LRUHashMapWithSavedHash = LRUHashMapImpl; diff --git a/src/Common/IPv6ToBinary.cpp b/src/Common/IPv6ToBinary.cpp index 3c004a5a84e..a8363a46de7 100644 --- a/src/Common/IPv6ToBinary.cpp +++ b/src/Common/IPv6ToBinary.cpp @@ -13,8 +13,7 @@ namespace DB /// Result array could be indexed with all possible uint8 values without extra check. /// For values greater than 128 we will store same value as for 128 (all bits set). constexpr size_t IPV6_MASKS_COUNT = 256; - -using RawMaskArray = std::array; +using RawMaskArrayV6 = std::array; void IPv6ToRawBinary(const Poco::Net::IPAddress & address, char * res) { @@ -41,33 +40,86 @@ std::array IPv6ToBinary(const Poco::Net::IPAddress & address) return res; } -static constexpr RawMaskArray generateBitMask(size_t prefix) +template +static constexpr RawMaskArrayT generateBitMask(size_t prefix) { - if (prefix >= 128) - prefix = 128; - RawMaskArray arr{0}; + RawMaskArrayT arr{0}; + if (prefix >= arr.size() * 8) + prefix = arr.size() * 8; size_t i = 0; for (; prefix >= 8; ++i, prefix -= 8) arr[i] = 0xff; if (prefix > 0) arr[i++] = ~(0xff >> prefix); - while (i < 16) + while (i < arr.size()) arr[i++] = 0x00; return arr; } -static constexpr std::array generateBitMasks() +template +static constexpr std::array generateBitMasks() { - std::array arr{}; - for (size_t i = 0; i < IPV6_MASKS_COUNT; ++i) - arr[i] = generateBitMask(i); + std::array arr{}; + for (size_t i = 0; i < masksCount; ++i) + arr[i] = generateBitMask(i); return arr; } -const uint8_t * getCIDRMaskIPv6(UInt8 prefix_len) +const std::array & getCIDRMaskIPv6(UInt8 prefix_len) { - static constexpr std::array IPV6_RAW_MASK_ARRAY = generateBitMasks(); - return IPV6_RAW_MASK_ARRAY[prefix_len].data(); + static constexpr auto IPV6_RAW_MASK_ARRAY = generateBitMasks(); + return IPV6_RAW_MASK_ARRAY[prefix_len]; } +bool matchIPv4Subnet(UInt32 addr, UInt32 cidr_addr, UInt8 prefix) +{ + UInt32 mask = (prefix >= 32) ? 0xffffffffu : ~(0xffffffffu >> prefix); + return (addr & mask) == (cidr_addr & mask); +} + +#if defined(__SSE2__) +#include + +bool matchIPv6Subnet(const uint8_t * addr, const uint8_t * cidr_addr, UInt8 prefix) +{ + uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8( + _mm_loadu_si128(reinterpret_cast(addr)), + _mm_loadu_si128(reinterpret_cast(cidr_addr)))); + mask = ~mask; + + if (mask) + { + auto offset = __builtin_ctz(mask); + + if (prefix / 8 != offset) + return prefix / 8 < offset; + + auto cmpmask = ~(0xff >> (prefix % 8)); + return (addr[offset] & cmpmask) == (cidr_addr[offset] & cmpmask); + } + return true; +} + +# else + +bool matchIPv6Subnet(const uint8_t * addr, const uint8_t * cidr_addr, UInt8 prefix) +{ + if (prefix > IPV6_BINARY_LENGTH * 8U) + prefix = IPV6_BINARY_LENGTH * 8U; + + size_t i = 0; + for (; prefix >= 8; ++i, prefix -= 8) + { + if (addr[i] != cidr_addr[i]) + return false; + } + if (prefix == 0) + return true; + + auto mask = ~(0xff >> prefix); + return (addr[i] & mask) == (cidr_addr[i] & mask); +} + +#endif // __SSE2__ + } diff --git a/src/Common/IPv6ToBinary.h b/src/Common/IPv6ToBinary.h index 2e47238aeba..d766d408359 100644 --- a/src/Common/IPv6ToBinary.h +++ b/src/Common/IPv6ToBinary.h @@ -14,9 +14,13 @@ void IPv6ToRawBinary(const Poco::Net::IPAddress & address, char * res); /// Convert IP address to 16-byte array with IPv6 data (big endian). If it's an IPv4, map it to IPv6. std::array IPv6ToBinary(const Poco::Net::IPAddress & address); -/// Returns pointer to 16-byte array containing mask with first `prefix_len` bits set to `1` and `128 - prefix_len` to `0`. -/// Pointer is valid during all program execution time and doesn't require freeing. +/// Returns a reference to 16-byte array containing mask with first `prefix_len` bits set to `1` and `128 - prefix_len` to `0`. +/// The reference is valid during all program execution time. /// Values of prefix_len greater than 128 interpreted as 128 exactly. -const uint8_t * getCIDRMaskIPv6(UInt8 prefix_len); +const std::array & getCIDRMaskIPv6(UInt8 prefix_len); + +/// Check that address contained in CIDR range +bool matchIPv4Subnet(UInt32 addr, UInt32 cidr_addr, UInt8 prefix); +bool matchIPv6Subnet(const uint8_t * addr, const uint8_t * cidr_addr, UInt8 prefix); } diff --git a/src/Common/LRUCache.h b/src/Common/LRUCache.h index d75c8caf1fc..012ab7fe6c4 100644 --- a/src/Common/LRUCache.h +++ b/src/Common/LRUCache.h @@ -271,16 +271,23 @@ private: void setImpl(const Key & key, const MappedPtr & mapped, [[maybe_unused]] std::lock_guard & cache_lock) { - auto res = cells.emplace(std::piecewise_construct, + auto [it, inserted] = cells.emplace(std::piecewise_construct, std::forward_as_tuple(key), std::forward_as_tuple()); - Cell & cell = res.first->second; - bool inserted = res.second; + Cell & cell = it->second; if (inserted) { - cell.queue_iterator = queue.insert(queue.end(), key); + try + { + cell.queue_iterator = queue.insert(queue.end(), key); + } + catch (...) + { + cells.erase(it); + throw; + } } else { diff --git a/src/Common/Macros.cpp b/src/Common/Macros.cpp index b8e25499c0b..7882449b595 100644 --- a/src/Common/Macros.cpp +++ b/src/Common/Macros.cpp @@ -78,7 +78,10 @@ String Macros::expand(const String & s, /// Prefer explicit macros over implicit. if (it != macros.end() && !info.expand_special_macros_only) + { res += it->second; + info.expanded_other = true; + } else if (macro_name == "database" && !info.table_id.database_name.empty()) { res += info.table_id.database_name; diff --git a/src/Common/Macros.h b/src/Common/Macros.h index 3082452e297..9298dbfc2d5 100644 --- a/src/Common/Macros.h +++ b/src/Common/Macros.h @@ -40,6 +40,7 @@ public: bool expanded_database = false; bool expanded_table = false; bool expanded_uuid = false; + bool expanded_other = false; bool has_unknown = false; }; diff --git a/src/Common/MemorySanitizer.h b/src/Common/MemorySanitizer.h index 54a92ea3a19..9e34e454090 100644 --- a/src/Common/MemorySanitizer.h +++ b/src/Common/MemorySanitizer.h @@ -1,5 +1,7 @@ #pragma once +#include + #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wreserved-id-macro" @@ -9,14 +11,15 @@ #define __msan_test_shadow(X, Y) (false) #define __msan_print_shadow(X, Y) #define __msan_unpoison_string(X) -#if defined(__has_feature) -# if __has_feature(memory_sanitizer) -# undef __msan_unpoison -# undef __msan_test_shadow -# undef __msan_print_shadow -# undef __msan_unpoison_string -# include -# endif + +#if defined(ch_has_feature) +# if ch_has_feature(memory_sanitizer) +# undef __msan_unpoison +# undef __msan_test_shadow +# undef __msan_print_shadow +# undef __msan_unpoison_string +# include +# endif #endif #ifdef __clang__ diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index a584885cf0f..60fb4d06b14 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -24,8 +24,8 @@ namespace /// /// - when it is explicitly blocked with LockExceptionInThread /// -/// - to avoid std::terminate(), when stack unwinding is currently in progress -/// in this thread. +/// - when there are uncaught exceptions objects in the current thread +/// (to avoid std::terminate()) /// /// NOTE: that since C++11 destructor marked with noexcept by default, and /// this means that any throw from destructor (that is not marked with diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 8e05dfea8b3..b1042332cfa 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -117,8 +117,11 @@ protected: template void alloc(size_t bytes, TAllocatorParams &&... allocator_params) { - c_start = c_end = reinterpret_cast(TAllocator::alloc(bytes, std::forward(allocator_params)...)) + pad_left; - c_end_of_storage = c_start + bytes - pad_right - pad_left; + char * allocated = reinterpret_cast(TAllocator::alloc(bytes, std::forward(allocator_params)...)); + + c_start = allocated + pad_left; + c_end = c_start; + c_end_of_storage = allocated + bytes - pad_right; if (pad_left) memset(c_start - ELEMENT_SIZE, 0, ELEMENT_SIZE); @@ -147,12 +150,12 @@ protected: ptrdiff_t end_diff = c_end - c_start; - c_start = reinterpret_cast( - TAllocator::realloc(c_start - pad_left, allocated_bytes(), bytes, std::forward(allocator_params)...)) - + pad_left; + char * allocated = reinterpret_cast( + TAllocator::realloc(c_start - pad_left, allocated_bytes(), bytes, std::forward(allocator_params)...)); + c_start = allocated + pad_left; c_end = c_start + end_diff; - c_end_of_storage = c_start + bytes - pad_right - pad_left; + c_end_of_storage = allocated + bytes - pad_right; } bool isInitialized() const @@ -318,11 +321,9 @@ protected: T * t_start() { return reinterpret_cast(this->c_start); } T * t_end() { return reinterpret_cast(this->c_end); } - T * t_end_of_storage() { return reinterpret_cast(this->c_end_of_storage); } const T * t_start() const { return reinterpret_cast(this->c_start); } const T * t_end() const { return reinterpret_cast(this->c_end); } - const T * t_end_of_storage() const { return reinterpret_cast(this->c_end_of_storage); } public: using value_type = T; @@ -334,7 +335,7 @@ public: using const_iterator = const T *; - PODArray() {} + PODArray() = default; PODArray(size_t n) { @@ -430,7 +431,7 @@ public: template void push_back(U && x, TAllocatorParams &&... allocator_params) { - if (unlikely(this->c_end == this->c_end_of_storage)) + if (unlikely(this->c_end + sizeof(T) > this->c_end_of_storage)) this->reserveForNextSize(std::forward(allocator_params)...); new (t_end()) T(std::forward(x)); @@ -443,7 +444,7 @@ public: template void emplace_back(Args &&... args) { - if (unlikely(this->c_end == this->c_end_of_storage)) + if (unlikely(this->c_end + sizeof(T) > this->c_end_of_storage)) this->reserveForNextSize(); new (t_end()) T(std::forward(args)...); @@ -529,6 +530,31 @@ public: this->c_end += bytes_to_copy; } + template + void insertFromItself(iterator from_begin, iterator from_end, TAllocatorParams && ... allocator_params) + { + static_assert(memcpy_can_be_used_for_assignment, std::decay_t>); + + /// Convert iterators to indexes because reserve can invalidate iterators + size_t start_index = from_begin - begin(); + size_t end_index = from_end - begin(); + size_t copy_size = end_index - start_index; + + assert(start_index <= end_index); + + size_t required_capacity = this->size() + copy_size; + if (required_capacity > this->capacity()) + this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward(allocator_params)...); + + size_t bytes_to_copy = this->byte_size(copy_size); + if (bytes_to_copy) + { + auto begin = this->c_start + this->byte_size(start_index); + memcpy(this->c_end, reinterpret_cast(&*begin), bytes_to_copy); + this->c_end += bytes_to_copy; + } + } + template void insert_assume_reserved(It1 from_begin, It2 from_end) { @@ -567,7 +593,7 @@ public: /// arr1 takes ownership of the heap memory of arr2. arr1.c_start = arr2.c_start; - arr1.c_end_of_storage = arr1.c_start + heap_allocated - arr1.pad_right; + arr1.c_end_of_storage = arr1.c_start + heap_allocated - arr2.pad_right - arr2.pad_left; arr1.c_end = arr1.c_start + this->byte_size(heap_size); /// Allocate stack space for arr2. @@ -584,7 +610,7 @@ public: dest.dealloc(); dest.alloc(src.allocated_bytes(), std::forward(allocator_params)...); memcpy(dest.c_start, src.c_start, this->byte_size(src.size())); - dest.c_end = dest.c_start + (src.c_end - src.c_start); + dest.c_end = dest.c_start + this->byte_size(src.size()); src.c_start = Base::null; src.c_end = Base::null; @@ -638,8 +664,8 @@ public: size_t rhs_size = rhs.size(); size_t rhs_allocated = rhs.allocated_bytes(); - this->c_end_of_storage = this->c_start + rhs_allocated - Base::pad_right; - rhs.c_end_of_storage = rhs.c_start + lhs_allocated - Base::pad_right; + this->c_end_of_storage = this->c_start + rhs_allocated - Base::pad_right - Base::pad_left; + rhs.c_end_of_storage = rhs.c_start + lhs_allocated - Base::pad_right - Base::pad_left; this->c_end = this->c_start + this->byte_size(rhs_size); rhs.c_end = rhs.c_start + this->byte_size(lhs_size); @@ -691,35 +717,59 @@ public: assign(from.begin(), from.end()); } - - bool operator== (const PODArray & other) const + void erase(const_iterator first, const_iterator last) { - if (this->size() != other.size()) + iterator first_no_const = const_cast(first); + iterator last_no_const = const_cast(last); + + size_t items_to_move = end() - last; + + while (items_to_move != 0) + { + *first_no_const = *last_no_const; + + ++first_no_const; + ++last_no_const; + + --items_to_move; + } + + this->c_end = reinterpret_cast(first_no_const); + } + + void erase(const_iterator pos) + { + this->erase(pos, pos + 1); + } + + bool operator== (const PODArray & rhs) const + { + if (this->size() != rhs.size()) return false; - const_iterator this_it = begin(); - const_iterator that_it = other.begin(); + const_iterator lhs_it = begin(); + const_iterator rhs_it = rhs.begin(); - while (this_it != end()) + while (lhs_it != end()) { - if (*this_it != *that_it) + if (*lhs_it != *rhs_it) return false; - ++this_it; - ++that_it; + ++lhs_it; + ++rhs_it; } return true; } - bool operator!= (const PODArray & other) const + bool operator!= (const PODArray & rhs) const { - return !operator==(other); + return !operator==(rhs); } }; -template -void swap(PODArray & lhs, PODArray & rhs) +template +void swap(PODArray & lhs, PODArray & rhs) { lhs.swap(rhs); } diff --git a/src/Common/PoolBase.h b/src/Common/PoolBase.h index 43f4fbff9fe..6fc5aee26dd 100644 --- a/src/Common/PoolBase.h +++ b/src/Common/PoolBase.h @@ -51,7 +51,7 @@ private: */ struct PoolEntryHelper { - PoolEntryHelper(PooledObject & data_) : data(data_) { data.in_use = true; } + explicit PoolEntryHelper(PooledObject & data_) : data(data_) { data.in_use = true; } ~PoolEntryHelper() { std::unique_lock lock(data.pool.mutex); @@ -69,7 +69,7 @@ public: public: friend class PoolBase; - Entry() {} /// For deferred initialization. + Entry() = default; /// For deferred initialization. /** The `Entry` object protects the resource from being used by another thread. * The following methods are forbidden for `rvalue`, so you can not write a similar to @@ -99,10 +99,10 @@ public: private: std::shared_ptr data; - Entry(PooledObject & object) : data(std::make_shared(object)) {} + explicit Entry(PooledObject & object) : data(std::make_shared(object)) {} }; - virtual ~PoolBase() {} + virtual ~PoolBase() = default; /** Allocates the object. Wait for free object in pool for 'timeout'. With 'timeout' < 0, the timeout is infinite. */ Entry get(Poco::Timespan::TimeDiff timeout) diff --git a/src/Common/PoolWithFailoverBase.h b/src/Common/PoolWithFailoverBase.h index 7779d18d969..141ac7a7e49 100644 --- a/src/Common/PoolWithFailoverBase.h +++ b/src/Common/PoolWithFailoverBase.h @@ -93,6 +93,19 @@ public: double staleness = 0.0; /// Helps choosing the "least stale" option when all replicas are stale. }; + struct PoolState; + + using PoolStates = std::vector; + + struct ShuffledPool + { + NestedPool * pool{}; + const PoolState * state{}; + size_t index = 0; + size_t error_count = 0; + size_t slowdown_count = 0; + }; + /// This functor must be provided by a client. It must perform a single try that takes a connection /// from the provided pool and checks that it is good. using TryGetEntryFunc = std::function; @@ -113,9 +126,6 @@ public: const GetPriorityFunc & get_priority = GetPriorityFunc()); protected: - struct PoolState; - - using PoolStates = std::vector; /// Returns a single connection. Entry get(size_t max_ignored_errors, bool fallback_to_stale_replicas, @@ -124,6 +134,12 @@ protected: /// This function returns a copy of pool states to avoid race conditions when modifying shared pool states. PoolStates updatePoolStates(size_t max_ignored_errors); + void updateErrorCounts(PoolStates & states, time_t & last_decrease_time) const; + + std::vector getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority); + + inline void updateSharedErrorCounts(std::vector & shuffled_pools); + auto getPoolExtendedStates() const { std::lock_guard lock(pool_states_mutex); @@ -143,6 +159,47 @@ protected: Poco::Logger * log; }; + +template +std::vector::ShuffledPool> +PoolWithFailoverBase::getShuffledPools( + size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority) +{ + /// Update random numbers and error counts. + PoolStates pool_states = updatePoolStates(max_ignored_errors); + if (get_priority) + { + for (size_t i = 0; i < pool_states.size(); ++i) + pool_states[i].priority = get_priority(i); + } + + /// Sort the pools into order in which they will be tried (based on respective PoolStates). + std::vector shuffled_pools; + shuffled_pools.reserve(nested_pools.size()); + for (size_t i = 0; i < nested_pools.size(); ++i) + shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0}); + std::sort( + shuffled_pools.begin(), shuffled_pools.end(), + [](const ShuffledPool & lhs, const ShuffledPool & rhs) + { + return PoolState::compare(*lhs.state, *rhs.state); + }); + + return shuffled_pools; +} + +template +inline void PoolWithFailoverBase::updateSharedErrorCounts(std::vector & shuffled_pools) +{ + std::lock_guard lock(pool_states_mutex); + for (const ShuffledPool & pool: shuffled_pools) + { + auto & pool_state = shared_pool_states[pool.index]; + pool_state.error_count = std::min(max_error_cap, pool_state.error_count + pool.error_count); + pool_state.slowdown_count += pool.slowdown_count; + } +} + template typename TNestedPool::Entry PoolWithFailoverBase::get(size_t max_ignored_errors, bool fallback_to_stale_replicas, @@ -168,33 +225,7 @@ PoolWithFailoverBase::getMany( const TryGetEntryFunc & try_get_entry, const GetPriorityFunc & get_priority) { - /// Update random numbers and error counts. - PoolStates pool_states = updatePoolStates(max_ignored_errors); - if (get_priority) - { - for (size_t i = 0; i < pool_states.size(); ++i) - pool_states[i].priority = get_priority(i); - } - - struct ShuffledPool - { - NestedPool * pool{}; - const PoolState * state{}; - size_t index = 0; - size_t error_count = 0; - }; - - /// Sort the pools into order in which they will be tried (based on respective PoolStates). - std::vector shuffled_pools; - shuffled_pools.reserve(nested_pools.size()); - for (size_t i = 0; i < nested_pools.size(); ++i) - shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0}); - std::sort( - shuffled_pools.begin(), shuffled_pools.end(), - [](const ShuffledPool & lhs, const ShuffledPool & rhs) - { - return PoolState::compare(*lhs.state, *rhs.state); - }); + std::vector shuffled_pools = getShuffledPools(max_ignored_errors, get_priority); /// We will try to get a connection from each pool until a connection is produced or max_tries is reached. std::vector try_results(shuffled_pools.size()); @@ -206,12 +237,7 @@ PoolWithFailoverBase::getMany( /// At exit update shared error counts with error counts occurred during this call. SCOPE_EXIT( { - std::lock_guard lock(pool_states_mutex); - for (const ShuffledPool & pool: shuffled_pools) - { - auto & pool_state = shared_pool_states[pool.index]; - pool_state.error_count = std::min(max_error_cap, pool_state.error_count + pool.error_count); - } + updateSharedErrorCounts(shuffled_pools); }); std::string fail_messages; @@ -310,6 +336,8 @@ template struct PoolWithFailoverBase::PoolState { UInt64 error_count = 0; + /// The number of slowdowns that led to changing replica in HedgedRequestsFactory + UInt64 slowdown_count = 0; /// Priority from the configuration. Int64 config_priority = 1; /// Priority from the GetPriorityFunc. @@ -323,8 +351,8 @@ struct PoolWithFailoverBase::PoolState static bool compare(const PoolState & lhs, const PoolState & rhs) { - return std::forward_as_tuple(lhs.error_count, lhs.config_priority, lhs.priority, lhs.random) - < std::forward_as_tuple(rhs.error_count, rhs.config_priority, rhs.priority, rhs.random); + return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random) + < std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random); } private: @@ -344,39 +372,7 @@ PoolWithFailoverBase::updatePoolStates(size_t max_ignored_errors) for (auto & state : shared_pool_states) state.randomize(); - time_t current_time = time(nullptr); - - if (last_error_decrease_time) - { - time_t delta = current_time - last_error_decrease_time; - - if (delta >= 0) - { - const UInt64 MAX_BITS = sizeof(UInt64) * CHAR_BIT; - size_t shift_amount = MAX_BITS; - /// Divide error counts by 2 every decrease_error_period seconds. - if (decrease_error_period) - shift_amount = delta / decrease_error_period; - /// Update time but don't do it more often than once a period. - /// Else if the function is called often enough, error count will never decrease. - if (shift_amount) - last_error_decrease_time = current_time; - - if (shift_amount >= MAX_BITS) - { - for (auto & state : shared_pool_states) - state.error_count = 0; - } - else if (shift_amount) - { - for (auto & state : shared_pool_states) - state.error_count >>= shift_amount; - } - } - } - else - last_error_decrease_time = current_time; - + updateErrorCounts(shared_pool_states, last_error_decrease_time); result.assign(shared_pool_states.begin(), shared_pool_states.end()); } @@ -386,3 +382,46 @@ PoolWithFailoverBase::updatePoolStates(size_t max_ignored_errors) return result; } + +template +void PoolWithFailoverBase::updateErrorCounts(PoolWithFailoverBase::PoolStates & states, time_t & last_decrease_time) const +{ + time_t current_time = time(nullptr); + + if (last_decrease_time) + { + time_t delta = current_time - last_decrease_time; + + if (delta >= 0) + { + const UInt64 MAX_BITS = sizeof(UInt64) * CHAR_BIT; + size_t shift_amount = MAX_BITS; + /// Divide error counts by 2 every decrease_error_period seconds. + if (decrease_error_period) + shift_amount = delta / decrease_error_period; + /// Update time but don't do it more often than once a period. + /// Else if the function is called often enough, error count will never decrease. + if (shift_amount) + last_decrease_time = current_time; + + if (shift_amount >= MAX_BITS) + { + for (auto & state : states) + { + state.error_count = 0; + state.slowdown_count = 0; + } + } + else if (shift_amount) + { + for (auto & state : states) + { + state.error_count >>= shift_amount; + state.slowdown_count >>= shift_amount; + } + } + } + } + else + last_decrease_time = current_time; +} diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index c459bf41352..162d6e035cc 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -32,6 +32,8 @@ M(UncompressedCacheHits, "") \ M(UncompressedCacheMisses, "") \ M(UncompressedCacheWeightLost, "") \ + M(MMappedFileCacheHits, "") \ + M(MMappedFileCacheMisses, "") \ M(IOBufferAllocs, "") \ M(IOBufferAllocBytes, "") \ M(ArenaAllocChunks, "") \ @@ -68,6 +70,9 @@ M(DelayedInserts, "Number of times the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.") \ M(RejectedInserts, "Number of times the INSERT of a block to a MergeTree table was rejected with 'Too many parts' exception due to high number of active data parts for partition.") \ M(DelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.") \ + M(DistributedDelayedInserts, "Number of times the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.") \ + M(DistributedRejectedInserts, "Number of times the INSERT of a block to a Distributed table was rejected with 'Too many bytes' exception due to high number of pending bytes.") \ + M(DistributedDelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.") \ M(DuplicatedInsertedBlocks, "Number of times the INSERTed block to a ReplicatedMergeTree table was deduplicated.") \ \ M(ZooKeeperInit, "") \ @@ -94,6 +99,8 @@ M(DistributedConnectionStaleReplica, "") \ M(DistributedConnectionFailAtAll, "Total count when distributed connection fails after all retries finished") \ \ + M(HedgedRequestsChangeReplica, "Total count when timeout for changing replica expired in hedged requests.") \ + \ M(CompileFunction, "Number of times a compilation of generated LLVM code (to create fused function for complex expressions) was initiated.") \ M(CompiledFunctionExecute, "Number of times a compiled function was executed.") \ M(CompileExpressionsMicroseconds, "Total time spent for compilation of expressions to LLVM code.") \ @@ -139,6 +146,11 @@ M(StorageBufferPassedTimeMaxThreshold, "") \ M(StorageBufferPassedRowsMaxThreshold, "") \ M(StorageBufferPassedBytesMaxThreshold, "") \ + M(StorageBufferPassedTimeFlushThreshold, "") \ + M(StorageBufferPassedRowsFlushThreshold, "") \ + M(StorageBufferPassedBytesFlushThreshold, "") \ + M(StorageBufferLayerLockReadersWaitMilliseconds, "Time for waiting for Buffer layer during reading") \ + M(StorageBufferLayerLockWritersWaitMilliseconds, "Time for waiting free Buffer layer to write to (can be used to tune Buffer layers)") \ \ M(DictCacheKeysRequested, "") \ M(DictCacheKeysRequestedMiss, "") \ diff --git a/src/Common/RemoteHostFilter.cpp b/src/Common/RemoteHostFilter.cpp index fb6fc4e9bc3..6ea366314e1 100644 --- a/src/Common/RemoteHostFilter.cpp +++ b/src/Common/RemoteHostFilter.cpp @@ -42,6 +42,7 @@ void RemoteHostFilter::setValuesFromConfig(const Poco::Util::AbstractConfigurati else if (startsWith(key, "host")) primary_hosts.insert(config.getString("remote_url_allow_hosts." + key)); } + is_allow_by_default = false; } } @@ -58,6 +59,6 @@ bool RemoteHostFilter::checkForDirectEntry(const std::string & str) const } return true; } - return true; + return is_allow_by_default; } } diff --git a/src/Common/RemoteHostFilter.h b/src/Common/RemoteHostFilter.h index 48d9b2bda7c..a445471a411 100644 --- a/src/Common/RemoteHostFilter.h +++ b/src/Common/RemoteHostFilter.h @@ -24,6 +24,7 @@ public: void checkHostAndPort(const std::string & host, const std::string & port) const; /// Does the same as checkURL, but for host and port. private: + bool is_allow_by_default = true; std::unordered_set primary_hosts; /// Allowed primary () URL from config.xml std::vector regexp_hosts; /// Allowed regexp () URL from config.xml diff --git a/src/Common/SharedLibrary.cpp b/src/Common/SharedLibrary.cpp index 689179be7d8..37da308d5af 100644 --- a/src/Common/SharedLibrary.cpp +++ b/src/Common/SharedLibrary.cpp @@ -13,11 +13,11 @@ namespace ErrorCodes extern const int CANNOT_DLSYM; } -SharedLibrary::SharedLibrary(const std::string & path, int flags) +SharedLibrary::SharedLibrary(std::string_view path, int flags) { - handle = dlopen(path.c_str(), flags); + handle = dlopen(path.data(), flags); if (!handle) - throw Exception(std::string("Cannot dlopen: ") + dlerror(), ErrorCodes::CANNOT_DLOPEN); + throw Exception(ErrorCodes::CANNOT_DLOPEN, "Cannot dlopen: ({})", dlerror()); updatePHDRCache(); @@ -31,17 +31,18 @@ SharedLibrary::~SharedLibrary() std::terminate(); } -void * SharedLibrary::getImpl(const std::string & name, bool no_throw) +void * SharedLibrary::getImpl(std::string_view name, bool no_throw) { dlerror(); - auto * res = dlsym(handle, name.c_str()); + auto * res = dlsym(handle, name.data()); if (char * error = dlerror()) { if (no_throw) return nullptr; - throw Exception(std::string("Cannot dlsym: ") + error, ErrorCodes::CANNOT_DLSYM); + + throw Exception(ErrorCodes::CANNOT_DLSYM, "Cannot dlsym: ({})", error); } return res; diff --git a/src/Common/SharedLibrary.h b/src/Common/SharedLibrary.h index 9d2b9bc7843..866e60fbd33 100644 --- a/src/Common/SharedLibrary.h +++ b/src/Common/SharedLibrary.h @@ -14,23 +14,24 @@ namespace DB class SharedLibrary : private boost::noncopyable { public: - explicit SharedLibrary(const std::string & path, int flags = RTLD_LAZY); + explicit SharedLibrary(std::string_view path, int flags = RTLD_LAZY); ~SharedLibrary(); template - Func get(const std::string & name) + Func get(std::string_view name) { return reinterpret_cast(getImpl(name)); } + template - Func tryGet(const std::string & name) + Func tryGet(std::string_view name) { return reinterpret_cast(getImpl(name, true)); } private: - void * getImpl(const std::string & name, bool no_throw = false); + void * getImpl(std::string_view name, bool no_throw = false); void * handle = nullptr; }; diff --git a/src/Common/ShellCommand.cpp b/src/Common/ShellCommand.cpp index 069c7774729..6a18cb77346 100644 --- a/src/Common/ShellCommand.cpp +++ b/src/Common/ShellCommand.cpp @@ -2,15 +2,18 @@ #include #include #include +#include +#include +#include + +#include +#include #include #include #include -#include -#include #include #include -#include -#include + namespace { @@ -36,9 +39,9 @@ namespace ErrorCodes extern const int CANNOT_CREATE_CHILD_PROCESS; } -ShellCommand::ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, bool terminate_in_destructor_) +ShellCommand::ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, ShellCommandDestructorStrategy destructor_strategy_) : pid(pid_) - , terminate_in_destructor(terminate_in_destructor_) + , destructor_strategy(destructor_strategy_) , in(in_fd_) , out(out_fd_) , err(err_fd_) @@ -52,14 +55,24 @@ Poco::Logger * ShellCommand::getLogger() ShellCommand::~ShellCommand() { - if (terminate_in_destructor) + if (wait_called) + return; + + if (destructor_strategy.terminate_in_destructor) { - LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid); - int retcode = kill(pid, SIGTERM); - if (retcode != 0) - LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode)); + size_t try_wait_timeout = destructor_strategy.wait_for_normal_exit_before_termination_seconds; + bool process_terminated_normally = tryWaitProcessWithTimeout(try_wait_timeout); + + if (!process_terminated_normally) + { + LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid); + + int retcode = kill(pid, SIGTERM); + if (retcode != 0) + LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode)); + } } - else if (!wait_called) + else { try { @@ -72,6 +85,53 @@ ShellCommand::~ShellCommand() } } +bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) +{ + int status = 0; + + LOG_TRACE(getLogger(), "Try wait for shell command pid ({}) with timeout ({})", pid, timeout_in_seconds); + + wait_called = true; + struct timespec interval {.tv_sec = 1, .tv_nsec = 0}; + + in.close(); + out.close(); + err.close(); + + if (timeout_in_seconds == 0) + { + /// If there is no timeout before signal try to waitpid 1 time without block so we can avoid sending + /// signal if process is already normally terminated. + + int waitpid_res = waitpid(pid, &status, WNOHANG); + bool process_terminated_normally = (waitpid_res == pid); + return process_terminated_normally; + } + + /// If timeout is positive try waitpid without block in loop until + /// process is normally terminated or waitpid return error + + while (timeout_in_seconds != 0) + { + int waitpid_res = waitpid(pid, &status, WNOHANG); + bool process_terminated_normally = (waitpid_res == pid); + + if (process_terminated_normally) + return true; + else if (waitpid_res == 0) + { + --timeout_in_seconds; + nanosleep(&interval, nullptr); + + continue; + } + else if (waitpid_res == -1 && errno != EINTR) + return false; + } + + return false; +} + void ShellCommand::logCommand(const char * filename, char * const argv[]) { WriteBufferFromOwnString args; @@ -87,7 +147,10 @@ void ShellCommand::logCommand(const char * filename, char * const argv[]) } std::unique_ptr ShellCommand::executeImpl( - const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor) + const char * filename, + char * const argv[], + bool pipe_stdin_only, + ShellCommandDestructorStrategy terminate_in_destructor_strategy) { logCommand(filename, argv); @@ -144,7 +207,7 @@ std::unique_ptr ShellCommand::executeImpl( } std::unique_ptr res(new ShellCommand( - pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor)); + pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor_strategy)); LOG_TRACE(getLogger(), "Started shell command '{}' with pid {}", filename, pid); return res; @@ -152,7 +215,9 @@ std::unique_ptr ShellCommand::executeImpl( std::unique_ptr ShellCommand::execute( - const std::string & command, bool pipe_stdin_only, bool terminate_in_destructor) + const std::string & command, + bool pipe_stdin_only, + ShellCommandDestructorStrategy terminate_in_destructor_strategy) { /// Arguments in non-constant chunks of memory (as required for `execv`). /// Moreover, their copying must be done before calling `vfork`, so after `vfork` do a minimum of things. @@ -162,12 +227,14 @@ std::unique_ptr ShellCommand::execute( char * const argv[] = { argv0.data(), argv1.data(), argv2.data(), nullptr }; - return executeImpl("/bin/sh", argv, pipe_stdin_only, terminate_in_destructor); + return executeImpl("/bin/sh", argv, pipe_stdin_only, terminate_in_destructor_strategy); } std::unique_ptr ShellCommand::executeDirect( - const std::string & path, const std::vector & arguments, bool terminate_in_destructor) + const std::string & path, + const std::vector & arguments, + ShellCommandDestructorStrategy terminate_in_destructor_strategy) { size_t argv_sum_size = path.size() + 1; for (const auto & arg : arguments) @@ -188,7 +255,7 @@ std::unique_ptr ShellCommand::executeDirect( argv[arguments.size() + 1] = nullptr; - return executeImpl(path.data(), argv.data(), false, terminate_in_destructor); + return executeImpl(path.data(), argv.data(), false, terminate_in_destructor_strategy); } @@ -203,8 +270,11 @@ int ShellCommand::tryWait() LOG_TRACE(getLogger(), "Will wait for shell command pid {}", pid); int status = 0; - if (-1 == waitpid(pid, &status, 0)) - throwFromErrno("Cannot waitpid", ErrorCodes::CANNOT_WAITPID); + while (waitpid(pid, &status, 0) < 0) + { + if (errno != EINTR) + throwFromErrno("Cannot waitpid", ErrorCodes::CANNOT_WAITPID); + } LOG_TRACE(getLogger(), "Wait for shell command pid {} completed with status {}", pid, status); diff --git a/src/Common/ShellCommand.h b/src/Common/ShellCommand.h index 1c681227efd..f1d808128ff 100644 --- a/src/Common/ShellCommand.h +++ b/src/Common/ShellCommand.h @@ -23,21 +23,38 @@ namespace DB * The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr of running process, * and also to obtain the return code and completion status. */ -class ShellCommand + +struct ShellCommandDestructorStrategy final +{ + explicit ShellCommandDestructorStrategy(bool terminate_in_destructor_, size_t wait_for_normal_exit_before_termination_seconds_ = 0) + : terminate_in_destructor(terminate_in_destructor_) + , wait_for_normal_exit_before_termination_seconds(wait_for_normal_exit_before_termination_seconds_) + { + } + + bool terminate_in_destructor; + + /// If terminate in destructor is true, command will wait until send SIGTERM signal to created process + size_t wait_for_normal_exit_before_termination_seconds = 0; +}; + +class ShellCommand final { private: pid_t pid; bool wait_called = false; - bool terminate_in_destructor; + ShellCommandDestructorStrategy destructor_strategy; - ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, bool terminate_in_destructor_); + ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, ShellCommandDestructorStrategy destructor_strategy_); + + bool tryWaitProcessWithTimeout(size_t timeout_in_seconds); static Poco::Logger * getLogger(); /// Print command name and the list of arguments to log. NOTE: No escaping of arguments is performed. static void logCommand(const char * filename, char * const argv[]); - static std::unique_ptr executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor); + static std::unique_ptr executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, ShellCommandDestructorStrategy terminate_in_destructor_strategy); public: WriteBufferFromFile in; /// If the command reads from stdin, do not forget to call in.close() after writing all the data there. @@ -48,11 +65,11 @@ public: /// Run the command using /bin/sh -c. /// If terminate_in_destructor is true, send terminate signal in destructor and don't wait process. - static std::unique_ptr execute(const std::string & command, bool pipe_stdin_only = false, bool terminate_in_destructor = false); + static std::unique_ptr execute(const std::string & command, bool pipe_stdin_only = false, ShellCommandDestructorStrategy terminate_in_destructor_strategy = ShellCommandDestructorStrategy(false)); /// Run the executable with the specified arguments. `arguments` - without argv[0]. /// If terminate_in_destructor is true, send terminate signal in destructor and don't wait process. - static std::unique_ptr executeDirect(const std::string & path, const std::vector & arguments, bool terminate_in_destructor = false); + static std::unique_ptr executeDirect(const std::string & path, const std::vector & arguments, ShellCommandDestructorStrategy terminate_in_destructor_strategy = ShellCommandDestructorStrategy(false)); /// Wait for the process to end, throw an exception if the code is not 0 or if the process was not completed by itself. void wait(); diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp index c4cf7f11e68..812f888b284 100644 --- a/src/Common/StackTrace.cpp +++ b/src/Common/StackTrace.cpp @@ -35,7 +35,7 @@ std::string signalToErrorMessage(int sig, const siginfo_t & info, const ucontext else error << "Address: " << info.si_addr; -#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) +#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__powerpc__) auto err_mask = context.uc_mcontext.gregs[REG_ERR]; if ((err_mask & 0x02)) error << " Access: write."; @@ -184,8 +184,14 @@ static void * getCallerAddress(const ucontext_t & context) # else return reinterpret_cast(context.uc_mcontext.gregs[REG_RIP]); # endif + +#elif defined(__APPLE__) && defined(__aarch64__) + return reinterpret_cast(context.uc_mcontext->__ss.__pc); + #elif defined(__aarch64__) return reinterpret_cast(context.uc_mcontext.pc); +#elif defined(__powerpc64__) + return reinterpret_cast(context.uc_mcontext.gp_regs[PT_NIP]); #else return nullptr; #endif diff --git a/src/Common/StackTrace.h b/src/Common/StackTrace.h index 58660f9e4da..ef90a0d587d 100644 --- a/src/Common/StackTrace.h +++ b/src/Common/StackTrace.h @@ -11,7 +11,9 @@ #ifdef __APPLE__ // ucontext is not available without _XOPEN_SOURCE -# pragma clang diagnostic ignored "-Wreserved-id-macro" +# ifdef __clang__ +# pragma clang diagnostic ignored "-Wreserved-id-macro" +# endif # define _XOPEN_SOURCE 700 #endif #include @@ -37,8 +39,12 @@ public: static constexpr size_t capacity = #ifndef NDEBUG - /* The stacks are normally larger in debug version due to less inlining. */ - 64 + /* The stacks are normally larger in debug version due to less inlining. + * + * NOTE: it cannot be larger then 56 right now, since otherwise it will + * not fit into minimal PIPE_BUF (512) in TraceCollector. + */ + 56 #else 32 #endif diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index bd6c5d9eda0..a23184c9c0a 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -60,11 +60,11 @@ Otherwise you will get only exported symbols from program headers. #endif #define __msan_unpoison_string(X) // NOLINT -#if defined(__has_feature) -# if __has_feature(memory_sanitizer) -# undef __msan_unpoison_string -# include -# endif +#if defined(ch_has_feature) +# if ch_has_feature(memory_sanitizer) +# undef __msan_unpoison_string +# include +# endif #endif diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 8c01ed2d46f..1493d30ea01 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -101,7 +101,8 @@ ThreadStatus::~ThreadStatus() #if !defined(ARCADIA_BUILD) /// It may cause segfault if query_context was destroyed, but was not detached - assert((!query_context && query_id.empty()) || (query_context && query_id == query_context->getCurrentQueryId())); + auto query_context_ptr = query_context.lock(); + assert((!query_context_ptr && query_id.empty()) || (query_context_ptr && query_id == query_context_ptr->getCurrentQueryId())); #endif if (deleter) diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index dc5f09c5f3d..3b39e462fa6 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -1,20 +1,20 @@ #pragma once -#include -#include +#include +#include +#include #include #include +#include +#include -#include +#include -#include - -#include +#include #include +#include #include #include -#include -#include namespace Poco @@ -26,7 +26,6 @@ namespace Poco namespace DB { -class Context; class QueryStatus; class ThreadStatus; class QueryProfilerReal; @@ -58,8 +57,8 @@ public: ProfileEvents::Counters performance_counters{VariableContext::Process}; MemoryTracker memory_tracker{VariableContext::Process}; - Context * query_context = nullptr; - Context * global_context = nullptr; + ContextWeakPtr query_context; + ContextWeakPtr global_context; InternalTextLogsQueueWeakPtr logs_queue_ptr; std::function fatal_error_callback; @@ -122,9 +121,9 @@ protected: std::atomic thread_state{ThreadState::DetachedFromQuery}; /// Is set once - Context * global_context = nullptr; + ContextWeakPtr global_context; /// Use it only from current thread - Context * query_context = nullptr; + ContextWeakPtr query_context; String query_id; @@ -178,9 +177,9 @@ public: return query_id; } - const Context * getQueryContext() const + auto getQueryContext() const { - return query_context; + return query_context.lock(); } /// Starts new query and create new thread group for it, current thread becomes master thread of the query @@ -203,7 +202,7 @@ public: /// Sets query context for current master thread and its thread group /// NOTE: query_context have to be alive until detachQuery() is called - void attachQueryContext(Context & query_context); + void attachQueryContext(ContextPtr query_context); /// Update several ProfileEvents counters void updatePerformanceCounters(); diff --git a/src/Common/TimerDescriptor.cpp b/src/Common/TimerDescriptor.cpp index f4c3ec35588..791e6380a89 100644 --- a/src/Common/TimerDescriptor.cpp +++ b/src/Common/TimerDescriptor.cpp @@ -27,10 +27,16 @@ TimerDescriptor::TimerDescriptor(int clockid, int flags) throwFromErrno("Cannot set O_NONBLOCK for timer_fd", ErrorCodes::CANNOT_FCNTL); } +TimerDescriptor::TimerDescriptor(TimerDescriptor && other) : timer_fd(other.timer_fd) +{ + other.timer_fd = -1; +} + TimerDescriptor::~TimerDescriptor() { /// Do not check for result cause cannot throw exception. - close(timer_fd); + if (timer_fd != -1) + close(timer_fd); } void TimerDescriptor::reset() const @@ -74,7 +80,7 @@ void TimerDescriptor::setRelative(const Poco::Timespan & timespan) const spec.it_interval.tv_nsec = 0; spec.it_interval.tv_sec = 0; spec.it_value.tv_sec = timespan.totalSeconds(); - spec.it_value.tv_nsec = timespan.useconds(); + spec.it_value.tv_nsec = timespan.useconds() * 1000; if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr)) throwFromErrno("Cannot set time for timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD); diff --git a/src/Common/TimerDescriptor.h b/src/Common/TimerDescriptor.h index ddb8f2a1367..42f8eb386af 100644 --- a/src/Common/TimerDescriptor.h +++ b/src/Common/TimerDescriptor.h @@ -12,12 +12,12 @@ private: int timer_fd; public: - explicit TimerDescriptor(int clockid, int flags); + explicit TimerDescriptor(int clockid = CLOCK_MONOTONIC, int flags = 0); ~TimerDescriptor(); TimerDescriptor(const TimerDescriptor &) = delete; TimerDescriptor & operator=(const TimerDescriptor &) = delete; - TimerDescriptor(TimerDescriptor &&) = default; + TimerDescriptor(TimerDescriptor && other); TimerDescriptor & operator=(TimerDescriptor &&) = default; int getDescriptor() const { return timer_fd; } diff --git a/src/Common/TraceCollector.cpp b/src/Common/TraceCollector.cpp index cbac9cd1a19..ab1845ebbd2 100644 --- a/src/Common/TraceCollector.cpp +++ b/src/Common/TraceCollector.cpp @@ -22,7 +22,9 @@ namespace { /// Normally query_id is a UUID (string with a fixed length) but user can provide custom query_id. /// Thus upper bound on query_id length should be introduced to avoid buffer overflow in signal handler. - constexpr size_t QUERY_ID_MAX_LEN = 1024; + /// + /// And it cannot be large, since otherwise it will not fit into PIPE_BUF. + constexpr size_t QUERY_ID_MAX_LEN = sizeof("00000000-0000-0000-0000-000000000000") - 1; // 36 } LazyPipeFDs pipe; @@ -60,10 +62,14 @@ void TraceCollector::collect(TraceType trace_type, const StackTrace & stack_trac 8 * sizeof(char) + // maximum VarUInt length for string size QUERY_ID_MAX_LEN * sizeof(char) + // maximum query_id length sizeof(UInt8) + // number of stack frames - sizeof(StackTrace::Frames) + // collected stack trace, maximum capacity + sizeof(StackTrace::FramePointers) + // collected stack trace, maximum capacity sizeof(TraceType) + // trace type sizeof(UInt64) + // thread_id sizeof(Int64); // size + /// Write should be atomic to avoid overlaps + /// (since recursive collect() is possible) + static_assert(buf_size < PIPE_BUF, "Only write of PIPE_BUF to pipe is atomic"); + char buffer[buf_size]; WriteBufferFromFileDescriptorDiscardOnFailure out(pipe.fds_rw[1], buf_size, buffer); diff --git a/src/Common/UInt128.h b/src/Common/UInt128.h index 06fddee8dc9..be96f409673 100644 --- a/src/Common/UInt128.h +++ b/src/Common/UInt128.h @@ -19,7 +19,7 @@ namespace DB struct UInt128 { /// Suppress gcc7 warnings: 'prev_key.DB::UInt128::low' may be used uninitialized in this function -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -92,7 +92,7 @@ struct UInt128 return static_cast(low); } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -150,7 +150,7 @@ struct DummyUInt256 { /// Suppress gcc7 warnings: 'prev_key.DB::UInt256::a' may be used uninitialized in this function -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -179,7 +179,7 @@ struct DummyUInt256 bool operator== (const UInt64 rhs) const { return a == rhs && b == 0 && c == 0 && d == 0; } bool operator!= (const UInt64 rhs) const { return !operator==(rhs); } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/XDBCBridgeHelper.h b/src/Common/XDBCBridgeHelper.h deleted file mode 100644 index 8da735c6fe3..00000000000 --- a/src/Common/XDBCBridgeHelper.h +++ /dev/null @@ -1,351 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if !defined(ARCADIA_BUILD) -# include -#endif - -namespace DB -{ -namespace ErrorCodes -{ - extern const int EXTERNAL_SERVER_IS_NOT_RESPONDING; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} - -/** - * Class for Helpers for Xdbc-bridges, provide utility methods, not main request - */ -class IXDBCBridgeHelper -{ -public: - static constexpr inline auto DEFAULT_FORMAT = "RowBinary"; - - virtual std::vector> getURLParams(const std::string & cols, UInt64 max_block_size) const = 0; - virtual void startBridgeSync() const = 0; - virtual Poco::URI getMainURI() const = 0; - virtual Poco::URI getColumnsInfoURI() const = 0; - virtual IdentifierQuotingStyle getIdentifierQuotingStyle() = 0; - virtual bool isSchemaAllowed() = 0; - virtual String getName() const = 0; - - virtual ~IXDBCBridgeHelper() = default; -}; - -using BridgeHelperPtr = std::shared_ptr; - -template -class XDBCBridgeHelper : public IXDBCBridgeHelper -{ -private: - Poco::Timespan http_timeout; - - std::string connection_string; - - Poco::URI ping_url; - - Poco::Logger * log = &Poco::Logger::get(BridgeHelperMixin::getName() + "BridgeHelper"); - - std::optional quote_style; - std::optional is_schema_allowed; - -protected: - auto getConnectionString() const - { - return connection_string; - } - -public: - using Configuration = Poco::Util::AbstractConfiguration; - - const Context & context; - const Configuration & config; - - static constexpr inline auto DEFAULT_HOST = "127.0.0.1"; - static constexpr inline auto DEFAULT_PORT = BridgeHelperMixin::DEFAULT_PORT; - static constexpr inline auto PING_HANDLER = "/ping"; - static constexpr inline auto MAIN_HANDLER = "/"; - static constexpr inline auto COL_INFO_HANDLER = "/columns_info"; - static constexpr inline auto IDENTIFIER_QUOTE_HANDLER = "/identifier_quote"; - static constexpr inline auto SCHEMA_ALLOWED_HANDLER = "/schema_allowed"; - static constexpr inline auto PING_OK_ANSWER = "Ok."; - - XDBCBridgeHelper(const Context & global_context_, const Poco::Timespan & http_timeout_, const std::string & connection_string_) - : http_timeout(http_timeout_), connection_string(connection_string_), context(global_context_), config(context.getConfigRef()) - { - size_t bridge_port = config.getUInt(BridgeHelperMixin::configPrefix() + ".port", DEFAULT_PORT); - std::string bridge_host = config.getString(BridgeHelperMixin::configPrefix() + ".host", DEFAULT_HOST); - - ping_url.setHost(bridge_host); - ping_url.setPort(bridge_port); - ping_url.setScheme("http"); - ping_url.setPath(PING_HANDLER); - } - - String getName() const override - { - return BridgeHelperMixin::getName(); - } - - IdentifierQuotingStyle getIdentifierQuotingStyle() override - { - if (!quote_style.has_value()) - { - startBridgeSync(); - - auto uri = createBaseURI(); - uri.setPath(IDENTIFIER_QUOTE_HANDLER); - uri.addQueryParameter("connection_string", getConnectionString()); - - ReadWriteBufferFromHTTP buf( - uri, Poco::Net::HTTPRequest::HTTP_POST, {}, ConnectionTimeouts::getHTTPTimeouts(context)); - std::string character; - readStringBinary(character, buf); - if (character.length() > 1) - throw Exception("Failed to parse quoting style from '" + character + "' for service " + BridgeHelperMixin::serviceAlias(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - else if (character.length() == 0) - quote_style = IdentifierQuotingStyle::None; - else if (character[0] == '`') - quote_style = IdentifierQuotingStyle::Backticks; - else if (character[0] == '"') - quote_style = IdentifierQuotingStyle::DoubleQuotes; - else - throw Exception("Can not map quote identifier '" + character + "' to enum value", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - } - - return *quote_style; - } - - bool isSchemaAllowed() override - { - if (!is_schema_allowed.has_value()) - { - startBridgeSync(); - - auto uri = createBaseURI(); - uri.setPath(SCHEMA_ALLOWED_HANDLER); - uri.addQueryParameter("connection_string", getConnectionString()); - - ReadWriteBufferFromHTTP buf( - uri, Poco::Net::HTTPRequest::HTTP_POST, {}, ConnectionTimeouts::getHTTPTimeouts(context)); - - bool res; - readBoolText(res, buf); - is_schema_allowed = res; - } - - return *is_schema_allowed; - } - - /** - * @todo leaky abstraction - used by external API's - */ - std::vector> getURLParams(const std::string & cols, UInt64 max_block_size) const override - { - std::vector> result; - - result.emplace_back("connection_string", connection_string); /// already validated - result.emplace_back("columns", cols); - result.emplace_back("max_block_size", std::to_string(max_block_size)); - - return result; - } - - /** - * Performs spawn of external daemon - */ - void startBridgeSync() const override - { - if (!checkBridgeIsRunning()) - { - LOG_TRACE(log, "{} is not running, will try to start it", BridgeHelperMixin::serviceAlias()); - startBridge(); - bool started = false; - - uint64_t milliseconds_to_wait = 10; /// Exponential backoff - uint64_t counter = 0; - while (milliseconds_to_wait < 10000) - { - ++counter; - LOG_TRACE(log, "Checking {} is running, try {}", BridgeHelperMixin::serviceAlias(), counter); - if (checkBridgeIsRunning()) - { - started = true; - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds_to_wait)); - milliseconds_to_wait *= 2; - } - - if (!started) - throw Exception(BridgeHelperMixin::getName() + "BridgeHelper: " + BridgeHelperMixin::serviceAlias() + " is not responding", - ErrorCodes::EXTERNAL_SERVER_IS_NOT_RESPONDING); - } - } - - /** - * URI to fetch the data from external service - */ - Poco::URI getMainURI() const override - { - auto uri = createBaseURI(); - uri.setPath(MAIN_HANDLER); - return uri; - } - - /** - * URI to retrieve column description from external service - */ - Poco::URI getColumnsInfoURI() const override - { - auto uri = createBaseURI(); - uri.setPath(COL_INFO_HANDLER); - return uri; - } - -protected: - Poco::URI createBaseURI() const - { - Poco::URI uri; - uri.setHost(ping_url.getHost()); - uri.setPort(ping_url.getPort()); - uri.setScheme("http"); - return uri; - } - -private: - bool checkBridgeIsRunning() const - { - try - { - ReadWriteBufferFromHTTP buf( - ping_url, Poco::Net::HTTPRequest::HTTP_GET, {}, ConnectionTimeouts::getHTTPTimeouts(context)); - return checkString(XDBCBridgeHelper::PING_OK_ANSWER, buf); - } - catch (...) - { - return false; - } - } - - /* Contains logic for instantiation of the bridge instance */ - void startBridge() const - { - auto cmd = BridgeHelperMixin::startBridge(config, log, http_timeout); - context.addXDBCBridgeCommand(std::move(cmd)); - } -}; - -struct JDBCBridgeMixin -{ - static constexpr inline auto DEFAULT_PORT = 9019; - static const String configPrefix() - { - return "jdbc_bridge"; - } - static const String serviceAlias() - { - return "clickhouse-jdbc-bridge"; - } - static const String getName() - { - return "JDBC"; - } - static AccessType getSourceAccessType() - { - return AccessType::JDBC; - } - - static std::unique_ptr startBridge(const Poco::Util::AbstractConfiguration &, const Poco::Logger *, const Poco::Timespan &) - { - throw Exception("jdbc-bridge is not running. Please, start it manually", ErrorCodes::EXTERNAL_SERVER_IS_NOT_RESPONDING); - } -}; - -struct ODBCBridgeMixin -{ - static constexpr inline auto DEFAULT_PORT = 9018; - - static const String configPrefix() - { - return "odbc_bridge"; - } - static const String serviceAlias() - { - return "clickhouse-odbc-bridge"; - } - static const String getName() - { - return "ODBC"; - } - static AccessType getSourceAccessType() - { - return AccessType::ODBC; - } - - static std::unique_ptr startBridge( - const Poco::Util::AbstractConfiguration & config, Poco::Logger * log, const Poco::Timespan & http_timeout) - { - /// Path to executable folder - Poco::Path path{config.getString("application.dir", "/usr/bin")}; - - std::vector cmd_args; - path.setFileName("clickhouse-odbc-bridge"); - -#if !CLICKHOUSE_SPLIT_BINARY - cmd_args.push_back("odbc-bridge"); -#endif - - cmd_args.push_back("--http-port"); - cmd_args.push_back(std::to_string(config.getUInt(configPrefix() + ".port", DEFAULT_PORT))); - cmd_args.push_back("--listen-host"); - cmd_args.push_back(config.getString(configPrefix() + ".listen_host", XDBCBridgeHelper::DEFAULT_HOST)); - cmd_args.push_back("--http-timeout"); - cmd_args.push_back(std::to_string(http_timeout.totalMicroseconds())); - if (config.has("logger." + configPrefix() + "_log")) - { - cmd_args.push_back("--log-path"); - cmd_args.push_back(config.getString("logger." + configPrefix() + "_log")); - } - if (config.has("logger." + configPrefix() + "_errlog")) - { - cmd_args.push_back("--err-log-path"); - cmd_args.push_back(config.getString("logger." + configPrefix() + "_errlog")); - } - if (config.has("logger." + configPrefix() + "_stdout")) - { - cmd_args.push_back("--stdout-path"); - cmd_args.push_back(config.getString("logger." + configPrefix() + "_stdout")); - } - if (config.has("logger." + configPrefix() + "_stderr")) - { - cmd_args.push_back("--stderr-path"); - cmd_args.push_back(config.getString("logger." + configPrefix() + "_stderr")); - } - if (config.has("logger." + configPrefix() + "_level")) - { - cmd_args.push_back("--log-level"); - cmd_args.push_back(config.getString("logger." + configPrefix() + "_level")); - } - - LOG_TRACE(log, "Starting {}", serviceAlias()); - - return ShellCommand::executeDirect(path.toString(), cmd_args, true); - } -}; -} diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index c53ea60ec7c..2d947bb402c 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -116,6 +116,7 @@ struct Request virtual ~Request() = default; virtual String getPath() const = 0; virtual void addRootPath(const String & /* root_path */) {} + virtual size_t bytesSize() const { return 0; } }; struct Response; @@ -131,6 +132,7 @@ struct Response Response & operator=(const Response &) = default; virtual ~Response() = default; virtual void removeRootPath(const String & /* root_path */) {} + virtual size_t bytesSize() const { return 0; } }; struct WatchResponse : virtual Response @@ -140,6 +142,8 @@ struct WatchResponse : virtual Response String path; void removeRootPath(const String & root_path) override; + + size_t bytesSize() const override { return path.size() + sizeof(type) + sizeof(state); } }; using WatchCallback = std::function; @@ -154,6 +158,9 @@ struct CreateRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size() + data.size() + + sizeof(is_ephemeral) + sizeof(is_sequential) + acls.size() * sizeof(ACL); } }; struct CreateResponse : virtual Response @@ -161,6 +168,8 @@ struct CreateResponse : virtual Response String path_created; void removeRootPath(const String & root_path) override; + + size_t bytesSize() const override { return path_created.size(); } }; struct RemoveRequest : virtual Request @@ -170,6 +179,8 @@ struct RemoveRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size() + sizeof(version); } }; struct RemoveResponse : virtual Response @@ -182,11 +193,15 @@ struct ExistsRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size(); } }; struct ExistsResponse : virtual Response { Stat stat; + + size_t bytesSize() const override { return sizeof(Stat); } }; struct GetRequest : virtual Request @@ -195,12 +210,16 @@ struct GetRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size(); } }; struct GetResponse : virtual Response { String data; Stat stat; + + size_t bytesSize() const override { return data.size() + sizeof(stat); } }; struct SetRequest : virtual Request @@ -211,11 +230,15 @@ struct SetRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return data.size() + data.size() + sizeof(version); } }; struct SetResponse : virtual Response { Stat stat; + + size_t bytesSize() const override { return sizeof(stat); } }; struct ListRequest : virtual Request @@ -224,12 +247,22 @@ struct ListRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size(); } }; struct ListResponse : virtual Response { std::vector names; Stat stat; + + size_t bytesSize() const override + { + size_t size = sizeof(stat); + for (const auto & name : names) + size += name.size(); + return size; + } }; struct CheckRequest : virtual Request @@ -239,6 +272,8 @@ struct CheckRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size() + sizeof(version); } }; struct CheckResponse : virtual Response @@ -251,6 +286,14 @@ struct MultiRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return {}; } + + size_t bytesSize() const override + { + size_t size = 0; + for (const auto & request : requests) + size += request->bytesSize(); + return size; + } }; struct MultiResponse : virtual Response @@ -258,6 +301,14 @@ struct MultiResponse : virtual Response Responses responses; void removeRootPath(const String & root_path) override; + + size_t bytesSize() const override + { + size_t size = 0; + for (const auto & response : responses) + size += response->bytesSize(); + return size; + } }; /// This response may be received only as an element of responses in MultiResponse. @@ -391,6 +442,9 @@ public: virtual void multi( const Requests & requests, MultiCallback callback) = 0; + + /// Expire session and finish all pending requests + virtual void finalize() = 0; }; } diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp index 5951164f58f..36c875fe325 100644 --- a/src/Common/ZooKeeper/TestKeeper.cpp +++ b/src/Common/ZooKeeper/TestKeeper.cpp @@ -421,26 +421,38 @@ std::pair TestKeeperMultiRequest::process(TestKeeper::Contain try { - for (const auto & request : requests) + auto request_it = requests.begin(); + response.error = Error::ZOK; + while (request_it != requests.end()) { - const TestKeeperRequest & concrete_request = dynamic_cast(*request); + const TestKeeperRequest & concrete_request = dynamic_cast(**request_it); + ++request_it; auto [ cur_response, undo_action ] = concrete_request.process(container, zxid); response.responses.emplace_back(cur_response); if (cur_response->error != Error::ZOK) { response.error = cur_response->error; - - for (auto it = undo_actions.rbegin(); it != undo_actions.rend(); ++it) - if (*it) - (*it)(); - - return { std::make_shared(response), {} }; + break; + } + + undo_actions.emplace_back(std::move(undo_action)); + } + + if (response.error != Error::ZOK) + { + for (auto it = undo_actions.rbegin(); it != undo_actions.rend(); ++it) + if (*it) + (*it)(); + + while (request_it != requests.end()) + { + const TestKeeperRequest & concrete_request = dynamic_cast(**request_it); + ++request_it; + response.responses.emplace_back(concrete_request.createResponse()); + response.responses.back()->error = Error::ZRUNTIMEINCONSISTENCY; } - else - undo_actions.emplace_back(std::move(undo_action)); } - response.error = Error::ZOK; return { std::make_shared(response), {} }; } catch (...) diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h index ca9f584304f..b46f98c0074 100644 --- a/src/Common/ZooKeeper/TestKeeper.h +++ b/src/Common/ZooKeeper/TestKeeper.h @@ -30,7 +30,7 @@ using TestKeeperRequestPtr = std::shared_ptr; * * NOTE: You can add various failure modes for better testing. */ -class TestKeeper : public IKeeper +class TestKeeper final : public IKeeper { public: TestKeeper(const String & root_path_, Poco::Timespan operation_timeout_); @@ -83,6 +83,7 @@ public: const Requests & requests, MultiCallback callback) override; + void finalize() override; struct Node { @@ -130,7 +131,6 @@ private: void pushRequest(RequestInfo && request); - void finalize(); ThreadFromGlobalPool processing_thread; diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index a1c6eb9b481..9f59da233fc 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -44,7 +44,7 @@ static void check(Coordination::Error code, const std::string & path) } -void ZooKeeper::init(const std::string & implementation_, const std::string & hosts_, const std::string & identity_, +void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_, const std::string & identity_, int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_) { log = &Poco::Logger::get("ZooKeeper"); @@ -60,13 +60,16 @@ void ZooKeeper::init(const std::string & implementation_, const std::string & ho if (hosts.empty()) throw KeeperException("No hosts passed to ZooKeeper constructor.", Coordination::Error::ZBADARGUMENTS); - std::vector hosts_strings; - splitInto<','>(hosts_strings, hosts); Coordination::ZooKeeper::Nodes nodes; - nodes.reserve(hosts_strings.size()); + nodes.reserve(hosts.size()); + + Strings shuffled_hosts = hosts; + /// Shuffle the hosts to distribute the load among ZooKeeper nodes. + pcg64 generator(randomSeed()); + std::shuffle(shuffled_hosts.begin(), shuffled_hosts.end(), generator); bool dns_error = false; - for (auto & host_string : hosts_strings) + for (auto & host_string : shuffled_hosts) { try { @@ -109,9 +112,9 @@ void ZooKeeper::init(const std::string & implementation_, const std::string & ho Poco::Timespan(0, operation_timeout_ms_ * 1000)); if (chroot.empty()) - LOG_TRACE(log, "Initialized, hosts: {}", hosts); + LOG_TRACE(log, "Initialized, hosts: {}", fmt::join(hosts, ",")); else - LOG_TRACE(log, "Initialized, hosts: {}, chroot: {}", hosts, chroot); + LOG_TRACE(log, "Initialized, hosts: {}, chroot: {}", fmt::join(hosts, ","), chroot); } else if (implementation == "testkeeper") { @@ -128,7 +131,16 @@ void ZooKeeper::init(const std::string & implementation_, const std::string & ho throw KeeperException("Zookeeper root doesn't exist. You should create root node " + chroot + " before start.", Coordination::Error::ZNONODE); } -ZooKeeper::ZooKeeper(const std::string & hosts_, const std::string & identity_, int32_t session_timeout_ms_, +ZooKeeper::ZooKeeper(const std::string & hosts_string, const std::string & identity_, int32_t session_timeout_ms_, + int32_t operation_timeout_ms_, const std::string & chroot_, const std::string & implementation_) +{ + Strings hosts_strings; + splitInto<','>(hosts_strings, hosts_string); + + init(implementation_, hosts_strings, identity_, session_timeout_ms_, operation_timeout_ms_, chroot_); +} + +ZooKeeper::ZooKeeper(const Strings & hosts_, const std::string & identity_, int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_, const std::string & implementation_) { init(implementation_, hosts_, identity_, session_timeout_ms_, operation_timeout_ms_, chroot_); @@ -141,8 +153,6 @@ struct ZooKeeperArgs Poco::Util::AbstractConfiguration::Keys keys; config.keys(config_name, keys); - std::vector hosts_strings; - session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS; operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS; implementation = "zookeeper"; @@ -150,7 +160,7 @@ struct ZooKeeperArgs { if (startsWith(key, "node")) { - hosts_strings.push_back( + hosts.push_back( (config.getBool(config_name + "." + key + ".secure", false) ? "secure://" : "") + config.getString(config_name + "." + key + ".host") + ":" + config.getString(config_name + "." + key + ".port", "2181") @@ -180,17 +190,6 @@ struct ZooKeeperArgs throw KeeperException(std::string("Unknown key ") + key + " in config file", Coordination::Error::ZBADARGUMENTS); } - /// Shuffle the hosts to distribute the load among ZooKeeper nodes. - pcg64 generator(randomSeed()); - std::shuffle(hosts_strings.begin(), hosts_strings.end(), generator); - - for (auto & host : hosts_strings) - { - if (!hosts.empty()) - hosts += ','; - hosts += host; - } - if (!chroot.empty()) { if (chroot.front() != '/') @@ -200,7 +199,7 @@ struct ZooKeeperArgs } } - std::string hosts; + Strings hosts; std::string identity; int session_timeout_ms; int operation_timeout_ms; @@ -244,6 +243,7 @@ Coordination::Error ZooKeeper::getChildrenImpl(const std::string & path, Strings auto callback = [&](const Coordination::ListResponse & response) { + SCOPE_EXIT(event.set()); code = response.error; if (code == Coordination::Error::ZOK) { @@ -251,7 +251,6 @@ Coordination::Error ZooKeeper::getChildrenImpl(const std::string & path, Strings if (stat) *stat = response.stat; } - event.set(); }; impl->list(path, callback, watch_callback); @@ -304,10 +303,10 @@ Coordination::Error ZooKeeper::createImpl(const std::string & path, const std::s auto callback = [&](const Coordination::CreateResponse & response) { + SCOPE_EXIT(event.set()); code = response.error; if (code == Coordination::Error::ZOK) path_created = response.path_created; - event.set(); }; impl->create(path, data, mode & 1, mode & 2, {}, callback); /// TODO better mode @@ -372,9 +371,9 @@ Coordination::Error ZooKeeper::removeImpl(const std::string & path, int32_t vers auto callback = [&](const Coordination::RemoveResponse & response) { + SCOPE_EXIT(event.set()); if (response.error != Coordination::Error::ZOK) code = response.error; - event.set(); }; impl->remove(path, version, callback); @@ -405,10 +404,10 @@ Coordination::Error ZooKeeper::existsImpl(const std::string & path, Coordination auto callback = [&](const Coordination::ExistsResponse & response) { + SCOPE_EXIT(event.set()); code = response.error; if (code == Coordination::Error::ZOK && stat) *stat = response.stat; - event.set(); }; impl->exists(path, callback, watch_callback); @@ -437,6 +436,7 @@ Coordination::Error ZooKeeper::getImpl(const std::string & path, std::string & r auto callback = [&](const Coordination::GetResponse & response) { + SCOPE_EXIT(event.set()); code = response.error; if (code == Coordination::Error::ZOK) { @@ -444,7 +444,6 @@ Coordination::Error ZooKeeper::getImpl(const std::string & path, std::string & r if (stat) *stat = response.stat; } - event.set(); }; impl->get(path, callback, watch_callback); @@ -509,10 +508,10 @@ Coordination::Error ZooKeeper::setImpl(const std::string & path, const std::stri auto callback = [&](const Coordination::SetResponse & response) { + SCOPE_EXIT(event.set()); code = response.error; if (code == Coordination::Error::ZOK && stat) *stat = response.stat; - event.set(); }; impl->set(path, data, version, callback); @@ -559,9 +558,9 @@ Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests auto callback = [&](const Coordination::MultiResponse & response) { + SCOPE_EXIT(event.set()); code = response.error; responses = response.responses; - event.set(); }; impl->multi(requests, callback); @@ -922,6 +921,10 @@ Coordination::Error ZooKeeper::tryMultiNoThrow(const Coordination::Requests & re } } +void ZooKeeper::finalize() +{ + impl->finalize(); +} size_t KeeperMultiException::getFailedOpIndex(Coordination::Error exception_code, const Coordination::Responses & responses) { @@ -1000,4 +1003,5 @@ Coordination::RequestPtr makeCheckRequest(const std::string & path, int version) request->version = version; return request; } + } diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 5b37e4d6024..4a65ff070f7 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -50,7 +50,14 @@ class ZooKeeper public: using Ptr = std::shared_ptr; - ZooKeeper(const std::string & hosts_, const std::string & identity_ = "", + /// hosts_string -- comma separated [secure://]host:port list + ZooKeeper(const std::string & hosts_string, const std::string & identity_ = "", + int32_t session_timeout_ms_ = Coordination::DEFAULT_SESSION_TIMEOUT_MS, + int32_t operation_timeout_ms_ = Coordination::DEFAULT_OPERATION_TIMEOUT_MS, + const std::string & chroot_ = "", + const std::string & implementation_ = "zookeeper"); + + ZooKeeper(const Strings & hosts_, const std::string & identity_ = "", int32_t session_timeout_ms_ = Coordination::DEFAULT_SESSION_TIMEOUT_MS, int32_t operation_timeout_ms_ = Coordination::DEFAULT_OPERATION_TIMEOUT_MS, const std::string & chroot_ = "", @@ -247,10 +254,12 @@ public: /// Like the previous one but don't throw any exceptions on future.get() FutureMulti tryAsyncMulti(const Coordination::Requests & ops); + void finalize(); + private: friend class EphemeralNodeHolder; - void init(const std::string & implementation_, const std::string & hosts_, const std::string & identity_, + void init(const std::string & implementation_, const Strings & hosts_, const std::string & identity_, int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_); /// The following methods don't throw exceptions but return error codes. @@ -266,7 +275,7 @@ private: std::unique_ptr impl; - std::string hosts; + Strings hosts; std::string identity; int32_t session_timeout_ms; int32_t operation_timeout_ms; diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index 56f9de31ec8..50bdc6c77ba 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -455,6 +455,39 @@ ZooKeeperResponsePtr ZooKeeperCheckRequest::makeResponse() const { return std::m ZooKeeperResponsePtr ZooKeeperMultiRequest::makeResponse() const { return std::make_shared(requests); } ZooKeeperResponsePtr ZooKeeperCloseRequest::makeResponse() const { return std::make_shared(); } +void ZooKeeperSessionIDRequest::writeImpl(WriteBuffer & out) const +{ + Coordination::write(internal_id, out); + Coordination::write(session_timeout_ms, out); + Coordination::write(server_id, out); +} + +void ZooKeeperSessionIDRequest::readImpl(ReadBuffer & in) +{ + Coordination::read(internal_id, in); + Coordination::read(session_timeout_ms, in); + Coordination::read(server_id, in); +} + +Coordination::ZooKeeperResponsePtr ZooKeeperSessionIDRequest::makeResponse() const +{ + return std::make_shared(); +} + +void ZooKeeperSessionIDResponse::readImpl(ReadBuffer & in) +{ + Coordination::read(internal_id, in); + Coordination::read(session_id, in); + Coordination::read(server_id, in); +} + +void ZooKeeperSessionIDResponse::writeImpl(WriteBuffer & out) const +{ + Coordination::write(internal_id, out); + Coordination::write(session_id, out); + Coordination::write(server_id, out); +} + void ZooKeeperRequestFactory::registerRequest(OpNum op_num, Creator creator) { if (!op_num_to_request.try_emplace(op_num, creator).second) @@ -511,6 +544,7 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory() registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); + registerZooKeeperRequest(*this); } } diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h index 92b1e7c9858..dd95eaa6b67 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.h +++ b/src/Common/ZooKeeper/ZooKeeperCommon.h @@ -84,6 +84,8 @@ struct ZooKeeperSyncRequest final : ZooKeeperRequest void readImpl(ReadBuffer & in) override; ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return ZooKeeperRequest::bytesSize() + path.size(); } }; struct ZooKeeperSyncResponse final : ZooKeeperResponse @@ -92,6 +94,8 @@ struct ZooKeeperSyncResponse final : ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Sync; } + + size_t bytesSize() const override { return path.size(); } }; struct ZooKeeperHeartbeatResponse final : ZooKeeperResponse @@ -128,6 +132,9 @@ struct ZooKeeperAuthRequest final : ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return ZooKeeperRequest::bytesSize() + sizeof(xid) + + sizeof(type) + scheme.size() + data.size(); } }; struct ZooKeeperAuthResponse final : ZooKeeperResponse @@ -136,6 +143,8 @@ struct ZooKeeperAuthResponse final : ZooKeeperResponse void writeImpl(WriteBuffer &) const override {} OpNum getOpNum() const override { return OpNum::Auth; } + + size_t bytesSize() const override { return ZooKeeperResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperCloseRequest final : ZooKeeperRequest @@ -172,6 +181,8 @@ struct ZooKeeperCreateRequest final : public CreateRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return CreateRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse @@ -181,6 +192,8 @@ struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Create; } + + size_t bytesSize() const override { return CreateResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest @@ -194,6 +207,8 @@ struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return RemoveRequest::bytesSize() + sizeof(xid); } }; struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse @@ -201,6 +216,8 @@ struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse void readImpl(ReadBuffer &) override {} void writeImpl(WriteBuffer &) const override {} OpNum getOpNum() const override { return OpNum::Remove; } + + size_t bytesSize() const override { return RemoveResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest @@ -211,6 +228,8 @@ struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return !has_watch; } + + size_t bytesSize() const override { return ExistsRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse @@ -218,6 +237,8 @@ struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Exists; } + + size_t bytesSize() const override { return ExistsResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest @@ -228,6 +249,8 @@ struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return !has_watch; } + + size_t bytesSize() const override { return GetRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse @@ -235,6 +258,8 @@ struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Get; } + + size_t bytesSize() const override { return GetResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest @@ -247,6 +272,8 @@ struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest void readImpl(ReadBuffer & in) override; ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return SetRequest::bytesSize() + sizeof(xid); } }; struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse @@ -254,6 +281,8 @@ struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Set; } + + size_t bytesSize() const override { return SetResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest @@ -263,6 +292,8 @@ struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest void readImpl(ReadBuffer & in) override; ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return !has_watch; } + + size_t bytesSize() const override { return ListRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperSimpleListRequest final : ZooKeeperListRequest @@ -275,6 +306,8 @@ struct ZooKeeperListResponse : ListResponse, ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::List; } + + size_t bytesSize() const override { return ListResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperSimpleListResponse final : ZooKeeperListResponse @@ -293,6 +326,8 @@ struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return !has_watch; } + + size_t bytesSize() const override { return CheckRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse @@ -300,6 +335,8 @@ struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse void readImpl(ReadBuffer &) override {} void writeImpl(WriteBuffer &) const override {} OpNum getOpNum() const override { return OpNum::Check; } + + size_t bytesSize() const override { return CheckResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; /// This response may be received only as an element of responses in MultiResponse. @@ -309,6 +346,8 @@ struct ZooKeeperErrorResponse final : ErrorResponse, ZooKeeperResponse void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Error; } + + size_t bytesSize() const override { return ErrorResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest @@ -323,6 +362,8 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override; + + size_t bytesSize() const override { return MultiRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse @@ -346,6 +387,41 @@ struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse void writeImpl(WriteBuffer & out) const override; + size_t bytesSize() const override { return MultiResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } +}; + +/// Fake internal coordination (keeper) response. Never received from client +/// and never send to client. +struct ZooKeeperSessionIDRequest final : ZooKeeperRequest +{ + int64_t internal_id; + int64_t session_timeout_ms; + /// Who requested this session + int32_t server_id; + + Coordination::OpNum getOpNum() const override { return OpNum::SessionID; } + String getPath() const override { return {}; } + void writeImpl(WriteBuffer & out) const override; + void readImpl(ReadBuffer & in) override; + + Coordination::ZooKeeperResponsePtr makeResponse() const override; + bool isReadRequest() const override { return false; } +}; + +/// Fake internal coordination (keeper) response. Never received from client +/// and never send to client. +struct ZooKeeperSessionIDResponse final : ZooKeeperResponse +{ + int64_t internal_id; + int64_t session_id; + /// Who requested this session + int32_t server_id; + + void readImpl(ReadBuffer & in) override; + + void writeImpl(WriteBuffer & out) const override; + + Coordination::OpNum getOpNum() const override { return OpNum::SessionID; } }; class ZooKeeperRequestFactory final : private boost::noncopyable diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.cpp b/src/Common/ZooKeeper/ZooKeeperConstants.cpp index 295094b336b..d2dde4c4cdd 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp +++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp @@ -21,6 +21,7 @@ static const std::unordered_set VALID_OPERATIONS = static_cast(OpNum::Check), static_cast(OpNum::Multi), static_cast(OpNum::Auth), + static_cast(OpNum::SessionID), }; std::string toString(OpNum op_num) @@ -55,6 +56,8 @@ std::string toString(OpNum op_num) return "Heartbeat"; case OpNum::Auth: return "Auth"; + case OpNum::SessionID: + return "SessionID"; } int32_t raw_op = static_cast(op_num); throw Exception("Operation " + std::to_string(raw_op) + " is unknown", Error::ZUNIMPLEMENTED); diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.h b/src/Common/ZooKeeper/ZooKeeperConstants.h index 81ca6c6a460..f91204693a0 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.h +++ b/src/Common/ZooKeeper/ZooKeeperConstants.h @@ -30,6 +30,7 @@ enum class OpNum : int32_t Check = 13, Multi = 14, Auth = 100, + SessionID = 997, /// Special internal request }; std::string toString(OpNum op_num); diff --git a/src/Common/ZooKeeper/ZooKeeperIO.cpp b/src/Common/ZooKeeper/ZooKeeperIO.cpp index 3f0905ea186..55448c9a109 100644 --- a/src/Common/ZooKeeper/ZooKeeperIO.cpp +++ b/src/Common/ZooKeeper/ZooKeeperIO.cpp @@ -3,7 +3,6 @@ namespace Coordination { - void write(size_t x, WriteBuffer & out) { x = __builtin_bswap64(x); diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 2314139af69..a717052a1ba 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -796,8 +796,17 @@ void ZooKeeper::receiveEvent() /// In case we cannot read the response, we should indicate it as the error of that type /// when the user cannot assume whether the request was processed or not. response->error = Error::ZCONNECTIONLOSS; - if (request_info.callback) - request_info.callback(*response); + + try + { + if (request_info.callback) + request_info.callback(*response); + } + catch (...) + { + /// Throw initial exception, not exception from callback. + tryLogCurrentException(__PRETTY_FUNCTION__); + } throw; } @@ -1003,6 +1012,16 @@ void ZooKeeper::pushRequest(RequestInfo && info) ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions); } +void ZooKeeper::executeGenericRequest( + const ZooKeeperRequestPtr & request, + ResponseCallback callback) +{ + RequestInfo request_info; + request_info.request = request; + request_info.callback = callback; + + pushRequest(std::move(request_info)); +} void ZooKeeper::create( const String & path, diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index 71b7cd56149..2210fd98b18 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -88,7 +88,7 @@ using namespace DB; /** Usage scenario: look at the documentation for IKeeper class. */ -class ZooKeeper : public IKeeper +class ZooKeeper final : public IKeeper { public: struct Node @@ -121,6 +121,9 @@ public: /// Useful to check owner of ephemeral node. int64_t getSessionID() const override { return session_id; } + void executeGenericRequest( + const ZooKeeperRequestPtr & request, + ResponseCallback callback); /// See the documentation about semantics of these methods in IKeeper class. @@ -167,6 +170,20 @@ public: const Requests & requests, MultiCallback callback) override; + /// Without forcefully invalidating (finalizing) ZooKeeper session before + /// establishing a new one, there was a possibility that server is using + /// two ZooKeeper sessions simultaneously in different parts of code. + /// This is strong antipattern and we always prevented it. + + /// ZooKeeper is linearizeable for writes, but not linearizeable for + /// reads, it only maintains "sequential consistency": in every session + /// you observe all events in order but possibly with some delay. If you + /// perform write in one session, then notify different part of code and + /// it will do read in another session, that read may not see the + /// already performed write. + + void finalize() override { finalize(false, false); } + private: String root_path; ACLs default_acls; diff --git a/src/Common/formatIPv6.h b/src/Common/formatIPv6.h index bd0c68d70f9..1cb9d3d1d81 100644 --- a/src/Common/formatIPv6.h +++ b/src/Common/formatIPv6.h @@ -25,7 +25,7 @@ void formatIPv6(const unsigned char * src, char *& dst, uint8_t zeroed_tail_byte /** Unsafe (no bounds-checking for src nor dst), optimized version of parsing IPv4 string. * - * Parses the input string `src` and stores binary BE value into buffer pointed by `dst`, + * Parses the input string `src` and stores binary host-endian value into buffer pointed by `dst`, * which should be long enough. * That is "127.0.0.1" becomes 0x7f000001. * @@ -63,7 +63,7 @@ inline bool parseIPv4(const char * src, unsigned char * dst) /** Unsafe (no bounds-checking for src nor dst), optimized version of parsing IPv6 string. * * Slightly altered implementation from http://svn.apache.org/repos/asf/apr/apr/trunk/network_io/unix/inet_pton.c -* Parses the input string `src` and stores binary LE value into buffer pointed by `dst`, +* Parses the input string `src` and stores binary big-endian value into buffer pointed by `dst`, * which should be long enough. In case of failure zeroes * IPV6_BINARY_LENGTH bytes of buffer pointed by `dst`. * diff --git a/src/Common/formatReadable.cpp b/src/Common/formatReadable.cpp index fc5c6c19b50..fd8ff9abd86 100644 --- a/src/Common/formatReadable.cpp +++ b/src/Common/formatReadable.cpp @@ -13,7 +13,10 @@ namespace DB } } -static void formatReadable(double size, DB::WriteBuffer & out, int precision, const char ** units, size_t units_size, double delimiter) +// I wanted to make this ALWAYS_INLINE to prevent flappy performance tests, +// but GCC complains it may not be inlined. +static void formatReadable(double size, DB::WriteBuffer & out, + int precision, const char ** units, size_t units_size, double delimiter) { size_t i = 0; for (; i + 1 < units_size && fabs(size) >= delimiter; ++i) diff --git a/src/Common/parseRemoteDescription.cpp b/src/Common/parseRemoteDescription.cpp index cc89af26d99..477f5e0f250 100644 --- a/src/Common/parseRemoteDescription.cpp +++ b/src/Common/parseRemoteDescription.cpp @@ -1,6 +1,9 @@ #include "parseRemoteDescription.h" #include #include +#include +#include + namespace DB { @@ -167,4 +170,27 @@ std::vector parseRemoteDescription(const String & description, size_t l, return res; } + +std::vector> parseRemoteDescriptionForExternalDatabase(const String & description, size_t max_addresses, UInt16 default_port) +{ + auto addresses = parseRemoteDescription(description, 0, description.size(), '|', max_addresses); + std::vector> result; + + for (const auto & address : addresses) + { + size_t colon = address.find(':'); + if (colon == String::npos) + { + LOG_WARNING(&Poco::Logger::get("ParseRemoteDescription"), "Port is not found for host: {}. Using default port {}", address, default_port); + result.emplace_back(std::make_pair(address, default_port)); + } + else + { + result.emplace_back(std::make_pair(address.substr(0, colon), DB::parseFromString(address.substr(colon + 1)))); + } + } + + return result; +} + } diff --git a/src/Common/parseRemoteDescription.h b/src/Common/parseRemoteDescription.h index 6ba0bb4737f..f691a57dc73 100644 --- a/src/Common/parseRemoteDescription.h +++ b/src/Common/parseRemoteDescription.h @@ -17,4 +17,7 @@ namespace DB */ std::vector parseRemoteDescription(const String & description, size_t l, size_t r, char separator, size_t max_addresses); +/// Parse remote description for external database (MySQL or PostgreSQL). +std::vector> parseRemoteDescriptionForExternalDatabase(const String & description, size_t max_addresses, UInt16 default_port); + } diff --git a/src/Common/tests/compact_array.cpp b/src/Common/tests/compact_array.cpp index a63859ac712..af6257e1963 100644 --- a/src/Common/tests/compact_array.cpp +++ b/src/Common/tests/compact_array.cpp @@ -1,5 +1,5 @@ /// Bug in GCC: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124 -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" #endif @@ -263,6 +263,6 @@ int main() return 0; } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp new file mode 100644 index 00000000000..74fd7cb6149 --- /dev/null +++ b/src/Common/tests/gtest_DateLUTImpl.cpp @@ -0,0 +1,531 @@ +#include +#include + +#include + +#include +#include + +#include + + +/// For the expansion of gtest macros. +#if defined(__clang__) + #pragma clang diagnostic ignored "-Wused-but-marked-unused" +#endif + +// All timezones present at build time and embedded into ClickHouse binary. +extern const char * auto_time_zones[]; + +namespace +{ + +cctz::civil_day YYYYMMDDToDay(unsigned value) +{ + return cctz::civil_day( + value / 10000, // year + (value % 10000) / 100, // month + value % 100); // day +} + +std::vector allTimezones(bool with_weird_offsets = true) +{ + std::vector result; + + const auto * timezone_name = auto_time_zones; + while (*timezone_name) + { + bool weird_offsets = (std::string_view(*timezone_name) == "Africa/Monrovia"); + + if (!weird_offsets || with_weird_offsets) + result.push_back(*timezone_name); + ++timezone_name; + } + + return result; +} + +struct FailuresCount +{ + size_t non_fatal = 0; + size_t fatal = 0; + size_t total = 0; +}; + +FailuresCount countFailures(const ::testing::TestResult & test_result) +{ + FailuresCount failures{0, 0, 0}; + const size_t count = test_result.total_part_count(); + for (size_t i = 0; i < count; ++i) + { + const auto & part = test_result.GetTestPartResult(i); + if (part.nonfatally_failed()) + { + ++failures.non_fatal; + ++failures.total; + } + if (part.fatally_failed()) + { + ++failures.fatal; + ++failures.total; + } + } + + return failures; +} + +} + +TEST(DateLUTTest, makeDayNumTest) +{ + const DateLUTImpl & lut = DateLUT::instance("UTC"); + EXPECT_EQ(0, lut.makeDayNum(2500, 12, 25)); + EXPECT_EQ(0, lut.makeDayNum(1924, 12, 31)); +} + + +TEST(DateLUTTest, TimeValuesInMiddleOfRange) +{ + const DateLUTImpl & lut = DateLUT::instance("Europe/Minsk"); + const time_t time = 1568650811; // 2019-09-16 19:20:11 (Monday) + + EXPECT_EQ(lut.getTimeZone(), "Europe/Minsk"); + EXPECT_EQ(lut.getOffsetAtStartOfEpoch(), 3600*3); // UTC-3 + + EXPECT_EQ(lut.toDate(time), 1568581200); + EXPECT_EQ(lut.toMonth(time), 9); + EXPECT_EQ(lut.toQuarter(time), 3); + EXPECT_EQ(lut.toYear(time), 2019); + EXPECT_EQ(lut.toDayOfMonth(time), 16); + + EXPECT_EQ(lut.toFirstDayOfWeek(time), 1568581200 /*time_t*/); + EXPECT_EQ(lut.toFirstDayNumOfWeek(time), DayNum(18155) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfMonth(time), 1567285200 /*time_t*/); + EXPECT_EQ(lut.toFirstDayNumOfMonth(time), DayNum(18140) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayNumOfQuarter(time), DayNum(18078) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfQuarter(time), 1561928400 /*time_t*/); + EXPECT_EQ(lut.toFirstDayOfYear(time), 1546290000 /*time_t*/); + EXPECT_EQ(lut.toFirstDayNumOfYear(time), DayNum(17897) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfNextMonth(time), 1569877200 /*time_t*/); + EXPECT_EQ(lut.toFirstDayOfPrevMonth(time), 1564606800 /*time_t*/); + EXPECT_EQ(lut.daysInMonth(time), 30 /*UInt8*/); + EXPECT_EQ(lut.toDateAndShift(time, 10), 1569445200 /*time_t*/); + EXPECT_EQ(lut.toTime(time), 58811 /*time_t*/); + EXPECT_EQ(lut.toHour(time), 19 /*unsigned*/); + EXPECT_EQ(lut.toSecond(time), 11 /*unsigned*/); + EXPECT_EQ(lut.toMinute(time), 20 /*unsigned*/); + EXPECT_EQ(lut.toStartOfMinute(time), 1568650800 /*time_t*/); + EXPECT_EQ(lut.toStartOfFiveMinute(time), 1568650800 /*time_t*/); + EXPECT_EQ(lut.toStartOfFifteenMinutes(time), 1568650500 /*time_t*/); + EXPECT_EQ(lut.toStartOfTenMinutes(time), 1568650800 /*time_t*/); + EXPECT_EQ(lut.toStartOfHour(time), 1568649600 /*time_t*/); + EXPECT_EQ(lut.toDayNum(time), DayNum(18155) /*DayNum*/); + EXPECT_EQ(lut.toDayOfYear(time), 259 /*unsigned*/); + EXPECT_EQ(lut.toRelativeWeekNum(time), 2594 /*unsigned*/); + EXPECT_EQ(lut.toISOYear(time), 2019 /*unsigned*/); + EXPECT_EQ(lut.toFirstDayNumOfISOYear(time), DayNum(17896) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfISOYear(time), 1546203600 /*time_t*/); + EXPECT_EQ(lut.toISOWeek(time), 38 /*unsigned*/); + EXPECT_EQ(lut.toRelativeMonthNum(time), 24237 /*unsigned*/); + EXPECT_EQ(lut.toRelativeQuarterNum(time), 8078 /*unsigned*/); + EXPECT_EQ(lut.toRelativeHourNum(time), 435736 /*time_t*/); + EXPECT_EQ(lut.toRelativeMinuteNum(time), 26144180 /*time_t*/); + EXPECT_EQ(lut.toStartOfMinuteInterval(time, 6), 1568650680 /*time_t*/); + EXPECT_EQ(lut.toStartOfSecondInterval(time, 7), 1568650811 /*time_t*/); + EXPECT_EQ(lut.toNumYYYYMM(time), 201909 /*UInt32*/); + EXPECT_EQ(lut.toNumYYYYMMDD(time), 20190916 /*UInt32*/); + EXPECT_EQ(lut.toNumYYYYMMDDhhmmss(time), 20190916192011 /*UInt64*/); + EXPECT_EQ(lut.addDays(time, 100), 1577290811 /*time_t*/); + EXPECT_EQ(lut.addWeeks(time, 100), 1629130811 /*time_t*/); + EXPECT_EQ(lut.addMonths(time, 100), 1831652411 /*time_t*/); + EXPECT_EQ(lut.addQuarters(time, 100), 2357655611 /*time_t*/); + EXPECT_EQ(lut.addYears(time, 10), 1884270011 /*time_t*/); + EXPECT_EQ(lut.timeToString(time), "2019-09-16 19:20:11" /*std::string*/); + EXPECT_EQ(lut.dateToString(time), "2019-09-16" /*std::string*/); +} + + +TEST(DateLUTTest, TimeValuesAtLeftBoderOfRange) +{ + const DateLUTImpl & lut = DateLUT::instance("UTC"); + const time_t time = 0; // 1970-01-01 00:00:00 (Thursday) + + EXPECT_EQ(lut.getTimeZone(), "UTC"); + + EXPECT_EQ(lut.toDate(time), 0); + EXPECT_EQ(lut.toMonth(time), 1); + EXPECT_EQ(lut.toQuarter(time), 1); + EXPECT_EQ(lut.toYear(time), 1970); + EXPECT_EQ(lut.toDayOfMonth(time), 1); + + EXPECT_EQ(lut.toFirstDayOfWeek(time), -259200 /*time_t*/); // 1969-12-29 00:00:00 + EXPECT_EQ(lut.toFirstDayNumOfWeek(time), ExtendedDayNum(-3) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfMonth(time), 0 /*time_t*/); + EXPECT_EQ(lut.toFirstDayNumOfMonth(time), DayNum(0) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayNumOfQuarter(time), DayNum(0) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfQuarter(time), 0 /*time_t*/); + EXPECT_EQ(lut.toFirstDayOfYear(time), 0 /*time_t*/); + EXPECT_EQ(lut.toFirstDayNumOfYear(time), DayNum(0) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfNextMonth(time), 2678400 /*time_t*/); + EXPECT_EQ(lut.toFirstDayOfPrevMonth(time), -2678400 /*time_t*/); // 1969-12-01 00:00:00 + EXPECT_EQ(lut.daysInMonth(time), 31 /*UInt8*/); + EXPECT_EQ(lut.toDateAndShift(time, 10), 864000 /*time_t*/); + EXPECT_EQ(lut.toTime(time), 0 /*time_t*/); + EXPECT_EQ(lut.toHour(time), 0 /*unsigned*/); + EXPECT_EQ(lut.toSecond(time), 0 /*unsigned*/); + EXPECT_EQ(lut.toMinute(time), 0 /*unsigned*/); + EXPECT_EQ(lut.toStartOfMinute(time), 0 /*time_t*/); + EXPECT_EQ(lut.toStartOfFiveMinute(time), 0 /*time_t*/); + EXPECT_EQ(lut.toStartOfFifteenMinutes(time), 0 /*time_t*/); + EXPECT_EQ(lut.toStartOfTenMinutes(time), 0 /*time_t*/); + EXPECT_EQ(lut.toStartOfHour(time), 0 /*time_t*/); + EXPECT_EQ(lut.toDayNum(time), DayNum(0) /*DayNum*/); + EXPECT_EQ(lut.toDayOfYear(time), 1 /*unsigned*/); + EXPECT_EQ(lut.toRelativeWeekNum(time), 0 /*unsigned*/); + EXPECT_EQ(lut.toISOYear(time), 1970 /*unsigned*/); + EXPECT_EQ(lut.toFirstDayNumOfISOYear(time), ExtendedDayNum(-3) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfISOYear(time), -259200 /*time_t*/); // 1969-12-29 00:00:00 + EXPECT_EQ(lut.toISOWeek(time), 1 /*unsigned*/); + EXPECT_EQ(lut.toRelativeMonthNum(time), 23641 /*unsigned*/); // ? + EXPECT_EQ(lut.toRelativeQuarterNum(time), 7880 /*unsigned*/); // ? + EXPECT_EQ(lut.toRelativeHourNum(time), 0 /*time_t*/); + EXPECT_EQ(lut.toRelativeMinuteNum(time), 0 /*time_t*/); + EXPECT_EQ(lut.toStartOfMinuteInterval(time, 6), 0 /*time_t*/); + EXPECT_EQ(lut.toStartOfSecondInterval(time, 7), 0 /*time_t*/); + EXPECT_EQ(lut.toNumYYYYMM(time), 197001 /*UInt32*/); + EXPECT_EQ(lut.toNumYYYYMMDD(time), 19700101 /*UInt32*/); + EXPECT_EQ(lut.toNumYYYYMMDDhhmmss(time), 19700101000000 /*UInt64*/); + EXPECT_EQ(lut.addDays(time, 100), 8640000 /*time_t*/); + EXPECT_EQ(lut.addWeeks(time, 100), 60480000 /*time_t*/); + EXPECT_EQ(lut.addMonths(time, 100), 262828800 /*time_t*/); + EXPECT_EQ(lut.addQuarters(time, 100), 788918400 /*time_t*/); + EXPECT_EQ(lut.addYears(time, 10), 315532800 /*time_t*/); + EXPECT_EQ(lut.timeToString(time), "1970-01-01 00:00:00" /*std::string*/); + EXPECT_EQ(lut.dateToString(time), "1970-01-01" /*std::string*/); +} + +TEST(DateLUTTest, TimeValuesAtRightBoderOfRangeOfOldLUT) +{ + // Value is at the right border of the old (small) LUT, and provides meaningful values where old LUT would provide garbage. + const DateLUTImpl & lut = DateLUT::instance("UTC"); + + const time_t time = 4294343873; // 2106-01-31T01:17:53 (Sunday) + + EXPECT_EQ(lut.getTimeZone(), "UTC"); + + EXPECT_EQ(lut.toDate(time), 4294339200); + EXPECT_EQ(lut.toMonth(time), 1); + EXPECT_EQ(lut.toQuarter(time), 1); + EXPECT_EQ(lut.toYear(time), 2106); + EXPECT_EQ(lut.toDayOfMonth(time), 31); + + EXPECT_EQ(lut.toFirstDayOfWeek(time), 4293820800 /*time_t*/); + EXPECT_EQ(lut.toFirstDayNumOfWeek(time), DayNum(49697)); + EXPECT_EQ(lut.toFirstDayOfMonth(time), 4291747200 /*time_t*/); // 2016-01-01 + EXPECT_EQ(lut.toFirstDayNumOfMonth(time), DayNum(49673)); + EXPECT_EQ(lut.toFirstDayNumOfQuarter(time), DayNum(49673) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfQuarter(time), 4291747200 /*time_t*/); + EXPECT_EQ(lut.toFirstDayOfYear(time), 4291747200 /*time_t*/); + EXPECT_EQ(lut.toFirstDayNumOfYear(time), DayNum(49673) /*DayNum*/); + EXPECT_EQ(lut.toFirstDayOfNextMonth(time), 4294425600 /*time_t*/); // 2106-02-01 + EXPECT_EQ(lut.toFirstDayOfPrevMonth(time), 4289068800 /*time_t*/); // 2105-12-01 + EXPECT_EQ(lut.daysInMonth(time), 31 /*UInt8*/); + EXPECT_EQ(lut.toDateAndShift(time, 10), 4295203200 /*time_t*/); // 2106-02-10 + EXPECT_EQ(lut.toTime(time), 4673 /*time_t*/); + EXPECT_EQ(lut.toHour(time), 1 /*unsigned*/); + EXPECT_EQ(lut.toMinute(time), 17 /*unsigned*/); + EXPECT_EQ(lut.toSecond(time), 53 /*unsigned*/); + EXPECT_EQ(lut.toStartOfMinute(time), 4294343820 /*time_t*/); + EXPECT_EQ(lut.toStartOfFiveMinute(time), 4294343700 /*time_t*/); + EXPECT_EQ(lut.toStartOfFifteenMinutes(time), 4294343700 /*time_t*/); + EXPECT_EQ(lut.toStartOfTenMinutes(time), 4294343400 /*time_t*/); + EXPECT_EQ(lut.toStartOfHour(time), 4294342800 /*time_t*/); + EXPECT_EQ(lut.toDayNum(time), DayNum(49703) /*DayNum*/); + EXPECT_EQ(lut.toDayOfYear(time), 31 /*unsigned*/); + EXPECT_EQ(lut.toRelativeWeekNum(time), 7100 /*unsigned*/); + EXPECT_EQ(lut.toISOYear(time), 2106 /*unsigned*/); + EXPECT_EQ(lut.toFirstDayNumOfISOYear(time), DayNum(49676) /*DayNum*/); // 2106-01-04 + EXPECT_EQ(lut.toFirstDayOfISOYear(time), 4292006400 /*time_t*/); + EXPECT_EQ(lut.toISOWeek(time), 4 /*unsigned*/); + EXPECT_EQ(lut.toRelativeMonthNum(time), 25273 /*unsigned*/); + EXPECT_EQ(lut.toRelativeQuarterNum(time), 8424 /*unsigned*/); + EXPECT_EQ(lut.toRelativeHourNum(time), 1192873 /*time_t*/); + EXPECT_EQ(lut.toRelativeMinuteNum(time), 71572397 /*time_t*/); + EXPECT_EQ(lut.toStartOfMinuteInterval(time, 6), 4294343520 /*time_t*/); + EXPECT_EQ(lut.toStartOfSecondInterval(time, 7), 4294343872 /*time_t*/); + EXPECT_EQ(lut.toNumYYYYMM(time), 210601 /*UInt32*/); + EXPECT_EQ(lut.toNumYYYYMMDD(time), 21060131 /*UInt32*/); + EXPECT_EQ(lut.toNumYYYYMMDDhhmmss(time), 21060131011753 /*UInt64*/); + EXPECT_EQ(lut.addDays(time, 100), 4302983873 /*time_t*/); + EXPECT_EQ(lut.addWeeks(time, 10), 4300391873 /*time_t*/); + EXPECT_EQ(lut.addMonths(time, 10), 4320523073 /*time_t*/); // 2106-11-30 01:17:53 + EXPECT_EQ(lut.addQuarters(time, 10), 4373140673 /*time_t*/); // 2108-07-31 01:17:53 + EXPECT_EQ(lut.addYears(time, 10), 4609876673 /*time_t*/); // 2116-01-31 01:17:53 + + EXPECT_EQ(lut.timeToString(time), "2106-01-31 01:17:53" /*std::string*/); + EXPECT_EQ(lut.dateToString(time), "2106-01-31" /*std::string*/); +} + + +class DateLUTWithTimeZone : public ::testing::TestWithParam +{}; + +TEST_P(DateLUTWithTimeZone, LoadLUT) +{ + // There are some assumptions and assertions about TZ data made in DateLUTImpl which are verified upon loading, + // to make sure that those assertions are true for all timezones we are going to load all of them one by one. + DateLUT::instance(GetParam()); +} + +// Another long running test, shouldn't be run to often +TEST_P(DateLUTWithTimeZone, VaidateTimeComponentsAroundEpoch) +{ + // Converting time around 1970-01-01 to hour-minute-seconds time components + // could be problematic. + const size_t max_failures_per_tz = 3; + const auto * timezone_name = GetParam(); + + const auto * test_info = ::testing::UnitTest::GetInstance()->current_test_info(); + const DateLUTImpl & lut = DateLUT::instance(timezone_name); + + for (time_t i = -856147870; i < 86400 * 10000; i += 11 * 13 * 17 * 19) + { + SCOPED_TRACE(::testing::Message() + << "\n\tTimezone: " << timezone_name + << "\n\ttimestamp: " << i + << "\n\t offset at start of epoch : " << lut.getOffsetAtStartOfEpoch() + << "\n\t offset_at_start_of_lut : " << lut.getTimeOffsetAtStartOfLUT()); + + EXPECT_GE(24, lut.toHour(i)); + EXPECT_GT(60, lut.toMinute(i)); + EXPECT_GT(60, lut.toSecond(i)); + + const auto current_failures = countFailures(*test_info->result()); + if (current_failures.total > 0) + { + if (i < 0) + i = -1; + } + + if (current_failures.total >= max_failures_per_tz) + break; + } +} + +TEST_P(DateLUTWithTimeZone, getTimeZone) +{ + const auto & lut = DateLUT::instance(GetParam()); + + EXPECT_EQ(GetParam(), lut.getTimeZone()); +} + + +// Group of tests for timezones that have or had some time ago an offset which is not multiple of 15 minutes. +INSTANTIATE_TEST_SUITE_P(ExoticTimezones, + DateLUTWithTimeZone, + ::testing::ValuesIn(std::initializer_list{ + "Africa/El_Aaiun", + "Pacific/Apia", + "Pacific/Enderbury", + "Pacific/Fakaofo", + "Pacific/Kiritimati", + }) +); + +INSTANTIATE_TEST_SUITE_P(AllTimeZones, + DateLUTWithTimeZone, + ::testing::ValuesIn(allTimezones()) +); + +std::ostream & operator<<(std::ostream & ostr, const DateLUTImpl::Values & v) +{ + return ostr << "DateLUTImpl::Values{" + << "\n\t date : " << v.date + << "\n\t year : " << static_cast(v.year) + << "\n\t month : " << static_cast(v.month) + << "\n\t day : " << static_cast(v.day_of_month) + << "\n\t weekday : " << static_cast(v.day_of_week) + << "\n\t days in month : " << static_cast(v.days_in_month) + << "\n\t offset change : " << v.amount_of_offset_change() + << "\n\t offset change at : " << v.time_at_offset_change() + << "\n}"; +} + +struct TimeRangeParam +{ + const cctz::civil_second begin; + const cctz::civil_second end; + const int step_in_seconds; +}; + +std::ostream & operator<<(std::ostream & ostr, const TimeRangeParam & param) +{ + return ostr << param.begin << " : " << param.end << " step: " << param.step_in_seconds << "s"; +} + +class DateLUTWithTimeZoneAndTimeRange : public ::testing::TestWithParam> +{}; + +// refactored test from tests/date_lut3.cpp +TEST_P(DateLUTWithTimeZoneAndTimeRange, InRange) +{ + // for a time_t values in range [begin, end) to match with reference obtained from cctz: + // compare date and time components: year, month, day, hours, minutes, seconds, formatted time string. + const auto & [timezone_name, range_data] = GetParam(); + const auto & [begin, end, step] = range_data; + + const auto * test_info = ::testing::UnitTest::GetInstance()->current_test_info(); + static const size_t max_failures_per_case = 3; + cctz::time_zone tz; + ASSERT_TRUE(cctz::load_time_zone(timezone_name, &tz)); + + const auto & lut = DateLUT::instance(timezone_name); + const auto start = cctz::convert(begin, tz).time_since_epoch().count(); + const auto stop = cctz::convert(end, tz).time_since_epoch().count(); + + for (time_t expected_time_t = start; expected_time_t < stop; expected_time_t += step) + { + SCOPED_TRACE(expected_time_t); + + const cctz::civil_second tz_time = cctz::convert(std::chrono::system_clock::from_time_t(expected_time_t), tz); + + /// Weird offset, not supported. + /// Example: Africa/Monrovia has offset UTC-0:44:30 in year 1970. + + auto timestamp_current_day_pre = std::chrono::system_clock::to_time_t(tz.lookup(cctz::civil_day(tz_time)).pre); + auto timestamp_current_day_post = std::chrono::system_clock::to_time_t(tz.lookup(cctz::civil_day(tz_time) + 1).post); + + if (timestamp_current_day_pre % 900 || timestamp_current_day_post % 900) + continue; + + /// Unsupported timezone transitions - not in 15-minute time point or to different day. + /// Example: America/Goose_Bay decided to go back one hour at 00:01: + /// $ seq 1289097900 30 1289103600 | TZ=America/Goose_Bay LC_ALL=C xargs -I{} date -d @{} + /// Sat Nov 6 23:59:00 ADT 2010 + /// Sat Nov 6 23:59:30 ADT 2010 + /// Sun Nov 7 00:00:00 ADT 2010 + /// Sun Nov 7 00:00:30 ADT 2010 + /// Sat Nov 6 23:01:00 AST 2010 + /// Sat Nov 6 23:01:30 AST 2010 + + bool has_transition = false; + cctz::time_zone::civil_transition transition{}; + if (tz.next_transition(std::chrono::system_clock::from_time_t(expected_time_t - 1), &transition) + && (transition.from.day() == tz_time.day() || transition.to.day() == tz_time.day())) + { + has_transition = true; + } + + if (has_transition && (transition.from.second() != 0 || transition.from.minute() % 15 != 0)) + { + /*std::cerr << "Skipping " << timezone_name << " " << tz_time + << " because of unsupported timezone transition from " << transition.from << " to " << transition.to + << " (not divisible by 15 minutes)\n";*/ + continue; + } + + /// Transition to previous day, but not from midnight. + if (has_transition && cctz::civil_day(transition.from) == cctz::civil_day(transition.to) + 1 + && transition.from != cctz::civil_day(transition.from)) + { + /*std::cerr << "Skipping " << timezone_name << " " << tz_time + << " because of unsupported timezone transition from " << transition.from << " to " << transition.to + << " (to previous day but not at midnight)\n";*/ + continue; + } + + /// To large transition. + if (has_transition + && std::abs(transition.from - transition.to) > 3600 * 3) + { + /*std::cerr << "Skipping " << timezone_name << " " << tz_time + << " because of unsupported timezone transition from " << transition.from << " to " << transition.to + << " (it is too large)\n";*/ + continue; + } + + EXPECT_EQ(tz_time.year(), lut.toYear(expected_time_t)); + EXPECT_EQ(tz_time.month(), lut.toMonth(expected_time_t)); + EXPECT_EQ(tz_time.day(), lut.toDayOfMonth(expected_time_t)); + /// tm.tm_wday Sunday is 0, while for DateLUTImpl it is 7 + EXPECT_EQ(static_cast(cctz::get_weekday(tz_time)) + 1, lut.toDayOfWeek(expected_time_t)); + EXPECT_EQ(cctz::get_yearday(tz_time), lut.toDayOfYear(expected_time_t)); + EXPECT_EQ(tz_time.hour(), lut.toHour(expected_time_t)); + EXPECT_EQ(tz_time.minute(), lut.toMinute(expected_time_t)); + EXPECT_EQ(tz_time.second(), lut.toSecond(expected_time_t)); + + const auto time_string = cctz::format("%E4Y-%m-%d %H:%M:%S", std::chrono::system_clock::from_time_t(expected_time_t), tz); + EXPECT_EQ(time_string, lut.timeToString(expected_time_t)); + + /// It makes sense to let test execute all checks above to simplify debugging, + /// but once we've found a bad apple, no need to dig deeper. + if (countFailures(*test_info->result()).total >= max_failures_per_case) + break; + } +} + +INSTANTIATE_TEST_SUITE_P(AllTimezones_Year2010, + DateLUTWithTimeZoneAndTimeRange, + ::testing::Combine( + ::testing::ValuesIn(allTimezones()), + ::testing::ValuesIn(std::initializer_list{ + // Values from tests/date_lut3.cpp + {YYYYMMDDToDay(20101031), YYYYMMDDToDay(20101101), 10 * 15 * 60}, + {YYYYMMDDToDay(20100328), YYYYMMDDToDay(20100330), 10 * 15 * 60} + })) +); + +INSTANTIATE_TEST_SUITE_P(AllTimezones_Year1970_WHOLE, + DateLUTWithTimeZoneAndTimeRange, + ::testing::Combine( + ::testing::ValuesIn(allTimezones(false)), + ::testing::ValuesIn(std::initializer_list{ + // Values from tests/date_lut3.cpp + {YYYYMMDDToDay(19700101), YYYYMMDDToDay(19701231), 10 * 3191 /*53m 11s*/}, + })) +); + +INSTANTIATE_TEST_SUITE_P(AllTimezones_Year2010_WHOLE, + DateLUTWithTimeZoneAndTimeRange, + ::testing::Combine( + ::testing::ValuesIn(allTimezones(false)), + ::testing::ValuesIn(std::initializer_list{ + // Values from tests/date_lut3.cpp + {YYYYMMDDToDay(20100101), YYYYMMDDToDay(20101231), 10 * 3191 /*53m 11s*/}, + })) +); + +INSTANTIATE_TEST_SUITE_P(AllTimezones_Year2020_WHOLE, + DateLUTWithTimeZoneAndTimeRange, + ::testing::Combine( + ::testing::ValuesIn(allTimezones()), + ::testing::ValuesIn(std::initializer_list{ + // Values from tests/date_lut3.cpp + {YYYYMMDDToDay(20200101), YYYYMMDDToDay(20201231), 10 * 3191 /*53m 11s*/}, + })) +); + +INSTANTIATE_TEST_SUITE_P(AllTimezones_PreEpoch, + DateLUTWithTimeZoneAndTimeRange, + ::testing::Combine( + ::testing::ValuesIn(allTimezones(false)), + ::testing::ValuesIn(std::initializer_list{ + {YYYYMMDDToDay(19500101), YYYYMMDDToDay(19600101), 10 * 15 * 60}, + {YYYYMMDDToDay(19300101), YYYYMMDDToDay(19350101), 10 * 11 * 15 * 60} + })) +); + +INSTANTIATE_TEST_SUITE_P(AllTimezones_Year1970, + DateLUTWithTimeZoneAndTimeRange, + ::testing::Combine( + ::testing::ValuesIn(allTimezones(false)), + ::testing::ValuesIn(std::initializer_list{ + {YYYYMMDDToDay(19700101), YYYYMMDDToDay(19700201), 10 * 15 * 60}, + {YYYYMMDDToDay(19700101), YYYYMMDDToDay(19701231), 10 * 11 * 13 * 17} +// // 11 was chosen as a number which can't divide product of 2-combinarions of (7, 24, 60), +// // to reduce likelehood of hitting same hour/minute/second values for different days. +// // + 12 is just to make sure that last day is covered fully. +// {0, 0 + 11 * 3600 * 24 + 12, 11}, + })) +); + diff --git a/base/common/tests/gtest_find_symbols.cpp b/src/Common/tests/gtest_find_symbols.cpp similarity index 100% rename from base/common/tests/gtest_find_symbols.cpp rename to src/Common/tests/gtest_find_symbols.cpp diff --git a/src/Common/tests/gtest_global_context.h b/src/Common/tests/gtest_global_context.h index b6529f09b46..30ebf1dbca9 100644 --- a/src/Common/tests/gtest_global_context.h +++ b/src/Common/tests/gtest_global_context.h @@ -5,14 +5,14 @@ struct ContextHolder { DB::SharedContextHolder shared_context; - DB::Context context; + DB::ContextPtr context; ContextHolder() : shared_context(DB::Context::createShared()) , context(DB::Context::createGlobal(shared_context.get())) { - context.makeGlobalContext(); - context.setPath("./"); + context->makeGlobalContext(); + context->setPath("./"); } ContextHolder(ContextHolder &&) = default; diff --git a/src/Common/tests/gtest_json_test.cpp b/src/Common/tests/gtest_json_test.cpp new file mode 100644 index 00000000000..726fb836030 --- /dev/null +++ b/src/Common/tests/gtest_json_test.cpp @@ -0,0 +1,656 @@ +#include +#include +#include +#include +#include + +#include +#include + + +enum class ResultType +{ + Return, + Throw +}; + +struct GetStringTestRecord +{ + std::string_view input; + ResultType result_type; + std::string_view result; +}; + +TEST(JSONSuite, SimpleTest) +{ + using namespace std::literals; + + std::vector test_data = + { + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Вафельница Vitek WX-1102 FL")"sv, ResultType::Return, "Вафельница Vitek WX-1102 FL"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("184509")"sv, ResultType::Return, "184509"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("Все для детей/Детская техника/Vitek")"sv, ResultType::Return, "Все для детей/Детская техника/Vitek"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("В наличии")"sv, ResultType::Return, "В наличии"sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("2390.00")"sv, ResultType::Return, "2390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("Карточка")"sv, ResultType::Return, "Карточка"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("detail")"sv, ResultType::Return, "detail"sv }, + { R"("actionField")"sv, ResultType::Return, "actionField"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("http://www.techport.ru/q/?t=вафельница&sort=price&sdim=asc")"sv, ResultType::Return, "http://www.techport.ru/q/?t=вафельница&sort=price&sdim=asc"sv }, + { R"("action")"sv, ResultType::Return, "action"sv }, + { R"("detail")"sv, ResultType::Return, "detail"sv }, + { R"("products")"sv, ResultType::Return, "products"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Вафельница Vitek WX-1102 FL")"sv, ResultType::Return, "Вафельница Vitek WX-1102 FL"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("184509")"sv, ResultType::Return, "184509"sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("2390.00")"sv, ResultType::Return, "2390.00"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("Vitek")"sv, ResultType::Return, "Vitek"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("Все для детей/Детская техника/Vitek")"sv, ResultType::Return, "Все для детей/Детская техника/Vitek"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("В наличии")"sv, ResultType::Return, "В наличии"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("isAuthorized")"sv, ResultType::Return, "isAuthorized"sv }, + { R"("isSubscriber")"sv, ResultType::Return, "isSubscriber"sv }, + { R"("postType")"sv, ResultType::Return, "postType"sv }, + { R"("Новости")"sv, ResultType::Return, "Новости"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("Электроплита GEFEST Брест ЭПНД 5140-01 0001")"sv, ResultType::Return, "Электроплита GEFEST Брест ЭПНД 5140-01 0001"sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("currencyCode")"sv, ResultType::Return, "currencyCode"sv }, + { R"("RUB")"sv, ResultType::Return, "RUB"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("trash_login")"sv, ResultType::Return, "trash_login"sv }, + { R"("novikoff")"sv, ResultType::Return, "novikoff"sv }, + { R"("trash_cat_link")"sv, ResultType::Return, "trash_cat_link"sv }, + { R"("progs")"sv, ResultType::Return, "progs"sv }, + { R"("trash_parent_link")"sv, ResultType::Return, "trash_parent_link"sv }, + { R"("content")"sv, ResultType::Return, "content"sv }, + { R"("trash_posted_parent")"sv, ResultType::Return, "trash_posted_parent"sv }, + { R"("content.01.2016")"sv, ResultType::Return, "content.01.2016"sv }, + { R"("trash_posted_cat")"sv, ResultType::Return, "trash_posted_cat"sv }, + { R"("progs.01.2016")"sv, ResultType::Return, "progs.01.2016"sv }, + { R"("trash_virus_count")"sv, ResultType::Return, "trash_virus_count"sv }, + { R"("trash_is_android")"sv, ResultType::Return, "trash_is_android"sv }, + { R"("trash_is_wp8")"sv, ResultType::Return, "trash_is_wp8"sv }, + { R"("trash_is_ios")"sv, ResultType::Return, "trash_is_ios"sv }, + { R"("trash_posted")"sv, ResultType::Return, "trash_posted"sv }, + { R"("01.2016")"sv, ResultType::Return, "01.2016"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("merchantId")"sv, ResultType::Return, "merchantId"sv }, + { R"("13694_49246")"sv, ResultType::Return, "13694_49246"sv }, + { R"("cps-source")"sv, ResultType::Return, "cps-source"sv }, + { R"("wargaming")"sv, ResultType::Return, "wargaming"sv }, + { R"("cps_provider")"sv, ResultType::Return, "cps_provider"sv }, + { R"("default")"sv, ResultType::Return, "default"sv }, + { R"("errorReason")"sv, ResultType::Return, "errorReason"sv }, + { R"("no errors")"sv, ResultType::Return, "no errors"sv }, + { R"("scid")"sv, ResultType::Return, "scid"sv }, + { R"("isAuthPayment")"sv, ResultType::Return, "isAuthPayment"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("rubric")"sv, ResultType::Return, "rubric"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("rubric")"sv, ResultType::Return, "rubric"sv }, + { R"("Мир")"sv, ResultType::Return, "Мир"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("__ym")"sv, ResultType::Return, "__ym"sv }, + { R"("ecommerce")"sv, ResultType::Return, "ecommerce"sv }, + { R"("impressions")"sv, ResultType::Return, "impressions"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("863813")"sv, ResultType::Return, "863813"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Happy, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Happy, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("863839")"sv, ResultType::Return, "863839"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Pretty kitten, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Pretty kitten, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("863847")"sv, ResultType::Return, "863847"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Little tiger, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Little tiger, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911480")"sv, ResultType::Return, "911480"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Puppy, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Puppy, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911484")"sv, ResultType::Return, "911484"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Little bears, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Little bears, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911489")"sv, ResultType::Return, "911489"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Dolphin, возраст 2-4 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Dolphin, возраст 2-4 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911496")"sv, ResultType::Return, "911496"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Pretty, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Pretty, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911504")"sv, ResultType::Return, "911504"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Fairytale, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Fairytale, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911508")"sv, ResultType::Return, "911508"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Kittens, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Kittens, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911512")"sv, ResultType::Return, "911512"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Sunshine, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Sunshine, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911516")"sv, ResultType::Return, "911516"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Dog in bag, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Dog in bag, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911520")"sv, ResultType::Return, "911520"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Cute puppy, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Cute puppy, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911524")"sv, ResultType::Return, "911524"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Rabbit, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Rabbit, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("911528")"sv, ResultType::Return, "911528"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Футболка детская 3D Turtle, возраст 1-2 года, трикотаж")"sv, ResultType::Return, "Футболка детская 3D Turtle, возраст 1-2 года, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("390.00")"sv, ResultType::Return, "390.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("888616")"sv, ResultType::Return, "888616"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { "\"3Д Футболка мужская \\\"Collorista\\\" Светлое завтра р-р XL(52-54), 100% хлопок, трикотаж\""sv, ResultType::Return, "3Д Футболка мужская \"Collorista\" Светлое завтра р-р XL(52-54), 100% хлопок, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Одежда и обувь/Мужская одежда/Футболки/")"sv, ResultType::Return, "/Одежда и обувь/Мужская одежда/Футболки/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("406.60")"sv, ResultType::Return, "406.60"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("913361")"sv, ResultType::Return, "913361"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("3Д Футболка детская World р-р 8-10, 100% хлопок, трикотаж")"sv, ResultType::Return, "3Д Футболка детская World р-р 8-10, 100% хлопок, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("470.00")"sv, ResultType::Return, "470.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("913364")"sv, ResultType::Return, "913364"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("3Д Футболка детская Force р-р 8-10, 100% хлопок, трикотаж")"sv, ResultType::Return, "3Д Футболка детская Force р-р 8-10, 100% хлопок, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("470.00")"sv, ResultType::Return, "470.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("913367")"sv, ResultType::Return, "913367"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("3Д Футболка детская Winter tale р-р 8-10, 100% хлопок, трикотаж")"sv, ResultType::Return, "3Д Футболка детская Winter tale р-р 8-10, 100% хлопок, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("470.00")"sv, ResultType::Return, "470.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("913385")"sv, ResultType::Return, "913385"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("3Д Футболка детская Moonshine р-р 8-10, 100% хлопок, трикотаж")"sv, ResultType::Return, "3Д Футболка детская Moonshine р-р 8-10, 100% хлопок, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("470.00")"sv, ResultType::Return, "470.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("913391")"sv, ResultType::Return, "913391"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("3Д Футболка детская Shaman р-р 8-10, 100% хлопок, трикотаж")"sv, ResultType::Return, "3Д Футболка детская Shaman р-р 8-10, 100% хлопок, трикотаж"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("/Летние товары/Летний текстиль/")"sv, ResultType::Return, "/Летние товары/Летний текстиль/"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("")"sv, ResultType::Return, ""sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("470.00")"sv, ResultType::Return, "470.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("/retailrocket/")"sv, ResultType::Return, "/retailrocket/"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/")"sv, ResultType::Return, "/911488/futbolka-detskaya-3d-dolphin-vozrast-1-2-goda-trikotazh/"sv }, + { R"("usertype")"sv, ResultType::Return, "usertype"sv }, + { R"("visitor")"sv, ResultType::Return, "visitor"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("__ym")"sv, ResultType::Return, "__ym"sv }, + { R"("ecommerce")"sv, ResultType::Return, "ecommerce"sv }, + { R"("impressions")"sv, ResultType::Return, "impressions"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("experiments")"sv, ResultType::Return, "experiments"sv }, + { R"("lang")"sv, ResultType::Return, "lang"sv }, + { R"("ru")"sv, ResultType::Return, "ru"sv }, + { R"("los_portal")"sv, ResultType::Return, "los_portal"sv }, + { R"("los_level")"sv, ResultType::Return, "los_level"sv }, + { R"("none")"sv, ResultType::Return, "none"sv }, + { R"("__ym")"sv, ResultType::Return, "__ym"sv }, + { R"("ecommerce")"sv, ResultType::Return, "ecommerce"sv }, + { R"("currencyCode")"sv, ResultType::Return, "currencyCode"sv }, + { R"("RUR")"sv, ResultType::Return, "RUR"sv }, + { R"("impressions")"sv, ResultType::Return, "impressions"sv }, + { R"("name")"sv, ResultType::Return, "name"sv }, + { R"("Чайник электрический Mystery MEK-1627, белый")"sv, ResultType::Return, "Чайник электрический Mystery MEK-1627, белый"sv }, + { R"("brand")"sv, ResultType::Return, "brand"sv }, + { R"("Mystery")"sv, ResultType::Return, "Mystery"sv }, + { R"("id")"sv, ResultType::Return, "id"sv }, + { R"("187180")"sv, ResultType::Return, "187180"sv }, + { R"("category")"sv, ResultType::Return, "category"sv }, + { R"("Мелкая бытовая техника/Мелкие кухонные приборы/Чайники электрические/Mystery")"sv, ResultType::Return, "Мелкая бытовая техника/Мелкие кухонные приборы/Чайники электрические/Mystery"sv }, + { R"("variant")"sv, ResultType::Return, "variant"sv }, + { R"("В наличии")"sv, ResultType::Return, "В наличии"sv }, + { R"("price")"sv, ResultType::Return, "price"sv }, + { R"("1630.00")"sv, ResultType::Return, "1630.00"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { R"("Карточка")"sv, ResultType::Return, "Карточка"sv }, + { R"("position")"sv, ResultType::Return, "position"sv }, + { R"("detail")"sv, ResultType::Return, "detail"sv }, + { R"("actionField")"sv, ResultType::Return, "actionField"sv }, + { R"("list")"sv, ResultType::Return, "list"sv }, + { "\0\""sv, ResultType::Throw, "JSON: expected \", got \0"sv }, + { "\"/igrushki/konstruktory\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/1290414/komplekt-zhenskiy-dzhemper-plusbryuki-m-254-09-malina-plustemno-siniy-\0a"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Творчество/Рисование/Инструменты и кра\0a"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Строительство и ремонт/Силовая техника/Зарядные устройства для автомобильных аккумуляторов/Пуско-зарядные устр\xD0\0a"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Строительство и ремонт/Силовая техника/Зарядные устройств\xD0\0t"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Строительство и ремонт/Силовая техника/Зарядные устройства для автомобиль\0k"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\0t"sv, ResultType::Throw, "JSON: expected \", got \0"sv }, + { "\"/Хозтовары/Хранение вещей и организа\xD1\0t"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Хозтовары/Товары для стир\0a"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"li\0a"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/734859/samolet-radioupravlyaemyy-istrebitel-rabotaet-o\0k"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/kosmetika-i-parfyum/parfyumeriya/mu\0t"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/ko\0\x04"sv, ResultType::Throw, "JSON: begin >= end."sv }, + { ""sv, ResultType::Throw, "JSON: expected \", got \0"sv }, + { "\"/stroitelstvo-i-remont/stroit\0t"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/stroitelstvo-i-remont/stroitelnyy-instrument/av\0k"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/s\0a"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Строительство и ремонт/Строительный инструмент/Изм\0e"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/avto/soputstvuy\0l"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/str\0\xD0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Отвертка 2 в 1 \\\"TUNDRA basic\\\" 5х75 мм (+,-) \0\xFF"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/stroitelstvo-i-remont/stroitelnyy-instrument/avtoinstrumen\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Мелкая бытовая техника/Мелки\xD0\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Пряжа \\\"Бамбук стрейч\\0\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Карандаш чёрнографитны\xD0\0\xD0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Творчество/Рукоделие, аппликации/Пряжа и шерсть для \xD0\0l"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/1071547/karandash-chernografitnyy-volshebstvo-nv-kruglyy-d-7-2mm-dl-176mm-plast-tuba/\0e"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"ca\0e"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"ca\0e"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/1165424/chipbord-vyrubnoy-dlya-skrapbukinga-malyshi-mikki-maus-disney-bebi\0t"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/posuda/kuhonnye-prinadlezhnosti-i-i\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Канцтовары/Ежедневники и блокн\xD0\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/kanctovary/ezhednevniki-i-blok\0a"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Стакан \xD0\0a"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Набор бумаги для скрапбукинга \\\"Мои первый годик\\\": Микки Маус, Дисней бэби, 12 листов 29.5 х 29.5 см, 160\0\x80"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"c\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Органайзер для хранения аксессуаров, \0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"quantity\00"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Сменный блок для тетрадей на кольцах А5, 160 листов клетка, офсет \xE2\x84\0="sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Сувениры/Ф\xD0\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"\0\""sv, ResultType::Return, "\0"sv }, + { "\"\0\x04"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"va\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"ca\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"В \0\x04"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/letnie-tovary/z\0\x04"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Посудомоечная машина Ha\0="sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Крупная бытов\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Полочная акустическая система Magnat Needl\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"brand\00"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"pos\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"c\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"var\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Телевизоры и видеотехника/Всё для домашних кинотеатр\0="sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Флеш-диск Transcend JetFlash 620 8GB (TS8GJF62\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Табурет Мег\0\xD0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"variant\0\x04"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Катал\xD0\0\""sv, ResultType::Return, "Катал\xD0\0"sv }, + { "\"К\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Полочная акустическая система Magnat Needl\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"brand\00"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"pos\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"c\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"17\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/igrushki/razvivayusc\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Ключница \\\"\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Игр\xD1\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Игрушки/Игрушки для девочек/Игровые модули дл\xD1\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Крупная бытовая техника/Стиральные машины/С фронт\xD0\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\0 "sv, ResultType::Throw, "JSON: expected \", got \0"sv }, + { "\"Светодиодная лента SMD3528, 5 м. IP33, 60LED, зеленый, 4,8W/мет\xD1\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Сантехника/Мебель для ванных комнат/Стол\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\0o"sv, ResultType::Throw, "JSON: expected \", got \0"sv }, + { "\"/igrushki/konstruktory\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/posuda/kuhonnye-prinadlezhnosti-i-instrumenty/kuhonnye-pr\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/1290414/komplekt-zhenskiy-dzhemper-plusbryuki-m-254-09-malina-plustemno-siniy-\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Творчество/Рисование/Инструменты и кра\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Строительство и ремонт/Силовая техника/Зарядные устройства для автомобильных аккумуляторов/Пуско-зарядные устр\xD0\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Строительство и ремонт/Силовая техника/Зарядные устройств\xD0\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Строительство и ремонт/Силовая техника/Зарядные устройства для автомобиль\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\0 "sv, ResultType::Throw, "JSON: expected \", got \0"sv }, + { "\"/Хозтовары/Хранение вещей и организа\xD1\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Хозтовары/Товары для стир\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"li\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/igrushki/igrus\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/734859/samolet-radioupravlyaemyy-istrebitel-rabotaet-o\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/kosmetika-i-parfyum/parfyumeriya/mu\00"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/ko\0\0"sv, ResultType::Throw, "JSON: begin >= end."sv }, + { "\"/avto/avtomobilnyy\0\0"sv, ResultType::Throw, "JSON: begin >= end."sv }, + { "\"/stroitelstvo-i-remont/stroit\00"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/stroitelstvo-i-remont/stroitelnyy-instrument/av\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/s\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Строительство и ремонт/Строительный инструмент/Изм\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/avto/soputstvuy\0\""sv, ResultType::Return, "/avto/soputstvuy\0"sv }, + { "\"/str\0k"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Отвертка 2 в 1 \\\"TUNDRA basic\\\" 5х75 мм (+,-) \0\xD0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/stroitelstvo-i-remont/stroitelnyy-instrument/avtoinstrumen\0="sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Чайник электрический Vitesse\0="sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Мелкая бытовая техника/Мелки\xD0\0\xD0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Пряжа \\\"Бамбук стрейч\\0о"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Карандаш чёрнографитны\xD0\0k"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Творчество/Рукоделие, аппликации/Пряжа и шерсть для \xD0\0\""sv, ResultType::Return, "/Творчество/Рукоделие, аппликации/Пряжа и шерсть для \xD0\0"sv }, + { "\"/1071547/karandash-chernografitnyy-volshebstvo-nv-kruglyy-d-7-2mm-dl-176mm-plast-tuba/\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"ca\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Подаро\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Средство для прочис\xD1\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"i\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/p\0\""sv, ResultType::Return, "/p\0"sv }, + { "\"/Сувениры/Магниты, н\xD0\0k"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Дерев\xD0\0="sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/prazdniki/svadba/svadebnaya-c\0\xD0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Канцт\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Праздники/То\xD0\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"v\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Косметика \xD0\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Спорт и отдых/Настольные игры/Покер, руле\xD1\0\xD0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"categ\0="sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/retailr\0k"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/retailrocket\0k"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Ежедневник недат А5 140л кл,ляссе,обл пв\0="sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/432809/ezhednevnik-organayzer-sredniy-s-remeshkom-na-knopke-v-oblozhke-kalkulyator-kalendar-do-\0\xD0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/1165424/chipbord-vyrubnoy-dlya-skrapbukinga-malyshi-mikki-maus-disney-bebi\0d"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/posuda/kuhonnye-prinadlezhnosti-i-i\0 "sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/Канцтовары/Ежедневники и блокн\xD0\0o"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"/kanctovary/ezhednevniki-i-blok\00"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Стакан \xD0\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"Набор бумаги для скрапбукинга \\\"Мои первый годик\\\": Микки Маус, Дисней бэби, 12 листов 29.5 х 29.5 см, 160\0\0"sv, ResultType::Throw, "JSON: incorrect syntax (expected end of string, found end of JSON)."sv }, + { "\"c\0\""sv, ResultType::Return, "c\0"sv }, + }; + + for (auto i : boost::irange(0, 1/*00000*/)) + { + static_cast(i); + + for (auto & r : test_data) + { + try + { + JSON j(r.input.data(), r.input.data() + r.input.size()); + + ASSERT_EQ(j.getString(), r.result); + ASSERT_EQ(r.result_type, ResultType::Return); + } + catch (const JSONException &) + { + ASSERT_EQ(r.result_type, ResultType::Throw); + } + } + } +} diff --git a/base/common/tests/local_date_time_comparison.cpp b/src/Common/tests/gtest_local_date_time_comparison.cpp similarity index 79% rename from base/common/tests/local_date_time_comparison.cpp rename to src/Common/tests/gtest_local_date_time_comparison.cpp index 5492ec31004..f75c2647100 100644 --- a/base/common/tests/local_date_time_comparison.cpp +++ b/src/Common/tests/gtest_local_date_time_comparison.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -16,14 +17,13 @@ void checkComparison() LocalDateTime a("2018-07-18 01:02:03"); LocalDateTime b("2018-07-18 01:02:03"); - if (a != b) - throw std::runtime_error("Test failed"); + EXPECT_EQ(a, b); + EXPECT_FALSE(a != b); } -int main(int, char **) +TEST(LocalDateTime, Comparison) { fillStackWithGarbage(); checkComparison(); - return 0; } diff --git a/src/Common/tests/gtest_pod_array.cpp b/src/Common/tests/gtest_pod_array.cpp index 988a3e649ba..9cc77b88195 100644 --- a/src/Common/tests/gtest_pod_array.cpp +++ b/src/Common/tests/gtest_pod_array.cpp @@ -33,6 +33,19 @@ TEST(Common, PODArrayInsert) EXPECT_EQ(str, std::string(chars.data(), chars.size())); } +TEST(Common, PODArrayInsertFromItself) +{ + { + PaddedPODArray array { 1 }; + + for (size_t i = 0; i < 3; ++i) + array.insertFromItself(array.begin(), array.end()); + + PaddedPODArray expected {1,1,1,1,1,1,1,1}; + ASSERT_EQ(array,expected); + } +} + TEST(Common, PODPushBackRawMany) { PODArray chars; @@ -66,3 +79,83 @@ TEST(Common, PODNoOverallocation) EXPECT_EQ(capacities, (std::vector{4065, 8161, 16353, 32737, 65505, 131041, 262113, 524257, 1048545})); } + +template +struct ItemWithSize +{ + char v[size] {}; +}; + +TEST(Common, PODInsertElementSizeNotMultipleOfLeftPadding) +{ + using ItemWith24Size = ItemWithSize<24>; + PaddedPODArray arr1_initially_empty; + + size_t items_to_insert_size = 120000; + + for (size_t test = 0; test < items_to_insert_size; ++test) + arr1_initially_empty.emplace_back(); + + EXPECT_EQ(arr1_initially_empty.size(), items_to_insert_size); + + PaddedPODArray arr2_initially_nonempty; + + for (size_t test = 0; test < items_to_insert_size; ++test) + arr2_initially_nonempty.emplace_back(); + + EXPECT_EQ(arr1_initially_empty.size(), items_to_insert_size); +} + +TEST(Common, PODErase) +{ + { + PaddedPODArray items {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + expected = {0,1,2,3,4,5,6,7,8,9}; + + items.erase(items.begin(), items.begin()); + EXPECT_EQ(items, expected); + + items.erase(items.end(), items.end()); + EXPECT_EQ(items, expected); + } + { + PaddedPODArray actual {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + + expected = {0,1,4,5,6,7,8,9}; + actual.erase(actual.begin() + 2, actual.begin() + 4); + EXPECT_EQ(actual, expected); + + expected = {0,1,4}; + actual.erase(actual.begin() + 3, actual.end()); + EXPECT_EQ(actual, expected); + + expected = {}; + actual.erase(actual.begin(), actual.end()); + EXPECT_EQ(actual, expected); + + for (size_t i = 0; i < 10; ++i) + actual.emplace_back(static_cast(i)); + + expected = {0,1,4,5,6,7,8,9}; + actual.erase(actual.begin() + 2, actual.begin() + 4); + EXPECT_EQ(actual, expected); + + expected = {0,1,4}; + actual.erase(actual.begin() + 3, actual.end()); + EXPECT_EQ(actual, expected); + + expected = {}; + actual.erase(actual.begin(), actual.end()); + EXPECT_EQ(actual, expected); + } + { + PaddedPODArray actual {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + + expected = {1,2,3,4,5,6,7,8,9}; + actual.erase(actual.begin()); + EXPECT_EQ(actual, expected); + } +} diff --git a/base/common/tests/gtest_strong_typedef.cpp b/src/Common/tests/gtest_strong_typedef.cpp similarity index 100% rename from base/common/tests/gtest_strong_typedef.cpp rename to src/Common/tests/gtest_strong_typedef.cpp diff --git a/src/Common/tests/parallel_aggregation.cpp b/src/Common/tests/parallel_aggregation.cpp index e39be163619..045a385671b 100644 --- a/src/Common/tests/parallel_aggregation.cpp +++ b/src/Common/tests/parallel_aggregation.cpp @@ -69,7 +69,7 @@ static void aggregate1(Map & map, Source::const_iterator begin, Source::const_it ++map[*it]; } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -122,7 +122,7 @@ static void aggregate22(MapTwoLevel & map, Source::const_iterator begin, Source: } } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/tests/parallel_aggregation2.cpp b/src/Common/tests/parallel_aggregation2.cpp index 1f8e598b122..e2ad36232de 100644 --- a/src/Common/tests/parallel_aggregation2.cpp +++ b/src/Common/tests/parallel_aggregation2.cpp @@ -62,7 +62,7 @@ struct AggregateIndependent } }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -115,7 +115,7 @@ struct AggregateIndependentWithSequentialKeysOptimization } }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -265,7 +265,7 @@ struct Creator void operator()(Value &) const {} }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -275,7 +275,7 @@ struct Updater void operator()(Value & x) const { ++x; } }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/ya.make b/src/Common/ya.make index 64dd628c457..d1ff04f8f0a 100644 --- a/src/Common/ya.make +++ b/src/Common/ya.make @@ -14,7 +14,7 @@ PEERDIR( clickhouse/base/common clickhouse/base/pcg-random clickhouse/base/widechar_width - contrib/libs/libcpuid/libcpuid + contrib/libs/libcpuid contrib/libs/openssl contrib/libs/poco/NetSSL_OpenSSL contrib/libs/re2 @@ -39,6 +39,7 @@ SRCS( DNSResolver.cpp Dwarf.cpp Elf.cpp + Epoll.cpp ErrorCodes.cpp Exception.cpp ExternalLoaderStatus.cpp diff --git a/src/Common/ya.make.in b/src/Common/ya.make.in index 210ecab6ef6..420384bb4a9 100644 --- a/src/Common/ya.make.in +++ b/src/Common/ya.make.in @@ -13,7 +13,7 @@ PEERDIR( clickhouse/base/common clickhouse/base/pcg-random clickhouse/base/widechar_width - contrib/libs/libcpuid/libcpuid + contrib/libs/libcpuid contrib/libs/openssl contrib/libs/poco/NetSSL_OpenSSL contrib/libs/re2 diff --git a/src/Compression/CachedCompressedReadBuffer.cpp b/src/Compression/CachedCompressedReadBuffer.cpp index 4b4d33954a9..d511266d139 100644 --- a/src/Compression/CachedCompressedReadBuffer.cpp +++ b/src/Compression/CachedCompressedReadBuffer.cpp @@ -33,33 +33,27 @@ bool CachedCompressedReadBuffer::nextImpl() /// Let's check for the presence of a decompressed block in the cache, grab the ownership of this block, if it exists. UInt128 key = cache->hash(path, file_pos); - owned_cell = cache->get(key); - if (!owned_cell) + owned_cell = cache->getOrSet(key, [&]() { - /// If not, read it from the file. initInput(); file_in->seek(file_pos, SEEK_SET); - owned_cell = std::make_shared(); + auto cell = std::make_shared(); size_t size_decompressed; size_t size_compressed_without_checksum; - owned_cell->compressed_size = readCompressedData(size_decompressed, size_compressed_without_checksum, false); + cell->compressed_size = readCompressedData(size_decompressed, size_compressed_without_checksum, false); - if (owned_cell->compressed_size) + if (cell->compressed_size) { - owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer(); - owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes); - decompress(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum); - + cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer(); + cell->data.resize(size_decompressed + cell->additional_bytes); + decompressTo(cell->data.data(), size_decompressed, size_compressed_without_checksum); } - /// Put data into cache. - /// NOTE: Even if we don't read anything (compressed_size == 0) - /// because we can reuse this information and don't reopen file in future - cache->set(key, owned_cell); - } + return cell; + }); if (owned_cell->data.size() == 0) return false; diff --git a/src/Compression/CompressedReadBuffer.cpp b/src/Compression/CompressedReadBuffer.cpp index 6a082164231..78241ec1b69 100644 --- a/src/Compression/CompressedReadBuffer.cpp +++ b/src/Compression/CompressedReadBuffer.cpp @@ -21,7 +21,7 @@ bool CompressedReadBuffer::nextImpl() memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); return true; } @@ -48,7 +48,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) /// If the decompressed block fits entirely where it needs to be copied. if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { - decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum); + decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum); bytes_read += size_decompressed; bytes += size_decompressed; } @@ -61,9 +61,9 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - pos = working_buffer.begin(); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); + pos = working_buffer.begin(); bytes_read += read(to + bytes_read, n - bytes_read); break; diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index 8f5b779e4bc..79757d6f151 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -184,7 +184,7 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, } -void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum) +static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, bool allow_different_codecs) { ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks); ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed); @@ -210,11 +210,38 @@ void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, s ErrorCodes::CANNOT_DECOMPRESS); } } +} + +void CompressedReadBufferBase::decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum) +{ + readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs); codec->decompress(compressed_buffer, size_compressed_without_checksum, to); } +void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum) +{ + readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs); + + if (codec->isNone()) + { + /// Shortcut for NONE codec to avoid extra memcpy. + /// We doing it by changing the buffer `to` to point to existing uncompressed data. + + UInt8 header_size = ICompressionCodec::getHeaderSize(); + if (size_compressed_without_checksum < header_size) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", + size_compressed_without_checksum, static_cast(header_size)); + + to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum); + } + else + codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin()); +} + + /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'. CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_) : compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_) diff --git a/src/Compression/CompressedReadBufferBase.h b/src/Compression/CompressedReadBufferBase.h index 60b8847f639..c1e928039ef 100644 --- a/src/Compression/CompressedReadBufferBase.h +++ b/src/Compression/CompressedReadBufferBase.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -37,7 +38,12 @@ protected: /// Returns number of compressed bytes read. size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum, bool always_copy); - void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum); + /// Decompress into memory pointed by `to` + void decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum); + + /// This method can change location of `to` to avoid unnecessary copy if data is uncompressed. + /// It is more efficient for compression codec NONE but not suitable if you want to decompress into specific location. + void decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum); public: /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'. diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp index 54f360f417b..e14a1784b14 100644 --- a/src/Compression/CompressedReadBufferFromFile.cpp +++ b/src/Compression/CompressedReadBufferFromFile.cpp @@ -31,7 +31,7 @@ bool CompressedReadBufferFromFile::nextImpl() memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); return true; } @@ -45,9 +45,15 @@ CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr(0) - , p_file_in(createReadBufferFromFileBase(path, estimated_size, aio_threshold, mmap_threshold, buf_size)) + , p_file_in(createReadBufferFromFileBase(path, estimated_size, aio_threshold, mmap_threshold, mmap_cache, buf_size)) , file_in(*p_file_in) { compressed_in = &file_in; @@ -108,7 +114,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) /// If the decompressed block fits entirely where it needs to be copied. if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { - decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum); + decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum); bytes_read += size_decompressed; bytes += size_decompressed; } @@ -122,9 +128,9 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - pos = working_buffer.begin(); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); + pos = working_buffer.begin(); bytes_read += read(to + bytes_read, n - bytes_read); break; diff --git a/src/Compression/CompressedReadBufferFromFile.h b/src/Compression/CompressedReadBufferFromFile.h index 166b2595ef9..2ee7021b35a 100644 --- a/src/Compression/CompressedReadBufferFromFile.h +++ b/src/Compression/CompressedReadBufferFromFile.h @@ -9,6 +9,8 @@ namespace DB { +class MMappedFileCache; + /// Unlike CompressedReadBuffer, it can do seek. class CompressedReadBufferFromFile : public CompressedReadBufferBase, public BufferWithOwnMemory @@ -31,7 +33,7 @@ public: CompressedReadBufferFromFile(std::unique_ptr buf, bool allow_different_codecs_ = false); CompressedReadBufferFromFile( - const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, + const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MMappedFileCache * mmap_cache, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, bool allow_different_codecs_ = false); void seek(size_t offset_in_compressed_file, size_t offset_in_decompressed_block); diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp index aacf95b1950..0ff9797aeaf 100644 --- a/src/Compression/CompressionFactory.cpp +++ b/src/Compression/CompressionFactory.cpp @@ -107,9 +107,9 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(const ASTPtr if (column_type) { CompressionCodecPtr prev_codec; - IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & substream_type) + IDataType::StreamCallbackWithType callback = [&](const ISerialization::SubstreamPath & substream_path, const IDataType & substream_type) { - if (IDataType::isSpecialCompressionAllowed(substream_path)) + if (ISerialization::isSpecialCompressionAllowed(substream_path)) { result_codec = getImpl(codec_family_name, codec_arguments, &substream_type); @@ -121,8 +121,8 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(const ASTPtr } }; - IDataType::SubstreamPath stream_path; - column_type->enumerateStreams(callback, stream_path); + ISerialization::SubstreamPath stream_path; + column_type->enumerateStreams(column_type->getDefaultSerialization(), callback, stream_path); if (!result_codec) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName()); diff --git a/src/Compression/ICompressionCodec.cpp b/src/Compression/ICompressionCodec.cpp index dec2b633046..46a12e50828 100644 --- a/src/Compression/ICompressionCodec.cpp +++ b/src/Compression/ICompressionCodec.cpp @@ -98,7 +98,7 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch UInt8 header_size = getHeaderSize(); if (source_size < header_size) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", source_size, size_t(header_size)); + throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", source_size, static_cast(header_size)); uint8_t our_method = getMethodByte(); uint8_t method = source[0]; diff --git a/src/Compression/LZ4_decompress_faster.h b/src/Compression/LZ4_decompress_faster.h index dd923279ebf..30a0d7acb22 100644 --- a/src/Compression/LZ4_decompress_faster.h +++ b/src/Compression/LZ4_decompress_faster.h @@ -95,7 +95,7 @@ struct PerformanceStatistics /// How to select method to run. /// -1 - automatically, based on statistics (default); - /// 0..3 - always choose specified method (for performance testing); + /// >= 0 - always choose specified method (for performance testing); /// -2 - choose methods in round robin fashion (for performance testing). ssize_t choose_method = -1; diff --git a/src/Compression/tests/cached_compressed_read_buffer.cpp b/src/Compression/tests/cached_compressed_read_buffer.cpp index ed198e36e46..94e8b356bd5 100644 --- a/src/Compression/tests/cached_compressed_read_buffer.cpp +++ b/src/Compression/tests/cached_compressed_read_buffer.cpp @@ -37,7 +37,7 @@ int main(int argc, char ** argv) path, [&]() { - return createReadBufferFromFileBase(path, 0, 0, 0); + return createReadBufferFromFileBase(path, 0, 0, 0, nullptr); }, &cache ); @@ -56,7 +56,7 @@ int main(int argc, char ** argv) path, [&]() { - return createReadBufferFromFileBase(path, 0, 0, 0); + return createReadBufferFromFileBase(path, 0, 0, 0, nullptr); }, &cache ); diff --git a/src/Compression/tests/gtest_compressionCodec.cpp b/src/Compression/tests/gtest_compressionCodec.cpp index e9470536ae8..0acd15e48c3 100644 --- a/src/Compression/tests/gtest_compressionCodec.cpp +++ b/src/Compression/tests/gtest_compressionCodec.cpp @@ -749,7 +749,7 @@ auto RandomishGenerator = [](auto i) { using T = decltype(i); double sin_value = sin(static_cast(i * i)) * i; - if (sin_value < std::numeric_limits::lowest() || sin_value > std::numeric_limits::max()) + if (sin_value < std::numeric_limits::lowest() || sin_value > static_cast(std::numeric_limits::max())) return T{}; return T(sin_value); }; diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp new file mode 100644 index 00000000000..ba1664b23da --- /dev/null +++ b/src/Coordination/Changelog.cpp @@ -0,0 +1,588 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CHECKSUM_DOESNT_MATCH; + extern const int CORRUPTED_DATA; + extern const int UNKNOWN_FORMAT_VERSION; + extern const int LOGICAL_ERROR; +} + +namespace +{ + +constexpr auto DEFAULT_PREFIX = "changelog"; + +std::string formatChangelogPath(const std::string & prefix, const ChangelogFileDescription & name) +{ + std::filesystem::path path(prefix); + path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_index) + "_" + std::to_string(name.to_log_index) + ".bin"); + return path; +} + +ChangelogFileDescription getChangelogFileDescription(const std::string & path_str) +{ + std::filesystem::path path(path_str); + std::string filename = path.stem(); + Strings filename_parts; + boost::split(filename_parts, filename, boost::is_any_of("_")); + if (filename_parts.size() < 3) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str); + + ChangelogFileDescription result; + result.prefix = filename_parts[0]; + result.from_log_index = parse(filename_parts[1]); + result.to_log_index = parse(filename_parts[2]); + result.path = path_str; + return result; +} + +LogEntryPtr makeClone(const LogEntryPtr & entry) +{ + return cs_new(entry->get_term(), nuraft::buffer::clone(entry->get_buf()), entry->get_val_type()); +} + +Checksum computeRecordChecksum(const ChangelogRecord & record) +{ + SipHash hash; + hash.update(record.header.version); + hash.update(record.header.index); + hash.update(record.header.term); + hash.update(record.header.value_type); + hash.update(record.header.blob_size); + if (record.header.blob_size != 0) + hash.update(reinterpret_cast(record.blob->data_begin()), record.blob->size()); + return hash.get64(); +} + +} + +class ChangelogWriter +{ +public: + ChangelogWriter(const std::string & filepath_, WriteMode mode, uint64_t start_index_) + : filepath(filepath_) + , plain_buf(filepath, DBMS_DEFAULT_BUFFER_SIZE, mode == WriteMode::Rewrite ? -1 : (O_APPEND | O_CREAT | O_WRONLY)) + , start_index(start_index_) + {} + + + off_t appendRecord(ChangelogRecord && record) + { + off_t result = plain_buf.count(); + writeIntBinary(computeRecordChecksum(record), plain_buf); + + writeIntBinary(record.header.version, plain_buf); + writeIntBinary(record.header.index, plain_buf); + writeIntBinary(record.header.term, plain_buf); + writeIntBinary(record.header.value_type, plain_buf); + writeIntBinary(record.header.blob_size, plain_buf); + + if (record.header.blob_size != 0) + plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); + + entries_written++; + + return result; + } + + void truncateToLength(off_t new_length) + { + plain_buf.next(); + plain_buf.truncate(new_length); + plain_buf.seek(new_length, SEEK_SET); + } + + void flush(bool force_fsync) + { + plain_buf.next(); + if (force_fsync) + plain_buf.sync(); + } + + uint64_t getEntriesWritten() const + { + return entries_written; + } + + void setEntriesWritten(uint64_t entries_written_) + { + entries_written = entries_written_; + } + + uint64_t getStartIndex() const + { + return start_index; + } + + void setStartIndex(uint64_t start_index_) + { + start_index = start_index_; + } + +private: + std::string filepath; + WriteBufferFromFile plain_buf; + uint64_t entries_written = 0; + uint64_t start_index; +}; + +struct ChangelogReadResult +{ + uint64_t entries_read; + uint64_t first_read_index; + off_t last_position; + bool error; +}; + +class ChangelogReader +{ +public: + explicit ChangelogReader(const std::string & filepath_) + : filepath(filepath_) + , read_buf(filepath) + {} + + ChangelogReadResult readChangelog(IndexToLogEntry & logs, uint64_t start_log_index, IndexToOffset & index_to_offset, Poco::Logger * log) + { + uint64_t previous_index = 0; + ChangelogReadResult result{}; + try + { + while (!read_buf.eof()) + { + result.last_position = read_buf.count(); + Checksum record_checksum; + readIntBinary(record_checksum, read_buf); + + /// Initialization is required, otherwise checksums may fail + ChangelogRecord record; + readIntBinary(record.header.version, read_buf); + readIntBinary(record.header.index, read_buf); + readIntBinary(record.header.term, read_buf); + readIntBinary(record.header.value_type, read_buf); + readIntBinary(record.header.blob_size, read_buf); + + if (record.header.version > CURRENT_CHANGELOG_VERSION) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", record.header.version, filepath); + + if (record.header.blob_size != 0) + { + auto buffer = nuraft::buffer::alloc(record.header.blob_size); + auto * buffer_begin = reinterpret_cast(buffer->data_begin()); + read_buf.readStrict(buffer_begin, record.header.blob_size); + record.blob = buffer; + } + else + record.blob = nullptr; + + if (previous_index != 0 && previous_index + 1 != record.header.index) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Previous log entry {}, next log entry {}, seems like some entries skipped", previous_index, record.header.index); + + previous_index = record.header.index; + + Checksum checksum = computeRecordChecksum(record); + if (checksum != record_checksum) + { + throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, + "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", + filepath, record.header.version, record.header.index, record.header.blob_size); + } + + if (logs.count(record.header.index) != 0) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); + + result.entries_read += 1; + + if (record.header.index < start_log_index) + { + continue; + } + + auto log_entry = nuraft::cs_new(record.header.term, record.blob, record.header.value_type); + if (result.first_read_index == 0) + result.first_read_index = record.header.index; + + logs.emplace(record.header.index, log_entry); + index_to_offset[record.header.index] = result.last_position; + if (result.entries_read % 50000 == 0) + LOG_TRACE(log, "Reading changelog from path {}, entries {}", filepath, result.entries_read); + } + } + catch (const Exception & ex) + { + if (ex.code() == ErrorCodes::UNKNOWN_FORMAT_VERSION) + throw ex; + + result.error = true; + LOG_WARNING(log, "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); + } + catch (...) + { + result.error = true; + tryLogCurrentException(log); + } + LOG_TRACE(log, "Totally read from changelog {} {} entries", filepath, result.entries_read); + + return result; + } + +private: + std::string filepath; + ReadBufferFromFile read_buf; +}; + +Changelog::Changelog( + const std::string & changelogs_dir_, + uint64_t rotate_interval_, + bool force_sync_, + Poco::Logger * log_) + : changelogs_dir(changelogs_dir_) + , rotate_interval(rotate_interval_) + , force_sync(force_sync_) + , log(log_) +{ + namespace fs = std::filesystem; + if (!fs::exists(changelogs_dir)) + fs::create_directories(changelogs_dir); + + for (const auto & p : fs::directory_iterator(changelogs_dir)) + { + auto file_description = getChangelogFileDescription(p.path()); + existing_changelogs[file_description.from_log_index] = file_description; + } +} + +void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep) +{ + uint64_t total_read = 0; + uint64_t entries_in_last = 0; + uint64_t incomplete_log_index = 0; + ChangelogReadResult result{}; + uint64_t first_read_index = 0; + + uint64_t start_to_read_from = last_commited_log_index; + if (start_to_read_from > logs_to_keep) + start_to_read_from -= logs_to_keep; + else + start_to_read_from = 1; + + bool started = false; + for (const auto & [changelog_start_index, changelog_description] : existing_changelogs) + { + entries_in_last = changelog_description.to_log_index - changelog_description.from_log_index + 1; + + if (changelog_description.to_log_index >= start_to_read_from) + { + if (!started) + { + if (changelog_description.from_log_index > last_commited_log_index && (changelog_description.from_log_index - last_commited_log_index) > 1) + { + LOG_ERROR(log, "Some records was lost, last committed log index {}, smallest available log index on disk {}. Hopefully will receive missing records from leader.", last_commited_log_index, changelog_description.from_log_index); + incomplete_log_index = changelog_start_index; + break; + } + else if (changelog_description.from_log_index > start_to_read_from) + LOG_WARNING(log, "Don't have required amount of reserved log records. Need to read from {}, smalled available log index on disk {}.", start_to_read_from, changelog_description.from_log_index); + } + + started = true; + + ChangelogReader reader(changelog_description.path); + result = reader.readChangelog(logs, start_to_read_from, index_to_start_pos, log); + if (first_read_index == 0) + first_read_index = result.first_read_index; + + total_read += result.entries_read; + + /// May happen after truncate, crash or simply unfinished log + if (result.entries_read < entries_in_last) + { + incomplete_log_index = changelog_start_index; + break; + } + } + } + + if (first_read_index != 0) + start_index = first_read_index; + else + start_index = last_commited_log_index; + + if (incomplete_log_index != 0) + { + auto start_remove_from = existing_changelogs.begin(); + if (started) + start_remove_from = existing_changelogs.upper_bound(incomplete_log_index); + + /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them. + for (auto itr = start_remove_from; itr != existing_changelogs.end();) + { + LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path); + std::filesystem::remove(itr->second.path); + itr = existing_changelogs.erase(itr); + } + + /// Continue to write into existing log + if (!existing_changelogs.empty()) + { + auto description = existing_changelogs.rbegin()->second; + LOG_TRACE(log, "Continue to write into {}", description.path); + current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_index); + current_writer->setEntriesWritten(result.entries_read); + + /// Truncate all broken entries from log + if (result.error) + { + LOG_WARNING(log, "Read finished with error, truncating all broken log entries"); + current_writer->truncateToLength(result.last_position); + } + } + } + + /// Start new log if we don't initialize writer from previous log + if (!current_writer) + rotate(start_index + total_read); +} + +void Changelog::rotate(uint64_t new_start_log_index) +{ + /// Flush previous log + flush(); + + ChangelogFileDescription new_description; + new_description.prefix = DEFAULT_PREFIX; + new_description.from_log_index = new_start_log_index; + new_description.to_log_index = new_start_log_index + rotate_interval - 1; + + new_description.path = formatChangelogPath(changelogs_dir, new_description); + + LOG_TRACE(log, "Starting new changelog {}", new_description.path); + existing_changelogs[new_start_log_index] = new_description; + current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_index); +} + +ChangelogRecord Changelog::buildRecord(uint64_t index, const LogEntryPtr & log_entry) +{ + ChangelogRecord record; + record.header.version = ChangelogVersion::V0; + record.header.index = index; + record.header.term = log_entry->get_term(); + record.header.value_type = log_entry->get_val_type(); + auto buffer = log_entry->get_buf_ptr(); + if (buffer) + record.header.blob_size = buffer->size(); + else + record.header.blob_size = 0; + + record.blob = buffer; + + return record; +} + +void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry) +{ + if (!current_writer) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); + + if (logs.empty()) + start_index = index; + + if (current_writer->getEntriesWritten() == rotate_interval) + rotate(index); + + auto offset = current_writer->appendRecord(buildRecord(index, log_entry)); + if (!index_to_start_pos.try_emplace(index, offset).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); + + logs[index] = makeClone(log_entry); +} + +void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry) +{ + if (index_to_start_pos.count(index) == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); + + bool go_to_previous_file = index < current_writer->getStartIndex(); + if (go_to_previous_file) + { + auto index_changelog = existing_changelogs.lower_bound(index); + ChangelogFileDescription description; + if (index_changelog->first == index) + description = index_changelog->second; + else + description = std::prev(index_changelog)->second; + + current_writer = std::make_unique(description.path, WriteMode::Append, index_changelog->first); + current_writer->setEntriesWritten(description.to_log_index - description.from_log_index + 1); + } + + auto entries_written = current_writer->getEntriesWritten(); + current_writer->truncateToLength(index_to_start_pos[index]); + + if (go_to_previous_file) + { + /// Remove all subsequent files + auto to_remove_itr = existing_changelogs.upper_bound(index); + for (auto itr = to_remove_itr; itr != existing_changelogs.end();) + { + std::filesystem::remove(itr->second.path); + itr = existing_changelogs.erase(itr); + } + } + + /// Remove redundant logs from memory + for (uint64_t i = index; ; ++i) + { + auto log_itr = logs.find(i); + if (log_itr == logs.end()) + break; + logs.erase(log_itr); + index_to_start_pos.erase(i); + entries_written--; + } + + current_writer->setEntriesWritten(entries_written); + + appendEntry(index, log_entry); +} + +void Changelog::compact(uint64_t up_to_log_index) +{ + for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) + { + /// Remove all completely outdated changelog files + if (itr->second.to_log_index <= up_to_log_index) + { + + LOG_INFO(log, "Removing changelog {} because of compaction", itr->second.path); + std::erase_if(index_to_start_pos, [right_index = itr->second.to_log_index] (const auto & item) { return item.first <= right_index; }); + std::filesystem::remove(itr->second.path); + itr = existing_changelogs.erase(itr); + } + else /// Files are ordered, so all subsequent should exist + break; + } + start_index = up_to_log_index + 1; + std::erase_if(logs, [up_to_log_index] (const auto & item) { return item.first <= up_to_log_index; }); +} + +LogEntryPtr Changelog::getLastEntry() const +{ + static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(uint64_t))); + + uint64_t next_index = getNextEntryIndex() - 1; + auto entry = logs.find(next_index); + if (entry == logs.end()) + return fake_entry; + + return entry->second; +} + +LogEntriesPtr Changelog::getLogEntriesBetween(uint64_t start, uint64_t end) +{ + LogEntriesPtr ret = nuraft::cs_new>>(); + + ret->resize(end - start); + uint64_t result_pos = 0; + for (uint64_t i = start; i < end; ++i) + { + (*ret)[result_pos] = entryAt(i); + result_pos++; + } + return ret; +} + +LogEntryPtr Changelog::entryAt(uint64_t index) +{ + nuraft::ptr src = nullptr; + auto entry = logs.find(index); + if (entry == logs.end()) + return nullptr; + + src = entry->second; + return src; +} + +nuraft::ptr Changelog::serializeEntriesToBuffer(uint64_t index, int32_t count) +{ + std::vector> returned_logs; + + uint64_t size_total = 0; + for (uint64_t i = index; i < index + count; ++i) + { + auto entry = logs.find(i); + if (entry == logs.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Don't have log entry {}", i); + + nuraft::ptr buf = entry->second->serialize(); + size_total += buf->size(); + returned_logs.push_back(buf); + } + + nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32_t) + count * sizeof(int32_t) + size_total); + buf_out->pos(0); + buf_out->put(static_cast(count)); + + for (auto & entry : returned_logs) + { + nuraft::ptr & bb = entry; + buf_out->put(static_cast(bb->size())); + buf_out->put(*bb); + } + return buf_out; +} + +void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer) +{ + buffer.pos(0); + int num_logs = buffer.get_int(); + + for (int i = 0; i < num_logs; ++i) + { + uint64_t cur_index = index + i; + int buf_size = buffer.get_int(); + + nuraft::ptr buf_local = nuraft::buffer::alloc(buf_size); + buffer.get(buf_local); + + LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); + if (i == 0 && logs.count(cur_index)) + writeAt(cur_index, log_entry); + else + appendEntry(cur_index, log_entry); + } +} + +void Changelog::flush() +{ + if (current_writer) + current_writer->flush(force_sync); +} + +Changelog::~Changelog() +{ + try + { + flush(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} + +} diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h new file mode 100644 index 00000000000..d669f56aded --- /dev/null +++ b/src/Coordination/Changelog.h @@ -0,0 +1,137 @@ +#pragma once + +#include // Y_IGNORE +#include +#include +#include +#include +#include + +namespace DB +{ + +using Checksum = UInt64; + +using LogEntryPtr = nuraft::ptr; +using LogEntries = std::vector; +using LogEntriesPtr = nuraft::ptr; +using BufferPtr = nuraft::ptr; + +using IndexToOffset = std::unordered_map; +using IndexToLogEntry = std::unordered_map; + +enum class ChangelogVersion : uint8_t +{ + V0 = 0, +}; + +static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0; + +struct ChangelogRecordHeader +{ + ChangelogVersion version = CURRENT_CHANGELOG_VERSION; + uint64_t index; /// entry log number + uint64_t term; + nuraft::log_val_type value_type; + uint64_t blob_size; +}; + +/// Changelog record on disk +struct ChangelogRecord +{ + ChangelogRecordHeader header; + nuraft::ptr blob; +}; + +/// changelog_fromindex_toindex.bin +/// [fromindex, toindex] <- inclusive +struct ChangelogFileDescription +{ + std::string prefix; + uint64_t from_log_index; + uint64_t to_log_index; + + std::string path; +}; + +class ChangelogWriter; + +/// Simplest changelog with files rotation. +/// No compression, no metadata, just entries with headers one by one +/// Able to read broken files/entries and discard them. +class Changelog +{ + +public: + Changelog(const std::string & changelogs_dir_, uint64_t rotate_interval_, bool force_sync_, Poco::Logger * log_); + + /// Read changelog from files on changelogs_dir_ skipping all entries before from_log_index + /// Truncate broken entries, remove files after broken entries. + void readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep); + + /// Add entry to log with index. + void appendEntry(uint64_t index, const LogEntryPtr & log_entry); + + /// Write entry at index and truncate all subsequent entries. + void writeAt(uint64_t index, const LogEntryPtr & log_entry); + + /// Remove log files with to_log_index <= up_to_log_index. + void compact(uint64_t up_to_log_index); + + uint64_t getNextEntryIndex() const + { + return start_index + logs.size(); + } + + uint64_t getStartIndex() const + { + return start_index; + } + + /// Last entry in log, or fake entry with term 0 if log is empty + LogEntryPtr getLastEntry() const; + + /// Return log entries between [start, end) + LogEntriesPtr getLogEntriesBetween(uint64_t start_index, uint64_t end_index); + + /// Return entry at position index + LogEntryPtr entryAt(uint64_t index); + + /// Serialize entries from index into buffer + BufferPtr serializeEntriesToBuffer(uint64_t index, int32_t count); + + /// Apply entries from buffer overriding existing entries + void applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer); + + /// Fsync latest log to disk and flush buffer + void flush(); + + uint64_t size() const + { + return logs.size(); + } + + /// Fsync log to disk + ~Changelog(); + +private: + /// Pack log_entry into changelog record + static ChangelogRecord buildRecord(uint64_t index, const LogEntryPtr & log_entry); + + /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval] + void rotate(uint64_t new_start_log_index); + +private: + const std::string changelogs_dir; + const uint64_t rotate_interval; + const bool force_sync; + Poco::Logger * log; + + std::map existing_changelogs; + std::unique_ptr current_writer; + IndexToOffset index_to_start_pos; + IndexToLogEntry logs; + uint64_t start_index = 0; +}; + +} diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 441e1a5936f..7a98e3f200d 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -22,13 +22,19 @@ struct Settings; M(Milliseconds, heart_beat_interval_ms, 500, "Heartbeat interval between quorum nodes", 0) \ M(Milliseconds, election_timeout_lower_bound_ms, 1000, "Lower bound of election timer (avoid too often leader elections)", 0) \ M(Milliseconds, election_timeout_upper_bound_ms, 2000, "Lower bound of election timer (avoid too often leader elections)", 0) \ - M(UInt64, reserved_log_items, 5000, "How many log items to store (don't remove during compaction)", 0) \ - M(UInt64, snapshot_distance, 5000, "How many log items we have to collect to write new snapshot", 0) \ - M(UInt64, max_stored_snapshots, 3, "How many snapshots we want to store", 0) \ + M(UInt64, reserved_log_items, 100000, "How many log items to store (don't remove during compaction)", 0) \ + M(UInt64, snapshot_distance, 100000, "How many log items we have to collect to write new snapshot", 0) \ M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \ M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \ M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \ - M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) + M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ + M(UInt64, rotate_log_storage_interval, 100000, "How many records will be stored in one log storage file", 0) \ + M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \ + M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \ + M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \ + M(UInt64, max_requests_batch_size, 100, "Max size of batch in requests count before it will be sent to RAFT", 0) \ + M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \ + M(Bool, force_sync, true, "Call fsync on each change in RAFT changelog", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/InMemoryLogStore.cpp b/src/Coordination/InMemoryLogStore.cpp index 101458891e7..b7bccdf588e 100644 --- a/src/Coordination/InMemoryLogStore.cpp +++ b/src/Coordination/InMemoryLogStore.cpp @@ -16,16 +16,16 @@ ptr makeClone(const ptr & entry) InMemoryLogStore::InMemoryLogStore() : start_idx(1) { - nuraft::ptr buf = nuraft::buffer::alloc(sizeof(size_t)); + nuraft::ptr buf = nuraft::buffer::alloc(sizeof(uint64_t)); logs[0] = nuraft::cs_new(0, buf); } -size_t InMemoryLogStore::start_index() const +uint64_t InMemoryLogStore::start_index() const { return start_idx; } -size_t InMemoryLogStore::next_slot() const +uint64_t InMemoryLogStore::next_slot() const { std::lock_guard l(logs_lock); // Exclude the dummy entry. @@ -34,7 +34,7 @@ size_t InMemoryLogStore::next_slot() const nuraft::ptr InMemoryLogStore::last_entry() const { - size_t next_idx = next_slot(); + uint64_t next_idx = next_slot(); std::lock_guard lock(logs_lock); auto entry = logs.find(next_idx - 1); if (entry == logs.end()) @@ -43,17 +43,17 @@ nuraft::ptr InMemoryLogStore::last_entry() const return makeClone(entry->second); } -size_t InMemoryLogStore::append(nuraft::ptr & entry) +uint64_t InMemoryLogStore::append(nuraft::ptr & entry) { ptr clone = makeClone(entry); std::lock_guard l(logs_lock); - size_t idx = start_idx + logs.size() - 1; + uint64_t idx = start_idx + logs.size() - 1; logs[idx] = clone; return idx; } -void InMemoryLogStore::write_at(size_t index, nuraft::ptr & entry) +void InMemoryLogStore::write_at(uint64_t index, nuraft::ptr & entry) { nuraft::ptr clone = makeClone(entry); @@ -65,19 +65,19 @@ void InMemoryLogStore::write_at(size_t index, nuraft::ptr & e logs[index] = clone; } -nuraft::ptr>> InMemoryLogStore::log_entries(size_t start, size_t end) +nuraft::ptr>> InMemoryLogStore::log_entries(uint64_t start, uint64_t end) { nuraft::ptr>> ret = nuraft::cs_new>>(); ret->resize(end - start); - size_t cc = 0; - for (size_t ii = start; ii < end; ++ii) + uint64_t cc = 0; + for (uint64_t i = start; i < end; ++i) { nuraft::ptr src = nullptr; { std::lock_guard l(logs_lock); - auto entry = logs.find(ii); + auto entry = logs.find(i); if (entry == logs.end()) { entry = logs.find(0); @@ -90,7 +90,7 @@ nuraft::ptr>> InMemoryLogStore::log_e return ret; } -nuraft::ptr InMemoryLogStore::entry_at(size_t index) +nuraft::ptr InMemoryLogStore::entry_at(uint64_t index) { nuraft::ptr src = nullptr; { @@ -103,9 +103,9 @@ nuraft::ptr InMemoryLogStore::entry_at(size_t index) return makeClone(src); } -size_t InMemoryLogStore::term_at(size_t index) +uint64_t InMemoryLogStore::term_at(uint64_t index) { - size_t term = 0; + uint64_t term = 0; { std::lock_guard l(logs_lock); auto entry = logs.find(index); @@ -116,12 +116,12 @@ size_t InMemoryLogStore::term_at(size_t index) return term; } -nuraft::ptr InMemoryLogStore::pack(size_t index, Int32 cnt) +nuraft::ptr InMemoryLogStore::pack(uint64_t index, Int32 cnt) { std::vector> returned_logs; - size_t size_total = 0; - for (size_t ii = index; ii < index + cnt; ++ii) + uint64_t uint64_total = 0; + for (uint64_t ii = index; ii < index + cnt; ++ii) { ptr le = nullptr; { @@ -130,11 +130,11 @@ nuraft::ptr InMemoryLogStore::pack(size_t index, Int32 cnt) } assert(le.get()); nuraft::ptr buf = le->serialize(); - size_total += buf->size(); + uint64_total += buf->size(); returned_logs.push_back(buf); } - nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32) + cnt * sizeof(int32) + size_total); + nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32) + cnt * sizeof(int32) + uint64_total); buf_out->pos(0); buf_out->put(static_cast(cnt)); @@ -147,14 +147,14 @@ nuraft::ptr InMemoryLogStore::pack(size_t index, Int32 cnt) return buf_out; } -void InMemoryLogStore::apply_pack(size_t index, nuraft::buffer & pack) +void InMemoryLogStore::apply_pack(uint64_t index, nuraft::buffer & pack) { pack.pos(0); Int32 num_logs = pack.get_int(); - for (Int32 ii = 0; ii < num_logs; ++ii) + for (Int32 i = 0; i < num_logs; ++i) { - size_t cur_idx = index + ii; + uint64_t cur_idx = index + i; Int32 buf_size = pack.get_int(); nuraft::ptr buf_local = nuraft::buffer::alloc(buf_size); @@ -177,10 +177,10 @@ void InMemoryLogStore::apply_pack(size_t index, nuraft::buffer & pack) } } -bool InMemoryLogStore::compact(size_t last_log_index) +bool InMemoryLogStore::compact(uint64_t last_log_index) { std::lock_guard l(logs_lock); - for (size_t ii = start_idx; ii <= last_log_index; ++ii) + for (uint64_t ii = start_idx; ii <= last_log_index; ++ii) { auto entry = logs.find(ii); if (entry != logs.end()) diff --git a/src/Coordination/InMemoryLogStore.h b/src/Coordination/InMemoryLogStore.h index 425b056a81d..caa5e31698d 100644 --- a/src/Coordination/InMemoryLogStore.h +++ b/src/Coordination/InMemoryLogStore.h @@ -14,34 +14,34 @@ class InMemoryLogStore : public nuraft::log_store public: InMemoryLogStore(); - size_t start_index() const override; + uint64_t start_index() const override; - size_t next_slot() const override; + uint64_t next_slot() const override; nuraft::ptr last_entry() const override; - size_t append(nuraft::ptr & entry) override; + uint64_t append(nuraft::ptr & entry) override; - void write_at(size_t index, nuraft::ptr & entry) override; + void write_at(uint64_t index, nuraft::ptr & entry) override; - nuraft::ptr>> log_entries(size_t start, size_t end) override; + nuraft::ptr>> log_entries(uint64_t start, uint64_t end) override; - nuraft::ptr entry_at(size_t index) override; + nuraft::ptr entry_at(uint64_t index) override; - size_t term_at(size_t index) override; + uint64_t term_at(uint64_t index) override; - nuraft::ptr pack(size_t index, Int32 cnt) override; + nuraft::ptr pack(uint64_t index, Int32 cnt) override; - void apply_pack(size_t index, nuraft::buffer & pack) override; + void apply_pack(uint64_t index, nuraft::buffer & pack) override; - bool compact(size_t last_log_index) override; + bool compact(uint64_t last_log_index) override; bool flush() override { return true; } private: - std::map> logs; + std::map> logs; mutable std::mutex logs_lock; - std::atomic start_idx; + std::atomic start_idx; }; } diff --git a/src/Coordination/KeeperLogStore.cpp b/src/Coordination/KeeperLogStore.cpp new file mode 100644 index 00000000000..3896cf9b6fd --- /dev/null +++ b/src/Coordination/KeeperLogStore.cpp @@ -0,0 +1,114 @@ +#include + +namespace DB +{ + +KeeperLogStore::KeeperLogStore(const std::string & changelogs_path, uint64_t rotate_interval_, bool force_sync_) + : log(&Poco::Logger::get("KeeperLogStore")) + , changelog(changelogs_path, rotate_interval_, force_sync_, log) +{ + if (force_sync_) + LOG_INFO(log, "force_sync enabled"); + else + LOG_INFO(log, "force_sync disabled"); +} + +uint64_t KeeperLogStore::start_index() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getStartIndex(); +} + +void KeeperLogStore::init(uint64_t last_commited_log_index, uint64_t logs_to_keep) +{ + std::lock_guard lock(changelog_lock); + changelog.readChangelogAndInitWriter(last_commited_log_index, logs_to_keep); +} + +uint64_t KeeperLogStore::next_slot() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getNextEntryIndex(); +} + +nuraft::ptr KeeperLogStore::last_entry() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getLastEntry(); +} + +uint64_t KeeperLogStore::append(nuraft::ptr & entry) +{ + std::lock_guard lock(changelog_lock); + uint64_t idx = changelog.getNextEntryIndex(); + changelog.appendEntry(idx, entry); + return idx; +} + + +void KeeperLogStore::write_at(uint64_t index, nuraft::ptr & entry) +{ + std::lock_guard lock(changelog_lock); + changelog.writeAt(index, entry); +} + +nuraft::ptr>> KeeperLogStore::log_entries(uint64_t start, uint64_t end) +{ + std::lock_guard lock(changelog_lock); + return changelog.getLogEntriesBetween(start, end); +} + +nuraft::ptr KeeperLogStore::entry_at(uint64_t index) +{ + std::lock_guard lock(changelog_lock); + return changelog.entryAt(index); +} + +uint64_t KeeperLogStore::term_at(uint64_t index) +{ + std::lock_guard lock(changelog_lock); + auto entry = changelog.entryAt(index); + if (entry) + return entry->get_term(); + return 0; +} + +nuraft::ptr KeeperLogStore::pack(uint64_t index, int32_t cnt) +{ + std::lock_guard lock(changelog_lock); + return changelog.serializeEntriesToBuffer(index, cnt); +} + +bool KeeperLogStore::compact(uint64_t last_log_index) +{ + std::lock_guard lock(changelog_lock); + changelog.compact(last_log_index); + return true; +} + +bool KeeperLogStore::flush() +{ + std::lock_guard lock(changelog_lock); + changelog.flush(); + return true; +} + +void KeeperLogStore::apply_pack(uint64_t index, nuraft::buffer & pack) +{ + std::lock_guard lock(changelog_lock); + changelog.applyEntriesFromBuffer(index, pack); +} + +uint64_t KeeperLogStore::size() const +{ + std::lock_guard lock(changelog_lock); + return changelog.size(); +} + +void KeeperLogStore::end_of_append_batch(uint64_t /*start_index*/, uint64_t /*count*/) +{ + std::lock_guard lock(changelog_lock); + changelog.flush(); +} + +} diff --git a/src/Coordination/KeeperLogStore.h b/src/Coordination/KeeperLogStore.h new file mode 100644 index 00000000000..01315e6e879 --- /dev/null +++ b/src/Coordination/KeeperLogStore.h @@ -0,0 +1,53 @@ +#pragma once +#include // Y_IGNORE +#include +#include +#include +#include +#include + +namespace DB +{ + +class KeeperLogStore : public nuraft::log_store +{ +public: + KeeperLogStore(const std::string & changelogs_path, uint64_t rotate_interval_, bool force_sync_); + + void init(uint64_t last_commited_log_index, uint64_t logs_to_keep); + + uint64_t start_index() const override; + + uint64_t next_slot() const override; + + nuraft::ptr last_entry() const override; + + uint64_t append(nuraft::ptr & entry) override; + + void write_at(uint64_t index, nuraft::ptr & entry) override; + + nuraft::ptr>> log_entries(uint64_t start, uint64_t end) override; + + nuraft::ptr entry_at(uint64_t index) override; + + uint64_t term_at(uint64_t index) override; + + nuraft::ptr pack(uint64_t index, int32_t cnt) override; + + void apply_pack(uint64_t index, nuraft::buffer & pack) override; + + bool compact(uint64_t last_log_index) override; + + bool flush() override; + + uint64_t size() const; + + void end_of_append_batch(uint64_t start_index, uint64_t count) override; + +private: + mutable std::mutex changelog_lock; + Poco::Logger * log; + Changelog changelog; +}; + +} diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp new file mode 100644 index 00000000000..6557ad5504d --- /dev/null +++ b/src/Coordination/KeeperServer.cpp @@ -0,0 +1,330 @@ +#include + +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int RAFT_ERROR; + extern const int NO_ELEMENTS_IN_CONFIG; + extern const int SUPPORT_IS_DISABLED; + extern const int LOGICAL_ERROR; +} + +namespace +{ + +#if USE_SSL +void setSSLParams(nuraft::asio_service::options & asio_opts) +{ + const Poco::Util::LayeredConfiguration & config = Poco::Util::Application::instance().config(); + String certificate_file_property = "openSSL.server.certificateFile"; + String private_key_file_property = "openSSL.server.privateKeyFile"; + String root_ca_file_property = "openSSL.server.caConfig"; + + if (!config.has(certificate_file_property)) + throw Exception("Server certificate file is not set.", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + + if (!config.has(private_key_file_property)) + throw Exception("Server private key file is not set.", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + + asio_opts.enable_ssl_ = true; + asio_opts.server_cert_file_ = config.getString(certificate_file_property); + asio_opts.server_key_file_ = config.getString(private_key_file_property); + + if (config.has(root_ca_file_property)) + asio_opts.root_cert_file_ = config.getString(root_ca_file_property); + + if (config.getBool("openSSL.server.loadDefaultCAFile", false)) + asio_opts.load_default_ca_file_ = true; + + if (config.getString("openSSL.server.verificationMode", "none") == "none") + asio_opts.skip_verification_ = true; +} +#endif + +} + +KeeperServer::KeeperServer( + int server_id_, + const CoordinationSettingsPtr & coordination_settings_, + const Poco::Util::AbstractConfiguration & config, + ResponsesQueue & responses_queue_, + SnapshotsQueue & snapshots_queue_) + : server_id(server_id_) + , coordination_settings(coordination_settings_) + , state_machine(nuraft::cs_new( + responses_queue_, snapshots_queue_, + config.getString("keeper_server.snapshot_storage_path", config.getString("path", DBMS_DEFAULT_PATH) + "coordination/snapshots"), + coordination_settings)) + , state_manager(nuraft::cs_new(server_id, "keeper_server", config, coordination_settings)) + , log(&Poco::Logger::get("KeeperServer")) +{ + if (coordination_settings->quorum_reads) + LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); +} + +void KeeperServer::startup() +{ + state_machine->init(); + + state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); + + bool single_server = state_manager->getTotalServers() == 1; + + nuraft::raft_params params; + if (single_server) + { + /// Don't make sense in single server mode + params.heart_beat_interval_ = 0; + params.election_timeout_lower_bound_ = 0; + params.election_timeout_upper_bound_ = 0; + } + else + { + params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds(); + params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(); + params.election_timeout_upper_bound_ = coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(); + } + + params.reserved_log_items_ = coordination_settings->reserved_log_items; + params.snapshot_distance_ = coordination_settings->snapshot_distance; + params.stale_log_gap_ = coordination_settings->stale_log_gap; + params.fresh_log_gap_ = coordination_settings->fresh_log_gap; + params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds(); + params.auto_forwarding_ = coordination_settings->auto_forwarding; + params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2; + + params.return_method_ = nuraft::raft_params::async_handler; + + nuraft::asio_service::options asio_opts{}; + if (state_manager->isSecure()) + { +#if USE_SSL + setSSLParams(asio_opts); +#else + throw Exception{"SSL support for NuRaft is disabled because ClickHouse was built without SSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + } + + launchRaftServer(params, asio_opts); + + if (!raft_instance) + throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); +} + +void KeeperServer::launchRaftServer( + const nuraft::raft_params & params, + const nuraft::asio_service::options & asio_opts) +{ + nuraft::raft_server::init_options init_options; + + init_options.skip_initial_election_timeout_ = state_manager->shouldStartAsFollower(); + init_options.start_server_in_constructor_ = false; + init_options.raft_callback_ = [this] (nuraft::cb_func::Type type, nuraft::cb_func::Param * param) + { + return callbackFunc(type, param); + }; + + nuraft::ptr logger = nuraft::cs_new("RaftInstance", coordination_settings->raft_logs_level); + asio_service = nuraft::cs_new(asio_opts, logger); + asio_listener = asio_service->create_rpc_listener(state_manager->getPort(), logger); + + if (!asio_listener) + return; + + nuraft::ptr scheduler = asio_service; + nuraft::ptr rpc_cli_factory = asio_service; + + nuraft::ptr casted_state_manager = state_manager; + nuraft::ptr casted_state_machine = state_machine; + + /// raft_server creates unique_ptr from it + nuraft::context * ctx = new nuraft::context( + casted_state_manager, casted_state_machine, + asio_listener, logger, rpc_cli_factory, scheduler, params); + + raft_instance = nuraft::cs_new(ctx, init_options); + + raft_instance->start_server(init_options.skip_initial_election_timeout_); + asio_listener->listen(raft_instance); +} + +void KeeperServer::shutdownRaftServer() +{ + size_t timeout = coordination_settings->shutdown_timeout.totalSeconds(); + + if (!raft_instance) + { + LOG_INFO(log, "RAFT doesn't start, shutdown not required"); + return; + } + + raft_instance->shutdown(); + raft_instance.reset(); + + if (asio_listener) + { + asio_listener->stop(); + asio_listener->shutdown(); + } + + if (asio_service) + { + asio_service->stop(); + size_t count = 0; + while (asio_service->get_active_workers() != 0 && count < timeout * 100) + { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + count++; + } + } + + if (asio_service->get_active_workers() != 0) + LOG_WARNING(log, "Failed to shutdown RAFT server in {} seconds", timeout); +} + + +void KeeperServer::shutdown() +{ + state_machine->shutdownStorage(); + state_manager->flushLogStore(); + shutdownRaftServer(); +} + +namespace +{ + +nuraft::ptr getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request) +{ + DB::WriteBufferFromNuraftBuffer buf; + DB::writeIntBinary(session_id, buf); + request->write(buf); + return buf.getBuffer(); +} + +} + + +void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession & request_for_session) +{ + if (!request_for_session.request->isReadRequest()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot process non-read request locally"); + + state_machine->processReadRequest(request_for_session); +} + +RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForSessions & requests_for_sessions) +{ + + std::vector> entries; + for (const auto & [session_id, request] : requests_for_sessions) + entries.push_back(getZooKeeperLogEntry(session_id, request)); + + { + std::lock_guard lock(append_entries_mutex); + return raft_instance->append_entries(entries); + } +} + +bool KeeperServer::isLeader() const +{ + return raft_instance->is_leader(); +} + +bool KeeperServer::isLeaderAlive() const +{ + return raft_instance->is_leader_alive(); +} + +nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param) +{ + if (initialized_flag) + return nuraft::cb_func::ReturnCode::Ok; + + size_t last_commited = state_machine->last_commit_index(); + size_t next_index = state_manager->getLogStore()->next_slot(); + bool commited_store = false; + if (next_index < last_commited || next_index - last_commited <= 1) + commited_store = true; + + auto set_initialized = [this] () + { + std::unique_lock lock(initialized_mutex); + initialized_flag = true; + initialized_cv.notify_all(); + }; + + switch (type) + { + case nuraft::cb_func::BecomeLeader: + { + /// We become leader and store is empty or we already committed it + if (commited_store || initial_batch_committed) + set_initialized(); + return nuraft::cb_func::ReturnCode::Ok; + } + case nuraft::cb_func::BecomeFollower: + case nuraft::cb_func::GotAppendEntryReqFromLeader: + { + if (param->leaderId != -1) + { + auto leader_index = raft_instance->get_leader_committed_log_idx(); + auto our_index = raft_instance->get_committed_log_idx(); + /// This may happen when we start RAFT cluster from scratch. + /// Node first became leader, and after that some other node became leader. + /// BecameFresh for this node will not be called because it was already fresh + /// when it was leader. + if (leader_index < our_index + coordination_settings->fresh_log_gap) + set_initialized(); + } + return nuraft::cb_func::ReturnCode::Ok; + } + case nuraft::cb_func::BecomeFresh: + { + set_initialized(); /// We are fresh follower, ready to serve requests. + return nuraft::cb_func::ReturnCode::Ok; + } + case nuraft::cb_func::InitialBatchCommited: + { + if (param->myId == param->leaderId) /// We have committed our log store and we are leader, ready to serve requests. + set_initialized(); + initial_batch_committed = true; + return nuraft::cb_func::ReturnCode::Ok; + } + default: /// ignore other events + return nuraft::cb_func::ReturnCode::Ok; + } +} + +void KeeperServer::waitInit() +{ + std::unique_lock lock(initialized_mutex); + int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); + if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); })) + throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization"); +} + +std::unordered_set KeeperServer::getDeadSessions() +{ + return state_machine->getDeadSessions(); +} + +} diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h new file mode 100644 index 00000000000..11900ebb213 --- /dev/null +++ b/src/Coordination/KeeperServer.h @@ -0,0 +1,79 @@ +#pragma once + +#include // Y_IGNORE +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +using RaftAppendResult = nuraft::ptr>>; + +class KeeperServer +{ +private: + const int server_id; + + CoordinationSettingsPtr coordination_settings; + + nuraft::ptr state_machine; + + nuraft::ptr state_manager; + + nuraft::ptr raft_instance; + nuraft::ptr asio_service; + nuraft::ptr asio_listener; + + std::mutex append_entries_mutex; + + std::mutex initialized_mutex; + std::atomic initialized_flag = false; + std::condition_variable initialized_cv; + std::atomic initial_batch_committed = false; + + Poco::Logger * log; + + nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param); + + /// Almost copy-paste from nuraft::launcher, but with separated server init and start + /// Allows to avoid race conditions. + void launchRaftServer( + const nuraft::raft_params & params, + const nuraft::asio_service::options & asio_opts); + + void shutdownRaftServer(); + + +public: + KeeperServer( + int server_id_, + const CoordinationSettingsPtr & coordination_settings_, + const Poco::Util::AbstractConfiguration & config, + ResponsesQueue & responses_queue_, + SnapshotsQueue & snapshots_queue_); + + void startup(); + + void putLocalReadRequest(const KeeperStorage::RequestForSession & request); + + RaftAppendResult putRequestBatch(const KeeperStorage::RequestsForSessions & requests); + + std::unordered_set getDeadSessions(); + + bool isLeader() const; + + bool isLeaderAlive() const; + + void waitInit(); + + void shutdown(); + + int getServerID() const { return server_id; } +}; + +} diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp new file mode 100644 index 00000000000..3cfa07fb542 --- /dev/null +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -0,0 +1,366 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_FORMAT_VERSION; + extern const int UNKNOWN_SNAPSHOT; + extern const int LOGICAL_ERROR; +} + +namespace +{ + uint64_t getSnapshotPathUpToLogIdx(const String & snapshot_path) + { + std::filesystem::path path(snapshot_path); + std::string filename = path.stem(); + Strings name_parts; + splitInto<'_'>(name_parts, filename); + return parse(name_parts[1]); + } + + std::string getSnapshotFileName(uint64_t up_to_log_idx) + { + return std::string{"snapshot_"} + std::to_string(up_to_log_idx) + ".bin"; + } + + std::string getBaseName(const String & path) + { + size_t basename_start = path.rfind('/'); + return std::string{&path[basename_start + 1], path.length() - basename_start - 1}; + } + + String parentPath(const String & path) + { + auto rslash_pos = path.rfind('/'); + if (rslash_pos > 0) + return path.substr(0, rslash_pos); + return "/"; + } + + void writeNode(const KeeperStorage::Node & node, WriteBuffer & out) + { + writeBinary(node.data, out); + + /// Serialize ACL + writeBinary(node.acls.size(), out); + for (const auto & acl : node.acls) + { + writeBinary(acl.permissions, out); + writeBinary(acl.scheme, out); + writeBinary(acl.id, out); + } + + writeBinary(node.is_sequental, out); + /// Serialize stat + writeBinary(node.stat.czxid, out); + writeBinary(node.stat.mzxid, out); + writeBinary(node.stat.ctime, out); + writeBinary(node.stat.mtime, out); + writeBinary(node.stat.version, out); + writeBinary(node.stat.cversion, out); + writeBinary(node.stat.aversion, out); + writeBinary(node.stat.ephemeralOwner, out); + writeBinary(node.stat.dataLength, out); + writeBinary(node.stat.numChildren, out); + writeBinary(node.stat.pzxid, out); + + writeBinary(node.seq_num, out); + } + + void readNode(KeeperStorage::Node & node, ReadBuffer & in) + { + readBinary(node.data, in); + + /// Deserialize ACL + size_t acls_size; + readBinary(acls_size, in); + for (size_t i = 0; i < acls_size; ++i) + { + Coordination::ACL acl; + readBinary(acl.permissions, in); + readBinary(acl.scheme, in); + readBinary(acl.id, in); + node.acls.push_back(acl); + } + readBinary(node.is_sequental, in); + + /// Deserialize stat + readBinary(node.stat.czxid, in); + readBinary(node.stat.mzxid, in); + readBinary(node.stat.ctime, in); + readBinary(node.stat.mtime, in); + readBinary(node.stat.version, in); + readBinary(node.stat.cversion, in); + readBinary(node.stat.aversion, in); + readBinary(node.stat.ephemeralOwner, in); + readBinary(node.stat.dataLength, in); + readBinary(node.stat.numChildren, in); + readBinary(node.stat.pzxid, in); + readBinary(node.seq_num, in); + } + + void serializeSnapshotMetadata(const SnapshotMetadataPtr & snapshot_meta, WriteBuffer & out) + { + auto buffer = snapshot_meta->serialize(); + writeVarUInt(buffer->size(), out); + out.write(reinterpret_cast(buffer->data_begin()), buffer->size()); + } + + SnapshotMetadataPtr deserializeSnapshotMetadata(ReadBuffer & in) + { + size_t data_size; + readVarUInt(data_size, in); + auto buffer = nuraft::buffer::alloc(data_size); + in.readStrict(reinterpret_cast(buffer->data_begin()), data_size); + buffer->pos(0); + return SnapshotMetadata::deserialize(*buffer); + } +} + + +void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out) +{ + writeBinary(static_cast(snapshot.version), out); + serializeSnapshotMetadata(snapshot.snapshot_meta, out); + writeBinary(snapshot.session_id, out); + writeBinary(snapshot.snapshot_container_size, out); + size_t counter = 0; + for (auto it = snapshot.begin; counter < snapshot.snapshot_container_size; ++it, ++counter) + { + const auto & path = it->key; + const auto & node = it->value; + if (static_cast(node.stat.mzxid) > snapshot.snapshot_meta->get_last_log_idx()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to serialize node with mzxid {}, but last snapshot index {}", node.stat.mzxid, snapshot.snapshot_meta->get_last_log_idx()); + + writeBinary(path, out); + writeNode(node, out); + } + + size_t size = snapshot.session_and_timeout.size(); + writeBinary(size, out); + for (const auto & [session_id, timeout] : snapshot.session_and_timeout) + { + writeBinary(session_id, out); + writeBinary(timeout, out); + } +} + +SnapshotMetadataPtr KeeperStorageSnapshot::deserialize(KeeperStorage & storage, ReadBuffer & in) +{ + uint8_t version; + readBinary(version, in); + if (static_cast(version) > SnapshotVersion::V0) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported snapshot version {}", version); + + SnapshotMetadataPtr result = deserializeSnapshotMetadata(in); + int64_t session_id; + readBinary(session_id, in); + storage.zxid = result->get_last_log_idx(); + storage.session_id_counter = session_id; + + size_t snapshot_container_size; + readBinary(snapshot_container_size, in); + + size_t current_size = 0; + while (current_size < snapshot_container_size) + { + std::string path; + readBinary(path, in); + KeeperStorage::Node node; + readNode(node, in); + storage.container.insertOrReplace(path, node); + if (node.stat.ephemeralOwner != 0) + storage.ephemerals[node.stat.ephemeralOwner].insert(path); + + current_size++; + } + + for (const auto & itr : storage.container) + { + if (itr.key != "/") + { + auto parent_path = parentPath(itr.key); + storage.container.updateValue(parent_path, [&path = itr.key] (KeeperStorage::Node & value) { value.children.insert(getBaseName(path)); }); + } + } + + size_t active_sessions_size; + readBinary(active_sessions_size, in); + + size_t current_session_size = 0; + while (current_session_size < active_sessions_size) + { + int64_t active_session_id, timeout; + readBinary(active_session_id, in); + readBinary(timeout, in); + storage.addSessionID(active_session_id, timeout); + current_session_size++; + } + + return result; +} + +KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t up_to_log_idx_) + : storage(storage_) + , snapshot_meta(std::make_shared(up_to_log_idx_, 0, std::make_shared())) + , session_id(storage->session_id_counter) +{ + storage->enableSnapshotMode(); + snapshot_container_size = storage->container.snapshotSize(); + begin = storage->getSnapshotIteratorBegin(); + session_and_timeout = storage->getActiveSessions(); +} + +KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_) + : storage(storage_) + , snapshot_meta(snapshot_meta_) + , session_id(storage->session_id_counter) +{ + storage->enableSnapshotMode(); + snapshot_container_size = storage->container.snapshotSize(); + begin = storage->getSnapshotIteratorBegin(); + session_and_timeout = storage->getActiveSessions(); +} + +KeeperStorageSnapshot::~KeeperStorageSnapshot() +{ + storage->disableSnapshotMode(); +} + +KeeperSnapshotManager::KeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_) + : snapshots_path(snapshots_path_) + , snapshots_to_keep(snapshots_to_keep_) + , storage_tick_time(storage_tick_time_) +{ + namespace fs = std::filesystem; + + if (!fs::exists(snapshots_path)) + fs::create_directories(snapshots_path); + + for (const auto & p : fs::directory_iterator(snapshots_path)) + { + if (startsWith(p.path(), "tmp_")) /// Unfinished tmp files + { + std::filesystem::remove(p); + continue; + } + size_t snapshot_up_to = getSnapshotPathUpToLogIdx(p.path()); + existing_snapshots[snapshot_up_to] = p.path(); + } + + removeOutdatedSnapshotsIfNeeded(); +} + + +std::string KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx) +{ + ReadBufferFromNuraftBuffer reader(buffer); + + auto snapshot_file_name = getSnapshotFileName(up_to_log_idx); + auto tmp_snapshot_file_name = "tmp_" + snapshot_file_name; + std::string tmp_snapshot_path = std::filesystem::path{snapshots_path} / tmp_snapshot_file_name; + std::string new_snapshot_path = std::filesystem::path{snapshots_path} / snapshot_file_name; + + WriteBufferFromFile plain_buf(tmp_snapshot_path); + copyData(reader, plain_buf); + plain_buf.sync(); + + std::filesystem::rename(tmp_snapshot_path, new_snapshot_path); + + existing_snapshots.emplace(up_to_log_idx, new_snapshot_path); + removeOutdatedSnapshotsIfNeeded(); + + return new_snapshot_path; +} + +nuraft::ptr KeeperSnapshotManager::deserializeLatestSnapshotBufferFromDisk() +{ + while (!existing_snapshots.empty()) + { + auto latest_itr = existing_snapshots.rbegin(); + try + { + return deserializeSnapshotBufferFromDisk(latest_itr->first); + } + catch (const DB::Exception &) + { + std::filesystem::remove(latest_itr->second); + existing_snapshots.erase(latest_itr->first); + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + + return nullptr; +} + +nuraft::ptr KeeperSnapshotManager::deserializeSnapshotBufferFromDisk(uint64_t up_to_log_idx) const +{ + const std::string & snapshot_path = existing_snapshots.at(up_to_log_idx); + WriteBufferFromNuraftBuffer writer; + ReadBufferFromFile reader(snapshot_path); + copyData(reader, writer); + return writer.getBuffer(); +} + +nuraft::ptr KeeperSnapshotManager::serializeSnapshotToBuffer(const KeeperStorageSnapshot & snapshot) +{ + WriteBufferFromNuraftBuffer writer; + CompressedWriteBuffer compressed_writer(writer); + + KeeperStorageSnapshot::serialize(snapshot, compressed_writer); + compressed_writer.finalize(); + return writer.getBuffer(); +} + +SnapshotMetaAndStorage KeeperSnapshotManager::deserializeSnapshotFromBuffer(nuraft::ptr buffer) const +{ + ReadBufferFromNuraftBuffer reader(buffer); + CompressedReadBuffer compressed_reader(reader); + auto storage = std::make_unique(storage_tick_time); + auto snapshot_metadata = KeeperStorageSnapshot::deserialize(*storage, compressed_reader); + return std::make_pair(snapshot_metadata, std::move(storage)); +} + +SnapshotMetaAndStorage KeeperSnapshotManager::restoreFromLatestSnapshot() +{ + if (existing_snapshots.empty()) + return {}; + + auto buffer = deserializeLatestSnapshotBufferFromDisk(); + if (!buffer) + return {}; + return deserializeSnapshotFromBuffer(buffer); +} + +void KeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded() +{ + while (existing_snapshots.size() > snapshots_to_keep) + removeSnapshot(existing_snapshots.begin()->first); +} + +void KeeperSnapshotManager::removeSnapshot(uint64_t log_idx) +{ + auto itr = existing_snapshots.find(log_idx); + if (itr == existing_snapshots.end()) + throw Exception(ErrorCodes::UNKNOWN_SNAPSHOT, "Unknown snapshot with log index {}", log_idx); + std::filesystem::remove(itr->second); + existing_snapshots.erase(itr); + +} + + +} diff --git a/src/Coordination/KeeperSnapshotManager.h b/src/Coordination/KeeperSnapshotManager.h new file mode 100644 index 00000000000..95d1ce831d4 --- /dev/null +++ b/src/Coordination/KeeperSnapshotManager.h @@ -0,0 +1,89 @@ +#pragma once +#include // Y_IGNORE +#include +#include +#include + +namespace DB +{ + +using SnapshotMetadata = nuraft::snapshot; +using SnapshotMetadataPtr = std::shared_ptr; + +enum SnapshotVersion : uint8_t +{ + V0 = 0, +}; + +struct KeeperStorageSnapshot +{ +public: + KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t up_to_log_idx_); + + KeeperStorageSnapshot(KeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_); + ~KeeperStorageSnapshot(); + + static void serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out); + + static SnapshotMetadataPtr deserialize(KeeperStorage & storage, ReadBuffer & in); + + KeeperStorage * storage; + + SnapshotVersion version = SnapshotVersion::V0; + SnapshotMetadataPtr snapshot_meta; + int64_t session_id; + size_t snapshot_container_size; + KeeperStorage::Container::const_iterator begin; + SessionAndTimeout session_and_timeout; +}; + +using KeeperStorageSnapshotPtr = std::shared_ptr; +using CreateSnapshotCallback = std::function; + + +using SnapshotMetaAndStorage = std::pair; + +class KeeperSnapshotManager +{ +public: + KeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_ = 500); + + SnapshotMetaAndStorage restoreFromLatestSnapshot(); + + static nuraft::ptr serializeSnapshotToBuffer(const KeeperStorageSnapshot & snapshot); + std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx); + + SnapshotMetaAndStorage deserializeSnapshotFromBuffer(nuraft::ptr buffer) const; + + nuraft::ptr deserializeSnapshotBufferFromDisk(uint64_t up_to_log_idx) const; + nuraft::ptr deserializeLatestSnapshotBufferFromDisk(); + + void removeSnapshot(uint64_t log_idx); + + size_t totalSnapshots() const + { + return existing_snapshots.size(); + } + + size_t getLatestSnapshotIndex() const + { + if (!existing_snapshots.empty()) + return existing_snapshots.rbegin()->first; + return 0; + } + +private: + void removeOutdatedSnapshotsIfNeeded(); + const std::string snapshots_path; + const size_t snapshots_to_keep; + std::map existing_snapshots; + size_t storage_tick_time; +}; + +struct CreateSnapshotTask +{ + KeeperStorageSnapshotPtr snapshot; + CreateSnapshotCallback create_snapshot; +}; + +} diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp new file mode 100644 index 00000000000..df68b8df266 --- /dev/null +++ b/src/Coordination/KeeperStateMachine.cpp @@ -0,0 +1,301 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +KeeperStorage::RequestForSession parseRequest(nuraft::buffer & data) +{ + ReadBufferFromNuraftBuffer buffer(data); + KeeperStorage::RequestForSession request_for_session; + readIntBinary(request_for_session.session_id, buffer); + + int32_t length; + Coordination::read(length, buffer); + + int32_t xid; + Coordination::read(xid, buffer); + + Coordination::OpNum opnum; + + Coordination::read(opnum, buffer); + + request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum); + request_for_session.request->xid = xid; + request_for_session.request->readImpl(buffer); + return request_for_session; +} + +KeeperStateMachine::KeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_) + : coordination_settings(coordination_settings_) + , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep, coordination_settings->dead_session_check_period_ms.totalMicroseconds()) + , responses_queue(responses_queue_) + , snapshots_queue(snapshots_queue_) + , last_committed_idx(0) + , log(&Poco::Logger::get("KeeperStateMachine")) +{ +} + +void KeeperStateMachine::init() +{ + /// Do everything without mutexes, no other threads exist. + LOG_DEBUG(log, "Totally have {} snapshots", snapshot_manager.totalSnapshots()); + bool loaded = false; + bool has_snapshots = snapshot_manager.totalSnapshots() != 0; + while (snapshot_manager.totalSnapshots() != 0) + { + uint64_t latest_log_index = snapshot_manager.getLatestSnapshotIndex(); + LOG_DEBUG(log, "Trying to load state machine from snapshot up to log index {}", latest_log_index); + + try + { + latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index); + std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf); + last_committed_idx = latest_snapshot_meta->get_last_log_idx(); + loaded = true; + break; + } + catch (const DB::Exception & ex) + { + LOG_WARNING(log, "Failed to load from snapshot with index {}, with error {}, will remove it from disk", latest_log_index, ex.displayText()); + snapshot_manager.removeSnapshot(latest_log_index); + } + } + + if (has_snapshots) + { + if (loaded) + LOG_DEBUG(log, "Loaded snapshot with last committed log index {}", last_committed_idx); + else + LOG_WARNING(log, "All snapshots broken, last committed log index {}", last_committed_idx); + } + else + { + LOG_DEBUG(log, "No existing snapshots, last committed log index {}", last_committed_idx); + } + + if (!storage) + storage = std::make_unique(coordination_settings->dead_session_check_period_ms.totalMilliseconds()); +} + +nuraft::ptr KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data) +{ + auto request_for_session = parseRequest(data); + if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID) + { + const Coordination::ZooKeeperSessionIDRequest & session_id_request = dynamic_cast(*request_for_session.request); + int64_t session_id; + { + std::lock_guard lock(storage_lock); + session_id = storage->getSessionID(session_id_request.session_timeout_ms); + } + LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_id_request.session_timeout_ms); + + std::shared_ptr response = std::make_shared(); + response->internal_id = session_id_request.internal_id; + response->session_id = session_id; + response->server_id = session_id_request.server_id; + + KeeperStorage::ResponseForSession response_for_session; + response_for_session.session_id = -1; + response_for_session.response = response; + responses_queue.push(response_for_session); + } + else + { + KeeperStorage::ResponsesForSessions responses_for_sessions; + { + std::lock_guard lock(storage_lock); + responses_for_sessions = storage->processRequest(request_for_session.request, request_for_session.session_id, log_idx); + for (auto & response_for_session : responses_for_sessions) + responses_queue.push(response_for_session); + } + } + + last_committed_idx = log_idx; + return nullptr; +} + +bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) +{ + LOG_DEBUG(log, "Applying snapshot {}", s.get_last_log_idx()); + nuraft::ptr latest_snapshot_ptr; + { + std::lock_guard lock(snapshots_lock); + if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Required to apply snapshot with last log index {}, but our last log index is {}", + s.get_last_log_idx(), latest_snapshot_meta->get_last_log_idx()); + latest_snapshot_ptr = latest_snapshot_buf; + } + + { + std::lock_guard lock(storage_lock); + std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr); + } + last_committed_idx = s.get_last_log_idx(); + return true; +} + +nuraft::ptr KeeperStateMachine::last_snapshot() +{ + /// Just return the latest snapshot. + std::lock_guard lock(snapshots_lock); + return latest_snapshot_meta; +} + +void KeeperStateMachine::create_snapshot( + nuraft::snapshot & s, + nuraft::async_result::handler_type & when_done) +{ + LOG_DEBUG(log, "Creating snapshot {}", s.get_last_log_idx()); + + nuraft::ptr snp_buf = s.serialize(); + auto snapshot_meta_copy = nuraft::snapshot::deserialize(*snp_buf); + CreateSnapshotTask snapshot_task; + { + std::lock_guard lock(storage_lock); + snapshot_task.snapshot = std::make_shared(storage.get(), snapshot_meta_copy); + } + + snapshot_task.create_snapshot = [this, when_done] (KeeperStorageSnapshotPtr && snapshot) + { + nuraft::ptr exception(nullptr); + bool ret = true; + try + { + { + std::lock_guard lock(snapshots_lock); + auto snapshot_buf = snapshot_manager.serializeSnapshotToBuffer(*snapshot); + auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*snapshot_buf, snapshot->snapshot_meta->get_last_log_idx()); + latest_snapshot_buf = snapshot_buf; + latest_snapshot_meta = snapshot->snapshot_meta; + + LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), result_path); + } + + { + /// Must do it with lock (clearing elements from list) + std::lock_guard lock(storage_lock); + storage->clearGarbageAfterSnapshot(); + /// Destroy snapshot with lock + snapshot.reset(); + LOG_TRACE(log, "Cleared garbage after snapshot"); + + } + } + catch (...) + { + LOG_TRACE(log, "Exception happened during snapshot"); + tryLogCurrentException(log); + ret = false; + } + + when_done(ret, exception); + }; + + LOG_DEBUG(log, "In memory snapshot {} created, queueing task to flash to disk", s.get_last_log_idx()); + snapshots_queue.push(std::move(snapshot_task)); +} + +void KeeperStateMachine::save_logical_snp_obj( + nuraft::snapshot & s, + uint64_t & obj_id, + nuraft::buffer & data, + bool /*is_first_obj*/, + bool /*is_last_obj*/) +{ + LOG_DEBUG(log, "Saving snapshot {} obj_id {}", s.get_last_log_idx(), obj_id); + + nuraft::ptr cloned_buffer; + nuraft::ptr cloned_meta; + if (obj_id == 0) + { + std::lock_guard lock(storage_lock); + KeeperStorageSnapshot snapshot(storage.get(), s.get_last_log_idx()); + cloned_buffer = snapshot_manager.serializeSnapshotToBuffer(snapshot); + } + else + { + cloned_buffer = nuraft::buffer::clone(data); + } + + nuraft::ptr snp_buf = s.serialize(); + cloned_meta = nuraft::snapshot::deserialize(*snp_buf); + + try + { + std::lock_guard lock(snapshots_lock); + auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx()); + latest_snapshot_buf = cloned_buffer; + latest_snapshot_meta = cloned_meta; + LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), result_path); + obj_id++; + } + catch (...) + { + tryLogCurrentException(log); + } +} + +int KeeperStateMachine::read_logical_snp_obj( + nuraft::snapshot & s, + void* & /*user_snp_ctx*/, + uint64_t obj_id, + nuraft::ptr & data_out, + bool & is_last_obj) +{ + + LOG_DEBUG(log, "Reading snapshot {} obj_id {}", s.get_last_log_idx(), obj_id); + if (obj_id == 0) + { + data_out = nuraft::buffer::alloc(sizeof(int32_t)); + nuraft::buffer_serializer bs(data_out); + bs.put_i32(0); + is_last_obj = false; + } + else + { + std::lock_guard lock(snapshots_lock); + if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Required to apply snapshot with last log index {}, but our last log index is {}", + s.get_last_log_idx(), latest_snapshot_meta->get_last_log_idx()); + data_out = nuraft::buffer::clone(*latest_snapshot_buf); + is_last_obj = true; + } + return 0; +} + +void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSession & request_for_session) +{ + KeeperStorage::ResponsesForSessions responses; + { + std::lock_guard lock(storage_lock); + responses = storage->processRequest(request_for_session.request, request_for_session.session_id, std::nullopt); + } + for (const auto & response : responses) + responses_queue.push(response); +} + +std::unordered_set KeeperStateMachine::getDeadSessions() +{ + std::lock_guard lock(storage_lock); + return storage->getDeadSessions(); +} + +void KeeperStateMachine::shutdownStorage() +{ + std::lock_guard lock(storage_lock); + storage->finalize(); +} + +} diff --git a/src/Coordination/KeeperStateMachine.h b/src/Coordination/KeeperStateMachine.h new file mode 100644 index 00000000000..8712adaf4b1 --- /dev/null +++ b/src/Coordination/KeeperStateMachine.h @@ -0,0 +1,89 @@ +#pragma once + +#include +#include // Y_IGNORE +#include +#include +#include +#include + +namespace DB +{ + +using ResponsesQueue = ThreadSafeQueue; +using SnapshotsQueue = ConcurrentBoundedQueue; + +class KeeperStateMachine : public nuraft::state_machine +{ +public: + KeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_); + + void init(); + + nuraft::ptr pre_commit(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; } + + nuraft::ptr commit(const uint64_t log_idx, nuraft::buffer & data) override; + + void rollback(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override {} + + uint64_t last_commit_index() override { return last_committed_idx; } + + bool apply_snapshot(nuraft::snapshot & s) override; + + nuraft::ptr last_snapshot() override; + + void create_snapshot( + nuraft::snapshot & s, + nuraft::async_result::handler_type & when_done) override; + + void save_logical_snp_obj( + nuraft::snapshot & s, + uint64_t & obj_id, + nuraft::buffer & data, + bool is_first_obj, + bool is_last_obj) override; + + int read_logical_snp_obj( + nuraft::snapshot & s, + void* & user_snp_ctx, + uint64_t obj_id, + nuraft::ptr & data_out, + bool & is_last_obj) override; + + KeeperStorage & getStorage() + { + return *storage; + } + + void processReadRequest(const KeeperStorage::RequestForSession & request_for_session); + + std::unordered_set getDeadSessions(); + + void shutdownStorage(); + +private: + + SnapshotMetadataPtr latest_snapshot_meta = nullptr; + nuraft::ptr latest_snapshot_buf = nullptr; + + CoordinationSettingsPtr coordination_settings; + + KeeperStoragePtr storage; + + KeeperSnapshotManager snapshot_manager; + + ResponsesQueue & responses_queue; + + SnapshotsQueue & snapshots_queue; + /// Mutex for snapshots + std::mutex snapshots_lock; + + /// Lock for storage + std::mutex storage_lock; + + /// Last committed Raft log number. + std::atomic last_committed_idx; + Poco::Logger * log; +}; + +} diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/KeeperStateManager.cpp similarity index 61% rename from src/Coordination/InMemoryStateManager.cpp rename to src/Coordination/KeeperStateManager.cpp index 69e93578cc1..e57ae7e7c19 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/KeeperStateManager.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB @@ -9,36 +9,47 @@ namespace ErrorCodes extern const int RAFT_ERROR; } -InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port) +KeeperStateManager::KeeperStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) : my_server_id(server_id_) , my_port(port) - , log_store(nuraft::cs_new()) + , secure(false) + , log_store(nuraft::cs_new(logs_path, 5000, false)) , cluster_config(nuraft::cs_new()) { auto peer_config = nuraft::cs_new(my_server_id, host + ":" + std::to_string(port)); cluster_config->get_servers().push_back(peer_config); } -InMemoryStateManager::InMemoryStateManager( +KeeperStateManager::KeeperStateManager( int my_server_id_, const std::string & config_prefix, - const Poco::Util::AbstractConfiguration & config) + const Poco::Util::AbstractConfiguration & config, + const CoordinationSettingsPtr & coordination_settings) : my_server_id(my_server_id_) - , log_store(nuraft::cs_new()) + , secure(config.getBool(config_prefix + ".raft_configuration.secure", false)) + , log_store(nuraft::cs_new( + config.getString(config_prefix + ".log_storage_path", config.getString("path", DBMS_DEFAULT_PATH) + "coordination/logs"), + coordination_settings->rotate_log_storage_interval, coordination_settings->force_sync)) , cluster_config(nuraft::cs_new()) { + Poco::Util::AbstractConfiguration::Keys keys; - config.keys(config_prefix, keys); + config.keys(config_prefix + ".raft_configuration", keys); + total_servers = keys.size(); for (const auto & server_key : keys) { - std::string full_prefix = config_prefix + "." + server_key; + if (!startsWith(server_key, "server")) + continue; + + std::string full_prefix = config_prefix + ".raft_configuration." + server_key; int server_id = config.getInt(full_prefix + ".id"); std::string hostname = config.getString(full_prefix + ".hostname"); int port = config.getInt(full_prefix + ".port"); bool can_become_leader = config.getBool(full_prefix + ".can_become_leader", true); int32_t priority = config.getInt(full_prefix + ".priority", 1); bool start_as_follower = config.getBool(full_prefix + ".start_as_follower", false); + if (start_as_follower) start_as_follower_servers.insert(server_id); @@ -52,14 +63,25 @@ InMemoryStateManager::InMemoryStateManager( cluster_config->get_servers().push_back(peer_config); } + if (!my_server_config) - throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section"); + throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section", my_server_id); if (start_as_follower_servers.size() == cluster_config->get_servers().size()) throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without )"); } -void InMemoryStateManager::save_config(const nuraft::cluster_config & config) +void KeeperStateManager::loadLogStore(uint64_t last_commited_index, uint64_t logs_to_keep) +{ + log_store->init(last_commited_index, logs_to_keep); +} + +void KeeperStateManager::flushLogStore() +{ + log_store->flush(); +} + +void KeeperStateManager::save_config(const nuraft::cluster_config & config) { // Just keep in memory in this example. // Need to write to disk here, if want to make it durable. @@ -67,7 +89,7 @@ void InMemoryStateManager::save_config(const nuraft::cluster_config & config) cluster_config = nuraft::cluster_config::deserialize(*buf); } -void InMemoryStateManager::save_state(const nuraft::srv_state & state) +void KeeperStateManager::save_state(const nuraft::srv_state & state) { // Just keep in memory in this example. // Need to write to disk here, if want to make it durable. diff --git a/src/Coordination/InMemoryStateManager.h b/src/Coordination/KeeperStateManager.h similarity index 63% rename from src/Coordination/InMemoryStateManager.h rename to src/Coordination/KeeperStateManager.h index 2a5c2f00dba..cb5181760cb 100644 --- a/src/Coordination/InMemoryStateManager.h +++ b/src/Coordination/KeeperStateManager.h @@ -2,25 +2,32 @@ #include #include -#include +#include +#include #include // Y_IGNORE #include namespace DB { -class InMemoryStateManager : public nuraft::state_mgr +class KeeperStateManager : public nuraft::state_mgr { public: - InMemoryStateManager( + KeeperStateManager( int server_id_, const std::string & config_prefix, - const Poco::Util::AbstractConfiguration & config); + const Poco::Util::AbstractConfiguration & config, + const CoordinationSettingsPtr & coordination_settings); - InMemoryStateManager( + KeeperStateManager( int server_id_, const std::string & host, - int port); + int port, + const std::string & logs_path); + + void loadLogStore(uint64_t last_commited_index, uint64_t logs_to_keep); + + void flushLogStore(); nuraft::ptr load_config() override { return cluster_config; } @@ -45,11 +52,22 @@ public: return start_as_follower_servers.count(my_server_id); } + bool isSecure() const + { + return secure; + } + + nuraft::ptr getLogStore() const { return log_store; } + + uint64_t getTotalServers() const { return total_servers; } + private: int my_server_id; int my_port; + bool secure; + uint64_t total_servers{0}; std::unordered_set start_as_follower_servers; - nuraft::ptr log_store; + nuraft::ptr log_store; nuraft::ptr my_server_config; nuraft::ptr cluster_config; nuraft::ptr server_state; diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp similarity index 61% rename from src/Coordination/NuKeeperStorage.cpp rename to src/Coordination/KeeperStorage.cpp index 631f975cddc..197cc4323cf 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -25,15 +25,15 @@ static String parentPath(const String & path) return "/"; } -static String baseName(const String & path) +static std::string getBaseName(const String & path) { - auto rslash_pos = path.rfind('/'); - return path.substr(rslash_pos + 1); + size_t basename_start = path.rfind('/'); + return std::string{&path[basename_start + 1], path.length() - basename_start - 1}; } -static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) +static KeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches, Coordination::Event event_type) { - NuKeeperStorage::ResponsesForSessions result; + KeeperStorage::ResponsesForSessions result; auto it = watches.find(path); if (it != watches.end()) { @@ -44,7 +44,7 @@ static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & p watch_response->type = event_type; watch_response->state = Coordination::State::CONNECTED; for (auto watcher_session : it->second) - result.push_back(NuKeeperStorage::ResponseForSession{watcher_session, watch_response}); + result.push_back(KeeperStorage::ResponseForSession{watcher_session, watch_response}); watches.erase(it); } @@ -60,47 +60,47 @@ static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & p watch_list_response->type = Coordination::Event::CHILD; watch_list_response->state = Coordination::State::CONNECTED; for (auto watcher_session : it->second) - result.push_back(NuKeeperStorage::ResponseForSession{watcher_session, watch_list_response}); + result.push_back(KeeperStorage::ResponseForSession{watcher_session, watch_list_response}); list_watches.erase(it); } return result; } -NuKeeperStorage::NuKeeperStorage(int64_t tick_time_ms) +KeeperStorage::KeeperStorage(int64_t tick_time_ms) : session_expiry_queue(tick_time_ms) { - container.emplace("/", Node()); + container.insert("/", Node()); } using Undo = std::function; -struct NuKeeperStorageRequest +struct KeeperStorageRequest { Coordination::ZooKeeperRequestPtr zk_request; - explicit NuKeeperStorageRequest(const Coordination::ZooKeeperRequestPtr & zk_request_) + explicit KeeperStorageRequest(const Coordination::ZooKeeperRequestPtr & zk_request_) : zk_request(zk_request_) {} - virtual std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const = 0; - virtual NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & /*watches*/, NuKeeperStorage::Watches & /*list_watches*/) const { return {}; } + virtual std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const = 0; + virtual KeeperStorage::ResponsesForSessions processWatches(KeeperStorage::Watches & /*watches*/, KeeperStorage::Watches & /*list_watches*/) const { return {}; } - virtual ~NuKeeperStorageRequest() = default; + virtual ~KeeperStorageRequest() = default; }; -struct NuKeeperStorageHeartbeatRequest final : public NuKeeperStorageRequest +struct KeeperStorageHeartbeatRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & /* container */, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container & /* container */, KeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override { return {zk_request->makeResponse(), {}}; } }; -struct NuKeeperStorageSyncRequest final : public NuKeeperStorageRequest +struct KeeperStorageSyncRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & /* container */, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container & /* container */, KeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override { auto response = zk_request->makeResponse(); dynamic_cast(response.get())->path = dynamic_cast(zk_request.get())->path; @@ -108,55 +108,56 @@ struct NuKeeperStorageSyncRequest final : public NuKeeperStorageRequest } }; -struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest +struct KeeperStorageCreateRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; + using KeeperStorageRequest::KeeperStorageRequest; - NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override + KeeperStorage::ResponsesForSessions processWatches(KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches) const override { return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CREATED); } - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override + std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Undo undo; Coordination::ZooKeeperCreateResponse & response = dynamic_cast(*response_ptr); Coordination::ZooKeeperCreateRequest & request = dynamic_cast(*zk_request); - if (container.count(request.path)) + if (container.contains(request.path)) { response.error = Coordination::Error::ZNODEEXISTS; } else { - auto it = container.find(parentPath(request.path)); + auto parent_path = parentPath(request.path); + auto it = container.find(parent_path); if (it == container.end()) { response.error = Coordination::Error::ZNONODE; } - else if (it->second.is_ephemeral) + else if (it->value.stat.ephemeralOwner != 0) { response.error = Coordination::Error::ZNOCHILDRENFOREPHEMERALS; } else { - NuKeeperStorage::Node created_node; + KeeperStorage::Node created_node; created_node.stat.czxid = zxid; created_node.stat.mzxid = zxid; created_node.stat.ctime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1); created_node.stat.mtime = created_node.stat.ctime; created_node.stat.numChildren = 0; created_node.stat.dataLength = request.data.length(); + created_node.stat.ephemeralOwner = request.is_ephemeral ? session_id : 0; created_node.data = request.data; - created_node.is_ephemeral = request.is_ephemeral; created_node.is_sequental = request.is_sequential; std::string path_created = request.path; if (request.is_sequential) { - auto seq_num = it->second.seq_num; + auto seq_num = it->value.seq_num; std::stringstream seq_num_str; // STYLE_CHECK_ALLOW_STD_STRING_STREAM seq_num_str.exceptions(std::ios::failbit); @@ -165,28 +166,36 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest path_created += seq_num_str.str(); } - /// Increment sequential number even if node is not sequential - ++it->second.seq_num; + auto child_path = getBaseName(path_created); + container.updateValue(parent_path, [child_path] (KeeperStorage::Node & parent) + { + /// Increment sequential number even if node is not sequential + ++parent.seq_num; + parent.children.insert(child_path); + ++parent.stat.cversion; + ++parent.stat.numChildren; + }); response.path_created = path_created; - container.emplace(path_created, std::move(created_node)); + container.insert(path_created, std::move(created_node)); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path, child_path] { container.erase(path_created); if (is_ephemeral) ephemerals[session_id].erase(path_created); - auto & undo_parent = container.at(parent_path); - --undo_parent.stat.cversion; - --undo_parent.stat.numChildren; - --undo_parent.seq_num; - }; - ++it->second.stat.cversion; - ++it->second.stat.numChildren; + container.updateValue(parent_path, [child_path] (KeeperStorage::Node & undo_parent) + { + --undo_parent.stat.cversion; + --undo_parent.stat.numChildren; + --undo_parent.seq_num; + undo_parent.children.erase(child_path); + }); + }; response.error = Coordination::Error::ZOK; } @@ -196,10 +205,10 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest } }; -struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest +struct KeeperStorageGetRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperGetResponse & response = dynamic_cast(*response_ptr); @@ -212,8 +221,8 @@ struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest } else { - response.stat = it->second.stat; - response.data = it->second.data; + response.stat = it->value.stat; + response.data = it->value.data; response.error = Coordination::Error::ZOK; } @@ -221,10 +230,10 @@ struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest } }; -struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest +struct KeeperStorageRemoveRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t /*session_id*/) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperRemoveResponse & response = dynamic_cast(*response_ptr); @@ -236,51 +245,65 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest { response.error = Coordination::Error::ZNONODE; } - else if (request.version != -1 && request.version != it->second.stat.version) + else if (request.version != -1 && request.version != it->value.stat.version) { response.error = Coordination::Error::ZBADVERSION; } - else if (it->second.stat.numChildren) + else if (it->value.stat.numChildren) { response.error = Coordination::Error::ZNOTEMPTY; } else { - auto prev_node = it->second; - if (prev_node.is_ephemeral) - ephemerals[session_id].erase(request.path); + auto prev_node = it->value; + if (prev_node.stat.ephemeralOwner != 0) + { + auto ephemerals_it = ephemerals.find(prev_node.stat.ephemeralOwner); + ephemerals_it->second.erase(request.path); + if (ephemerals_it->second.empty()) + ephemerals.erase(ephemerals_it); + } + + auto child_basename = getBaseName(it->key); + container.updateValue(parentPath(request.path), [&child_basename] (KeeperStorage::Node & parent) + { + --parent.stat.numChildren; + ++parent.stat.cversion; + parent.children.erase(child_basename); + }); - container.erase(it); - auto & parent = container.at(parentPath(request.path)); - --parent.stat.numChildren; - ++parent.stat.cversion; response.error = Coordination::Error::ZOK; - undo = [prev_node, &container, &ephemerals, session_id, path = request.path] - { - if (prev_node.is_ephemeral) - ephemerals[session_id].emplace(path); + container.erase(request.path); - container.emplace(path, prev_node); - auto & undo_parent = container.at(parentPath(path)); - ++undo_parent.stat.numChildren; - --undo_parent.stat.cversion; + undo = [prev_node, &container, &ephemerals, path = request.path, child_basename] + { + if (prev_node.stat.ephemeralOwner != 0) + ephemerals[prev_node.stat.ephemeralOwner].emplace(path); + + container.insert(path, prev_node); + container.updateValue(parentPath(path), [&child_basename] (KeeperStorage::Node & parent) + { + ++parent.stat.numChildren; + --parent.stat.cversion; + parent.children.insert(child_basename); + }); }; } return { response_ptr, undo }; } - NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override + KeeperStorage::ResponsesForSessions processWatches(KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches) const override { return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::DELETED); } }; -struct NuKeeperStorageExistsRequest final : public NuKeeperStorageRequest +struct KeeperStorageExistsRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /* session_id */) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /* session_id */) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperExistsResponse & response = dynamic_cast(*response_ptr); @@ -289,7 +312,7 @@ struct NuKeeperStorageExistsRequest final : public NuKeeperStorageRequest auto it = container.find(request.path); if (it != container.end()) { - response.stat = it->second.stat; + response.stat = it->value.stat; response.error = Coordination::Error::ZOK; } else @@ -301,10 +324,10 @@ struct NuKeeperStorageExistsRequest final : public NuKeeperStorageRequest } }; -struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest +struct KeeperStorageSetRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t zxid, int64_t /* session_id */) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & /* ephemerals */, int64_t zxid, int64_t /* session_id */) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperSetResponse & response = dynamic_cast(*response_ptr); @@ -316,24 +339,35 @@ struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest { response.error = Coordination::Error::ZNONODE; } - else if (request.version == -1 || request.version == it->second.stat.version) + else if (request.version == -1 || request.version == it->value.stat.version) { - auto prev_node = it->second; + auto prev_node = it->value; - it->second.data = request.data; - ++it->second.stat.version; - it->second.stat.mzxid = zxid; - it->second.stat.mtime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1); - it->second.stat.dataLength = request.data.length(); - it->second.data = request.data; - ++container.at(parentPath(request.path)).stat.cversion; - response.stat = it->second.stat; + auto itr = container.updateValue(request.path, [zxid, request] (KeeperStorage::Node & value) + { + value.data = request.data; + value.stat.version++; + value.stat.mzxid = zxid; + value.stat.mtime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1); + value.stat.dataLength = request.data.length(); + value.data = request.data; + }); + + container.updateValue(parentPath(request.path), [] (KeeperStorage::Node & parent) + { + parent.stat.cversion++; + }); + + response.stat = itr->value.stat; response.error = Coordination::Error::ZOK; undo = [prev_node, &container, path = request.path] { - container.at(path) = prev_node; - --container.at(parentPath(path)).stat.cversion; + container.updateValue(path, [&prev_node] (KeeperStorage::Node & value) { value = prev_node; }); + container.updateValue(parentPath(path), [] (KeeperStorage::Node & parent) + { + parent.stat.cversion--; + }); }; } else @@ -344,17 +378,16 @@ struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest return { response_ptr, undo }; } - NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override + KeeperStorage::ResponsesForSessions processWatches(KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches) const override { return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CHANGED); } - }; -struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest +struct KeeperStorageListRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperListResponse & response = dynamic_cast(*response_ptr); @@ -370,19 +403,9 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - if (path_prefix.back() != '/') - path_prefix += '/'; + response.names.insert(response.names.end(), it->value.children.begin(), it->value.children.end()); - /// Fairly inefficient. - for (auto child_it = container.upper_bound(path_prefix); - child_it != container.end() && startsWith(child_it->first, path_prefix); - ++child_it) - { - if (parentPath(child_it->first) == request.path) - response.names.emplace_back(baseName(child_it->first)); - } - - response.stat = it->second.stat; + response.stat = it->value.stat; response.error = Coordination::Error::ZOK; } @@ -390,10 +413,10 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest } }; -struct NuKeeperStorageCheckRequest final : public NuKeeperStorageRequest +struct KeeperStorageCheckRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperCheckResponse & response = dynamic_cast(*response_ptr); @@ -403,7 +426,7 @@ struct NuKeeperStorageCheckRequest final : public NuKeeperStorageRequest { response.error = Coordination::Error::ZNONODE; } - else if (request.version != -1 && request.version != it->second.stat.version) + else if (request.version != -1 && request.version != it->value.stat.version) { response.error = Coordination::Error::ZBADVERSION; } @@ -416,11 +439,11 @@ struct NuKeeperStorageCheckRequest final : public NuKeeperStorageRequest } }; -struct NuKeeperStorageMultiRequest final : public NuKeeperStorageRequest +struct KeeperStorageMultiRequest final : public KeeperStorageRequest { - std::vector concrete_requests; - explicit NuKeeperStorageMultiRequest(const Coordination::ZooKeeperRequestPtr & zk_request_) - : NuKeeperStorageRequest(zk_request_) + std::vector concrete_requests; + explicit KeeperStorageMultiRequest(const Coordination::ZooKeeperRequestPtr & zk_request_) + : KeeperStorageRequest(zk_request_) { Coordination::ZooKeeperMultiRequest & request = dynamic_cast(*zk_request); concrete_requests.reserve(request.requests.size()); @@ -430,26 +453,26 @@ struct NuKeeperStorageMultiRequest final : public NuKeeperStorageRequest auto sub_zk_request = std::dynamic_pointer_cast(sub_request); if (sub_zk_request->getOpNum() == Coordination::OpNum::Create) { - concrete_requests.push_back(std::make_shared(sub_zk_request)); + concrete_requests.push_back(std::make_shared(sub_zk_request)); } else if (sub_zk_request->getOpNum() == Coordination::OpNum::Remove) { - concrete_requests.push_back(std::make_shared(sub_zk_request)); + concrete_requests.push_back(std::make_shared(sub_zk_request)); } else if (sub_zk_request->getOpNum() == Coordination::OpNum::Set) { - concrete_requests.push_back(std::make_shared(sub_zk_request)); + concrete_requests.push_back(std::make_shared(sub_zk_request)); } else if (sub_zk_request->getOpNum() == Coordination::OpNum::Check) { - concrete_requests.push_back(std::make_shared(sub_zk_request)); + concrete_requests.push_back(std::make_shared(sub_zk_request)); } else throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal command as part of multi ZooKeeper request {}", sub_zk_request->getOpNum()); } } - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override + std::pair process(KeeperStorage::Container & container, KeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperMultiResponse & response = dynamic_cast(*response_ptr); @@ -502,9 +525,9 @@ struct NuKeeperStorageMultiRequest final : public NuKeeperStorageRequest } } - NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override + KeeperStorage::ResponsesForSessions processWatches(KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches) const override { - NuKeeperStorage::ResponsesForSessions result; + KeeperStorage::ResponsesForSessions result; for (const auto & generic_request : concrete_requests) { auto responses = generic_request->processWatches(watches, list_watches); @@ -514,16 +537,16 @@ struct NuKeeperStorageMultiRequest final : public NuKeeperStorageRequest } }; -struct NuKeeperStorageCloseRequest final : public NuKeeperStorageRequest +struct KeeperStorageCloseRequest final : public KeeperStorageRequest { - using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container &, NuKeeperStorage::Ephemerals &, int64_t, int64_t) const override + using KeeperStorageRequest::KeeperStorageRequest; + std::pair process(KeeperStorage::Container &, KeeperStorage::Ephemerals &, int64_t, int64_t) const override { throw DB::Exception("Called process on close request", ErrorCodes::LOGICAL_ERROR); } }; -void NuKeeperStorage::finalize() +void KeeperStorage::finalize() { if (finalized) throw DB::Exception("Testkeeper storage already finalized", ErrorCodes::LOGICAL_ERROR); @@ -543,20 +566,20 @@ void NuKeeperStorage::finalize() } -class NuKeeperWrapperFactory final : private boost::noncopyable +class KeeperWrapperFactory final : private boost::noncopyable { public: - using Creator = std::function; + using Creator = std::function; using OpNumToRequest = std::unordered_map; - static NuKeeperWrapperFactory & instance() + static KeeperWrapperFactory & instance() { - static NuKeeperWrapperFactory factory; + static KeeperWrapperFactory factory; return factory; } - NuKeeperStorageRequestPtr get(const Coordination::ZooKeeperRequestPtr & zk_request) const + KeeperStorageRequestPtr get(const Coordination::ZooKeeperRequestPtr & zk_request) const { auto it = op_num_to_request.find(zk_request->getOpNum()); if (it == op_num_to_request.end()) @@ -573,37 +596,45 @@ public: private: OpNumToRequest op_num_to_request; - NuKeeperWrapperFactory(); + KeeperWrapperFactory(); }; template -void registerNuKeeperRequestWrapper(NuKeeperWrapperFactory & factory) +void registerKeeperRequestWrapper(KeeperWrapperFactory & factory) { factory.registerRequest(num, [] (const Coordination::ZooKeeperRequestPtr & zk_request) { return std::make_shared(zk_request); }); } -NuKeeperWrapperFactory::NuKeeperWrapperFactory() +KeeperWrapperFactory::KeeperWrapperFactory() { - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - //registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); - registerNuKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + //registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); + registerKeeperRequestWrapper(*this); } -NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coordination::ZooKeeperRequestPtr & zk_request, int64_t session_id) +KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordination::ZooKeeperRequestPtr & zk_request, int64_t session_id, std::optional new_last_zxid) { - NuKeeperStorage::ResponsesForSessions results; + KeeperStorage::ResponsesForSessions results; + if (new_last_zxid) + { + if (zxid >= *new_last_zxid) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got new ZXID {} smaller or equal than current {}. It's a bug", *new_last_zxid, zxid); + zxid = *new_last_zxid; + } + + session_expiry_queue.update(session_id, session_and_timeout[session_id]); if (zk_request->getOpNum() == Coordination::OpNum::Close) { auto it = ephemerals.find(session_id); @@ -612,6 +643,13 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor for (const auto & ephemeral_path : it->second) { container.erase(ephemeral_path); + container.updateValue(parentPath(ephemeral_path), [&ephemeral_path] (KeeperStorage::Node & parent) + { + --parent.stat.numChildren; + ++parent.stat.cversion; + parent.children.erase(getBaseName(ephemeral_path)); + }); + auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED); results.insert(results.end(), responses.begin(), responses.end()); } @@ -629,8 +667,7 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor } else if (zk_request->getOpNum() == Coordination::OpNum::Heartbeat) { - session_expiry_queue.update(session_id, session_and_timeout[session_id]); - NuKeeperStorageRequestPtr storage_request = NuKeeperWrapperFactory::instance().get(zk_request); + KeeperStorageRequestPtr storage_request = KeeperWrapperFactory::instance().get(zk_request); auto [response, _] = storage_request->process(container, ephemerals, zxid, session_id); response->xid = zk_request->xid; response->zxid = getZXID(); @@ -639,8 +676,7 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor } else { - - NuKeeperStorageRequestPtr storage_request = NuKeeperWrapperFactory::instance().get(zk_request); + KeeperStorageRequestPtr storage_request = KeeperWrapperFactory::instance().get(zk_request); auto [response, _] = storage_request->process(container, ephemerals, zxid, session_id); if (zk_request->has_watch) @@ -677,7 +713,7 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor } -void NuKeeperStorage::clearDeadWatches(int64_t session_id) +void KeeperStorage::clearDeadWatches(int64_t session_id) { auto watches_it = sessions_and_watchers.find(session_id); if (watches_it != sessions_and_watchers.end()) diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/KeeperStorage.h similarity index 60% rename from src/Coordination/NuKeeperStorage.h rename to src/Coordination/KeeperStorage.h index 20ab1982b4e..585426a7441 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -13,23 +14,27 @@ namespace DB { using namespace DB; -struct NuKeeperStorageRequest; -using NuKeeperStorageRequestPtr = std::shared_ptr; +struct KeeperStorageRequest; +using KeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; +using ChildrenSet = std::unordered_set; +using SessionAndTimeout = std::unordered_map; -class NuKeeperStorage +struct KeeperStorageSnapshot; + +class KeeperStorage { public: - int64_t session_id_counter{0}; + int64_t session_id_counter{1}; struct Node { String data; Coordination::ACLs acls{}; - bool is_ephemeral = false; bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; + ChildrenSet children{}; }; struct ResponseForSession @@ -48,10 +53,9 @@ public: using RequestsForSessions = std::vector; - using Container = std::map; - using Ephemerals = std::unordered_map>; - using SessionAndWatcher = std::unordered_map>; - using SessionAndTimeout = std::unordered_map; + using Container = SnapshotableHashTable; + using Ephemerals = std::unordered_map>; + using SessionAndWatcher = std::unordered_map>; using SessionIDs = std::vector; using Watches = std::map; @@ -70,13 +74,13 @@ public: void clearDeadWatches(int64_t session_id); - int64_t getZXID() + int64_t getZXID() const { - return zxid++; + return zxid; } public: - NuKeeperStorage(int64_t tick_time_ms); + KeeperStorage(int64_t tick_time_ms); int64_t getSessionID(int64_t session_timeout_ms) { @@ -86,14 +90,47 @@ public: return result; } - ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id); + void addSessionID(int64_t session_id, int64_t session_timeout_ms) + { + session_and_timeout.emplace(session_id, session_timeout_ms); + session_expiry_queue.update(session_id, session_timeout_ms); + } + + ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id, std::optional new_last_zxid); void finalize(); + void enableSnapshotMode() + { + container.enableSnapshotMode(); + } + + void disableSnapshotMode() + { + container.disableSnapshotMode(); + } + + Container::const_iterator getSnapshotIteratorBegin() const + { + return container.begin(); + } + + void clearGarbageAfterSnapshot() + { + container.clearOutdatedNodes(); + } + + const SessionAndTimeout & getActiveSessions() const + { + return session_and_timeout; + } + std::unordered_set getDeadSessions() { return session_expiry_queue.getExpiredSessions(); } }; +using KeeperStoragePtr = std::unique_ptr; + } diff --git a/src/Coordination/KeeperStorageDispatcher.cpp b/src/Coordination/KeeperStorageDispatcher.cpp new file mode 100644 index 00000000000..fc79f5bee97 --- /dev/null +++ b/src/Coordination/KeeperStorageDispatcher.cpp @@ -0,0 +1,461 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + + extern const int LOGICAL_ERROR; + extern const int TIMEOUT_EXCEEDED; +} + +KeeperStorageDispatcher::KeeperStorageDispatcher() + : coordination_settings(std::make_shared()) + , log(&Poco::Logger::get("KeeperDispatcher")) +{ +} + + +void KeeperStorageDispatcher::requestThread() +{ + setThreadName("KeeperReqT"); + + /// Result of requests batch from previous iteration + RaftAppendResult prev_result = nullptr; + /// Requests from previous iteration. We store them to be able + /// to send errors to the client. + KeeperStorage::RequestsForSessions prev_batch; + + while (!shutdown_called) + { + KeeperStorage::RequestForSession request; + + UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds()); + uint64_t max_batch_size = coordination_settings->max_requests_batch_size; + + /// The code below do a very simple thing: batch all write (quorum) requests into vector until + /// previous write batch is not finished or max_batch size achieved. The main complexity goes from + /// the ability to process read requests without quorum (from local state). So when we are collecting + /// requests into a batch we must check that the new request is not read request. Otherwise we have to + /// process all already accumulated write requests, wait them synchronously and only after that process + /// read request. So reads are some kind of "separator" for writes. + try + { + if (requests_queue->tryPop(request, max_wait)) + { + if (shutdown_called) + break; + + KeeperStorage::RequestsForSessions current_batch; + + bool has_read_request = false; + + /// If new request is not read request or we must to process it through quorum. + /// Otherwise we will process it locally. + if (coordination_settings->quorum_reads || !request.request->isReadRequest()) + { + current_batch.emplace_back(request); + + /// Waiting until previous append will be successful, or batch is big enough + /// has_result == false && get_result_code == OK means that our request still not processed. + /// Sometimes NuRaft set errorcode without setting result, so we check both here. + while (prev_result && (!prev_result->has_result() && prev_result->get_result_code() == nuraft::cmd_result_code::OK) && current_batch.size() <= max_batch_size) + { + /// Trying to get batch requests as fast as possible + if (requests_queue->tryPop(request, 1)) + { + /// Don't append read request into batch, we have to process them separately + if (!coordination_settings->quorum_reads && request.request->isReadRequest()) + { + has_read_request = true; + break; + } + else + { + + current_batch.emplace_back(request); + } + } + + if (shutdown_called) + break; + } + } + else + has_read_request = true; + + if (shutdown_called) + break; + + /// Forcefully process all previous pending requests + if (prev_result) + forceWaitAndProcessResult(prev_result, prev_batch); + + /// Process collected write requests batch + if (!current_batch.empty()) + { + auto result = server->putRequestBatch(current_batch); + + if (result) + { + if (has_read_request) /// If we will execute read request next, than we have to process result now + forceWaitAndProcessResult(result, current_batch); + } + else + { + addErrorResponses(current_batch, Coordination::Error::ZRUNTIMEINCONSISTENCY); + current_batch.clear(); + } + + prev_batch = current_batch; + prev_result = result; + } + + /// Read request always goes after write batch (last request) + if (has_read_request) + { + if (server->isLeaderAlive()) + server->putLocalReadRequest(request); + else + addErrorResponses({request}, Coordination::Error::ZRUNTIMEINCONSISTENCY); + } + } + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } +} + +void KeeperStorageDispatcher::responseThread() +{ + setThreadName("KeeperRspT"); + while (!shutdown_called) + { + KeeperStorage::ResponseForSession response_for_session; + + UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds()); + + if (responses_queue.tryPop(response_for_session, max_wait)) + { + if (shutdown_called) + break; + + try + { + setResponse(response_for_session.session_id, response_for_session.response); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + } +} + +void KeeperStorageDispatcher::snapshotThread() +{ + setThreadName("KeeperSnpT"); + while (!shutdown_called) + { + CreateSnapshotTask task; + snapshots_queue.pop(task); + + if (shutdown_called) + break; + + try + { + task.create_snapshot(std::move(task.snapshot)); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } +} + +void KeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response) +{ + std::lock_guard lock(session_to_response_callback_mutex); + if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::SessionID) + { + const Coordination::ZooKeeperSessionIDResponse & session_id_resp = dynamic_cast(*response); + + /// Nobody waits for this session id + if (session_id_resp.server_id != server->getServerID() || !new_session_id_response_callback.count(session_id_resp.internal_id)) + return; + + auto callback = new_session_id_response_callback[session_id_resp.internal_id]; + callback(response); + new_session_id_response_callback.erase(session_id_resp.internal_id); + } + else + { + auto session_writer = session_to_response_callback.find(session_id); + if (session_writer == session_to_response_callback.end()) + return; + + session_writer->second(response); + + /// Session closed, no more writes + if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close) + { + session_to_response_callback.erase(session_writer); + } + } +} + +bool KeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id) +{ + { + std::lock_guard lock(session_to_response_callback_mutex); + if (session_to_response_callback.count(session_id) == 0) + return false; + } + + KeeperStorage::RequestForSession request_info; + request_info.request = request; + request_info.session_id = session_id; + + std::lock_guard lock(push_request_mutex); + /// Put close requests without timeouts + if (request->getOpNum() == Coordination::OpNum::Close) + requests_queue->push(std::move(request_info)); + else if (!requests_queue->tryPush(std::move(request_info), coordination_settings->operation_timeout_ms.totalMilliseconds())) + throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED); + return true; +} + +void KeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfiguration & config) +{ + LOG_DEBUG(log, "Initializing storage dispatcher"); + int myid = config.getInt("keeper_server.server_id"); + + coordination_settings->loadFromConfig("keeper_server.coordination_settings", config); + requests_queue = std::make_unique(coordination_settings->max_requests_batch_size); + + request_thread = ThreadFromGlobalPool([this] { requestThread(); }); + responses_thread = ThreadFromGlobalPool([this] { responseThread(); }); + snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); }); + + server = std::make_unique(myid, coordination_settings, config, responses_queue, snapshots_queue); + try + { + LOG_DEBUG(log, "Waiting server to initialize"); + server->startup(); + LOG_DEBUG(log, "Server initialized, waiting for quorum"); + + server->waitInit(); + LOG_DEBUG(log, "Quorum initialized"); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + throw; + } + + + session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); }); + + LOG_DEBUG(log, "Dispatcher initialized"); +} + +void KeeperStorageDispatcher::shutdown() +{ + try + { + { + std::lock_guard lock(push_request_mutex); + + if (shutdown_called) + return; + + LOG_DEBUG(log, "Shutting down storage dispatcher"); + shutdown_called = true; + + if (session_cleaner_thread.joinable()) + session_cleaner_thread.join(); + + /// FIXME not the best way to notify + requests_queue->push({}); + if (request_thread.joinable()) + request_thread.join(); + + responses_queue.push({}); + if (responses_thread.joinable()) + responses_thread.join(); + + snapshots_queue.push({}); + if (snapshot_thread.joinable()) + snapshot_thread.join(); + } + + if (server) + server->shutdown(); + + KeeperStorage::RequestForSession request_for_session; + while (requests_queue->tryPop(request_for_session)) + { + if (request_for_session.request) + { + auto response = request_for_session.request->makeResponse(); + response->error = Coordination::Error::ZSESSIONEXPIRED; + setResponse(request_for_session.session_id, response); + } + else + { + break; + } + } + session_to_response_callback.clear(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + + LOG_DEBUG(log, "Dispatcher shut down"); +} + +KeeperStorageDispatcher::~KeeperStorageDispatcher() +{ + shutdown(); +} + +void KeeperStorageDispatcher::registerSession(int64_t session_id, ZooKeeperResponseCallback callback) +{ + std::lock_guard lock(session_to_response_callback_mutex); + if (!session_to_response_callback.try_emplace(session_id, callback).second) + throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session with id {} already registered in dispatcher", session_id); +} + +void KeeperStorageDispatcher::sessionCleanerTask() +{ + while (true) + { + if (shutdown_called) + return; + + try + { + if (isLeader()) + { + auto dead_sessions = server->getDeadSessions(); + for (int64_t dead_session : dead_sessions) + { + LOG_INFO(log, "Found dead session {}, will try to close it", dead_session); + Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close); + request->xid = Coordination::CLOSE_XID; + KeeperStorage::RequestForSession request_info; + request_info.request = request; + request_info.session_id = dead_session; + { + std::lock_guard lock(push_request_mutex); + requests_queue->push(std::move(request_info)); + } + finishSession(dead_session); + LOG_INFO(log, "Dead session close request pushed"); + } + } + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + + std::this_thread::sleep_for(std::chrono::milliseconds(coordination_settings->dead_session_check_period_ms.totalMilliseconds())); + } +} + +void KeeperStorageDispatcher::finishSession(int64_t session_id) +{ + std::lock_guard lock(session_to_response_callback_mutex); + auto session_it = session_to_response_callback.find(session_id); + if (session_it != session_to_response_callback.end()) + session_to_response_callback.erase(session_it); +} + +void KeeperStorageDispatcher::addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error) +{ + for (const auto & [session_id, request] : requests_for_sessions) + { + KeeperStorage::ResponsesForSessions responses; + auto response = request->makeResponse(); + response->xid = request->xid; + response->zxid = 0; + response->error = error; + responses_queue.push(DB::KeeperStorage::ResponseForSession{session_id, response}); + } +} + +void KeeperStorageDispatcher::forceWaitAndProcessResult(RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions) +{ + if (!result->has_result()) + result->get(); + + /// If we get some errors, than send them to clients + if (!result->get_accepted() || result->get_result_code() == nuraft::cmd_result_code::TIMEOUT) + addErrorResponses(requests_for_sessions, Coordination::Error::ZOPERATIONTIMEOUT); + else if (result->get_result_code() != nuraft::cmd_result_code::OK) + addErrorResponses(requests_for_sessions, Coordination::Error::ZRUNTIMEINCONSISTENCY); + + result = nullptr; + requests_for_sessions.clear(); +} + +int64_t KeeperStorageDispatcher::getSessionID(int64_t session_timeout_ms) +{ + KeeperStorage::RequestForSession request_info; + std::shared_ptr request = std::make_shared(); + request->internal_id = internal_session_id_counter.fetch_add(1); + request->session_timeout_ms = session_timeout_ms; + request->server_id = server->getServerID(); + + request_info.request = request; + request_info.session_id = -1; + + auto promise = std::make_shared>(); + auto future = promise->get_future(); + { + std::lock_guard lock(session_to_response_callback_mutex); + new_session_id_response_callback[request->internal_id] = [promise, internal_id = request->internal_id] (const Coordination::ZooKeeperResponsePtr & response) + { + if (response->getOpNum() != Coordination::OpNum::SessionID) + promise->set_exception(std::make_exception_ptr(Exception(ErrorCodes::LOGICAL_ERROR, + "Incorrect response of type {} instead of SessionID response", Coordination::toString(response->getOpNum())))); + + auto session_id_response = dynamic_cast(*response); + if (session_id_response.internal_id != internal_id) + { + promise->set_exception(std::make_exception_ptr(Exception(ErrorCodes::LOGICAL_ERROR, + "Incorrect response with internal id {} instead of {}", session_id_response.internal_id, internal_id))); + } + + if (response->error != Coordination::Error::ZOK) + promise->set_exception(std::make_exception_ptr(zkutil::KeeperException("SessionID request failed with error", response->error))); + + promise->set_value(session_id_response.session_id); + }; + } + + { + std::lock_guard lock(push_request_mutex); + if (!requests_queue->tryPush(std::move(request_info), session_timeout_ms)) + throw Exception("Cannot push session id request to queue within session timeout", ErrorCodes::TIMEOUT_EXCEEDED); + } + + if (future.wait_for(std::chrono::milliseconds(session_timeout_ms)) != std::future_status::ready) + throw Exception("Cannot receive session id within session timeout", ErrorCodes::TIMEOUT_EXCEEDED); + + return future.get(); +} + +} diff --git a/src/Coordination/KeeperStorageDispatcher.h b/src/Coordination/KeeperStorageDispatcher.h new file mode 100644 index 00000000000..e4cfa620e6c --- /dev/null +++ b/src/Coordination/KeeperStorageDispatcher.h @@ -0,0 +1,116 @@ +#pragma once + +#if !defined(ARCADIA_BUILD) +# include +# include "config_core.h" +#endif + +#if USE_NURAFT + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +using ZooKeeperResponseCallback = std::function; + +class KeeperStorageDispatcher +{ + +private: + std::mutex push_request_mutex; + + CoordinationSettingsPtr coordination_settings; + using RequestsQueue = ConcurrentBoundedQueue; + using SessionToResponseCallback = std::unordered_map; + + /// Size depends on coordination settings + std::unique_ptr requests_queue; + ResponsesQueue responses_queue; + SnapshotsQueue snapshots_queue{1}; + + std::atomic shutdown_called{false}; + + std::mutex session_to_response_callback_mutex; + /// These two maps looks similar, but serves different purposes. + /// The first map is subscription map for normal responses like + /// (get, set, list, etc.). Dispatcher determines callback for each response + /// using session id from this map. + SessionToResponseCallback session_to_response_callback; + /// But when client connects to the server for the first time it doesn't + /// have session_id. It request it from server. We give temporary + /// internal id for such requests just to much client with its response. + SessionToResponseCallback new_session_id_response_callback; + + /// Reading and batching new requests from client handlers + ThreadFromGlobalPool request_thread; + /// Pushing responses to clients client handlers + /// using session_id. + ThreadFromGlobalPool responses_thread; + /// Cleaning old dead sessions + ThreadFromGlobalPool session_cleaner_thread; + /// Dumping new snapshots to disk + ThreadFromGlobalPool snapshot_thread; + + /// RAFT wrapper. Most important class. + std::unique_ptr server; + + Poco::Logger * log; + + /// Counter for new session_id requests. + std::atomic internal_session_id_counter{0}; + +private: + void requestThread(); + void responseThread(); + void sessionCleanerTask(); + void snapshotThread(); + void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response); + + /// Add error responses for requests to responses queue. + /// Clears requests. + void addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error); + + /// Forcefully wait for result and sets errors if something when wrong. + /// Clears both arguments + void forceWaitAndProcessResult(RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions); + +public: + KeeperStorageDispatcher(); + + void initialize(const Poco::Util::AbstractConfiguration & config); + + void shutdown(); + + ~KeeperStorageDispatcher(); + + bool putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id); + + bool isLeader() const + { + return server->isLeader(); + } + + bool hasLeader() const + { + return server->isLeaderAlive(); + } + + int64_t getSessionID(int64_t session_timeout_ms); + + void registerSession(int64_t session_id, ZooKeeperResponseCallback callback); + /// Call if we don't need any responses for this session no more (session was expired) + void finishSession(int64_t session_id); +}; + +} + +#endif diff --git a/src/Coordination/LoggerWrapper.h b/src/Coordination/LoggerWrapper.h index 755b72c06cc..25a1969d2e9 100644 --- a/src/Coordination/LoggerWrapper.h +++ b/src/Coordination/LoggerWrapper.h @@ -9,12 +9,26 @@ namespace DB class LoggerWrapper : public nuraft::logger { +private: + + static inline const std::unordered_map LEVELS = + { + {LogsLevel::trace, Poco::Message::Priority::PRIO_TRACE}, + {LogsLevel::debug, Poco::Message::Priority::PRIO_DEBUG}, + {LogsLevel::information, Poco::Message::PRIO_INFORMATION}, + {LogsLevel::warning, Poco::Message::PRIO_WARNING}, + {LogsLevel::error, Poco::Message::PRIO_ERROR}, + {LogsLevel::fatal, Poco::Message::PRIO_FATAL} + }; + static inline const int LEVEL_MAX = static_cast(LogsLevel::trace); + static inline const int LEVEL_MIN = static_cast(LogsLevel::none); + public: LoggerWrapper(const std::string & name, LogsLevel level_) : log(&Poco::Logger::get(name)) - , level(static_cast(level_)) + , level(level_) { - log->setLevel(level); + log->setLevel(static_cast(LEVELS.at(level))); } void put_details( @@ -24,24 +38,26 @@ public: size_t /* line_number */, const std::string & msg) override { - LOG_IMPL(log, static_cast(level_), static_cast(level_), msg); + LogsLevel db_level = static_cast(level_); + LOG_IMPL(log, db_level, LEVELS.at(db_level), msg); } void set_level(int level_) override { - level_ = std::min(6, std::max(1, level_)); - log->setLevel(level_); - level = level_; + level_ = std::min(LEVEL_MAX, std::max(LEVEL_MIN, level_)); + level = static_cast(level_); + log->setLevel(static_cast(LEVELS.at(level))); } int get_level() override { - return level; + LogsLevel lvl = level; + return static_cast(lvl); } private: Poco::Logger * log; - std::atomic level; + std::atomic level; }; } diff --git a/src/Coordination/NuKeeperCommon.h b/src/Coordination/NuKeeperCommon.h deleted file mode 100644 index 14fc612093c..00000000000 --- a/src/Coordination/NuKeeperCommon.h +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -struct NuKeeperRequest -{ - int64_t session_id; - Coordination::ZooKeeperRequestPtr request; -}; - -using NuKeeperRequests = std::vector; - -struct NuKeeperResponse -{ - int64_t session_id; - Coordination::ZooKeeperRequestPtr response; -}; - -using NuKeeperResponses = std::vector; - -} diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp deleted file mode 100644 index 7464a06e86f..00000000000 --- a/src/Coordination/NuKeeperServer.cpp +++ /dev/null @@ -1,182 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int RAFT_ERROR; -} - -NuKeeperServer::NuKeeperServer( - int server_id_, - const CoordinationSettingsPtr & coordination_settings_, - const Poco::Util::AbstractConfiguration & config, - ResponsesQueue & responses_queue_) - : server_id(server_id_) - , coordination_settings(coordination_settings_) - , state_machine(nuraft::cs_new(responses_queue_, coordination_settings)) - , state_manager(nuraft::cs_new(server_id, "test_keeper_server.raft_configuration", config)) - , responses_queue(responses_queue_) -{ -} - -void NuKeeperServer::startup() -{ - nuraft::raft_params params; - params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds(); - params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(); - params.election_timeout_upper_bound_ = coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(); - params.reserved_log_items_ = coordination_settings->reserved_log_items; - params.snapshot_distance_ = coordination_settings->snapshot_distance; - params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds(); - params.auto_forwarding_ = coordination_settings->auto_forwarding; - params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2; - - params.return_method_ = nuraft::raft_params::blocking; - - nuraft::asio_service::options asio_opts{}; - nuraft::raft_server::init_options init_options; - init_options.skip_initial_election_timeout_ = state_manager->shouldStartAsFollower(); - init_options.raft_callback_ = [this] (nuraft::cb_func::Type type, nuraft::cb_func::Param * param) - { - return callbackFunc(type, param); - }; - - raft_instance = launcher.init( - state_machine, state_manager, nuraft::cs_new("RaftInstance", coordination_settings->raft_logs_level), state_manager->getPort(), - asio_opts, params, init_options); - - if (!raft_instance) - throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); -} - -void NuKeeperServer::shutdown() -{ - state_machine->shutdownStorage(); - if (!launcher.shutdown(coordination_settings->shutdown_timeout.totalSeconds())) - LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", 5); -} - -namespace -{ - -nuraft::ptr getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request) -{ - DB::WriteBufferFromNuraftBuffer buf; - DB::writeIntBinary(session_id, buf); - request->write(buf); - return buf.getBuffer(); -} - -} - -void NuKeeperServer::putRequest(const NuKeeperStorage::RequestForSession & request_for_session) -{ - auto [session_id, request] = request_for_session; - if (isLeaderAlive() && request->isReadRequest()) - { - state_machine->processReadRequest(request_for_session); - } - else - { - std::vector> entries; - entries.push_back(getZooKeeperLogEntry(session_id, request)); - - std::lock_guard lock(append_entries_mutex); - - auto result = raft_instance->append_entries(entries); - if (!result->get_accepted()) - { - NuKeeperStorage::ResponsesForSessions responses; - auto response = request->makeResponse(); - response->xid = request->xid; - response->zxid = 0; - response->error = Coordination::Error::ZOPERATIONTIMEOUT; - responses_queue.push(DB::NuKeeperStorage::ResponseForSession{session_id, response}); - } - - if (result->get_result_code() == nuraft::cmd_result_code::TIMEOUT) - { - NuKeeperStorage::ResponsesForSessions responses; - auto response = request->makeResponse(); - response->xid = request->xid; - response->zxid = 0; - response->error = Coordination::Error::ZOPERATIONTIMEOUT; - responses_queue.push(DB::NuKeeperStorage::ResponseForSession{session_id, response}); - } - else if (result->get_result_code() != nuraft::cmd_result_code::OK) - throw Exception(ErrorCodes::RAFT_ERROR, "Requests result failed with code {} and message: '{}'", result->get_result_code(), result->get_result_str()); - } -} - -int64_t NuKeeperServer::getSessionID(int64_t session_timeout_ms) -{ - auto entry = nuraft::buffer::alloc(sizeof(int64_t)); - /// Just special session request - nuraft::buffer_serializer bs(entry); - bs.put_i64(session_timeout_ms); - - std::lock_guard lock(append_entries_mutex); - - auto result = raft_instance->append_entries({entry}); - - if (!result->get_accepted()) - throw Exception(ErrorCodes::RAFT_ERROR, "Cannot send session_id request to RAFT"); - - if (result->get_result_code() != nuraft::cmd_result_code::OK) - throw Exception(ErrorCodes::RAFT_ERROR, "session_id request failed to RAFT"); - - auto resp = result->get(); - if (resp == nullptr) - throw Exception(ErrorCodes::RAFT_ERROR, "Received nullptr as session_id"); - - nuraft::buffer_serializer bs_resp(resp); - return bs_resp.get_i64(); -} - -bool NuKeeperServer::isLeader() const -{ - return raft_instance->is_leader(); -} - -bool NuKeeperServer::isLeaderAlive() const -{ - return raft_instance->is_leader_alive(); -} - -nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * /* param */) -{ - if (type == nuraft::cb_func::Type::BecomeFresh || type == nuraft::cb_func::Type::BecomeLeader) - { - std::unique_lock lock(initialized_mutex); - initialized_flag = true; - initialized_cv.notify_all(); - } - return nuraft::cb_func::ReturnCode::Ok; -} - -void NuKeeperServer::waitInit() -{ - std::unique_lock lock(initialized_mutex); - int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); - if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; })) - throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization"); -} - -std::unordered_set NuKeeperServer::getDeadSessions() -{ - return state_machine->getDeadSessions(); -} - -} diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h deleted file mode 100644 index a8d269eb9eb..00000000000 --- a/src/Coordination/NuKeeperServer.h +++ /dev/null @@ -1,63 +0,0 @@ -#pragma once - -#include // Y_IGNORE -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -class NuKeeperServer -{ -private: - int server_id; - - CoordinationSettingsPtr coordination_settings; - - nuraft::ptr state_machine; - - nuraft::ptr state_manager; - - nuraft::raft_launcher launcher; - - nuraft::ptr raft_instance; - - std::mutex append_entries_mutex; - - ResponsesQueue & responses_queue; - - std::mutex initialized_mutex; - bool initialized_flag = false; - std::condition_variable initialized_cv; - - nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param); - -public: - NuKeeperServer( - int server_id_, - const CoordinationSettingsPtr & coordination_settings_, - const Poco::Util::AbstractConfiguration & config, - ResponsesQueue & responses_queue_); - - void startup(); - - void putRequest(const NuKeeperStorage::RequestForSession & request); - - int64_t getSessionID(int64_t session_timeout_ms); - - std::unordered_set getDeadSessions(); - - bool isLeader() const; - - bool isLeaderAlive() const; - - void waitInit(); - - void shutdown(); -}; - -} diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp deleted file mode 100644 index 0061645c75c..00000000000 --- a/src/Coordination/NuKeeperStateMachine.cpp +++ /dev/null @@ -1,262 +0,0 @@ -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data) -{ - ReadBufferFromNuraftBuffer buffer(data); - NuKeeperStorage::RequestForSession request_for_session; - readIntBinary(request_for_session.session_id, buffer); - - int32_t length; - Coordination::read(length, buffer); - - int32_t xid; - Coordination::read(xid, buffer); - - Coordination::OpNum opnum; - Coordination::read(opnum, buffer); - - request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum); - request_for_session.request->xid = xid; - request_for_session.request->readImpl(buffer); - return request_for_session; -} - -nuraft::ptr writeResponses(NuKeeperStorage::ResponsesForSessions & responses) -{ - WriteBufferFromNuraftBuffer buffer; - for (const auto & response_and_session : responses) - { - writeIntBinary(response_and_session.session_id, buffer); - response_and_session.response->write(buffer); - } - return buffer.getBuffer(); -} - - -NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, const CoordinationSettingsPtr & coordination_settings_) - : coordination_settings(coordination_settings_) - , storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds()) - , responses_queue(responses_queue_) - , last_committed_idx(0) - , log(&Poco::Logger::get("NuRaftStateMachine")) -{ - LOG_DEBUG(log, "Created nukeeper state machine"); -} - -nuraft::ptr NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data) -{ - if (data.size() == sizeof(int64_t)) - { - nuraft::buffer_serializer timeout_data(data); - int64_t session_timeout_ms = timeout_data.get_i64(); - auto response = nuraft::buffer::alloc(sizeof(int64_t)); - int64_t session_id; - nuraft::buffer_serializer bs(response); - { - std::lock_guard lock(storage_lock); - session_id = storage.getSessionID(session_timeout_ms); - bs.put_i64(session_id); - } - LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms); - last_committed_idx = log_idx; - return response; - } - else - { - auto request_for_session = parseRequest(data); - NuKeeperStorage::ResponsesForSessions responses_for_sessions; - { - std::lock_guard lock(storage_lock); - responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id); - for (auto & response_for_session : responses_for_sessions) - responses_queue.push(response_for_session); - } - - last_committed_idx = log_idx; - return nullptr; - } -} - -bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s) -{ - LOG_DEBUG(log, "Applying snapshot {}", s.get_last_log_idx()); - StorageSnapshotPtr snapshot; - { - std::lock_guard lock(snapshots_lock); - auto entry = snapshots.find(s.get_last_log_idx()); - if (entry == snapshots.end()) - return false; - snapshot = entry->second; - } - std::lock_guard lock(storage_lock); - storage = snapshot->storage; - last_committed_idx = s.get_last_log_idx(); - return true; -} - -nuraft::ptr NuKeeperStateMachine::last_snapshot() -{ - // Just return the latest snapshot. - std::lock_guard lock(snapshots_lock); - auto entry = snapshots.rbegin(); - if (entry == snapshots.rend()) - return nullptr; - - return entry->second->snapshot; -} - -NuKeeperStateMachine::StorageSnapshotPtr NuKeeperStateMachine::createSnapshotInternal(nuraft::snapshot & s) -{ - nuraft::ptr snp_buf = s.serialize(); - nuraft::ptr ss = nuraft::snapshot::deserialize(*snp_buf); - std::lock_guard lock(storage_lock); - return std::make_shared(ss, storage); -} - -NuKeeperStateMachine::StorageSnapshotPtr NuKeeperStateMachine::readSnapshot(nuraft::snapshot & s, nuraft::buffer & in) -{ - nuraft::ptr snp_buf = s.serialize(); - nuraft::ptr ss = nuraft::snapshot::deserialize(*snp_buf); - NuKeeperStorageSerializer serializer; - - ReadBufferFromNuraftBuffer reader(in); - NuKeeperStorage new_storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds()); - serializer.deserialize(new_storage, reader); - return std::make_shared(ss, new_storage); -} - - -void NuKeeperStateMachine::writeSnapshot(const NuKeeperStateMachine::StorageSnapshotPtr & snapshot, nuraft::ptr & out) -{ - NuKeeperStorageSerializer serializer; - - WriteBufferFromNuraftBuffer writer; - serializer.serialize(snapshot->storage, writer); - out = writer.getBuffer(); -} - -void NuKeeperStateMachine::create_snapshot( - nuraft::snapshot & s, - nuraft::async_result::handler_type & when_done) -{ - LOG_DEBUG(log, "Creating snapshot {}", s.get_last_log_idx()); - auto snapshot = createSnapshotInternal(s); - { - std::lock_guard lock(snapshots_lock); - snapshots[s.get_last_log_idx()] = snapshot; - size_t num = snapshots.size(); - if (num > coordination_settings->max_stored_snapshots) - { - auto entry = snapshots.begin(); - - for (size_t i = 0; i < num - coordination_settings->max_stored_snapshots; ++i) - { - if (entry == snapshots.end()) - break; - entry = snapshots.erase(entry); - } - } - - } - - LOG_DEBUG(log, "Created snapshot {}", s.get_last_log_idx()); - nuraft::ptr except(nullptr); - bool ret = true; - when_done(ret, except); -} - -void NuKeeperStateMachine::save_logical_snp_obj( - nuraft::snapshot & s, - size_t & obj_id, - nuraft::buffer & data, - bool /*is_first_obj*/, - bool /*is_last_obj*/) -{ - LOG_DEBUG(log, "Saving snapshot {} obj_id {}", s.get_last_log_idx(), obj_id); - - if (obj_id == 0) - { - auto new_snapshot = createSnapshotInternal(s); - std::lock_guard lock(snapshots_lock); - snapshots.try_emplace(s.get_last_log_idx(), std::move(new_snapshot)); - } - else - { - auto received_snapshot = readSnapshot(s, data); - - std::lock_guard lock(snapshots_lock); - snapshots[s.get_last_log_idx()] = std::move(received_snapshot); - } - - obj_id++; -} - -int NuKeeperStateMachine::read_logical_snp_obj( - nuraft::snapshot & s, - void* & /*user_snp_ctx*/, - ulong obj_id, - nuraft::ptr & data_out, - bool & is_last_obj) -{ - - LOG_DEBUG(log, "Reading snapshot {} obj_id {}", s.get_last_log_idx(), obj_id); - StorageSnapshotPtr required_snapshot; - { - std::lock_guard lock(snapshots_lock); - auto entry = snapshots.find(s.get_last_log_idx()); - if (entry == snapshots.end()) - { - // Snapshot doesn't exist. - data_out = nullptr; - is_last_obj = true; - return 0; - } - required_snapshot = entry->second; - } - - if (obj_id == 0) - { - auto new_snapshot = createSnapshotInternal(s); - writeSnapshot(new_snapshot, data_out); - is_last_obj = false; - } - else - { - writeSnapshot(required_snapshot, data_out); - is_last_obj = true; - } - return 0; -} - -void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session) -{ - NuKeeperStorage::ResponsesForSessions responses; - { - std::lock_guard lock(storage_lock); - responses = storage.processRequest(request_for_session.request, request_for_session.session_id); - } - for (const auto & response : responses) - responses_queue.push(response); -} - -std::unordered_set NuKeeperStateMachine::getDeadSessions() -{ - std::lock_guard lock(storage_lock); - return storage.getDeadSessions(); -} - -void NuKeeperStateMachine::shutdownStorage() -{ - std::lock_guard lock(storage_lock); - storage.finalize(); -} - -} diff --git a/src/Coordination/NuKeeperStateMachine.h b/src/Coordination/NuKeeperStateMachine.h deleted file mode 100644 index 87748db20a5..00000000000 --- a/src/Coordination/NuKeeperStateMachine.h +++ /dev/null @@ -1,99 +0,0 @@ -#pragma once - -#include -#include // Y_IGNORE -#include -#include -#include - -namespace DB -{ - -using ResponsesQueue = ThreadSafeQueue; - -class NuKeeperStateMachine : public nuraft::state_machine -{ -public: - NuKeeperStateMachine(ResponsesQueue & responses_queue_, const CoordinationSettingsPtr & coordination_settings_); - - nuraft::ptr pre_commit(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; } - - nuraft::ptr commit(const size_t log_idx, nuraft::buffer & data) override; - - void rollback(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override {} - - size_t last_commit_index() override { return last_committed_idx; } - - bool apply_snapshot(nuraft::snapshot & s) override; - - nuraft::ptr last_snapshot() override; - - void create_snapshot( - nuraft::snapshot & s, - nuraft::async_result::handler_type & when_done) override; - - void save_logical_snp_obj( - nuraft::snapshot & s, - size_t & obj_id, - nuraft::buffer & data, - bool is_first_obj, - bool is_last_obj) override; - - int read_logical_snp_obj( - nuraft::snapshot & s, - void* & user_snp_ctx, - ulong obj_id, - nuraft::ptr & data_out, - bool & is_last_obj) override; - - NuKeeperStorage & getStorage() - { - return storage; - } - - void processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session); - - std::unordered_set getDeadSessions(); - - void shutdownStorage(); - -private: - struct StorageSnapshot - { - StorageSnapshot(const nuraft::ptr & s, const NuKeeperStorage & storage_) - : snapshot(s) - , storage(storage_) - {} - - nuraft::ptr snapshot; - NuKeeperStorage storage; - }; - - using StorageSnapshotPtr = std::shared_ptr; - - StorageSnapshotPtr createSnapshotInternal(nuraft::snapshot & s); - - StorageSnapshotPtr readSnapshot(nuraft::snapshot & s, nuraft::buffer & in); - - static void writeSnapshot(const StorageSnapshotPtr & snapshot, nuraft::ptr & out); - - CoordinationSettingsPtr coordination_settings; - - NuKeeperStorage storage; - - ResponsesQueue & responses_queue; - /// Mutex for snapshots - std::mutex snapshots_lock; - - /// Lock for storage - std::mutex storage_lock; - - /// Fake snapshot storage - std::map snapshots; - - /// Last committed Raft log number. - std::atomic last_committed_idx; - Poco::Logger * log; -}; - -} diff --git a/src/Coordination/NuKeeperStorageDispatcher.cpp b/src/Coordination/NuKeeperStorageDispatcher.cpp deleted file mode 100644 index 570087757ad..00000000000 --- a/src/Coordination/NuKeeperStorageDispatcher.cpp +++ /dev/null @@ -1,237 +0,0 @@ -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - - extern const int LOGICAL_ERROR; - extern const int TIMEOUT_EXCEEDED; -} - -NuKeeperStorageDispatcher::NuKeeperStorageDispatcher() - : coordination_settings(std::make_shared()) - , log(&Poco::Logger::get("NuKeeperDispatcher")) -{ -} - -void NuKeeperStorageDispatcher::requestThread() -{ - setThreadName("NuKeeperReqT"); - while (!shutdown_called) - { - NuKeeperStorage::RequestForSession request; - - UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds()); - - if (requests_queue.tryPop(request, max_wait)) - { - if (shutdown_called) - break; - - try - { - server->putRequest(request); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } - } - } -} - -void NuKeeperStorageDispatcher::responseThread() -{ - setThreadName("NuKeeperRspT"); - while (!shutdown_called) - { - NuKeeperStorage::ResponseForSession response_for_session; - - UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds()); - - if (responses_queue.tryPop(response_for_session, max_wait)) - { - if (shutdown_called) - break; - - try - { - setResponse(response_for_session.session_id, response_for_session.response); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } - } - } -} - -void NuKeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response) -{ - std::lock_guard lock(session_to_response_callback_mutex); - auto session_writer = session_to_response_callback.find(session_id); - if (session_writer == session_to_response_callback.end()) - return; - - session_writer->second(response); - /// Session closed, no more writes - if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close) - session_to_response_callback.erase(session_writer); -} - -bool NuKeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id) -{ - { - std::lock_guard lock(session_to_response_callback_mutex); - if (session_to_response_callback.count(session_id) == 0) - return false; - } - - NuKeeperStorage::RequestForSession request_info; - request_info.request = request; - request_info.session_id = session_id; - - std::lock_guard lock(push_request_mutex); - /// Put close requests without timeouts - if (request->getOpNum() == Coordination::OpNum::Close) - requests_queue.push(std::move(request_info)); - else if (!requests_queue.tryPush(std::move(request_info), coordination_settings->operation_timeout_ms.totalMilliseconds())) - throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED); - return true; -} - -void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfiguration & config) -{ - LOG_DEBUG(log, "Initializing storage dispatcher"); - int myid = config.getInt("test_keeper_server.server_id"); - - coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config); - - server = std::make_unique(myid, coordination_settings, config, responses_queue); - try - { - LOG_DEBUG(log, "Waiting server to initialize"); - server->startup(); - LOG_DEBUG(log, "Server initialized, waiting for quorum"); - - server->waitInit(); - LOG_DEBUG(log, "Quorum initialized"); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - throw; - } - - request_thread = ThreadFromGlobalPool([this] { requestThread(); }); - responses_thread = ThreadFromGlobalPool([this] { responseThread(); }); - session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); }); - - LOG_DEBUG(log, "Dispatcher initialized"); -} - -void NuKeeperStorageDispatcher::shutdown() -{ - try - { - { - std::lock_guard lock(push_request_mutex); - - if (shutdown_called) - return; - - LOG_DEBUG(log, "Shutting down storage dispatcher"); - shutdown_called = true; - - if (session_cleaner_thread.joinable()) - session_cleaner_thread.join(); - - if (request_thread.joinable()) - request_thread.join(); - - if (responses_thread.joinable()) - responses_thread.join(); - } - - if (server) - server->shutdown(); - - NuKeeperStorage::RequestForSession request_for_session; - while (requests_queue.tryPop(request_for_session)) - { - auto response = request_for_session.request->makeResponse(); - response->error = Coordination::Error::ZSESSIONEXPIRED; - setResponse(request_for_session.session_id, response); - } - session_to_response_callback.clear(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } - - LOG_DEBUG(log, "Dispatcher shut down"); -} - -NuKeeperStorageDispatcher::~NuKeeperStorageDispatcher() -{ - shutdown(); -} - -void NuKeeperStorageDispatcher::registerSession(int64_t session_id, ZooKeeperResponseCallback callback) -{ - std::lock_guard lock(session_to_response_callback_mutex); - if (!session_to_response_callback.try_emplace(session_id, callback).second) - throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session with id {} already registered in dispatcher", session_id); -} - -void NuKeeperStorageDispatcher::sessionCleanerTask() -{ - while (true) - { - if (shutdown_called) - return; - - try - { - if (isLeader()) - { - auto dead_sessions = server->getDeadSessions(); - for (int64_t dead_session : dead_sessions) - { - LOG_INFO(log, "Found dead session {}, will try to close it", dead_session); - Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close); - request->xid = Coordination::CLOSE_XID; - NuKeeperStorage::RequestForSession request_info; - request_info.request = request; - request_info.session_id = dead_session; - { - std::lock_guard lock(push_request_mutex); - requests_queue.push(std::move(request_info)); - } - finishSession(dead_session); - LOG_INFO(log, "Dead session close request pushed"); - } - } - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(coordination_settings->dead_session_check_period_ms.totalMilliseconds())); - } -} - -void NuKeeperStorageDispatcher::finishSession(int64_t session_id) -{ - std::lock_guard lock(session_to_response_callback_mutex); - auto session_it = session_to_response_callback.find(session_id); - if (session_it != session_to_response_callback.end()) - session_to_response_callback.erase(session_it); -} - -} diff --git a/src/Coordination/NuKeeperStorageDispatcher.h b/src/Coordination/NuKeeperStorageDispatcher.h deleted file mode 100644 index 62144b92a7a..00000000000 --- a/src/Coordination/NuKeeperStorageDispatcher.h +++ /dev/null @@ -1,89 +0,0 @@ -#pragma once - -#if !defined(ARCADIA_BUILD) -# include -# include "config_core.h" -#endif - -#if USE_NURAFT - -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -using ZooKeeperResponseCallback = std::function; - -class NuKeeperStorageDispatcher -{ - -private: - std::mutex push_request_mutex; - - CoordinationSettingsPtr coordination_settings; - using RequestsQueue = ConcurrentBoundedQueue; - RequestsQueue requests_queue{1}; - ResponsesQueue responses_queue; - std::atomic shutdown_called{false}; - using SessionToResponseCallback = std::unordered_map; - - std::mutex session_to_response_callback_mutex; - SessionToResponseCallback session_to_response_callback; - - ThreadFromGlobalPool request_thread; - ThreadFromGlobalPool responses_thread; - - ThreadFromGlobalPool session_cleaner_thread; - - std::unique_ptr server; - - Poco::Logger * log; - -private: - void requestThread(); - void responseThread(); - void sessionCleanerTask(); - void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response); - -public: - NuKeeperStorageDispatcher(); - - void initialize(const Poco::Util::AbstractConfiguration & config); - - void shutdown(); - - ~NuKeeperStorageDispatcher(); - - bool putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id); - - bool isLeader() const - { - return server->isLeader(); - } - - bool hasLeader() const - { - return server->isLeaderAlive(); - } - - int64_t getSessionID(long session_timeout_ms) - { - return server->getSessionID(session_timeout_ms); - } - - void registerSession(int64_t session_id, ZooKeeperResponseCallback callback); - /// Call if we don't need any responses for this session no more (session was expired) - void finishSession(int64_t session_id); -}; - -} - -#endif diff --git a/src/Coordination/NuKeeperStorageSerializer.cpp b/src/Coordination/NuKeeperStorageSerializer.cpp deleted file mode 100644 index 298df45cde0..00000000000 --- a/src/Coordination/NuKeeperStorageSerializer.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include -#include -#include - -namespace DB -{ - -namespace -{ - void writeNode(const NuKeeperStorage::Node & node, WriteBuffer & out) - { - Coordination::write(node.data, out); - Coordination::write(node.acls, out); - Coordination::write(node.is_ephemeral, out); - Coordination::write(node.is_sequental, out); - Coordination::write(node.stat, out); - Coordination::write(node.seq_num, out); - } - - void readNode(NuKeeperStorage::Node & node, ReadBuffer & in) - { - Coordination::read(node.data, in); - Coordination::read(node.acls, in); - Coordination::read(node.is_ephemeral, in); - Coordination::read(node.is_sequental, in); - Coordination::read(node.stat, in); - Coordination::read(node.seq_num, in); - } -} - -void NuKeeperStorageSerializer::serialize(const NuKeeperStorage & storage, WriteBuffer & out) -{ - Coordination::write(storage.zxid, out); - Coordination::write(storage.session_id_counter, out); - Coordination::write(storage.container.size(), out); - for (const auto & [path, node] : storage.container) - { - Coordination::write(path, out); - writeNode(node, out); - } - Coordination::write(storage.ephemerals.size(), out); - for (const auto & [session_id, paths] : storage.ephemerals) - { - Coordination::write(session_id, out); - Coordination::write(paths.size(), out); - for (const auto & path : paths) - Coordination::write(path, out); - } -} - -void NuKeeperStorageSerializer::deserialize(NuKeeperStorage & storage, ReadBuffer & in) -{ - int64_t session_id_counter, zxid; - Coordination::read(zxid, in); - Coordination::read(session_id_counter, in); - storage.zxid = zxid; - storage.session_id_counter = session_id_counter; - - size_t container_size; - Coordination::read(container_size, in); - while (storage.container.size() < container_size) - { - std::string path; - Coordination::read(path, in); - NuKeeperStorage::Node node; - readNode(node, in); - storage.container[path] = node; - } - size_t ephemerals_size; - Coordination::read(ephemerals_size, in); - while (storage.ephemerals.size() < ephemerals_size) - { - int64_t session_id; - size_t ephemerals_for_session; - Coordination::read(session_id, in); - Coordination::read(ephemerals_for_session, in); - while (storage.ephemerals[session_id].size() < ephemerals_for_session) - { - std::string ephemeral_path; - Coordination::read(ephemeral_path, in); - storage.ephemerals[session_id].emplace(ephemeral_path); - } - } -} - -} diff --git a/src/Coordination/NuKeeperStorageSerializer.h b/src/Coordination/NuKeeperStorageSerializer.h deleted file mode 100644 index e54c65a739d..00000000000 --- a/src/Coordination/NuKeeperStorageSerializer.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once -#include -#include -#include - -namespace DB -{ - -class NuKeeperStorageSerializer -{ -public: - static void serialize(const NuKeeperStorage & storage, WriteBuffer & out); - - static void deserialize(NuKeeperStorage & storage, ReadBuffer & in); -}; - -} diff --git a/src/Coordination/SnapshotableHashTable.h b/src/Coordination/SnapshotableHashTable.h new file mode 100644 index 00000000000..9987081ba87 --- /dev/null +++ b/src/Coordination/SnapshotableHashTable.h @@ -0,0 +1,197 @@ +#pragma once +#include +#include +#include +#include + +namespace DB +{ + +template +struct ListNode +{ + std::string key; + V value; + bool active_in_map; +}; + +template +class SnapshotableHashTable +{ +private: + + using ListElem = ListNode; + using List = std::list; + using IndexMap = std::unordered_map; + + List list; + IndexMap map; + bool snapshot_mode{false}; + +public: + + using iterator = typename List::iterator; + using const_iterator = typename List::const_iterator; + using reverse_iterator = typename List::reverse_iterator; + using const_reverse_iterator = typename List::const_reverse_iterator; + using ValueUpdater = std::function; + + bool insert(const std::string & key, const V & value) + { + auto it = map.find(key); + if (it == map.end()) + { + ListElem elem{key, value, true}; + auto itr = list.insert(list.end(), elem); + map.emplace(itr->key, itr); + return true; + } + + return false; + } + + + void insertOrReplace(const std::string & key, const V & value) + { + auto it = map.find(key); + if (it == map.end()) + { + ListElem elem{key, value, true}; + auto itr = list.insert(list.end(), elem); + map.emplace(itr->key, itr); + } + else + { + auto list_itr = it->second; + if (snapshot_mode) + { + ListElem elem{key, value, true}; + list_itr->active_in_map = false; + auto new_list_itr = list.insert(list.end(), elem); + map.erase(it); + map.emplace(new_list_itr->key, new_list_itr); + } + else + { + list_itr->value = value; + } + } + } + + bool erase(const std::string & key) + { + auto it = map.find(key); + if (it == map.end()) + return false; + + auto list_itr = it->second; + if (snapshot_mode) + { + list_itr->active_in_map = false; + map.erase(it); + } + else + { + map.erase(it); + list.erase(list_itr); + } + + return true; + } + + bool contains(const std::string & key) const + { + return map.find(key) != map.end(); + } + + const_iterator updateValue(const std::string & key, ValueUpdater updater) + { + auto it = map.find(key); + assert(it != map.end()); + if (snapshot_mode) + { + auto list_itr = it->second; + auto elem_copy = *(list_itr); + list_itr->active_in_map = false; + map.erase(it); + updater(elem_copy.value); + auto itr = list.insert(list.end(), elem_copy); + map.emplace(itr->key, itr); + return itr; + } + else + { + auto list_itr = it->second; + updater(list_itr->value); + return list_itr; + } + } + + const_iterator find(const std::string & key) const + { + auto map_it = map.find(key); + if (map_it != map.end()) + return map_it->second; + return list.end(); + } + + const V & getValue(const std::string & key) const + { + auto it = map.find(key); + assert(it != map.end()); + return it->second->value; + } + + void clearOutdatedNodes() + { + auto start = list.begin(); + auto end = list.end(); + for (auto itr = start; itr != end;) + { + if (!itr->active_in_map) + itr = list.erase(itr); + else + itr++; + } + } + + void clear() + { + list.clear(); + map.clear(); + } + + void enableSnapshotMode() + { + snapshot_mode = true; + } + + void disableSnapshotMode() + { + snapshot_mode = false; + } + + size_t size() const + { + return map.size(); + } + + size_t snapshotSize() const + { + return list.size(); + } + + + iterator begin() { return list.begin(); } + const_iterator begin() const { return list.cbegin(); } + iterator end() { return list.end(); } + const_iterator end() const { return list.cend(); } + + reverse_iterator rbegin() { return list.rbegin(); } + const_reverse_iterator rbegin() const { return list.crbegin(); } + reverse_iterator rend() { return list.rend(); } + const_reverse_iterator rend() const { return list.crend(); } +}; + + +} diff --git a/src/Coordination/SummingStateMachine.cpp b/src/Coordination/SummingStateMachine.cpp index 0cb7a7da6c3..ae3d2b06d75 100644 --- a/src/Coordination/SummingStateMachine.cpp +++ b/src/Coordination/SummingStateMachine.cpp @@ -21,7 +21,7 @@ SummingStateMachine::SummingStateMachine() { } -nuraft::ptr SummingStateMachine::commit(const size_t log_idx, nuraft::buffer & data) +nuraft::ptr SummingStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data) { int64_t value_to_add = deserializeValue(data); @@ -84,7 +84,7 @@ void SummingStateMachine::createSnapshotInternal(nuraft::snapshot & s) void SummingStateMachine::save_logical_snp_obj( nuraft::snapshot & s, - size_t & obj_id, + uint64_t & obj_id, nuraft::buffer & data, bool /*is_first_obj*/, bool /*is_last_obj*/) @@ -112,7 +112,7 @@ void SummingStateMachine::save_logical_snp_obj( int SummingStateMachine::read_logical_snp_obj( nuraft::snapshot & s, void* & /*user_snp_ctx*/, - size_t obj_id, + uint64_t obj_id, nuraft::ptr & data_out, bool & is_last_obj) { @@ -142,7 +142,7 @@ int SummingStateMachine::read_logical_snp_obj( else { // Object ID > 0: second object, put actual value. - data_out = nuraft::buffer::alloc(sizeof(size_t)); + data_out = nuraft::buffer::alloc(sizeof(uint64_t)); nuraft::buffer_serializer bs(data_out); bs.put_u64(ctx->value); is_last_obj = true; diff --git a/src/Coordination/SummingStateMachine.h b/src/Coordination/SummingStateMachine.h index c8594ba7e8d..03572840739 100644 --- a/src/Coordination/SummingStateMachine.h +++ b/src/Coordination/SummingStateMachine.h @@ -15,13 +15,13 @@ class SummingStateMachine : public nuraft::state_machine public: SummingStateMachine(); - nuraft::ptr pre_commit(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; } + nuraft::ptr pre_commit(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; } - nuraft::ptr commit(const size_t log_idx, nuraft::buffer & data) override; + nuraft::ptr commit(const uint64_t log_idx, nuraft::buffer & data) override; - void rollback(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override {} + void rollback(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override {} - size_t last_commit_index() override { return last_committed_idx; } + uint64_t last_commit_index() override { return last_committed_idx; } bool apply_snapshot(nuraft::snapshot & s) override; @@ -33,7 +33,7 @@ public: void save_logical_snp_obj( nuraft::snapshot & s, - size_t & obj_id, + uint64_t & obj_id, nuraft::buffer & data, bool is_first_obj, bool is_last_obj) override; @@ -41,7 +41,7 @@ public: int read_logical_snp_obj( nuraft::snapshot & s, void* & user_snp_ctx, - size_t obj_id, + uint64_t obj_id, nuraft::ptr & data_out, bool & is_last_obj) override; diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index ed9777350c5..515565a3b9f 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -6,12 +6,13 @@ #endif #if USE_NURAFT - +#include +#include #include -#include -#include +#include +#include #include -#include +#include #include #include #include @@ -20,9 +21,37 @@ #include #include #include +#include #include // Y_IGNORE #include +#include +#include +#include +#include + +namespace fs = std::filesystem; +struct ChangelogDirTest +{ + std::string path; + bool drop; + explicit ChangelogDirTest(std::string path_, bool drop_ = true) + : path(path_) + , drop(drop_) + { + if (fs::exists(path)) + { + EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; + } + fs::create_directory(path); + } + + ~ChangelogDirTest() + { + if (fs::exists(path) && drop) + fs::remove_all(path); + } +}; TEST(CoordinationTest, BuildTest) { @@ -67,14 +96,15 @@ TEST(CoordinationTest, BufferSerde) template struct SimpliestRaftServer { - SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_) + SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_, const std::string & logs_path) : server_id(server_id_) , hostname(hostname_) , port(port_) , endpoint(hostname + ":" + std::to_string(port)) , state_machine(nuraft::cs_new()) - , state_manager(nuraft::cs_new(server_id, hostname, port)) + , state_manager(nuraft::cs_new(server_id, hostname, port, logs_path)) { + state_manager->loadLogStore(1, 0); nuraft::raft_params params; params.heart_beat_interval_ = 100; params.election_timeout_lower_bound_ = 200; @@ -90,10 +120,10 @@ struct SimpliestRaftServer if (!raft_instance) { - std::cerr << "Failed to initialize launcher (see the message " - "in the log file)." << std::endl; + std::cerr << "Failed to initialize launcher" << std::endl; exit(-1); } + std::cout << "init Raft instance " << server_id; for (size_t ii = 0; ii < 20; ++ii) { @@ -123,7 +153,7 @@ struct SimpliestRaftServer nuraft::ptr state_machine; // State manager. - nuraft::ptr state_manager; + nuraft::ptr state_manager; // Raft launcher. nuraft::raft_launcher launcher; @@ -134,11 +164,10 @@ struct SimpliestRaftServer using SummingRaftServer = SimpliestRaftServer; -nuraft::ptr getLogEntry(int64_t number) +nuraft::ptr getBuffer(int64_t number) { nuraft::ptr ret = nuraft::buffer::alloc(sizeof(number)); nuraft::buffer_serializer bs(ret); - // WARNING: We don't consider endian-safety in this example. bs.put_raw(&number, sizeof(number)); return ret; } @@ -146,12 +175,13 @@ nuraft::ptr getLogEntry(int64_t number) TEST(CoordinationTest, TestSummingRaft1) { - SummingRaftServer s1(1, "localhost", 44444); + ChangelogDirTest test("./logs"); + SummingRaftServer s1(1, "localhost", 44444, "./logs"); /// Single node is leader EXPECT_EQ(s1.raft_instance->get_leader(), 1); - auto entry1 = getLogEntry(143); + auto entry1 = getBuffer(143); auto ret = s1.raft_instance->append_entries({entry1}); EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code(); EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code(); @@ -167,111 +197,916 @@ TEST(CoordinationTest, TestSummingRaft1) s1.launcher.shutdown(5); } -TEST(CoordinationTest, TestSummingRaft3) +DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) { - SummingRaftServer s1(1, "localhost", 44444); - SummingRaftServer s2(2, "localhost", 44445); - SummingRaftServer s3(3, "localhost", 44446); - - nuraft::srv_config first_config(1, "localhost:44444"); - auto ret1 = s2.raft_instance->add_srv(first_config); - if (!ret1->get_accepted()) - { - std::cout << "failed to add server: " - << ret1->get_result_str() << std::endl; - EXPECT_TRUE(false); - } - - while (s1.raft_instance->get_leader() != 2) - { - std::cout << "Waiting s1 to join to s2 quorum\n"; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - nuraft::srv_config third_config(3, "localhost:44446"); - auto ret3 = s2.raft_instance->add_srv(third_config); - if (!ret3->get_accepted()) - { - std::cout << "failed to add server: " - << ret3->get_result_str() << std::endl; - EXPECT_TRUE(false); - } - - while (s3.raft_instance->get_leader() != 2) - { - std::cout << "Waiting s3 to join to s2 quorum\n"; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - /// S2 is leader - EXPECT_EQ(s1.raft_instance->get_leader(), 2); - EXPECT_EQ(s2.raft_instance->get_leader(), 2); - EXPECT_EQ(s3.raft_instance->get_leader(), 2); - - std::cerr << "Starting to add entries\n"; - auto entry = getLogEntry(1); - auto ret = s2.raft_instance->append_entries({entry}); - EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code(); - EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code(); - - while (s1.state_machine->getValue() != 1) - { - std::cout << "Waiting s1 to apply entry\n"; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - while (s2.state_machine->getValue() != 1) - { - std::cout << "Waiting s2 to apply entry\n"; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - while (s3.state_machine->getValue() != 1) - { - std::cout << "Waiting s3 to apply entry\n"; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - EXPECT_EQ(s1.state_machine->getValue(), 1); - EXPECT_EQ(s2.state_machine->getValue(), 1); - EXPECT_EQ(s3.state_machine->getValue(), 1); - - auto non_leader_entry = getLogEntry(3); - auto ret_non_leader1 = s1.raft_instance->append_entries({non_leader_entry}); - - EXPECT_FALSE(ret_non_leader1->get_accepted()); - - auto ret_non_leader3 = s3.raft_instance->append_entries({non_leader_entry}); - - EXPECT_FALSE(ret_non_leader3->get_accepted()); - - auto leader_entry = getLogEntry(77); - auto ret_leader = s2.raft_instance->append_entries({leader_entry}); - EXPECT_TRUE(ret_leader->get_accepted()) << "failed to replicate: entry 78" << ret_leader->get_result_code(); - EXPECT_EQ(ret_leader->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 78" << ret_leader->get_result_code(); - - while (s1.state_machine->getValue() != 78) - { - std::cout << "Waiting s1 to apply entry\n"; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - while (s3.state_machine->getValue() != 78) - { - std::cout << "Waiting s3 to apply entry\n"; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - EXPECT_EQ(s1.state_machine->getValue(), 78); - EXPECT_EQ(s2.state_machine->getValue(), 78); - EXPECT_EQ(s3.state_machine->getValue(), 78); - - s1.launcher.shutdown(5); - s2.launcher.shutdown(5); - s3.launcher.shutdown(5); + DB::WriteBufferFromNuraftBuffer bufwriter; + writeText(s, bufwriter); + return nuraft::cs_new(term, bufwriter.getBuffer()); } -nuraft::ptr getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request) +TEST(CoordinationTest, ChangelogTestSimple) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + auto entry = getLogEntry("hello world", 77); + changelog.append(entry); + changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog.next_slot(), 2); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.last_entry()->get_term(), 77); + EXPECT_EQ(changelog.entry_at(1)->get_term(), 77); + EXPECT_EQ(changelog.log_entries(1, 2)->size(), 1); +} + +TEST(CoordinationTest, ChangelogTestFile) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + auto entry = getLogEntry("hello world", 77); + changelog.append(entry); + changelog.end_of_append_batch(0, 0); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + for (const auto & p : fs::directory_iterator("./logs")) + EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin"); + + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); + changelog.end_of_append_batch(0, 0); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); +} + +TEST(CoordinationTest, ChangelogReadWrite) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 1000, true); + changelog.init(1, 0); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog.size(), 10); + DB::KeeperLogStore changelog_reader("./logs", 1000, true); + changelog_reader.init(1, 0); + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); + EXPECT_EQ(changelog_reader.start_index(), changelog.start_index()); + EXPECT_EQ(changelog_reader.next_slot(), changelog.next_slot()); + + for (size_t i = 0; i < 10; ++i) + EXPECT_EQ(changelog_reader.entry_at(i + 1)->get_term(), changelog.entry_at(i + 1)->get_term()); + + auto entries_from_range_read = changelog_reader.log_entries(1, 11); + auto entries_from_range = changelog.log_entries(1, 11); + EXPECT_EQ(entries_from_range_read->size(), entries_from_range->size()); + EXPECT_EQ(10, entries_from_range->size()); +} + +TEST(CoordinationTest, ChangelogWriteAt) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 1000, true); + changelog.init(1, 0); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.append(entry); + } + + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.size(), 10); + + auto entry = getLogEntry("writer", 77); + changelog.write_at(7, entry); + changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog.size(), 7); + EXPECT_EQ(changelog.last_entry()->get_term(), 77); + EXPECT_EQ(changelog.entry_at(7)->get_term(), 77); + EXPECT_EQ(changelog.next_slot(), 8); + + DB::KeeperLogStore changelog_reader("./logs", 1000, true); + changelog_reader.init(1, 0); + + EXPECT_EQ(changelog_reader.size(), changelog.size()); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); + EXPECT_EQ(changelog_reader.start_index(), changelog.start_index()); + EXPECT_EQ(changelog_reader.next_slot(), changelog.next_slot()); +} + + +TEST(CoordinationTest, ChangelogTestAppendAfterRead) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + for (size_t i = 0; i < 7; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog.size(), 7); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + DB::KeeperLogStore changelog_reader("./logs", 5, true); + changelog_reader.init(1, 0); + + EXPECT_EQ(changelog_reader.size(), 7); + for (size_t i = 7; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog_reader.append(entry); + } + changelog_reader.end_of_append_batch(0, 0); + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + size_t logs_count = 0; + for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + logs_count++; + + EXPECT_EQ(logs_count, 2); + + auto entry = getLogEntry("someentry", 77); + changelog_reader.append(entry); + changelog_reader.end_of_append_batch(0, 0); + EXPECT_EQ(changelog_reader.size(), 11); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + logs_count = 0; + for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + logs_count++; + + EXPECT_EQ(logs_count, 3); +} + +TEST(CoordinationTest, ChangelogTestCompaction) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + + for (size_t i = 0; i < 3; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog.size(), 3); + + changelog.compact(2); + + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.start_index(), 3); + EXPECT_EQ(changelog.next_slot(), 4); + EXPECT_EQ(changelog.last_entry()->get_term(), 20); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + + auto e1 = getLogEntry("hello world", 30); + changelog.append(e1); + auto e2 = getLogEntry("hello world", 40); + changelog.append(e2); + auto e3 = getLogEntry("hello world", 50); + changelog.append(e3); + auto e4 = getLogEntry("hello world", 60); + changelog.append(e4); + changelog.end_of_append_batch(0, 0); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + changelog.compact(6); + + EXPECT_FALSE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.start_index(), 7); + EXPECT_EQ(changelog.next_slot(), 8); + EXPECT_EQ(changelog.last_entry()->get_term(), 60); + /// And we able to read it + DB::KeeperLogStore changelog_reader("./logs", 5, true); + changelog_reader.init(7, 0); + EXPECT_EQ(changelog_reader.size(), 1); + EXPECT_EQ(changelog_reader.start_index(), 7); + EXPECT_EQ(changelog_reader.next_slot(), 8); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 60); +} + +TEST(CoordinationTest, ChangelogTestBatchOperations) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 100, true); + changelog.init(1, 0); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog.size(), 10); + + auto entries = changelog.pack(1, 5); + + DB::KeeperLogStore apply_changelog("./logs", 100, true); + apply_changelog.init(1, 0); + + for (size_t i = 0; i < 10; ++i) + { + EXPECT_EQ(apply_changelog.entry_at(i + 1)->get_term(), i * 10); + } + EXPECT_EQ(apply_changelog.size(), 10); + + apply_changelog.apply_pack(8, *entries); + apply_changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(apply_changelog.size(), 12); + EXPECT_EQ(apply_changelog.start_index(), 1); + EXPECT_EQ(apply_changelog.next_slot(), 13); + + for (size_t i = 0; i < 7; ++i) + { + EXPECT_EQ(apply_changelog.entry_at(i + 1)->get_term(), i * 10); + } + + EXPECT_EQ(apply_changelog.entry_at(8)->get_term(), 0); + EXPECT_EQ(apply_changelog.entry_at(9)->get_term(), 10); + EXPECT_EQ(apply_changelog.entry_at(10)->get_term(), 20); + EXPECT_EQ(apply_changelog.entry_at(11)->get_term(), 30); + EXPECT_EQ(apply_changelog.entry_at(12)->get_term(), 40); +} + +TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 100, true); + changelog.init(1, 0); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog.size(), 10); + + auto entries = changelog.pack(5, 5); + + ChangelogDirTest test1("./logs1"); + DB::KeeperLogStore changelog_new("./logs1", 100, true); + changelog_new.init(1, 0); + EXPECT_EQ(changelog_new.size(), 0); + + changelog_new.apply_pack(5, *entries); + changelog_new.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog_new.size(), 5); + EXPECT_EQ(changelog_new.start_index(), 5); + EXPECT_EQ(changelog_new.next_slot(), 10); + + for (size_t i = 4; i < 9; ++i) + EXPECT_EQ(changelog_new.entry_at(i + 1)->get_term(), i * 10); + + auto e = getLogEntry("hello_world", 110); + changelog_new.append(e); + changelog_new.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog_new.size(), 6); + EXPECT_EQ(changelog_new.start_index(), 5); + EXPECT_EQ(changelog_new.next_slot(), 11); + + DB::KeeperLogStore changelog_reader("./logs1", 100, true); + changelog_reader.init(5, 0); +} + + +TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + + EXPECT_EQ(changelog.size(), 33); + + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(7, e1); + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.size(), 7); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 8); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::KeeperLogStore changelog_read("./logs", 5, true); + changelog_read.init(1, 0); + EXPECT_EQ(changelog_read.size(), 7); + EXPECT_EQ(changelog_read.start_index(), 1); + EXPECT_EQ(changelog_read.next_slot(), 8); + EXPECT_EQ(changelog_read.last_entry()->get_term(), 5555); +} + +TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + + EXPECT_EQ(changelog.size(), 33); + + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(11, e1); + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.size(), 11); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 12); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::KeeperLogStore changelog_read("./logs", 5, true); + changelog_read.init(1, 0); + EXPECT_EQ(changelog_read.size(), 11); + EXPECT_EQ(changelog_read.start_index(), 1); + EXPECT_EQ(changelog_read.next_slot(), 12); + EXPECT_EQ(changelog_read.last_entry()->get_term(), 5555); +} + +TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + + EXPECT_EQ(changelog.size(), 33); + + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(1, e1); + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 2); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); +} + +TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) +{ + ChangelogDirTest test("./logs"); + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.size(), 35); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_36_40.bin")); + + + DB::KeeperLogStore changelog_reader("./logs", 5, true); + changelog_reader.init(1, 0); + + auto entry = getLogEntry("36_hello_world", 360); + changelog_reader.append(entry); + changelog_reader.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog_reader.size(), 36); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_36_40.bin")); +} + + +TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) +{ + ChangelogDirTest test("./logs"); + + DB::KeeperLogStore changelog("./logs", 5, true); + changelog.init(1, 0); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.size(), 35); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + + DB::WriteBufferFromFile plain_buf("./logs/changelog_11_15.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + plain_buf.truncate(0); + + DB::KeeperLogStore changelog_reader("./logs", 5, true); + changelog_reader.init(1, 0); + changelog_reader.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 90); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + auto entry = getLogEntry("h", 7777); + changelog_reader.append(entry); + changelog_reader.end_of_append_batch(0, 0); + EXPECT_EQ(changelog_reader.size(), 11); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::KeeperLogStore changelog_reader2("./logs", 5, true); + changelog_reader2.init(1, 0); + EXPECT_EQ(changelog_reader2.size(), 11); + EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777); +} + +TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) +{ + ChangelogDirTest test("./logs"); + + DB::KeeperLogStore changelog("./logs", 20, true); + changelog.init(1, 0); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin")); + + DB::WriteBufferFromFile plain_buf("./logs/changelog_1_20.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + plain_buf.truncate(140); + + DB::KeeperLogStore changelog_reader("./logs", 20, true); + changelog_reader.init(1, 0); + + EXPECT_EQ(changelog_reader.size(), 2); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 450); + EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin")); + auto entry = getLogEntry("hello_world", 7777); + changelog_reader.append(entry); + changelog_reader.end_of_append_batch(0, 0); + EXPECT_EQ(changelog_reader.size(), 3); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); + + + DB::KeeperLogStore changelog_reader2("./logs", 20, true); + changelog_reader2.init(1, 0); + EXPECT_EQ(changelog_reader2.size(), 3); + EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777); +} + +TEST(CoordinationTest, ChangelogTestLostFiles) +{ + ChangelogDirTest test("./logs"); + + DB::KeeperLogStore changelog("./logs", 20, true); + changelog.init(1, 0); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin")); + + fs::remove("./logs/changelog_1_20.bin"); + + DB::KeeperLogStore changelog_reader("./logs", 20, true); + /// It should print error message, but still able to start + changelog_reader.init(5, 0); + EXPECT_FALSE(fs::exists("./logs/changelog_1_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin")); +} + +TEST(CoordinationTest, SnapshotableHashMapSimple) +{ + DB::SnapshotableHashTable hello; + EXPECT_TRUE(hello.insert("hello", 5)); + EXPECT_TRUE(hello.contains("hello")); + EXPECT_EQ(hello.getValue("hello"), 5); + EXPECT_FALSE(hello.insert("hello", 145)); + EXPECT_EQ(hello.getValue("hello"), 5); + hello.updateValue("hello", [](int & value) { value = 7; }); + EXPECT_EQ(hello.getValue("hello"), 7); + EXPECT_EQ(hello.size(), 1); + EXPECT_TRUE(hello.erase("hello")); + EXPECT_EQ(hello.size(), 0); +} + +TEST(CoordinationTest, SnapshotableHashMapTrySnapshot) +{ + DB::SnapshotableHashTable map_snp; + EXPECT_TRUE(map_snp.insert("/hello", 7)); + EXPECT_FALSE(map_snp.insert("/hello", 145)); + map_snp.enableSnapshotMode(); + EXPECT_FALSE(map_snp.insert("/hello", 145)); + map_snp.updateValue("/hello", [](int & value) { value = 554; }); + EXPECT_EQ(map_snp.getValue("/hello"), 554); + EXPECT_EQ(map_snp.snapshotSize(), 2); + EXPECT_EQ(map_snp.size(), 1); + + auto itr = map_snp.begin(); + EXPECT_EQ(itr->key, "/hello"); + EXPECT_EQ(itr->value, 7); + EXPECT_EQ(itr->active_in_map, false); + itr = std::next(itr); + EXPECT_EQ(itr->key, "/hello"); + EXPECT_EQ(itr->value, 554); + EXPECT_EQ(itr->active_in_map, true); + itr = std::next(itr); + EXPECT_EQ(itr, map_snp.end()); + for (size_t i = 0; i < 5; ++i) + { + EXPECT_TRUE(map_snp.insert("/hello" + std::to_string(i), i)); + } + EXPECT_EQ(map_snp.getValue("/hello3"), 3); + + EXPECT_EQ(map_snp.snapshotSize(), 7); + EXPECT_EQ(map_snp.size(), 6); + itr = std::next(map_snp.begin(), 2); + for (size_t i = 0; i < 5; ++i) + { + EXPECT_EQ(itr->key, "/hello" + std::to_string(i)); + EXPECT_EQ(itr->value, i); + EXPECT_EQ(itr->active_in_map, true); + itr = std::next(itr); + } + + EXPECT_TRUE(map_snp.erase("/hello3")); + EXPECT_TRUE(map_snp.erase("/hello2")); + + EXPECT_EQ(map_snp.snapshotSize(), 7); + EXPECT_EQ(map_snp.size(), 4); + itr = std::next(map_snp.begin(), 2); + for (size_t i = 0; i < 5; ++i) + { + EXPECT_EQ(itr->key, "/hello" + std::to_string(i)); + EXPECT_EQ(itr->value, i); + EXPECT_EQ(itr->active_in_map, i != 3 && i != 2); + itr = std::next(itr); + } + map_snp.clearOutdatedNodes(); + + EXPECT_EQ(map_snp.snapshotSize(), 4); + EXPECT_EQ(map_snp.size(), 4); + itr = map_snp.begin(); + EXPECT_EQ(itr->key, "/hello"); + EXPECT_EQ(itr->value, 554); + EXPECT_EQ(itr->active_in_map, true); + itr = std::next(itr); + EXPECT_EQ(itr->key, "/hello0"); + EXPECT_EQ(itr->value, 0); + EXPECT_EQ(itr->active_in_map, true); + itr = std::next(itr); + EXPECT_EQ(itr->key, "/hello1"); + EXPECT_EQ(itr->value, 1); + EXPECT_EQ(itr->active_in_map, true); + itr = std::next(itr); + EXPECT_EQ(itr->key, "/hello4"); + EXPECT_EQ(itr->value, 4); + EXPECT_EQ(itr->active_in_map, true); + itr = std::next(itr); + EXPECT_EQ(itr, map_snp.end()); + map_snp.disableSnapshotMode(); +} + +void addNode(DB::KeeperStorage & storage, const std::string & path, const std::string & data, int64_t ephemeral_owner=0) +{ + using Node = DB::KeeperStorage::Node; + Node node{}; + node.data = data; + node.stat.ephemeralOwner = ephemeral_owner; + storage.container.insertOrReplace(path, node); +} + +TEST(CoordinationTest, TestStorageSnapshotSimple) +{ + ChangelogDirTest test("./snapshots"); + DB::KeeperSnapshotManager manager("./snapshots", 3); + + DB::KeeperStorage storage(500); + addNode(storage, "/hello", "world", 1); + addNode(storage, "/hello/somepath", "somedata", 3); + storage.session_id_counter = 5; + storage.zxid = 2; + storage.ephemerals[3] = {"/hello"}; + storage.ephemerals[1] = {"/hello/somepath"}; + storage.getSessionID(130); + storage.getSessionID(130); + + DB::KeeperStorageSnapshot snapshot(&storage, 2); + + EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 2); + EXPECT_EQ(snapshot.session_id, 7); + EXPECT_EQ(snapshot.snapshot_container_size, 3); + EXPECT_EQ(snapshot.session_and_timeout.size(), 2); + + auto buf = manager.serializeSnapshotToBuffer(snapshot); + manager.serializeSnapshotBufferToDisk(*buf, 2); + EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin")); + + + auto debuf = manager.deserializeSnapshotBufferFromDisk(2); + + auto [snapshot_meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf); + + EXPECT_EQ(restored_storage->container.size(), 3); + EXPECT_EQ(restored_storage->container.getValue("/").children.size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello").children.size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").children.size(), 0); + + EXPECT_EQ(restored_storage->container.getValue("/").data, ""); + EXPECT_EQ(restored_storage->container.getValue("/hello").data, "world"); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").data, "somedata"); + EXPECT_EQ(restored_storage->session_id_counter, 7); + EXPECT_EQ(restored_storage->zxid, 2); + EXPECT_EQ(restored_storage->ephemerals.size(), 2); + EXPECT_EQ(restored_storage->ephemerals[3].size(), 1); + EXPECT_EQ(restored_storage->ephemerals[1].size(), 1); + EXPECT_EQ(restored_storage->session_and_timeout.size(), 2); +} + +TEST(CoordinationTest, TestStorageSnapshotMoreWrites) +{ + ChangelogDirTest test("./snapshots"); + DB::KeeperSnapshotManager manager("./snapshots", 3); + + DB::KeeperStorage storage(500); + storage.getSessionID(130); + + for (size_t i = 0; i < 50; ++i) + { + addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); + } + + DB::KeeperStorageSnapshot snapshot(&storage, 50); + EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 50); + EXPECT_EQ(snapshot.snapshot_container_size, 51); + + for (size_t i = 50; i < 100; ++i) + { + addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); + } + + EXPECT_EQ(storage.container.size(), 101); + + auto buf = manager.serializeSnapshotToBuffer(snapshot); + manager.serializeSnapshotBufferToDisk(*buf, 50); + EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin")); + + + auto debuf = manager.deserializeSnapshotBufferFromDisk(50); + auto [meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf); + + EXPECT_EQ(restored_storage->container.size(), 51); + for (size_t i = 0; i < 50; ++i) + { + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + } +} + + +TEST(CoordinationTest, TestStorageSnapshotManySnapshots) +{ + ChangelogDirTest test("./snapshots"); + DB::KeeperSnapshotManager manager("./snapshots", 3); + + DB::KeeperStorage storage(500); + storage.getSessionID(130); + + for (size_t j = 1; j <= 5; ++j) + { + for (size_t i = (j - 1) * 50; i < j * 50; ++i) + { + addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); + } + + DB::KeeperStorageSnapshot snapshot(&storage, j * 50); + auto buf = manager.serializeSnapshotToBuffer(snapshot); + manager.serializeSnapshotBufferToDisk(*buf, j * 50); + EXPECT_TRUE(fs::exists(std::string{"./snapshots/snapshot_"} + std::to_string(j * 50) + ".bin")); + } + + EXPECT_FALSE(fs::exists("./snapshots/snapshot_50.bin")); + EXPECT_FALSE(fs::exists("./snapshots/snapshot_100.bin")); + EXPECT_TRUE(fs::exists("./snapshots/snapshot_150.bin")); + EXPECT_TRUE(fs::exists("./snapshots/snapshot_200.bin")); + EXPECT_TRUE(fs::exists("./snapshots/snapshot_250.bin")); + + + auto [meta, restored_storage] = manager.restoreFromLatestSnapshot(); + + EXPECT_EQ(restored_storage->container.size(), 251); + + for (size_t i = 0; i < 250; ++i) + { + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + } +} + +TEST(CoordinationTest, TestStorageSnapshotMode) +{ + ChangelogDirTest test("./snapshots"); + DB::KeeperSnapshotManager manager("./snapshots", 3); + DB::KeeperStorage storage(500); + for (size_t i = 0; i < 50; ++i) + { + addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); + } + + { + DB::KeeperStorageSnapshot snapshot(&storage, 50); + for (size_t i = 0; i < 50; ++i) + { + addNode(storage, "/hello_" + std::to_string(i), "wlrd_" + std::to_string(i)); + } + for (size_t i = 0; i < 50; ++i) + { + EXPECT_EQ(storage.container.getValue("/hello_" + std::to_string(i)).data, "wlrd_" + std::to_string(i)); + } + for (size_t i = 0; i < 50; ++i) + { + if (i % 2 == 0) + storage.container.erase("/hello_" + std::to_string(i)); + } + EXPECT_EQ(storage.container.size(), 26); + EXPECT_EQ(storage.container.snapshotSize(), 101); + auto buf = manager.serializeSnapshotToBuffer(snapshot); + manager.serializeSnapshotBufferToDisk(*buf, 50); + } + EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin")); + EXPECT_EQ(storage.container.size(), 26); + storage.clearGarbageAfterSnapshot(); + EXPECT_EQ(storage.container.snapshotSize(), 26); + for (size_t i = 0; i < 50; ++i) + { + if (i % 2 != 0) + EXPECT_EQ(storage.container.getValue("/hello_" + std::to_string(i)).data, "wlrd_" + std::to_string(i)); + else + EXPECT_FALSE(storage.container.contains("/hello_" + std::to_string(i))); + } + + auto [meta, restored_storage] = manager.restoreFromLatestSnapshot(); + + for (size_t i = 0; i < 50; ++i) + { + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + } + +} + +TEST(CoordinationTest, TestStorageSnapshotBroken) +{ + ChangelogDirTest test("./snapshots"); + DB::KeeperSnapshotManager manager("./snapshots", 3); + DB::KeeperStorage storage(500); + for (size_t i = 0; i < 50; ++i) + { + addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); + } + { + DB::KeeperStorageSnapshot snapshot(&storage, 50); + auto buf = manager.serializeSnapshotToBuffer(snapshot); + manager.serializeSnapshotBufferToDisk(*buf, 50); + } + EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin")); + + /// Let's corrupt file + DB::WriteBufferFromFile plain_buf("./snapshots/snapshot_50.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + plain_buf.truncate(34); + plain_buf.sync(); + + EXPECT_THROW(manager.restoreFromLatestSnapshot(), DB::Exception); +} + +nuraft::ptr getBufferFromZKRequest(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request) { DB::WriteBufferFromNuraftBuffer buf; DB::writeIntBinary(session_id, buf); @@ -279,58 +1114,198 @@ nuraft::ptr getZooKeeperLogEntry(int64_t session_id, const Coord return buf.getBuffer(); } -DB::NuKeeperStorage::ResponsesForSessions getZooKeeperResponses(nuraft::ptr & buffer, const Coordination::ZooKeeperRequestPtr & request) +nuraft::ptr getLogEntryFromZKRequest(size_t term, int64_t session_id, const Coordination::ZooKeeperRequestPtr & request) { - DB::NuKeeperStorage::ResponsesForSessions results; - DB::ReadBufferFromNuraftBuffer buf(buffer); - while (!buf.eof()) - { - int64_t session_id; - DB::readIntBinary(session_id, buf); - - int32_t length; - Coordination::XID xid; - int64_t zxid; - Coordination::Error err; - - Coordination::read(length, buf); - Coordination::read(xid, buf); - Coordination::read(zxid, buf); - Coordination::read(err, buf); - auto response = request->makeResponse(); - response->readImpl(buf); - results.push_back(DB::NuKeeperStorage::ResponseForSession{session_id, response}); - } - return results; + auto buffer = getBufferFromZKRequest(session_id, request); + return nuraft::cs_new(term, buffer); } -TEST(CoordinationTest, TestStorageSerialization) +void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint64_t total_logs) { - DB::NuKeeperStorage storage(500); - storage.container["/hello"] = DB::NuKeeperStorage::Node{.data="world"}; - storage.container["/hello/somepath"] = DB::NuKeeperStorage::Node{.data="somedata"}; - storage.session_id_counter = 5; - storage.zxid = 156; - storage.ephemerals[3] = {"/hello", "/"}; - storage.ephemerals[1] = {"/hello/somepath"}; + using namespace Coordination; + using namespace DB; - DB::WriteBufferFromOwnString buffer; - DB::NuKeeperStorageSerializer serializer; - serializer.serialize(storage, buffer); - std::string serialized = buffer.str(); - EXPECT_NE(serialized.size(), 0); - DB::ReadBufferFromString read(serialized); - DB::NuKeeperStorage new_storage(500); - serializer.deserialize(new_storage, read); + ChangelogDirTest snapshots("./snapshots"); + ChangelogDirTest logs("./logs"); - EXPECT_EQ(new_storage.container.size(), 3); - EXPECT_EQ(new_storage.container["/hello"].data, "world"); - EXPECT_EQ(new_storage.container["/hello/somepath"].data, "somedata"); - EXPECT_EQ(new_storage.session_id_counter, 5); - EXPECT_EQ(new_storage.zxid, 156); - EXPECT_EQ(new_storage.ephemerals.size(), 2); - EXPECT_EQ(new_storage.ephemerals[3].size(), 2); - EXPECT_EQ(new_storage.ephemerals[1].size(), 1); + ResponsesQueue queue; + SnapshotsQueue snapshots_queue{1}; + auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings); + state_machine->init(); + DB::KeeperLogStore changelog("./logs", settings->rotate_log_storage_interval, true); + changelog.init(state_machine->last_commit_index() + 1, settings->reserved_log_items); + for (size_t i = 1; i < total_logs + 1; ++i) + { + std::shared_ptr request = std::make_shared(); + request->path = "/hello_" + std::to_string(i); + auto entry = getLogEntryFromZKRequest(0, 1, request); + changelog.append(entry); + changelog.end_of_append_batch(0, 0); + + state_machine->commit(i, changelog.entry_at(i)->get_buf()); + bool snapshot_created = false; + if (i % settings->snapshot_distance == 0) + { + nuraft::snapshot s(i, 0, std::make_shared()); + nuraft::async_result::handler_type when_done = [&snapshot_created] (bool & ret, nuraft::ptr &/*exception*/) + { + snapshot_created = ret; + std::cerr << "Snapshot finised\n"; + }; + + state_machine->create_snapshot(s, when_done); + CreateSnapshotTask snapshot_task; + snapshots_queue.pop(snapshot_task); + snapshot_task.create_snapshot(std::move(snapshot_task.snapshot)); + } + if (snapshot_created) + { + if (changelog.size() > settings->reserved_log_items) + { + changelog.compact(i - settings->reserved_log_items); + } + } + } + + SnapshotsQueue snapshots_queue1{1}; + auto restore_machine = std::make_shared(queue, snapshots_queue1, "./snapshots", settings); + restore_machine->init(); + EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance); + + DB::KeeperLogStore restore_changelog("./logs", settings->rotate_log_storage_interval, true); + restore_changelog.init(restore_machine->last_commit_index() + 1, settings->reserved_log_items); + + EXPECT_EQ(restore_changelog.size(), std::min(settings->reserved_log_items + total_logs % settings->snapshot_distance, total_logs)); + EXPECT_EQ(restore_changelog.next_slot(), total_logs + 1); + if (total_logs > settings->reserved_log_items + 1) + EXPECT_EQ(restore_changelog.start_index(), total_logs - total_logs % settings->snapshot_distance - settings->reserved_log_items + 1); + else + EXPECT_EQ(restore_changelog.start_index(), 1); + + for (size_t i = restore_machine->last_commit_index() + 1; i < restore_changelog.next_slot(); ++i) + { + restore_machine->commit(i, changelog.entry_at(i)->get_buf()); + } + + auto & source_storage = state_machine->getStorage(); + auto & restored_storage = restore_machine->getStorage(); + + EXPECT_EQ(source_storage.container.size(), restored_storage.container.size()); + for (size_t i = 1; i < total_logs + 1; ++i) + { + auto path = "/hello_" + std::to_string(i); + EXPECT_EQ(source_storage.container.getValue(path).data, restored_storage.container.getValue(path).data); + } +} + +TEST(CoordinationTest, TestStateMachineAndLogStore) +{ + using namespace Coordination; + using namespace DB; + + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 10; + settings->reserved_log_items = 10; + settings->rotate_log_storage_interval = 10; + testLogAndStateMachine(settings, 37); + } + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 10; + settings->reserved_log_items = 10; + settings->rotate_log_storage_interval = 10; + testLogAndStateMachine(settings, 11); + } + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 10; + settings->reserved_log_items = 10; + settings->rotate_log_storage_interval = 10; + testLogAndStateMachine(settings, 40); + } + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 10; + settings->reserved_log_items = 20; + settings->rotate_log_storage_interval = 30; + testLogAndStateMachine(settings, 40); + } + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 10; + settings->reserved_log_items = 0; + settings->rotate_log_storage_interval = 10; + testLogAndStateMachine(settings, 40); + } + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 1; + settings->reserved_log_items = 1; + settings->rotate_log_storage_interval = 32; + testLogAndStateMachine(settings, 32); + } + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 10; + settings->reserved_log_items = 7; + settings->rotate_log_storage_interval = 1; + testLogAndStateMachine(settings, 33); + } + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 37; + settings->reserved_log_items = 1000; + settings->rotate_log_storage_interval = 5000; + testLogAndStateMachine(settings, 33); + } + { + CoordinationSettingsPtr settings = std::make_shared(); + settings->snapshot_distance = 37; + settings->reserved_log_items = 1000; + settings->rotate_log_storage_interval = 5000; + testLogAndStateMachine(settings, 45); + } +} + +TEST(CoordinationTest, TestEphemeralNodeRemove) +{ + using namespace Coordination; + using namespace DB; + + ChangelogDirTest snapshots("./snapshots"); + CoordinationSettingsPtr settings = std::make_shared(); + + ResponsesQueue queue; + SnapshotsQueue snapshots_queue{1}; + auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings); + state_machine->init(); + + std::shared_ptr request_c = std::make_shared(); + request_c->path = "/hello"; + request_c->is_ephemeral = true; + auto entry_c = getLogEntryFromZKRequest(0, 1, request_c); + state_machine->commit(1, entry_c->get_buf()); + const auto & storage = state_machine->getStorage(); + + EXPECT_EQ(storage.ephemerals.size(), 1); + std::shared_ptr request_d = std::make_shared(); + request_d->path = "/hello"; + /// Delete from other session + auto entry_d = getLogEntryFromZKRequest(0, 2, request_d); + state_machine->commit(2, entry_d->get_buf()); + + EXPECT_EQ(storage.ephemerals.size(), 0); +} + + +int main(int argc, char ** argv) +{ + Poco::AutoPtr channel(new Poco::ConsoleChannel(std::cerr)); + Poco::Logger::root().setChannel(channel); + Poco::Logger::root().setLevel("trace"); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); } #endif diff --git a/src/Core/AccurateComparison.h b/src/Core/AccurateComparison.h index 8b2f20f0af5..1e45dbe23dc 100644 --- a/src/Core/AccurateComparison.h +++ b/src/Core/AccurateComparison.h @@ -177,7 +177,7 @@ inline bool_if_big_int_vs_float equalsOpTmpl(TABigInt, TAFloa return false; } -/* Final realiztions */ +/* Final implementations */ template @@ -274,12 +274,14 @@ inline bool greaterOp(DB::UInt64 u, DB::Float32 f) template <> inline bool greaterOp(DB::Float64 f, DB::UInt128 u) { + /// TODO: This is wrong. return u.low == 0 && greaterOp(f, u.high); } template <> inline bool greaterOp(DB::UInt128 u, DB::Float64 f) { + /// TODO: This is wrong. return u.low != 0 || greaterOp(u.high, f); } @@ -310,56 +312,64 @@ inline bool_if_safe_conversion equalsOp(A a, B b) } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::Float64 f, DB::UInt64 u) +inline bool equalsOp(DB::Float64 f, DB::UInt64 u) { - return static_cast(f) == u && f == static_cast(u); + /// Maximum exactly representable integer. + return u <= (1ULL << std::numeric_limits::digits) + && f == static_cast(u); } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::UInt64 u, DB::Float64 f) +inline bool equalsOp(DB::UInt64 u, DB::Float64 f) { - return u == static_cast(f) && static_cast(u) == f; + return equalsOp(f, u); } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::Float64 f, DB::Int64 u) +inline bool equalsOp(DB::Float64 f, DB::Int64 u) { - return static_cast(f) == u && f == static_cast(u); + return u <= (1LL << std::numeric_limits::digits) + && u >= -(1LL << std::numeric_limits::digits) + && f == static_cast(u); } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::Int64 u, DB::Float64 f) +inline bool equalsOp(DB::Int64 u, DB::Float64 f) { - return u == static_cast(f) && static_cast(u) == f; + return equalsOp(f, u); } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::Float32 f, DB::UInt64 u) +inline bool equalsOp(DB::Float32 f, DB::UInt64 u) { - return static_cast(f) == u && f == static_cast(u); + return u <= (1ULL << std::numeric_limits::digits) + && f == static_cast(u); } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::UInt64 u, DB::Float32 f) +inline bool equalsOp(DB::UInt64 u, DB::Float32 f) { - return u == static_cast(f) && static_cast(u) == f; + return equalsOp(f, u); } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::Float32 f, DB::Int64 u) +inline bool equalsOp(DB::Float32 f, DB::Int64 u) { - return static_cast(f) == u && f == static_cast(u); + return u <= (1LL << std::numeric_limits::digits) + && u >= -(1LL << std::numeric_limits::digits) + && f == static_cast(u); } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::Int64 u, DB::Float32 f) +inline bool equalsOp(DB::Int64 u, DB::Float32 f) { - return u == static_cast(f) && static_cast(u) == f; + return equalsOp(f, u); } template <> -inline bool NO_SANITIZE_UNDEFINED equalsOp(DB::UInt128 u, DB::Float64 f) +inline bool equalsOp(DB::UInt128 u, DB::Float64 f) { + /// TODO: This is wrong. return u.low == 0 && equalsOp(static_cast(u.high), f); } diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 0c9a470dc1d..0f470c10b81 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -484,7 +484,7 @@ DataTypes Block::getDataTypes() const template -static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description) +static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description, bool allow_remove_constants) { auto on_error = [](const std::string & message [[maybe_unused]], int code [[maybe_unused]]) { @@ -515,7 +515,16 @@ static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, cons if (!actual.column || !expected.column) continue; - if (actual.column->getName() != expected.column->getName()) + const IColumn * actual_column = actual.column.get(); + + /// If we allow to remove constants, and expected column is not const, then unwrap actual constant column. + if (allow_remove_constants && !isColumnConst(*expected.column)) + { + if (const auto * column_const = typeid_cast(actual_column)) + actual_column = &column_const->getDataColumn(); + } + + if (actual_column->getName() != expected.column->getName()) return on_error("Block structure mismatch in " + context_description + " stream: different columns:\n" + lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::LOGICAL_ERROR); @@ -537,13 +546,25 @@ static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, cons bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs) { - return checkBlockStructure(lhs, rhs, {}); + return checkBlockStructure(lhs, rhs, {}, false); } void assertBlocksHaveEqualStructure(const Block & lhs, const Block & rhs, const std::string & context_description) { - checkBlockStructure(lhs, rhs, context_description); + checkBlockStructure(lhs, rhs, context_description, false); +} + + +bool isCompatibleHeader(const Block & actual, const Block & desired) +{ + return checkBlockStructure(actual, desired, {}, true); +} + + +void assertCompatibleHeader(const Block & actual, const Block & desired, const std::string & context_description) +{ + checkBlockStructure(actual, desired, context_description, true); } diff --git a/src/Core/Block.h b/src/Core/Block.h index 14f4f57caed..6a94034b8fd 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -1,15 +1,15 @@ #pragma once -#include -#include -#include -#include -#include - #include -#include #include #include +#include + +#include +#include +#include +#include +#include namespace DB @@ -22,8 +22,6 @@ namespace DB * Allows to insert, remove columns in arbitrary position, to change order of columns. */ -class Context; - class Block { private: @@ -116,7 +114,7 @@ public: /** Get a list of column names separated by commas. */ std::string dumpNames() const; - /** List of names, types and lengths of columns. Designed for debugging. */ + /** List of names, types and lengths of columns. Designed for debugging. */ std::string dumpStructure() const; /** List of column names and positions from index */ @@ -186,6 +184,12 @@ bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs); /// Throw exception when blocks are different. void assertBlocksHaveEqualStructure(const Block & lhs, const Block & rhs, const std::string & context_description); +/// Actual header is compatible to desired if block have equal structure except constants. +/// It is allowed when column from actual header is constant, but in desired is not. +/// If both columns are constant, it is checked that they have the same value. +bool isCompatibleHeader(const Block & actual, const Block & desired); +void assertCompatibleHeader(const Block & actual, const Block & desired, const std::string & context_description); + /// Calculate difference in structure of blocks and write description into output strings. NOTE It doesn't compare values of constant columns. void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out_lhs_diff, std::string & out_rhs_diff); diff --git a/src/Core/BlockInfo.h b/src/Core/BlockInfo.h index c8dd1576b22..125c90b1b23 100644 --- a/src/Core/BlockInfo.h +++ b/src/Core/BlockInfo.h @@ -1,9 +1,10 @@ #pragma once -#include - #include +#include +#include + namespace DB { diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index 355cf1d378a..2131a6e3c33 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -50,9 +50,10 @@ inline auto scaleMultiplier(UInt32 scale) * whole - represents whole part of decimal, can be negative or positive. * fractional - for fractional part of decimal, always positive. */ -template +template struct DecimalComponents { + using T = typename DecimalType::NativeType; T whole; T fractional; }; @@ -102,10 +103,22 @@ inline DecimalType decimalFromComponentsWithMultiplier( if (common::mulOverflow(whole, scale_multiplier, whole_scaled)) throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); - const T value = whole_scaled + fractional_sign * (fractional % scale_multiplier); + T value; + if (common::addOverflow(whole_scaled, fractional_sign * (fractional % scale_multiplier), value)) + throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); + return DecimalType(value); } +template +inline DecimalType decimalFromComponentsWithMultiplier( + const DecimalComponents & components, + typename DecimalType::NativeType scale_multiplier) +{ + return decimalFromComponentsWithMultiplier(components.whole, components.fractional, scale_multiplier); +} + + /** Make a decimal value from whole and fractional components with given scale. * * @see `decimalFromComponentsWithMultiplier` for details. @@ -126,7 +139,7 @@ inline DecimalType decimalFromComponents( */ template inline DecimalType decimalFromComponents( - const DecimalComponents & components, + const DecimalComponents & components, UInt32 scale) { return decimalFromComponents(components.whole, components.fractional, scale); @@ -136,7 +149,7 @@ inline DecimalType decimalFromComponents( * This is an optimization to reduce number of calls to scaleMultiplier on known scale. */ template -inline DecimalComponents splitWithScaleMultiplier( +inline DecimalComponents splitWithScaleMultiplier( const DecimalType & decimal, typename DecimalType::NativeType scale_multiplier) { @@ -151,7 +164,7 @@ inline DecimalComponents splitWithScaleMultipl /// Split decimal into components: whole and fractional part, @see `DecimalComponents` for details. template -inline DecimalComponents split(const DecimalType & decimal, UInt32 scale) +inline DecimalComponents split(const DecimalType & decimal, UInt32 scale) { if (scale == 0) { diff --git a/src/Core/Defines.h b/src/Core/Defines.h index ff033aa6183..668a60f9be8 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -11,6 +11,9 @@ #define DBMS_DEFAULT_CONNECT_TIMEOUT_WITH_FAILOVER_SECURE_MS 100 #define DBMS_DEFAULT_SEND_TIMEOUT_SEC 300 #define DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC 300 +/// Timeouts for hedged requests. +#define DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS 100 +#define DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_MS 2000 /// Timeout for synchronous request-result protocol call (like Ping or TablesStatus). #define DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC 5 #define DBMS_DEFAULT_POLL_INTERVAL 10 @@ -71,6 +74,9 @@ /// Minimum revision supporting OpenTelemetry #define DBMS_MIN_REVISION_WITH_OPENTELEMETRY 54442 + +#define DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION 1 + /// Minimum revision supporting interserver secret. #define DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET 54441 @@ -78,8 +84,9 @@ #define DBMS_MIN_REVISION_WITH_REFERER_IN_CLIENT_INFO 54447 /// Version of ClickHouse TCP protocol. Increment it manually when you change the protocol. -#define DBMS_TCP_PROTOCOL_VERSION 54447 +#define DBMS_TCP_PROTOCOL_VERSION 54448 +#define DBMS_MIN_PROTOCOL_VERSION_WITH_DISTRIBUTED_DEPTH 54448 /// The boundary on which the blocks for asynchronous file operations should be aligned. #define DEFAULT_AIO_FILE_BLOCK_SIZE 4096 diff --git a/src/Core/ExternalTable.cpp b/src/Core/ExternalTable.cpp index afc9fe00ef5..87945dd1ce6 100644 --- a/src/Core/ExternalTable.cpp +++ b/src/Core/ExternalTable.cpp @@ -31,11 +31,11 @@ namespace ErrorCodes } -ExternalTableDataPtr BaseExternalTable::getData(const Context & context) +ExternalTableDataPtr BaseExternalTable::getData(ContextPtr context) { initReadBuffer(); initSampleBlock(); - auto input = context.getInputFormat(format, *read_buffer, sample_block, DEFAULT_BLOCK_SIZE); + auto input = context->getInputFormat(format, *read_buffer, sample_block, DEFAULT_BLOCK_SIZE); auto stream = std::make_shared(input); auto data = std::make_unique(); @@ -127,7 +127,7 @@ ExternalTable::ExternalTable(const boost::program_options::variables_map & exter void ExternalTablesHandler::handlePart(const Poco::Net::MessageHeader & header, ReadBuffer & stream) { - const Settings & settings = context.getSettingsRef(); + const Settings & settings = getContext()->getSettingsRef(); if (settings.http_max_multipart_form_data_size) read_buffer = std::make_unique( @@ -152,14 +152,14 @@ void ExternalTablesHandler::handlePart(const Poco::Net::MessageHeader & header, else throw Exception("Neither structure nor types have not been provided for external table " + name + ". Use fields " + name + "_structure or " + name + "_types to do so.", ErrorCodes::BAD_ARGUMENTS); - ExternalTableDataPtr data = getData(context); + ExternalTableDataPtr data = getData(getContext()); /// Create table NamesAndTypesList columns = sample_block.getNamesAndTypesList(); - auto temporary_table = TemporaryTableHolder(context, ColumnsDescription{columns}, {}); + auto temporary_table = TemporaryTableHolder(getContext(), ColumnsDescription{columns}, {}); auto storage = temporary_table.getTable(); - context.addExternalTable(data->table_name, std::move(temporary_table)); - BlockOutputStreamPtr output = storage->write(ASTPtr(), storage->getInMemoryMetadataPtr(), context); + getContext()->addExternalTable(data->table_name, std::move(temporary_table)); + BlockOutputStreamPtr output = storage->write(ASTPtr(), storage->getInMemoryMetadataPtr(), getContext()); /// Write data data->pipe->resize(1); diff --git a/src/Core/ExternalTable.h b/src/Core/ExternalTable.h index aa15846d48a..fcefa3d7fe3 100644 --- a/src/Core/ExternalTable.h +++ b/src/Core/ExternalTable.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -11,30 +12,21 @@ #include -namespace Poco +namespace Poco::Net { - namespace Net - { - class NameValueCollection; - class MessageHeader; - } +class NameValueCollection; +class MessageHeader; } -namespace boost +namespace boost::program_options { - namespace program_options - { - class variables_map; - } +class variables_map; } namespace DB { -class Context; - - /// The base class containing the basic information about external table and /// basic functions for extracting this information from text fields. class BaseExternalTable @@ -56,7 +48,7 @@ public: virtual void initReadBuffer() {} /// Get the table data - a pair (a stream with the contents of the table, the name of the table) - ExternalTableDataPtr getData(const Context & context); + ExternalTableDataPtr getData(ContextPtr context); protected: /// Clear all accumulated information @@ -88,15 +80,14 @@ public: /// Parsing of external table used when sending tables via http /// The `handlePart` function will be called for each table passed, /// so it's also necessary to call `clean` at the end of the `handlePart`. -class ExternalTablesHandler : public HTMLForm::PartHandler, BaseExternalTable +class ExternalTablesHandler : public HTMLForm::PartHandler, BaseExternalTable, WithContext { public: - ExternalTablesHandler(Context & context_, const Poco::Net::NameValueCollection & params_) : context(context_), params(params_) {} + ExternalTablesHandler(ContextPtr context_, const Poco::Net::NameValueCollection & params_) : WithContext(context_), params(params_) {} void handlePart(const Poco::Net::MessageHeader & header, ReadBuffer & stream) override; private: - Context & context; const Poco::Net::NameValueCollection & params; }; diff --git a/src/Core/Field.cpp b/src/Core/Field.cpp index 93107d7bb2c..8aa79b0bbe1 100644 --- a/src/Core/Field.cpp +++ b/src/Core/Field.cpp @@ -452,7 +452,7 @@ template <> bool decimalLessOrEqual(DateTime64 x, DateTime64 y, UInt32 x_scale, inline void writeText(const Null &, WriteBuffer & buf) { - writeText(std::string("Null"), buf); + writeText(std::string("NULL"), buf); } String toString(const Field & x) diff --git a/src/Core/Field.h b/src/Core/Field.h index 3a52186167f..5c4c2e165ad 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -35,7 +36,7 @@ template using NearestFieldType = typename NearestFieldTypeImpl::Type; class Field; -using FieldVector = std::vector; +using FieldVector = std::vector>; /// Array and Tuple use the same storage type -- FieldVector, but we declare /// distinct types for them, so that the caller can choose whether it wants to @@ -95,7 +96,7 @@ template bool decimalEqual(T x, T y, UInt32 x_scale, UInt32 y_scale template bool decimalLess(T x, T y, UInt32 x_scale, UInt32 y_scale); template bool decimalLessOrEqual(T x, T y, UInt32 x_scale, UInt32 y_scale); -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -158,7 +159,7 @@ private: T dec; UInt32 scale; }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -399,10 +400,10 @@ public: template - T & get(); + NearestFieldType> & get(); template - const T & get() const + const auto & get() const { auto mutable_this = const_cast *>(this); return mutable_this->get(); @@ -436,21 +437,10 @@ public: return true; } - template T & safeGet() - { - const Types::Which requested = TypeToEnum>::value; - if (which != requested) - throw Exception("Bad get: has " + std::string(getTypeName()) + ", requested " + std::string(Types::toString(requested)), ErrorCodes::BAD_GET); - return get(); - } + template auto & safeGet() const + { return const_cast(this)->safeGet(); } - template const T & safeGet() const - { - const Types::Which requested = TypeToEnum>::value; - if (which != requested) - throw Exception("Bad get: has " + std::string(getTypeName()) + ", requested " + std::string(Types::toString(requested)), ErrorCodes::BAD_GET); - return get(); - } + template auto & safeGet(); bool operator< (const Field & rhs) const { @@ -573,7 +563,7 @@ public: { case Types::Null: return f(field.template get()); // gcc 8.2.1 -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -593,7 +583,7 @@ public: case Types::Int128: return f(field.template get()); case Types::UInt256: return f(field.template get()); case Types::Int256: return f(field.template get()); -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif } @@ -778,22 +768,40 @@ inline constexpr bool isInt64FieldType(Field::Types::Which t) // Field value getter with type checking in debug builds. template -T & Field::get() +NearestFieldType> & Field::get() { - using ValueType = std::decay_t; + // Before storing the value in the Field, we static_cast it to the field + // storage type, so here we return the value of storage type as well. + // Otherwise, it is easy to make a mistake of reinterpret_casting the stored + // value to a different and incompatible type. + // For example, a Float32 value is stored as Float64, and it is incorrect to + // return a reference to this value as Float32. + using StoredType = NearestFieldType>; #ifndef NDEBUG // Disregard signedness when converting between int64 types. - constexpr Field::Types::Which target = TypeToEnum>::value; + constexpr Field::Types::Which target = TypeToEnum::value; if (target != which && (!isInt64FieldType(target) || !isInt64FieldType(which))) throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Field get from type {} to type {}", Types::toString(which), Types::toString(target)); #endif - ValueType * MAY_ALIAS ptr = reinterpret_cast(&storage); + StoredType * MAY_ALIAS ptr = reinterpret_cast(&storage); + return *ptr; } + +template +auto & Field::safeGet() +{ + const Types::Which requested = TypeToEnum>>::value; + if (which != requested) + throw Exception("Bad get: has " + std::string(getTypeName()) + ", requested " + std::string(Types::toString(requested)), ErrorCodes::BAD_GET); + return get(); +} + + template T & Field::reinterpret() { @@ -946,3 +954,26 @@ void writeFieldText(const Field & x, WriteBuffer & buf); String toString(const Field & x); } + +template <> +struct fmt::formatter +{ + constexpr auto parse(format_parse_context & ctx) + { + auto it = ctx.begin(); + auto end = ctx.end(); + + /// Only support {}. + if (it != end && *it != '}') + throw format_error("invalid format"); + + return it; + } + + template + auto format(const DB::Field & x, FormatContext & ctx) + { + return format_to(ctx.out(), "{}", toString(x)); + } +}; + diff --git a/src/Core/MySQL/Authentication.cpp b/src/Core/MySQL/Authentication.cpp index e685ad0394d..b8dff9972c7 100644 --- a/src/Core/MySQL/Authentication.cpp +++ b/src/Core/MySQL/Authentication.cpp @@ -73,7 +73,7 @@ Native41::Native41(const String & password, const String & auth_plugin_data) } void Native41::authenticate( - const String & user_name, std::optional auth_response, Context & context, + const String & user_name, std::optional auth_response, ContextPtr context, std::shared_ptr packet_endpoint, bool, const Poco::Net::SocketAddress & address) { if (!auth_response) @@ -86,15 +86,17 @@ void Native41::authenticate( if (auth_response->empty()) { - context.setUser(user_name, "", address); + context->setUser(user_name, "", address); return; } if (auth_response->size() != Poco::SHA1Engine::DIGEST_SIZE) - throw Exception("Wrong size of auth response. Expected: " + std::to_string(Poco::SHA1Engine::DIGEST_SIZE) + " bytes, received: " + std::to_string(auth_response->size()) + " bytes.", - ErrorCodes::UNKNOWN_EXCEPTION); + throw Exception( + "Wrong size of auth response. Expected: " + std::to_string(Poco::SHA1Engine::DIGEST_SIZE) + + " bytes, received: " + std::to_string(auth_response->size()) + " bytes.", + ErrorCodes::UNKNOWN_EXCEPTION); - auto user = context.getAccessControlManager().read(user_name); + auto user = context->getAccessControlManager().read(user_name); Poco::SHA1Engine::Digest double_sha1_value = user->authentication.getPasswordDoubleSHA1(); assert(double_sha1_value.size() == Poco::SHA1Engine::DIGEST_SIZE); @@ -109,7 +111,7 @@ void Native41::authenticate( { password_sha1[i] = digest[i] ^ static_cast((*auth_response)[i]); } - context.setUser(user_name, password_sha1, address); + context->setUser(user_name, password_sha1, address); } #if USE_SSL @@ -134,7 +136,7 @@ Sha256Password::Sha256Password(RSA & public_key_, RSA & private_key_, Poco::Logg } void Sha256Password::authenticate( - const String & user_name, std::optional auth_response, Context & context, + const String & user_name, std::optional auth_response, ContextPtr context, std::shared_ptr packet_endpoint, bool is_secure_connection, const Poco::Net::SocketAddress & address) { if (!auth_response) @@ -229,7 +231,7 @@ void Sha256Password::authenticate( password.pop_back(); } - context.setUser(user_name, password, address); + context->setUser(user_name, password, address); } #endif diff --git a/src/Core/MySQL/Authentication.h b/src/Core/MySQL/Authentication.h index e1b7c174139..5358e2da737 100644 --- a/src/Core/MySQL/Authentication.h +++ b/src/Core/MySQL/Authentication.h @@ -32,7 +32,7 @@ public: virtual String getAuthPluginData() = 0; virtual void authenticate( - const String & user_name, std::optional auth_response, Context & context, + const String & user_name, std::optional auth_response, ContextPtr context, std::shared_ptr packet_endpoint, bool is_secure_connection, const Poco::Net::SocketAddress & address) = 0; }; @@ -49,7 +49,7 @@ public: String getAuthPluginData() override { return scramble; } void authenticate( - const String & user_name, std::optional auth_response, Context & context, + const String & user_name, std::optional auth_response, ContextPtr context, std::shared_ptr packet_endpoint, bool /* is_secure_connection */, const Poco::Net::SocketAddress & address) override; private: @@ -69,7 +69,7 @@ public: String getAuthPluginData() override { return scramble; } void authenticate( - const String & user_name, std::optional auth_response, Context & context, + const String & user_name, std::optional auth_response, ContextPtr context, std::shared_ptr packet_endpoint, bool is_secure_connection, const Poco::Net::SocketAddress & address) override; private: diff --git a/src/Core/MySQL/MySQLClient.cpp b/src/Core/MySQL/MySQLClient.cpp index e41b4128738..3650818c543 100644 --- a/src/Core/MySQL/MySQLClient.cpp +++ b/src/Core/MySQL/MySQLClient.cpp @@ -68,6 +68,7 @@ void MySQLClient::disconnect() socket->close(); socket = nullptr; connected = false; + seq = 0; } /// https://dev.mysql.com/doc/internals/en/connection-phase-packets.html diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp index 1b202c4edb4..7e3f68983be 100644 --- a/src/Core/MySQL/MySQLReplication.cpp +++ b/src/Core/MySQL/MySQLReplication.cpp @@ -17,6 +17,7 @@ namespace ErrorCodes extern const int UNKNOWN_EXCEPTION; extern const int LOGICAL_ERROR; extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int CANNOT_READ_ALL_DATA; } namespace MySQLReplication @@ -420,8 +421,8 @@ namespace MySQLReplication UInt32 i24 = 0; payload.readStrict(reinterpret_cast(&i24), 3); - DayNum date_day_number = DateLUT::instance().makeDayNum( - static_cast((i24 >> 9) & 0x7fff), static_cast((i24 >> 5) & 0xf), static_cast(i24 & 0x1f)); + const DayNum date_day_number(DateLUT::instance().makeDayNum( + static_cast((i24 >> 9) & 0x7fff), static_cast((i24 >> 5) & 0xf), static_cast(i24 & 0x1f)).toUnderType()); row.push_back(Field(date_day_number.toUnderType())); break; @@ -443,7 +444,7 @@ namespace MySQLReplication row.push_back(Field{UInt32(date_time)}); else { - DB::DecimalUtils::DecimalComponents components{ + DB::DecimalUtils::DecimalComponents components{ static_cast(date_time), 0}; components.fractional = fsp; @@ -462,7 +463,7 @@ namespace MySQLReplication row.push_back(Field{sec}); else { - DB::DecimalUtils::DecimalComponents components{ + DB::DecimalUtils::DecimalComponents components{ static_cast(sec), 0}; components.fractional = fsp; @@ -740,7 +741,7 @@ namespace MySQLReplication switch (header) { case PACKET_EOF: - throw ReplicationError("Master maybe lost", ErrorCodes::UNKNOWN_EXCEPTION); + throw ReplicationError("Master maybe lost", ErrorCodes::CANNOT_READ_ALL_DATA); case PACKET_ERR: ERRPacket err; err.readPayloadWithUnpacked(payload); diff --git a/src/Core/MySQL/PacketsProtocolText.cpp b/src/Core/MySQL/PacketsProtocolText.cpp index 62efe549b33..0494a146c47 100644 --- a/src/Core/MySQL/PacketsProtocolText.cpp +++ b/src/Core/MySQL/PacketsProtocolText.cpp @@ -12,7 +12,7 @@ namespace MySQLProtocol namespace ProtocolText { -ResultSetRow::ResultSetRow(const DataTypes & data_types, const Columns & columns_, int row_num_) +ResultSetRow::ResultSetRow(const Serializations & serializations, const Columns & columns_, int row_num_) : columns(columns_), row_num(row_num_) { for (size_t i = 0; i < columns.size(); i++) @@ -25,7 +25,7 @@ ResultSetRow::ResultSetRow(const DataTypes & data_types, const Columns & columns else { WriteBufferFromOwnString ostr; - data_types[i]->serializeAsText(*columns[i], row_num, ostr, FormatSettings()); + serializations[i]->serializeText(*columns[i], row_num, ostr, FormatSettings()); payload_size += getLengthEncodedStringSize(ostr.str()); serialized.push_back(std::move(ostr.str())); } diff --git a/src/Core/MySQL/PacketsProtocolText.h b/src/Core/MySQL/PacketsProtocolText.h index b54b1c5ca19..aeeedc4dbf8 100644 --- a/src/Core/MySQL/PacketsProtocolText.h +++ b/src/Core/MySQL/PacketsProtocolText.h @@ -76,7 +76,7 @@ protected: void writePayloadImpl(WriteBuffer & buffer) const override; public: - ResultSetRow(const DataTypes & data_types, const Columns & columns_, int row_num_); + ResultSetRow(const Serializations & serializations, const Columns & columns_, int row_num_); }; class ComFieldList : public LimitedReadPacket diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index e96ce1824d2..57d29c96c53 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB @@ -23,7 +22,9 @@ NameAndTypePair::NameAndTypePair( : name(name_in_storage_ + (subcolumn_name_.empty() ? "" : "." + subcolumn_name_)) , type(subcolumn_type_) , type_in_storage(type_in_storage_) - , subcolumn_delimiter_position(name_in_storage_.size()) {} + , subcolumn_delimiter_position(subcolumn_name_.empty() ? std::nullopt : std::make_optional(name_in_storage_.size())) +{ +} String NameAndTypePair::getNameInStorage() const { diff --git a/src/Core/NamesAndTypes.h b/src/Core/NamesAndTypes.h index dad031a543c..fc86c7f6a1d 100644 --- a/src/Core/NamesAndTypes.h +++ b/src/Core/NamesAndTypes.h @@ -63,7 +63,7 @@ using NamesAndTypes = std::vector; class NamesAndTypesList : public std::list { public: - NamesAndTypesList() {} + NamesAndTypesList() = default; NamesAndTypesList(std::initializer_list init) : std::list(init) {} diff --git a/src/Core/PostgreSQLProtocol.h b/src/Core/PostgreSQLProtocol.h index 471811e969b..4c41ead7a02 100644 --- a/src/Core/PostgreSQLProtocol.h +++ b/src/Core/PostgreSQLProtocol.h @@ -797,15 +797,15 @@ namespace PGAuthentication class AuthenticationMethod { protected: - void setPassword( + static void setPassword( const String & user_name, const String & password, - Context & context, + ContextPtr context, Messaging::MessageTransport & mt, const Poco::Net::SocketAddress & address) { try { - context.setUser(user_name, password, address); + context->setUser(user_name, password, address); } catch (const Exception &) { @@ -819,7 +819,7 @@ protected: public: virtual void authenticate( const String & user_name, - Context & context, + ContextPtr context, Messaging::MessageTransport & mt, const Poco::Net::SocketAddress & address) = 0; @@ -832,10 +832,13 @@ class NoPasswordAuth : public AuthenticationMethod { public: void authenticate( - const String & /* user_name */, - Context & /* context */, - Messaging::MessageTransport & /* mt */, - const Poco::Net::SocketAddress & /* address */) override {} + const String & user_name, + ContextPtr context, + Messaging::MessageTransport & mt, + const Poco::Net::SocketAddress & address) override + { + setPassword(user_name, "", context, mt, address); + } Authentication::Type getType() const override { @@ -848,7 +851,7 @@ class CleartextPasswordAuth : public AuthenticationMethod public: void authenticate( const String & user_name, - Context & context, + ContextPtr context, Messaging::MessageTransport & mt, const Poco::Net::SocketAddress & address) override { @@ -891,11 +894,11 @@ public: void authenticate( const String & user_name, - Context & context, + ContextPtr context, Messaging::MessageTransport & mt, const Poco::Net::SocketAddress & address) { - auto user = context.getAccessControlManager().read(user_name); + auto user = context->getAccessControlManager().read(user_name); Authentication::Type user_auth_type = user->authentication.getType(); if (type_to_method.find(user_auth_type) != type_to_method.end()) diff --git a/src/Core/Protocol.h b/src/Core/Protocol.h index df51a0cb61a..92e780104b5 100644 --- a/src/Core/Protocol.h +++ b/src/Core/Protocol.h @@ -76,8 +76,10 @@ namespace Protocol Log = 10, /// System logs of the query execution TableColumns = 11, /// Columns' description for default values calculation PartUUIDs = 12, /// List of unique parts ids. - - MAX = PartUUIDs, + ReadTaskRequest = 13, /// String (UUID) describes a request for which next task is needed + /// This is such an inverted logic, where server sends requests + /// And client returns back response + MAX = ReadTaskRequest, }; /// NOTE: If the type of packet argument would be Enum, the comparison packet >= 0 && packet < 10 @@ -100,6 +102,7 @@ namespace Protocol "Log", "TableColumns", "PartUUIDs", + "ReadTaskRequest" }; return packet <= MAX ? data[packet] @@ -135,8 +138,9 @@ namespace Protocol KeepAlive = 6, /// Keep the connection alive Scalar = 7, /// A block of data (compressed or not). IgnoredPartUUIDs = 8, /// List of unique parts ids to exclude from query processing + ReadTaskResponse = 9, /// TODO: - MAX = IgnoredPartUUIDs, + MAX = ReadTaskResponse, }; inline const char * toString(UInt64 packet) @@ -151,6 +155,7 @@ namespace Protocol "KeepAlive", "Scalar", "IgnoredPartUUIDs", + "ReadTaskResponse", }; return packet <= MAX ? data[packet] diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1de89aa6047..eaa5e2d34f8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -55,6 +55,10 @@ class IColumn; M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "", 0) \ M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, "", 0) \ M(Seconds, tcp_keep_alive_timeout, 0, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \ + M(Milliseconds, hedged_connection_timeout_ms, DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS, "Connection timeout for establishing connection with replica for Hedged requests", 0) \ + M(Milliseconds, receive_data_timeout_ms, DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_MS, "Connection timeout for receiving first packet of data or packet with positive progress from replica", 0) \ + M(Bool, use_hedged_requests, false, "Use hedged requests for distributed queries", 0) \ + M(Bool, allow_changing_replica_until_first_data_packet, false, "Allow HedgedConnections to change replica until receiving first data packet", 0) \ M(Milliseconds, queue_max_wait_ms, 0, "The wait time in the request queue, if the number of concurrent requests exceeds the maximum.", 0) \ M(Milliseconds, connection_pool_max_wait_ms, 0, "The wait time when the connection pool is full.", 0) \ M(Milliseconds, replace_running_query_max_wait_ms, 5000, "The wait time for running query with the same query_id to finish when setting 'replace_running_query' is active.", 0) \ @@ -66,6 +70,7 @@ class IColumn; M(UInt64, connections_with_failover_max_tries, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES, "The maximum number of attempts to connect to replicas.", 0) \ M(UInt64, s3_min_upload_part_size, 512*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, s3_max_single_part_upload_size, 64*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ + M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \ @@ -74,7 +79,7 @@ class IColumn; M(UInt64, background_buffer_flush_schedule_pool_size, 16, "Number of threads performing background flush for tables with Buffer engine. Only has meaning at server startup.", 0) \ M(UInt64, background_pool_size, 16, "Number of threads performing background work for tables (for example, merging in merge tree). Only has meaning at server startup.", 0) \ M(UInt64, background_move_pool_size, 8, "Number of threads performing background moves for tables. Only has meaning at server startup.", 0) \ - M(UInt64, background_fetches_pool_size, 3, "Number of threads performing background fetches for replicated tables. Only has meaning at server startup.", 0) \ + M(UInt64, background_fetches_pool_size, 8, "Number of threads performing background fetches for replicated tables. Only has meaning at server startup.", 0) \ M(UInt64, background_schedule_pool_size, 16, "Number of threads performing background tasks for replicated tables, dns cache updates. Only has meaning at server startup.", 0) \ M(UInt64, background_message_broker_schedule_pool_size, 16, "Number of threads performing background tasks for message streaming. Only has meaning at server startup.", 0) \ M(UInt64, background_distributed_schedule_pool_size, 16, "Number of threads performing background tasks for distributed sends. Only has meaning at server startup.", 0) \ @@ -112,7 +117,9 @@ class IColumn; M(UInt64, parallel_distributed_insert_select, 0, "Process distributed INSERT SELECT query in the same cluster on local tables on every shard, if 1 SELECT is executed on each shard, if 2 SELECT and INSERT is executed on each shard", 0) \ M(UInt64, distributed_group_by_no_merge, 0, "If 1, Do not merge aggregation states from different servers for distributed query processing - in case it is for certain that there are different keys on different shards. If 2 - same as 1 but also apply ORDER BY and LIMIT stages", 0) \ M(Bool, optimize_distributed_group_by_sharding_key, false, "Optimize GROUP BY sharding_key queries (by avoiding costly aggregation on the initiator server).", 0) \ + M(UInt64, optimize_skip_unused_shards_limit, 1000, "Limit for number of sharding key values, turns off optimize_skip_unused_shards if the limit is reached", 0) \ M(Bool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.", 0) \ + M(Bool, optimize_skip_unused_shards_rewrite_in, true, "Rewrite IN in query for remote shards to exclude values that does not belong to the shard (requires optimize_skip_unused_shards)", 0) \ M(Bool, allow_nondeterministic_optimize_skip_unused_shards, false, "Allow non-deterministic functions (includes dictGet) in sharding_key for optimize_skip_unused_shards", 0) \ M(UInt64, force_optimize_skip_unused_shards, 0, "Throw an exception if unused shards cannot be skipped (1 - throw only if the table has the sharding key, 2 - always throw.", 0) \ M(UInt64, optimize_skip_unused_shards_nesting, 0, "Same as optimize_skip_unused_shards, but accept nesting level until which it will work.", 0) \ @@ -211,14 +218,20 @@ class IColumn; \ M(Bool, insert_distributed_sync, false, "If setting is enabled, insert query into distributed waits until data will be sent to all nodes in cluster.", 0) \ M(UInt64, insert_distributed_timeout, 0, "Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout.", 0) \ - M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite.", 0) \ + M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. Zero means async mode.", 0) \ M(Milliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.", 0) \ M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \ \ + /** Settings for testing hedged requests */ \ + M(Milliseconds, sleep_in_send_tables_status_ms, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \ + M(Milliseconds, sleep_in_send_data_ms, 0, "Time to sleep in sending data in TCPHandler", 0) \ + M(UInt64, unknown_packet_in_send_data, 0, "Send unknown packet instead of data Nth data packet", 0) \ + \ M(Bool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.", 0) \ M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \ M(Seconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout", 0) \ M(Seconds, http_receive_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP receive timeout", 0) \ + M(UInt64, http_max_uri_size, 1048576, "Maximum URI length of HTTP request", 0) \ M(Bool, optimize_throw_if_noop, false, "If setting is enabled and OPTIMIZE query didn't actually assign a merge then an explanatory exception is thrown", 0) \ M(Bool, use_index_for_in_with_subqueries, true, "Try using an index if there is a subquery or a table expression on the right side of the IN operator.", 0) \ M(Bool, joined_subquery_requires_alias, true, "Force joined subqueries and table functions to have aliases for correct name qualification.", 0) \ @@ -231,6 +244,7 @@ class IColumn; M(Bool, metrics_perf_events_enabled, false, "If enabled, some of the perf events will be measured throughout queries' execution.", 0) \ M(String, metrics_perf_events_list, "", "Comma separated list of perf metrics that will be measured throughout queries' execution. Empty means all events. See PerfEventInfo in sources for the available events.", 0) \ M(Float, opentelemetry_start_trace_probability, 0., "Probability to start an OpenTelemetry trace for an incoming query.", 0) \ + M(Bool, prefer_column_name_to_alias, false, "Prefer using column names instead of aliases if possible.", 0) \ \ \ /** Limits during query execution are part of the settings. \ @@ -240,8 +254,6 @@ class IColumn; * Almost all limits apply to each stream individually. \ */ \ \ - M(UInt64, limit, 0, "Limit on read rows from the most 'end' result for select query, default 0 means no limit length", 0) \ - M(UInt64, offset, 0, "Offset on read rows from the most 'end' result for select query", 0) \ M(UInt64, max_rows_to_read, 0, "Limit on read rows from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server.", 0) \ M(UInt64, max_bytes_to_read, 0, "Limit on read bytes (after decompression) from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server.", 0) \ M(OverflowMode, read_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ @@ -347,7 +359,7 @@ class IColumn; M(UInt64, read_in_order_two_level_merge_threshold, 100, "Minimal number of parts to read to run preliminary merge step during multithread reading in order of primary key.", 0) \ M(Bool, low_cardinality_allow_in_native_format, true, "Use LowCardinality type in Native format. Otherwise, convert LowCardinality columns to ordinary for select query, and convert ordinary columns to required LowCardinality for insert query.", 0) \ M(Bool, cancel_http_readonly_queries_on_client_close, false, "Cancel HTTP readonly queries when a client closes the connection without waiting for response.", 0) \ - M(Bool, external_table_functions_use_nulls, true, "If it is set to true, external table functions will implicitly use Nullable type if needed. Otherwise NULLs will be substituted with default values. Currently supported only by 'mysql' and 'odbc' table functions.", 0) \ + M(Bool, external_table_functions_use_nulls, true, "If it is set to true, external table functions will implicitly use Nullable type if needed. Otherwise NULLs will be substituted with default values. Currently supported only by 'mysql', 'postgresql' and 'odbc' table functions.", 0) \ \ M(Bool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.", 0) \ M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \ @@ -358,6 +370,11 @@ class IColumn; M(Bool, check_query_single_value_result, true, "Return check query result as single 1/0 value", 0) \ M(Bool, allow_drop_detached, false, "Allow ALTER TABLE ... DROP DETACHED PART[ITION] ... queries", 0) \ \ + M(UInt64, postgresql_connection_pool_size, 16, "Connection pool size for PostgreSQL table engine and database engine.", 0) \ + M(Int64, postgresql_connection_pool_wait_timeout, -1, "Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool.", 0) \ + M(UInt64, glob_expansion_max_elements, 1000, "Maximum number of allowed addresses (For external storages, table functions, etc).", 0) \ + M(UInt64, odbc_bridge_connection_pool_size, 16, "Connection pool size for each connection settings string in ODBC bridge.", 0) \ + \ M(Seconds, distributed_replica_error_half_life, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_DECREASE_ERROR_PERIOD, "Time period reduces replica error counter by 2 times.", 0) \ M(UInt64, distributed_replica_error_cap, DBMS_CONNECTION_POOL_WITH_FAILOVER_MAX_ERROR_COUNT, "Max number of errors per replica, prevents piling up an incredible amount of errors if replica was offline for some time and allows it to be reconsidered in a shorter amount of time.", 0) \ M(UInt64, distributed_replica_max_ignored_errors, 0, "Number of errors that will be ignored while choosing replicas", 0) \ @@ -409,6 +426,7 @@ class IColumn; M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \ M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \ M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \ + M(Bool, optimize_fuse_sum_count_avg, false, "Fuse aggregate functions sum(), avg(), count() with identical arguments into one sumCount() call, if the query has at least two different functions", 0) \ M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \ M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \ M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \ @@ -417,7 +435,7 @@ class IColumn; M(Bool, allow_experimental_map_type, false, "Allow data type Map", 0) \ M(Bool, allow_experimental_window_functions, false, "Allow experimental window functions", 0) \ M(Bool, use_antlr_parser, false, "Parse incoming queries using ANTLR-generated experimental parser", 0) \ - M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \ + M(Bool, async_socket_for_remote, false, "Asynchronously read from socket executing remote query", 0) \ \ M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \ @@ -426,7 +444,12 @@ class IColumn; M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ - M(Bool, database_replicated_ddl_output, true, "Return table with query execution status as a result of DDL query", 0) \ + M(UInt64, max_distributed_depth, 5, "Maximum distributed query depth", 0) \ + M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \ + M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \ + M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \ + M(UInt64, external_storage_max_read_rows, 0, "Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializeMySQL. If equal to 0, this setting is disabled", 0) \ + M(UInt64, external_storage_max_read_bytes, 0, "Limit maximum number of bytes when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializeMySQL. If equal to 0, this setting is disabled", 0) \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ @@ -437,6 +460,15 @@ class IColumn; M(UnionMode, union_default_mode, UnionMode::Unspecified, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0) \ M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \ M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \ + \ + M(Bool, query_plan_enable_optimizations, true, "Apply optimizations to query plan", 0) \ + M(UInt64, query_plan_max_optimizations_to_apply, 10000, "Limit the total number of optimizations applied to query plan. If zero, ignored. If limit reached, throw exception", 0) \ + M(Bool, query_plan_filter_push_down, true, "Allow to push down filter by predicate query plan step", 0) \ + \ + M(Bool, database_replicated_ddl_output, true, "Obsolete setting, does nothing. Will be removed after 2021-09-08", 0) \ + M(HandleKafkaErrorMode, handle_kafka_error_mode, HandleKafkaErrorMode::DEFAULT, "How to handle errors for Kafka engine. Passible values: default, stream.", 0) \ + M(UInt64, limit, 0, "Limit on read rows from the most 'end' result for select query, default 0 means no limit length", 0) \ + M(UInt64, offset, 0, "Offset on read rows from the most 'end' result for select query", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS below. @@ -509,6 +541,7 @@ class IColumn; M(Bool, output_format_write_statistics, true, "Write statistics about read rows, bytes, time elapsed in suitable output formats.", 0) \ M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \ M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \ + M(Bool, cross_to_inner_join_rewrite, true, "Use inner join instead of comma/cross join if possible", 0) \ // End of FORMAT_FACTORY_SETTINGS @@ -528,7 +561,7 @@ struct Settings : public BaseSettings { /// For initialization from empty initializer-list to be "value initialization", not "aggregate initialization" in C++14. /// http://en.cppreference.com/w/cpp/language/aggregate_initialization - Settings() {} + Settings() = default; /** Set multiple settings from "profile" (in server configuration file (users.xml), profiles contain groups of multiple settings). * The profile can also be set using the `set` functions, like the profile setting. diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index 2e1cf025256..26c2bd9b6af 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -102,4 +102,13 @@ IMPLEMENT_SETTING_ENUM(UnionMode, ErrorCodes::UNKNOWN_UNION, {"ALL", UnionMode::ALL}, {"DISTINCT", UnionMode::DISTINCT}}) +IMPLEMENT_SETTING_ENUM(DistributedDDLOutputMode, ErrorCodes::BAD_ARGUMENTS, + {{"none", DistributedDDLOutputMode::NONE}, + {"throw", DistributedDDLOutputMode::THROW}, + {"null_status_on_timeout", DistributedDDLOutputMode::NULL_STATUS_ON_TIMEOUT}, + {"never_throw", DistributedDDLOutputMode::NEVER_THROW}}) + +IMPLEMENT_SETTING_ENUM(HandleKafkaErrorMode, ErrorCodes::BAD_ARGUMENTS, + {{"default", HandleKafkaErrorMode::DEFAULT}, + {"stream", HandleKafkaErrorMode::STREAM}}) } diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index c2ef08135eb..f0dd10aacfb 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -138,4 +138,23 @@ enum class UnionMode DECLARE_SETTING_ENUM(UnionMode) +enum class DistributedDDLOutputMode +{ + NONE, + THROW, + NULL_STATUS_ON_TIMEOUT, + NEVER_THROW, +}; + +DECLARE_SETTING_ENUM(DistributedDDLOutputMode) + +enum class HandleKafkaErrorMode +{ + DEFAULT = 0, // Ignore errors whit threshold. + STREAM, // Put errors to stream in the virtual column named ``_error. + /*FIXED_SYSTEM_TABLE, Put errors to in a fixed system table likey system.kafka_errors. This is not implemented now. */ + /*CUSTOM_SYSTEM_TABLE, Put errors to in a custom system table. This is not implemented now. */ +}; + +DECLARE_SETTING_ENUM(HandleKafkaErrorMode) } diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp new file mode 100644 index 00000000000..3bf9047af3a --- /dev/null +++ b/src/Core/SettingsQuirks.cpp @@ -0,0 +1,62 @@ +#include +#include +#include + +#ifdef __linux__ +#include +#endif + +#ifdef __linux__ +/// Detect does epoll_wait with nested epoll fds works correctly. +/// Polling nested epoll fds from epoll_wait is required for async_socket_for_remote and use_hedged_requests. +/// +/// It may not be reliable in 5.5+ [1], that has been fixed in 5.7+ [2] or 5.6.13+. +/// +/// [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=339ddb53d373 +/// [2]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c54a6a44bf3 +bool nestedEpollWorks(Poco::Logger * log) +{ + bool nested_epoll_works = +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 5, 0)) && (LINUX_VERSION_CODE < KERNEL_VERSION(5, 6, 13)) + /// the check is correct since there will be no more 5.5.x releases. + false +#else + true +#endif + ; + + if (!nested_epoll_works) + { + if (log) + LOG_WARNING(log, "Nested epoll_wait has some issues on kernels [5.5.0, 5.6.13). You should upgrade it to avoid possible issues."); + } + return nested_epoll_works; +} +#else +bool nestedEpollWorks(Poco::Logger *) { return true; } +#endif + +namespace DB +{ + +/// Update some settings defaults to avoid some known issues. +void applySettingsQuirks(Settings & settings, Poco::Logger * log) +{ + if (!nestedEpollWorks(log)) + { + if (!settings.async_socket_for_remote.changed && settings.async_socket_for_remote) + { + settings.async_socket_for_remote = false; + if (log) + LOG_WARNING(log, "async_socket_for_remote has been disabled (you can explicitly enable it still)"); + } + if (!settings.use_hedged_requests.changed && settings.use_hedged_requests) + { + settings.use_hedged_requests = false; + if (log) + LOG_WARNING(log, "use_hedged_requests has been disabled (you can explicitly enable it still)"); + } + } +} + +} diff --git a/src/Core/SettingsQuirks.h b/src/Core/SettingsQuirks.h new file mode 100644 index 00000000000..38def8eebf2 --- /dev/null +++ b/src/Core/SettingsQuirks.h @@ -0,0 +1,16 @@ +#pragma once + +namespace Poco +{ +class Logger; +} + +namespace DB +{ + +struct Settings; + +/// Update some settings defaults to avoid some known issues. +void applySettingsQuirks(Settings & settings, Poco::Logger * log = nullptr); + +} diff --git a/src/Core/SortCursor.h b/src/Core/SortCursor.h index f383c3ded8e..79730e9697e 100644 --- a/src/Core/SortCursor.h +++ b/src/Core/SortCursor.h @@ -366,4 +366,20 @@ private: } }; +template +bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescription & descr) +{ + for (const auto & elem : descr) + { + size_t ind = elem.column_number; + int res = elem.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.nulls_direction); + if (res < 0) + return true; + else if (res > 0) + return false; + } + + return false; +} + } diff --git a/src/Core/Types.h b/src/Core/Types.h index a463455cbe1..b9ecda4a46d 100644 --- a/src/Core/Types.h +++ b/src/Core/Types.h @@ -15,7 +15,7 @@ namespace DB struct Null {}; /// Ignore strange gcc warning https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55776 -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" #endif @@ -59,7 +59,7 @@ enum class TypeIndex LowCardinality, Map, }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in index 666ef32efdf..e250e013913 100644 --- a/src/Core/config_core.h.in +++ b/src/Core/config_core.h.in @@ -14,3 +14,4 @@ #cmakedefine01 USE_ROCKSDB #cmakedefine01 USE_LIBPQXX #cmakedefine01 USE_NURAFT +#cmakedefine01 USE_KRB5 diff --git a/src/Core/tests/gtest_DecimalFunctions.cpp b/src/Core/tests/gtest_DecimalFunctions.cpp index be64661176b..1069a810d64 100644 --- a/src/Core/tests/gtest_DecimalFunctions.cpp +++ b/src/Core/tests/gtest_DecimalFunctions.cpp @@ -14,7 +14,7 @@ struct DecimalUtilsSplitAndCombineTestParam Decimal64 decimal_value; uint8_t scale; - DecimalUtils::DecimalComponents components; + DecimalUtils::DecimalComponents components; }; std::ostream & operator << (std::ostream & ostr, const DecimalUtilsSplitAndCombineTestParam & param) diff --git a/src/Core/ya.make b/src/Core/ya.make index 1eae848163b..004653d060e 100644 --- a/src/Core/ya.make +++ b/src/Core/ya.make @@ -36,6 +36,7 @@ SRCS( Settings.cpp SettingsEnums.cpp SettingsFields.cpp + SettingsQuirks.cpp SortDescription.cpp iostream_debug_helpers.cpp diff --git a/src/DataStreams/AddingDefaultBlockOutputStream.cpp b/src/DataStreams/AddingDefaultBlockOutputStream.cpp index db1542801d6..f4d8f6954c1 100644 --- a/src/DataStreams/AddingDefaultBlockOutputStream.cpp +++ b/src/DataStreams/AddingDefaultBlockOutputStream.cpp @@ -10,11 +10,11 @@ AddingDefaultBlockOutputStream::AddingDefaultBlockOutputStream( const BlockOutputStreamPtr & output_, const Block & header_, const ColumnsDescription & columns_, - const Context & context_) + ContextPtr context_) : output(output_), header(header_) { auto dag = addMissingDefaults(header_, output->getHeader().getNamesAndTypesList(), columns_, context_); - adding_defaults_actions = std::make_shared(std::move(dag)); + adding_defaults_actions = std::make_shared(std::move(dag), ExpressionActionsSettings::fromContext(context_)); } void AddingDefaultBlockOutputStream::write(const Block & block) diff --git a/src/DataStreams/AddingDefaultBlockOutputStream.h b/src/DataStreams/AddingDefaultBlockOutputStream.h index 5fbbe2aed60..a8235d6623a 100644 --- a/src/DataStreams/AddingDefaultBlockOutputStream.h +++ b/src/DataStreams/AddingDefaultBlockOutputStream.h @@ -26,7 +26,7 @@ public: const BlockOutputStreamPtr & output_, const Block & header_, const ColumnsDescription & columns_, - const Context & context_); + ContextPtr context_); Block getHeader() const override { return header; } void write(const Block & block) override; diff --git a/src/DataStreams/AddingDefaultsBlockInputStream.cpp b/src/DataStreams/AddingDefaultsBlockInputStream.cpp index 4b8dcff1870..e3f0906cb03 100644 --- a/src/DataStreams/AddingDefaultsBlockInputStream.cpp +++ b/src/DataStreams/AddingDefaultsBlockInputStream.cpp @@ -130,7 +130,7 @@ static MutableColumnPtr mixColumns(const ColumnWithTypeAndName & col_read, AddingDefaultsBlockInputStream::AddingDefaultsBlockInputStream( const BlockInputStreamPtr & input, const ColumnsDescription & columns_, - const Context & context_) + ContextPtr context_) : columns(columns_) , column_defaults(columns.getDefaults()) , context(context_) @@ -174,7 +174,7 @@ Block AddingDefaultsBlockInputStream::readImpl() auto dag = evaluateMissingDefaults(evaluate_block, header.getNamesAndTypesList(), columns, context, false); if (dag) { - auto actions = std::make_shared(std::move(dag)); + auto actions = std::make_shared(std::move(dag), ExpressionActionsSettings::fromContext(context)); actions->execute(evaluate_block); } diff --git a/src/DataStreams/AddingDefaultsBlockInputStream.h b/src/DataStreams/AddingDefaultsBlockInputStream.h index b2efa5863ef..957f14caff3 100644 --- a/src/DataStreams/AddingDefaultsBlockInputStream.h +++ b/src/DataStreams/AddingDefaultsBlockInputStream.h @@ -7,8 +7,6 @@ namespace DB { -class Context; - /// Adds defaults to columns using BlockDelayedDefaults bitmask attached to Block by child InputStream. class AddingDefaultsBlockInputStream : public IBlockInputStream { @@ -16,7 +14,7 @@ public: AddingDefaultsBlockInputStream( const BlockInputStreamPtr & input, const ColumnsDescription & columns_, - const Context & context_); + ContextPtr context_); String getName() const override { return "AddingDefaults"; } Block getHeader() const override { return header; } @@ -28,7 +26,7 @@ private: Block header; const ColumnsDescription columns; const ColumnDefaults column_defaults; - const Context & context; + ContextPtr context; }; } diff --git a/src/DataStreams/BlockIO.h b/src/DataStreams/BlockIO.h index 91d7efac8d1..31a0e1020d2 100644 --- a/src/DataStreams/BlockIO.h +++ b/src/DataStreams/BlockIO.h @@ -50,7 +50,7 @@ struct BlockIO } } - void onException() + void onException() const { if (exception_callback) exception_callback(); diff --git a/src/DataStreams/CheckConstraintsBlockOutputStream.cpp b/src/DataStreams/CheckConstraintsBlockOutputStream.cpp index a967ee28502..c4556162323 100644 --- a/src/DataStreams/CheckConstraintsBlockOutputStream.cpp +++ b/src/DataStreams/CheckConstraintsBlockOutputStream.cpp @@ -1,12 +1,15 @@ -#include -#include -#include -#include -#include -#include #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace DB @@ -15,7 +18,7 @@ namespace DB namespace ErrorCodes { extern const int VIOLATED_CONSTRAINT; - extern const int LOGICAL_ERROR; + extern const int UNSUPPORTED_METHOD; } @@ -24,7 +27,7 @@ CheckConstraintsBlockOutputStream::CheckConstraintsBlockOutputStream( const BlockOutputStreamPtr & output_, const Block & header_, const ConstraintsDescription & constraints_, - const Context & context_) + ContextPtr context_) : table_id(table_id_), output(output_), header(header_), @@ -48,62 +51,75 @@ void CheckConstraintsBlockOutputStream::write(const Block & block) ColumnWithTypeAndName res_column = block_to_calculate.getByName(constraint_ptr->expr->getColumnName()); - if (!isUInt8(res_column.type)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Constraint {} does not return a value of type UInt8", + auto result_type = removeNullable(removeLowCardinality(res_column.type)); + + if (!isUInt8(result_type)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Constraint {} does not return a value of type UInt8", backQuote(constraint_ptr->name)); - if (const ColumnConst * res_const = typeid_cast(res_column.column.get())) - { - UInt8 value = res_const->getValue(); + auto result_column = res_column.column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality(); - /// Is violated. - if (!value) - { - throw Exception(ErrorCodes::VIOLATED_CONSTRAINT, - "Constraint {} for table {} is violated, because it is a constant expression returning 0. " - "It is most likely an error in table definition.", - backQuote(constraint_ptr->name), table_id.getNameForLogs()); - } + if (const auto * column_nullable = checkAndGetColumn(*result_column)) + { + const auto & nested_column = column_nullable->getNestedColumnPtr(); + + /// Check if constraint value is nullable + const auto & null_map = column_nullable->getNullMapColumn(); + const PaddedPODArray & data = null_map.getData(); + bool null_map_contains_null = !memoryIsZero(data.raw_data(), data.size() * sizeof(UInt8)); + + if (null_map_contains_null) + throw Exception( + ErrorCodes::VIOLATED_CONSTRAINT, + "Constraint {} for table {} is violated. Expression: ({})."\ + "Constraint expression returns nullable column that contains null value", + backQuote(constraint_ptr->name), + table_id.getNameForLogs(), + serializeAST(*(constraint_ptr->expr), true)); + + result_column = nested_column; } - else + + const ColumnUInt8 & res_column_uint8 = assert_cast(*result_column); + + const UInt8 * data = res_column_uint8.getData().data(); + size_t size = res_column_uint8.size(); + + /// Is violated. + if (!memoryIsByte(data, size, 1)) { - const ColumnUInt8 & res_column_uint8 = assert_cast(*res_column.column); + size_t row_idx = 0; + for (; row_idx < size; ++row_idx) + if (data[row_idx] != 1) + break; - const UInt8 * data = res_column_uint8.getData().data(); - size_t size = res_column_uint8.size(); + Names related_columns = constraint_expr->getRequiredColumns(); - /// Is violated. - if (!memoryIsByte(data, size, 1)) + bool first = true; + String column_values_msg; + constexpr size_t approx_bytes_for_col = 32; + column_values_msg.reserve(approx_bytes_for_col * related_columns.size()); + for (const auto & name : related_columns) { - size_t row_idx = 0; - for (; row_idx < size; ++row_idx) - if (data[row_idx] != 1) - break; + const IColumn & column = *block.getByName(name).column; + assert(row_idx < column.size()); - Names related_columns = constraint_expr->getRequiredColumns(); - - bool first = true; - String column_values_msg; - constexpr size_t approx_bytes_for_col = 32; - column_values_msg.reserve(approx_bytes_for_col * related_columns.size()); - for (const auto & name : related_columns) - { - const IColumn & column = *block.getByName(name).column; - assert(row_idx < column.size()); - - if (!first) - column_values_msg.append(", "); - column_values_msg.append(backQuoteIfNeed(name)); - column_values_msg.append(" = "); - column_values_msg.append(applyVisitor(FieldVisitorToString(), column[row_idx])); - first = false; - } - - throw Exception(ErrorCodes::VIOLATED_CONSTRAINT, - "Constraint {} for table {} is violated at row {}. Expression: ({}). Column values: {}", - backQuote(constraint_ptr->name), table_id.getNameForLogs(), rows_written + row_idx + 1, - serializeAST(*(constraint_ptr->expr), true), column_values_msg); + if (!first) + column_values_msg.append(", "); + column_values_msg.append(backQuoteIfNeed(name)); + column_values_msg.append(" = "); + column_values_msg.append(applyVisitor(FieldVisitorToString(), column[row_idx])); + first = false; } + + throw Exception( + ErrorCodes::VIOLATED_CONSTRAINT, + "Constraint {} for table {} is violated at row {}. Expression: ({}). Column values: {}", + backQuote(constraint_ptr->name), + table_id.getNameForLogs(), + rows_written + row_idx + 1, + serializeAST(*(constraint_ptr->expr), true), + column_values_msg); } } } diff --git a/src/DataStreams/CheckConstraintsBlockOutputStream.h b/src/DataStreams/CheckConstraintsBlockOutputStream.h index a1be720f21e..0f115550eb8 100644 --- a/src/DataStreams/CheckConstraintsBlockOutputStream.h +++ b/src/DataStreams/CheckConstraintsBlockOutputStream.h @@ -20,7 +20,7 @@ public: const BlockOutputStreamPtr & output_, const Block & header_, const ConstraintsDescription & constraints_, - const Context & context_); + ContextPtr context_); Block getHeader() const override { return header; } void write(const Block & block) override; diff --git a/src/DataStreams/IBlockInputStream.h b/src/DataStreams/IBlockInputStream.h index 4e314ef2980..b077f87e8a8 100644 --- a/src/DataStreams/IBlockInputStream.h +++ b/src/DataStreams/IBlockInputStream.h @@ -2,9 +2,8 @@ #include #include -#include -#include #include +#include #include #include #include diff --git a/src/DataStreams/InputStreamFromASTInsertQuery.cpp b/src/DataStreams/InputStreamFromASTInsertQuery.cpp index 70d69227ac0..0848d838276 100644 --- a/src/DataStreams/InputStreamFromASTInsertQuery.cpp +++ b/src/DataStreams/InputStreamFromASTInsertQuery.cpp @@ -24,7 +24,7 @@ InputStreamFromASTInsertQuery::InputStreamFromASTInsertQuery( const ASTPtr & ast, ReadBuffer * input_buffer_tail_part, const Block & header, - const Context & context, + ContextPtr context, const ASTPtr & input_function) { const auto * ast_insert_query = ast->as(); @@ -58,9 +58,9 @@ InputStreamFromASTInsertQuery::InputStreamFromASTInsertQuery( input_buffer_contacenated = std::make_unique(buffers); - res_stream = context.getInputFormat(format, *input_buffer_contacenated, header, context.getSettings().max_insert_block_size); + res_stream = context->getInputFormat(format, *input_buffer_contacenated, header, context->getSettings().max_insert_block_size); - if (context.getSettingsRef().input_format_defaults_for_omitted_fields && ast_insert_query->table_id && !input_function) + if (context->getSettingsRef().input_format_defaults_for_omitted_fields && ast_insert_query->table_id && !input_function) { StoragePtr storage = DatabaseCatalog::instance().getTable(ast_insert_query->table_id, context); auto metadata_snapshot = storage->getInMemoryMetadataPtr(); diff --git a/src/DataStreams/InputStreamFromASTInsertQuery.h b/src/DataStreams/InputStreamFromASTInsertQuery.h index d4c6443c77d..15b698a2d68 100644 --- a/src/DataStreams/InputStreamFromASTInsertQuery.h +++ b/src/DataStreams/InputStreamFromASTInsertQuery.h @@ -25,7 +25,7 @@ public: const ASTPtr & ast, ReadBuffer * input_buffer_tail_part, const Block & header, - const Context & context, + ContextPtr context, const ASTPtr & input_function); Block readImpl() override { return res_stream->read(); } diff --git a/src/DataStreams/InternalTextLogsRowOutputStream.h b/src/DataStreams/InternalTextLogsRowOutputStream.h index 0f333f70d18..8ade76b34a7 100644 --- a/src/DataStreams/InternalTextLogsRowOutputStream.h +++ b/src/DataStreams/InternalTextLogsRowOutputStream.h @@ -8,7 +8,7 @@ namespace DB /// Prints internal server logs /// Input blocks have to have the same structure as SystemLogsQueue::getSampleBlock() -/// NOTE: IRowOutputStream does not suite well for this case +/// NOTE: IRowOutputFormat does not suite well for this case class InternalTextLogsRowOutputStream : public IBlockOutputStream { public: diff --git a/src/DataStreams/MongoDBBlockInputStream.cpp b/src/DataStreams/MongoDBBlockInputStream.cpp index 5463d95151b..e4ddcd09ede 100644 --- a/src/DataStreams/MongoDBBlockInputStream.cpp +++ b/src/DataStreams/MongoDBBlockInputStream.cpp @@ -270,8 +270,8 @@ namespace throw Exception{"Type mismatch, expected Timestamp, got type id = " + toString(value.type()) + " for column " + name, ErrorCodes::TYPE_MISMATCH}; - assert_cast(column).getData().push_back(UInt16{DateLUT::instance().toDayNum( - static_cast &>(value).value().epochTime())}); + assert_cast(column).getData().push_back(static_cast(DateLUT::instance().toDayNum( + static_cast &>(value).value().epochTime()))); break; } diff --git a/src/DataStreams/NativeBlockInputStream.cpp b/src/DataStreams/NativeBlockInputStream.cpp index 377f4451419..2f376f5230d 100644 --- a/src/DataStreams/NativeBlockInputStream.cpp +++ b/src/DataStreams/NativeBlockInputStream.cpp @@ -73,14 +73,16 @@ void NativeBlockInputStream::resetParser() void NativeBlockInputStream::readData(const IDataType & type, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint) { - IDataType::DeserializeBinaryBulkSettings settings; - settings.getter = [&](IDataType::SubstreamPath) -> ReadBuffer * { return &istr; }; + ISerialization::DeserializeBinaryBulkSettings settings; + settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; }; settings.avg_value_size_hint = avg_value_size_hint; settings.position_independent_encoding = false; - IDataType::DeserializeBinaryBulkStatePtr state; - type.deserializeBinaryBulkStatePrefix(settings, state); - type.deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state); + ISerialization::DeserializeBinaryBulkStatePtr state; + auto serialization = type.getDefaultSerialization(); + + serialization->deserializeBinaryBulkStatePrefix(settings, state); + serialization->deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state, nullptr); if (column->size() != rows) throw Exception("Cannot read all data in NativeBlockInputStream. Rows read: " + toString(column->size()) + ". Rows expected: " + toString(rows) + ".", diff --git a/src/DataStreams/NativeBlockOutputStream.cpp b/src/DataStreams/NativeBlockOutputStream.cpp index c17d0dacc49..2a016c9a0c8 100644 --- a/src/DataStreams/NativeBlockOutputStream.cpp +++ b/src/DataStreams/NativeBlockOutputStream.cpp @@ -41,22 +41,24 @@ void NativeBlockOutputStream::flush() } -void NativeBlockOutputStream::writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) +static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) { /** If there are columns-constants - then we materialize them. * (Since the data type does not know how to serialize / deserialize constants.) */ ColumnPtr full_column = column->convertToFullColumnIfConst(); - IDataType::SerializeBinaryBulkSettings settings; - settings.getter = [&ostr](IDataType::SubstreamPath) -> WriteBuffer * { return &ostr; }; + ISerialization::SerializeBinaryBulkSettings settings; + settings.getter = [&ostr](ISerialization::SubstreamPath) -> WriteBuffer * { return &ostr; }; settings.position_independent_encoding = false; settings.low_cardinality_max_dictionary_size = 0; - IDataType::SerializeBinaryBulkStatePtr state; - type.serializeBinaryBulkStatePrefix(settings, state); - type.serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); - type.serializeBinaryBulkStateSuffix(settings, state); + auto serialization = type.getDefaultSerialization(); + + ISerialization::SerializeBinaryBulkStatePtr state; + serialization->serializeBinaryBulkStatePrefix(settings, state); + serialization->serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); + serialization->serializeBinaryBulkStateSuffix(settings, state); } diff --git a/src/DataStreams/NativeBlockOutputStream.h b/src/DataStreams/NativeBlockOutputStream.h index 64ccd267634..c47d7b2f1c3 100644 --- a/src/DataStreams/NativeBlockOutputStream.h +++ b/src/DataStreams/NativeBlockOutputStream.h @@ -30,8 +30,6 @@ public: void write(const Block & block) override; void flush() override; - static void writeData(const IDataType & type, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit); - String getContentType() const override { return "application/octet-stream"; } private: diff --git a/src/DataStreams/PostgreSQLBlockInputStream.cpp b/src/DataStreams/PostgreSQLBlockInputStream.cpp index da6a83fb930..f4af15df011 100644 --- a/src/DataStreams/PostgreSQLBlockInputStream.cpp +++ b/src/DataStreams/PostgreSQLBlockInputStream.cpp @@ -28,13 +28,13 @@ namespace ErrorCodes } PostgreSQLBlockInputStream::PostgreSQLBlockInputStream( - ConnectionPtr connection_, + postgres::ConnectionHolderPtr connection_, const std::string & query_str_, const Block & sample_block, const UInt64 max_block_size_) : query_str(query_str_) , max_block_size(max_block_size_) - , connection(connection_) + , connection(std::move(connection_)) { description.init(sample_block); for (const auto idx : ext::range(0, description.sample_block.columns())) @@ -48,7 +48,7 @@ PostgreSQLBlockInputStream::PostgreSQLBlockInputStream( void PostgreSQLBlockInputStream::readPrefix() { - tx = std::make_unique(*connection); + tx = std::make_unique(connection->conn()); stream = std::make_unique(*tx, pqxx::from_query, std::string_view(query_str)); } @@ -120,8 +120,15 @@ void PostgreSQLBlockInputStream::insertValue(IColumn & column, std::string_view switch (type) { case ValueType::vtUInt8: - assert_cast(column).insertValue(pqxx::from_string(value)); + { + if (value == "t") + assert_cast(column).insertValue(1); + else if (value == "f") + assert_cast(column).insertValue(0); + else + assert_cast(column).insertValue(pqxx::from_string(value)); break; + } case ValueType::vtUInt16: assert_cast(column).insertValue(pqxx::from_string(value)); break; @@ -160,8 +167,15 @@ void PostgreSQLBlockInputStream::insertValue(IColumn & column, std::string_view assert_cast(column).insertValue(UInt16{LocalDate{std::string(value)}.getDayNum()}); break; case ValueType::vtDateTime: - assert_cast(column).insertValue(time_t{LocalDateTime{std::string(value)}}); + { + ReadBufferFromString in(value); + time_t time = 0; + readDateTimeText(time, in); + if (time < 0) + time = 0; + assert_cast(column).insertValue(time); break; + } case ValueType::vtDateTime64:[[fallthrough]]; case ValueType::vtDecimal32: [[fallthrough]]; case ValueType::vtDecimal64: [[fallthrough]]; @@ -169,7 +183,7 @@ void PostgreSQLBlockInputStream::insertValue(IColumn & column, std::string_view case ValueType::vtDecimal256: { ReadBufferFromString istr(value); - data_type->deserializeAsWholeText(column, istr, FormatSettings{}); + data_type->getDefaultSerialization()->deserializeWholeText(column, istr, FormatSettings{}); break; } case ValueType::vtArray: @@ -257,7 +271,13 @@ void PostgreSQLBlockInputStream::prepareArrayInfo(size_t column_idx, const DataT else if (which.isDate()) parser = [](std::string & field) -> Field { return UInt16{LocalDate{field}.getDayNum()}; }; else if (which.isDateTime()) - parser = [](std::string & field) -> Field { return time_t{LocalDateTime{field}}; }; + parser = [](std::string & field) -> Field + { + ReadBufferFromString in(field); + time_t time = 0; + readDateTimeText(time, in); + return time; + }; else if (which.isDecimal32()) parser = [nested](std::string & field) -> Field { diff --git a/src/DataStreams/PostgreSQLBlockInputStream.h b/src/DataStreams/PostgreSQLBlockInputStream.h index b88c81cca0a..b172cae5b62 100644 --- a/src/DataStreams/PostgreSQLBlockInputStream.h +++ b/src/DataStreams/PostgreSQLBlockInputStream.h @@ -9,18 +9,17 @@ #include #include #include -#include +#include namespace DB { -using ConnectionPtr = std::shared_ptr; class PostgreSQLBlockInputStream : public IBlockInputStream { public: PostgreSQLBlockInputStream( - ConnectionPtr connection_, + postgres::ConnectionHolderPtr connection_, const std::string & query_str, const Block & sample_block, const UInt64 max_block_size_); @@ -47,7 +46,7 @@ private: const UInt64 max_block_size; ExternalResultDescription description; - ConnectionPtr connection; + postgres::ConnectionHolderPtr connection; std::unique_ptr tx; std::unique_ptr stream; diff --git a/src/DataStreams/PushingToViewsBlockOutputStream.cpp b/src/DataStreams/PushingToViewsBlockOutputStream.cpp index 4d1990ffe18..16baf4377e0 100644 --- a/src/DataStreams/PushingToViewsBlockOutputStream.cpp +++ b/src/DataStreams/PushingToViewsBlockOutputStream.cpp @@ -26,13 +26,13 @@ namespace DB PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( const StoragePtr & storage_, const StorageMetadataPtr & metadata_snapshot_, - const Context & context_, + ContextPtr context_, const ASTPtr & query_ptr_, bool no_destination) - : storage(storage_) + : WithContext(context_) + , storage(storage_) , metadata_snapshot(metadata_snapshot_) , log(&Poco::Logger::get("PushingToViewsBlockOutputStream")) - , context(context_) , query_ptr(query_ptr_) { checkStackSize(); @@ -42,12 +42,12 @@ PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( * but it's clear that here is not the best place for this functionality. */ addTableLock( - storage->lockForShare(context.getInitialQueryId(), context.getSettingsRef().lock_acquire_timeout)); + storage->lockForShare(getContext()->getInitialQueryId(), getContext()->getSettingsRef().lock_acquire_timeout)); /// If the "root" table deduplicates blocks, there are no need to make deduplication for children /// Moreover, deduplication for AggregatingMergeTree children could produce false positives due to low size of inserting blocks bool disable_deduplication_for_children = false; - if (!context.getSettingsRef().deduplicate_blocks_in_dependent_materialized_views) + if (!getContext()->getSettingsRef().deduplicate_blocks_in_dependent_materialized_views) disable_deduplication_for_children = !no_destination && storage->supportsDeduplication(); auto table_id = storage->getStorageID(); @@ -56,8 +56,8 @@ PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( /// We need special context for materialized views insertions if (!dependencies.empty()) { - select_context = std::make_unique(context); - insert_context = std::make_unique(context); + select_context = Context::createCopy(context); + insert_context = Context::createCopy(context); const auto & insert_settings = insert_context->getSettingsRef(); @@ -74,7 +74,7 @@ PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( for (const auto & database_table : dependencies) { - auto dependent_table = DatabaseCatalog::instance().getTable(database_table, context); + auto dependent_table = DatabaseCatalog::instance().getTable(database_table, getContext()); auto dependent_metadata_snapshot = dependent_table->getInMemoryMetadataPtr(); ASTPtr query; @@ -83,7 +83,7 @@ PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( if (auto * materialized_view = dynamic_cast(dependent_table.get())) { addTableLock( - materialized_view->lockForShare(context.getInitialQueryId(), context.getSettingsRef().lock_acquire_timeout)); + materialized_view->lockForShare(getContext()->getInitialQueryId(), getContext()->getSettingsRef().lock_acquire_timeout)); StoragePtr inner_table = materialized_view->getTargetTable(); auto inner_table_id = inner_table->getStorageID(); @@ -94,7 +94,7 @@ PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( insert->table_id = inner_table_id; /// Get list of columns we get from select query. - auto header = InterpreterSelectQuery(query, *select_context, SelectQueryOptions().analyze()) + auto header = InterpreterSelectQuery(query, select_context, SelectQueryOptions().analyze()) .getSampleBlock(); /// Insert only columns returned by select. @@ -110,16 +110,16 @@ PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( insert->columns = std::move(list); ASTPtr insert_query_ptr(insert.release()); - InterpreterInsertQuery interpreter(insert_query_ptr, *insert_context); + InterpreterInsertQuery interpreter(insert_query_ptr, insert_context); BlockIO io = interpreter.execute(); out = io.out; } else if (dynamic_cast(dependent_table.get())) out = std::make_shared( - dependent_table, dependent_metadata_snapshot, *insert_context, ASTPtr(), true); + dependent_table, dependent_metadata_snapshot, insert_context, ASTPtr(), true); else out = std::make_shared( - dependent_table, dependent_metadata_snapshot, *insert_context, ASTPtr()); + dependent_table, dependent_metadata_snapshot, insert_context, ASTPtr()); views.emplace_back(ViewInfo{std::move(query), database_table, std::move(out), nullptr, 0 /* elapsed_ms */}); } @@ -127,7 +127,7 @@ PushingToViewsBlockOutputStream::PushingToViewsBlockOutputStream( /// Do not push to destination table if the flag is set if (!no_destination) { - output = storage->write(query_ptr, storage->getInMemoryMetadataPtr(), context); + output = storage->write(query_ptr, storage->getInMemoryMetadataPtr(), getContext()); replicated_output = dynamic_cast(output.get()); } } @@ -155,7 +155,7 @@ void PushingToViewsBlockOutputStream::write(const Block & block) if (auto * live_view = dynamic_cast(storage.get())) { - StorageLiveView::writeIntoLiveView(*live_view, block, context); + StorageLiveView::writeIntoLiveView(*live_view, block, getContext()); } else { @@ -166,11 +166,11 @@ void PushingToViewsBlockOutputStream::write(const Block & block) } /// Don't process materialized views if this block is duplicate - if (!context.getSettingsRef().deduplicate_blocks_in_dependent_materialized_views && replicated_output && replicated_output->lastBlockIsDuplicate()) + if (!getContext()->getSettingsRef().deduplicate_blocks_in_dependent_materialized_views && replicated_output && replicated_output->lastBlockIsDuplicate()) return; // Insert data into materialized views only after successful insert into main table - const Settings & settings = context.getSettingsRef(); + const Settings & settings = getContext()->getSettingsRef(); if (settings.parallel_view_processing && views.size() > 1) { // Push to views concurrently if enabled and more than one view is attached @@ -228,7 +228,7 @@ void PushingToViewsBlockOutputStream::writeSuffix() std::exception_ptr first_exception; - const Settings & settings = context.getSettingsRef(); + const Settings & settings = getContext()->getSettingsRef(); bool parallel_processing = false; /// Run writeSuffix() for views in separate thread pool. @@ -311,7 +311,7 @@ void PushingToViewsBlockOutputStream::writeSuffix() UInt64 milliseconds = main_watch.elapsedMilliseconds(); if (views.size() > 1) { - LOG_TRACE(log, "Pushing from {} to {} views took {} ms.", + LOG_DEBUG(log, "Pushing from {} to {} views took {} ms.", storage->getStorageID().getNameForLogs(), views.size(), milliseconds); } @@ -353,10 +353,9 @@ void PushingToViewsBlockOutputStream::process(const Block & block, ViewInfo & vi /// but it will contain single block (that is INSERT-ed into main table). /// InterpreterSelectQuery will do processing of alias columns. - Context local_context = *select_context; - local_context.addViewSource( - StorageValues::create( - storage->getStorageID(), metadata_snapshot->getColumns(), block, storage->getVirtuals())); + auto local_context = Context::createCopy(select_context); + local_context->addViewSource( + StorageValues::create(storage->getStorageID(), metadata_snapshot->getColumns(), block, storage->getVirtuals())); select.emplace(view.query, local_context, SelectQueryOptions()); in = std::make_shared(select->execute().getInputStream()); @@ -364,7 +363,7 @@ void PushingToViewsBlockOutputStream::process(const Block & block, ViewInfo & vi /// even when only one block is inserted into the parent table (e.g. if the query is a GROUP BY /// and two-level aggregation is triggered). in = std::make_shared( - in, context.getSettingsRef().min_insert_block_size_rows, context.getSettingsRef().min_insert_block_size_bytes); + in, getContext()->getSettingsRef().min_insert_block_size_rows, getContext()->getSettingsRef().min_insert_block_size_bytes); in = std::make_shared(in, view.out->getHeader(), ConvertingBlockInputStream::MatchColumnsMode::Name); } else diff --git a/src/DataStreams/PushingToViewsBlockOutputStream.h b/src/DataStreams/PushingToViewsBlockOutputStream.h index 6b32607b294..2ae941efc2e 100644 --- a/src/DataStreams/PushingToViewsBlockOutputStream.h +++ b/src/DataStreams/PushingToViewsBlockOutputStream.h @@ -1,9 +1,9 @@ #pragma once #include -#include #include #include +#include namespace Poco { @@ -14,18 +14,16 @@ namespace DB { class ReplicatedMergeTreeBlockOutputStream; -class Context; - /** Writes data to the specified table and to all dependent materialized views. */ -class PushingToViewsBlockOutputStream : public IBlockOutputStream +class PushingToViewsBlockOutputStream : public IBlockOutputStream, WithContext { public: PushingToViewsBlockOutputStream( const StoragePtr & storage_, const StorageMetadataPtr & metadata_snapshot_, - const Context & context_, + ContextPtr context_, const ASTPtr & query_ptr_, bool no_destination = false); @@ -43,7 +41,6 @@ private: ReplicatedMergeTreeBlockOutputStream * replicated_output = nullptr; Poco::Logger * log; - const Context & context; ASTPtr query_ptr; Stopwatch main_watch; @@ -57,8 +54,8 @@ private: }; std::vector views; - std::unique_ptr select_context; - std::unique_ptr insert_context; + ContextPtr select_context; + ContextPtr insert_context; void process(const Block & block, ViewInfo & view); }; diff --git a/src/DataStreams/RemoteBlockInputStream.cpp b/src/DataStreams/RemoteBlockInputStream.cpp index a62f7fca0b7..c633600d37f 100644 --- a/src/DataStreams/RemoteBlockInputStream.cpp +++ b/src/DataStreams/RemoteBlockInputStream.cpp @@ -6,7 +6,7 @@ namespace DB RemoteBlockInputStream::RemoteBlockInputStream( Connection & connection, - const String & query_, const Block & header_, const Context & context_, + const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_) : query_executor(connection, query_, header_, context_, throttler, scalars_, external_tables_, stage_) { @@ -15,7 +15,7 @@ RemoteBlockInputStream::RemoteBlockInputStream( RemoteBlockInputStream::RemoteBlockInputStream( std::vector && connections, - const String & query_, const Block & header_, const Context & context_, + const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_) : query_executor(std::move(connections), query_, header_, context_, throttler, scalars_, external_tables_, stage_) { @@ -24,7 +24,7 @@ RemoteBlockInputStream::RemoteBlockInputStream( RemoteBlockInputStream::RemoteBlockInputStream( const ConnectionPoolWithFailoverPtr & pool, - const String & query_, const Block & header_, const Context & context_, + const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_) : query_executor(pool, query_, header_, context_, throttler, scalars_, external_tables_, stage_) { diff --git a/src/DataStreams/RemoteBlockInputStream.h b/src/DataStreams/RemoteBlockInputStream.h index 5ef05ee99eb..b0029da91bb 100644 --- a/src/DataStreams/RemoteBlockInputStream.h +++ b/src/DataStreams/RemoteBlockInputStream.h @@ -25,21 +25,21 @@ public: /// Takes already set connection. RemoteBlockInputStream( Connection & connection, - const String & query_, const Block & header_, const Context & context_, + const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete); /// Accepts several connections already taken from pool. RemoteBlockInputStream( std::vector && connections, - const String & query_, const Block & header_, const Context & context_, + const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete); /// Takes a pool and gets one or several connections from it. RemoteBlockInputStream( const ConnectionPoolWithFailoverPtr & pool, - const String & query_, const Block & header_, const Context & context_, + const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete); diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index fc3870b3f22..0961dd41458 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include namespace DB @@ -20,61 +22,74 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int UNKNOWN_PACKET_FROM_SERVER; extern const int DUPLICATED_PART_UUIDS; } RemoteQueryExecutor::RemoteQueryExecutor( Connection & connection, - const String & query_, const Block & header_, const Context & context_, - ThrottlerPtr throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_) + const String & query_, const Block & header_, ContextPtr context_, + ThrottlerPtr throttler, const Scalars & scalars_, const Tables & external_tables_, + QueryProcessingStage::Enum stage_, std::shared_ptr task_iterator_) : header(header_), query(query_), context(context_) - , scalars(scalars_), external_tables(external_tables_), stage(stage_) + , scalars(scalars_), external_tables(external_tables_), stage(stage_), task_iterator(task_iterator_) { - create_multiplexed_connections = [this, &connection, throttler]() + create_connections = [this, &connection, throttler]() { - return std::make_unique(connection, context.getSettingsRef(), throttler); + return std::make_unique(connection, context->getSettingsRef(), throttler); }; } RemoteQueryExecutor::RemoteQueryExecutor( - std::vector && connections, - const String & query_, const Block & header_, const Context & context_, - const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_) + std::vector && connections_, + const String & query_, const Block & header_, ContextPtr context_, + const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, + QueryProcessingStage::Enum stage_, std::shared_ptr task_iterator_) : header(header_), query(query_), context(context_) - , scalars(scalars_), external_tables(external_tables_), stage(stage_) + , scalars(scalars_), external_tables(external_tables_), stage(stage_), task_iterator(task_iterator_) { - create_multiplexed_connections = [this, connections, throttler]() mutable - { - return std::make_unique( - std::move(connections), context.getSettingsRef(), throttler); + create_connections = [this, connections_, throttler]() mutable { + return std::make_unique(std::move(connections_), context->getSettingsRef(), throttler); }; } RemoteQueryExecutor::RemoteQueryExecutor( const ConnectionPoolWithFailoverPtr & pool, - const String & query_, const Block & header_, const Context & context_, - const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_) + const String & query_, const Block & header_, ContextPtr context_, + const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, + QueryProcessingStage::Enum stage_, std::shared_ptr task_iterator_) : header(header_), query(query_), context(context_) - , scalars(scalars_), external_tables(external_tables_), stage(stage_) + , scalars(scalars_), external_tables(external_tables_), stage(stage_), task_iterator(task_iterator_) { - create_multiplexed_connections = [this, pool, throttler]() + create_connections = [this, pool, throttler]()->std::unique_ptr { - const Settings & current_settings = context.getSettingsRef(); + const Settings & current_settings = context->getSettingsRef(); auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); - std::vector connections; + +#if defined(OS_LINUX) + if (current_settings.use_hedged_requests) + { + std::shared_ptr table_to_check = nullptr; + if (main_table) + table_to_check = std::make_shared(main_table.getQualifiedName()); + + return std::make_unique(pool, current_settings, timeouts, throttler, pool_mode, table_to_check); + } +#endif + + std::vector connection_entries; if (main_table) { auto try_results = pool->getManyChecked(timeouts, ¤t_settings, pool_mode, main_table.getQualifiedName()); - connections.reserve(try_results.size()); + connection_entries.reserve(try_results.size()); for (auto & try_result : try_results) - connections.emplace_back(std::move(try_result.entry)); + connection_entries.emplace_back(std::move(try_result.entry)); } else - connections = pool->getMany(timeouts, ¤t_settings, pool_mode); + connection_entries = pool->getMany(timeouts, ¤t_settings, pool_mode); - return std::make_unique( - std::move(connections), current_settings, throttler); + return std::make_unique(std::move(connection_entries), current_settings, throttler); }; } @@ -85,7 +100,7 @@ RemoteQueryExecutor::~RemoteQueryExecutor() * these connections did not remain hanging in the out-of-sync state. */ if (established || isQueryPending()) - multiplexed_connections->disconnect(); + connections->disconnect(); } /** If we receive a block with slightly different column types, or with excessive columns, @@ -142,10 +157,10 @@ void RemoteQueryExecutor::sendQuery() if (sent_query) return; - multiplexed_connections = create_multiplexed_connections(); + connections = create_connections(); - const auto & settings = context.getSettingsRef(); - if (settings.skip_unavailable_shards && 0 == multiplexed_connections->size()) + const auto & settings = context->getSettingsRef(); + if (settings.skip_unavailable_shards && 0 == connections->size()) return; /// Query cannot be canceled in the middle of the send query, @@ -163,7 +178,7 @@ void RemoteQueryExecutor::sendQuery() was_cancelled = false; auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings); - ClientInfo modified_client_info = context.getClientInfo(); + ClientInfo modified_client_info = context->getClientInfo(); modified_client_info.query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; if (CurrentThread::isInitialized()) { @@ -173,10 +188,10 @@ void RemoteQueryExecutor::sendQuery() { std::lock_guard lock(duplicated_part_uuids_mutex); if (!duplicated_part_uuids.empty()) - multiplexed_connections->sendIgnoredPartUUIDs(duplicated_part_uuids); + connections->sendIgnoredPartUUIDs(duplicated_part_uuids); } - multiplexed_connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true); + connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true); established = false; sent_query = true; @@ -192,7 +207,7 @@ Block RemoteQueryExecutor::read() { sendQuery(); - if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size())) + if (context->getSettingsRef().skip_unavailable_shards && (0 == connections->size())) return {}; } @@ -201,7 +216,7 @@ Block RemoteQueryExecutor::read() if (was_cancelled) return Block(); - Packet packet = multiplexed_connections->receivePacket(); + Packet packet = connections->receivePacket(); if (auto block = processPacket(std::move(packet))) return *block; @@ -218,7 +233,7 @@ std::variant RemoteQueryExecutor::read(std::unique_ptr { sendQuery(); - if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size())) + if (context->getSettingsRef().skip_unavailable_shards && (0 == connections->size())) return Block(); } @@ -228,7 +243,7 @@ std::variant RemoteQueryExecutor::read(std::unique_ptr if (was_cancelled) return Block(); - read_context = std::make_unique(*multiplexed_connections); + read_context = std::make_unique(*connections); } do @@ -239,7 +254,7 @@ std::variant RemoteQueryExecutor::read(std::unique_ptr if (read_context->is_read_in_progress.load(std::memory_order_relaxed)) { read_context->setTimer(); - return read_context->epoll_fd; + return read_context->epoll.getFileDescriptor(); } else { @@ -260,7 +275,7 @@ std::variant RemoteQueryExecutor::restartQueryWithoutDuplicatedUUIDs { /// Cancel previous query and disconnect before retry. cancel(read_context); - multiplexed_connections->disconnect(); + connections->disconnect(); /// Only resend once, otherwise throw an exception if (!resent_query) @@ -284,6 +299,9 @@ std::optional RemoteQueryExecutor::processPacket(Packet packet) { switch (packet.type) { + case Protocol::Server::ReadTaskRequest: + processReadTaskRequest(); + break; case Protocol::Server::PartUUIDs: if (!setPartUUIDs(packet.part_uuids)) got_duplicated_part_uuids = true; @@ -300,7 +318,7 @@ std::optional RemoteQueryExecutor::processPacket(Packet packet) break; case Protocol::Server::EndOfStream: - if (!multiplexed_connections->hasActiveConnections()) + if (!connections->hasActiveConnections()) { finished = true; return Block(); @@ -342,7 +360,7 @@ std::optional RemoteQueryExecutor::processPacket(Packet packet) got_unknown_packet_from_replica = true; throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", toString(packet.type), - multiplexed_connections->dumpAddresses()); + connections->dumpAddresses()); } return {}; @@ -350,8 +368,8 @@ std::optional RemoteQueryExecutor::processPacket(Packet packet) bool RemoteQueryExecutor::setPartUUIDs(const std::vector & uuids) { - Context & query_context = const_cast(context).getQueryContext(); - auto duplicates = query_context.getPartUUIDs()->add(uuids); + auto query_context = context->getQueryContext(); + auto duplicates = query_context->getPartUUIDs()->add(uuids); if (!duplicates.empty()) { @@ -362,6 +380,14 @@ bool RemoteQueryExecutor::setPartUUIDs(const std::vector & uuids) return true; } +void RemoteQueryExecutor::processReadTaskRequest() +{ + if (!task_iterator) + throw Exception("Distributed task iterator is not initialized", ErrorCodes::LOGICAL_ERROR); + auto response = (*task_iterator)(); + connections->sendReadTaskResponse(response); +} + void RemoteQueryExecutor::finish(std::unique_ptr * read_context) { /** If one of: @@ -382,7 +408,7 @@ void RemoteQueryExecutor::finish(std::unique_ptr * read_context) tryCancel("Cancelling query because enough data has been read", read_context); /// Get the remaining packets so that there is no out of sync in the connections to the replicas. - Packet packet = multiplexed_connections->drain(); + Packet packet = connections->drain(); switch (packet.type) { case Protocol::Server::EndOfStream: @@ -404,7 +430,7 @@ void RemoteQueryExecutor::finish(std::unique_ptr * read_context) got_unknown_packet_from_replica = true; throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", toString(packet.type), - multiplexed_connections->dumpAddresses()); + connections->dumpAddresses()); } } @@ -427,14 +453,14 @@ void RemoteQueryExecutor::cancel(std::unique_ptr * read_context) void RemoteQueryExecutor::sendScalars() { - multiplexed_connections->sendScalarsData(scalars); + connections->sendScalarsData(scalars); } void RemoteQueryExecutor::sendExternalTables() { SelectQueryInfo query_info; - size_t count = multiplexed_connections->size(); + size_t count = connections->size(); { std::lock_guard lock(external_tables_mutex); @@ -472,7 +498,7 @@ void RemoteQueryExecutor::sendExternalTables() } } - multiplexed_connections->sendExternalTablesData(external_tables_data); + connections->sendExternalTablesData(external_tables_data); } void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr * read_context) @@ -489,11 +515,11 @@ void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptrcancel(); - multiplexed_connections->sendCancel(); + connections->sendCancel(); } if (log) - LOG_TRACE(log, "({}) {}", multiplexed_connections->dumpAddresses(), reason); + LOG_TRACE(log, "({}) {}", connections->dumpAddresses(), reason); } bool RemoteQueryExecutor::isQueryPending() const diff --git a/src/DataStreams/RemoteQueryExecutor.h b/src/DataStreams/RemoteQueryExecutor.h index 6a10627b948..a9cffd9cf97 100644 --- a/src/DataStreams/RemoteQueryExecutor.h +++ b/src/DataStreams/RemoteQueryExecutor.h @@ -1,7 +1,8 @@ #pragma once #include -#include +#include +#include #include #include #include @@ -25,6 +26,9 @@ using ProfileInfoCallback = std::function; + /// This class allows one to launch queries on remote replicas of one shard and get results class RemoteQueryExecutor { @@ -34,23 +38,23 @@ public: /// Takes already set connection. RemoteQueryExecutor( Connection & connection, - const String & query_, const Block & header_, const Context & context_, + const String & query_, const Block & header_, ContextPtr context_, ThrottlerPtr throttler_ = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete); + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::shared_ptr task_iterator_ = {}); /// Accepts several connections already taken from pool. RemoteQueryExecutor( - std::vector && connections, - const String & query_, const Block & header_, const Context & context_, + std::vector && connections_, + const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete); + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::shared_ptr task_iterator_ = {}); /// Takes a pool and gets one or several connections from it. RemoteQueryExecutor( const ConnectionPoolWithFailoverPtr & pool, - const String & query_, const Block & header_, const Context & context_, + const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete); + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::shared_ptr task_iterator_ = {}); ~RemoteQueryExecutor(); @@ -103,12 +107,12 @@ private: Block totals; Block extremes; - std::function()> create_multiplexed_connections; - std::unique_ptr multiplexed_connections; + std::function()> create_connections; + std::unique_ptr connections; const String query; - String query_id = ""; - Context context; + String query_id; + ContextPtr context; ProgressCallback progress_callback; ProfileInfoCallback profile_info_callback; @@ -118,6 +122,8 @@ private: /// Temporary tables needed to be sent to remote servers Tables external_tables; QueryProcessingStage::Enum stage; + /// Initiator identifier for distributed task processing + std::shared_ptr task_iterator; /// Streams for reading from temporary tables and following sending of data /// to remote servers for GLOBAL-subqueries @@ -178,6 +184,8 @@ private: /// Return true if duplicates found. bool setPartUUIDs(const std::vector & uuids); + void processReadTaskRequest(); + /// Cancell query and restart it with info about duplicated UUIDs /// only for `allow_experimental_query_deduplication`. std::variant restartQueryWithoutDuplicatedUUIDs(std::unique_ptr * read_context = nullptr); diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index 3cc24ad5056..46c2b6f10cb 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include namespace DB @@ -11,7 +11,7 @@ namespace DB struct RemoteQueryExecutorRoutine { - MultiplexedConnections & connections; + IConnections & connections; RemoteQueryExecutorReadContext & read_context; struct ReadCallback @@ -19,15 +19,15 @@ struct RemoteQueryExecutorRoutine RemoteQueryExecutorReadContext & read_context; Fiber & fiber; - void operator()(Poco::Net::Socket & socket) + void operator()(int fd, const Poco::Timespan & timeout = 0, const std::string fd_description = "") { try { - read_context.setSocket(socket); + read_context.setConnectionFD(fd, timeout, fd_description); } catch (DB::Exception & e) { - e.addMessage(" while reading from socket ({})", socket.peerAddress().toString()); + e.addMessage(" while reading from {}", fd_description); throw; } @@ -70,67 +70,45 @@ namespace ErrorCodes extern const int SOCKET_TIMEOUT; } -RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext(MultiplexedConnections & connections_) +RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext(IConnections & connections_) : connections(connections_) { - epoll_fd = epoll_create(2); - if (-1 == epoll_fd) - throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE); if (-1 == pipe2(pipe_fd, O_NONBLOCK)) throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE); { - epoll_event socket_event; - socket_event.events = EPOLLIN | EPOLLPRI; - socket_event.data.fd = pipe_fd[0]; - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event)) - throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + epoll.add(pipe_fd[0]); } { - epoll_event timer_event; - timer_event.events = EPOLLIN | EPOLLPRI; - timer_event.data.fd = timer.getDescriptor(); - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, timer_event.data.fd, &timer_event)) - throwFromErrno("Cannot add timer descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + epoll.add(timer.getDescriptor()); } auto routine = RemoteQueryExecutorRoutine{connections, *this}; fiber = boost::context::fiber(std::allocator_arg_t(), stack, std::move(routine)); } -void RemoteQueryExecutorReadContext::setSocket(Poco::Net::Socket & socket) +void RemoteQueryExecutorReadContext::setConnectionFD(int fd, const Poco::Timespan & timeout, const std::string & fd_description) { - int fd = socket.impl()->sockfd(); - if (fd == socket_fd) + if (fd == connection_fd) return; - epoll_event socket_event; - socket_event.events = EPOLLIN | EPOLLPRI; - socket_event.data.fd = fd; + if (connection_fd != -1) + epoll.remove(connection_fd); - if (socket_fd != -1) - { - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, socket_fd, &socket_event)) - throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); - } + connection_fd = fd; + epoll.add(connection_fd); - socket_fd = fd; - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, socket_fd, &socket_event)) - throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); - - receive_timeout = socket.impl()->getReceiveTimeout(); + receive_timeout = timeout; + connection_fd_description = fd_description; } -bool RemoteQueryExecutorReadContext::checkTimeout() const +bool RemoteQueryExecutorReadContext::checkTimeout(bool blocking) { try { - return checkTimeoutImpl(); + return checkTimeoutImpl(blocking); } catch (DB::Exception & e) { @@ -140,30 +118,23 @@ bool RemoteQueryExecutorReadContext::checkTimeout() const } } -bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const +bool RemoteQueryExecutorReadContext::checkTimeoutImpl(bool blocking) { + /// Wait for epoll will not block if it was polled externally. epoll_event events[3]; events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; - /// Wait for epoll_fd will not block if it was polled externally. - int num_events = 0; - while (num_events <= 0) - { - num_events = epoll_wait(epoll_fd, events, 3, -1); - if (num_events == -1 && errno != EINTR) - throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); - } + int num_events = epoll.getManyReady(3, events, blocking); bool is_socket_ready = false; bool is_pipe_alarmed = false; - bool has_timer_alarm = false; for (int i = 0; i < num_events; ++i) { - if (events[i].data.fd == socket_fd) + if (events[i].data.fd == connection_fd) is_socket_ready = true; if (events[i].data.fd == timer.getDescriptor()) - has_timer_alarm = true; + is_timer_alarmed = true; if (events[i].data.fd == pipe_fd[0]) is_pipe_alarmed = true; } @@ -171,7 +142,7 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const if (is_pipe_alarmed) return false; - if (has_timer_alarm && !is_socket_ready) + if (is_timer_alarmed && !is_socket_ready) { /// Socket receive timeout. Drain it in case or error, or it may be hide by timeout exception. timer.drain(); @@ -212,9 +183,24 @@ bool RemoteQueryExecutorReadContext::resumeRoutine() void RemoteQueryExecutorReadContext::cancel() { std::lock_guard guard(fiber_lock); + /// It is safe to just destroy fiber - we are not in the process of reading from socket. boost::context::fiber to_destroy = std::move(fiber); + /// One should not try to wait for the current packet here in case of + /// timeout because this will exceed the timeout. + /// Anyway if the timeout is exceeded, then the connection will be shutdown + /// (disconnected), so it will not left in an unsynchronised state. + if (!is_timer_alarmed) + { + /// Wait for current pending packet, to avoid leaving connection in unsynchronised state. + while (is_read_in_progress.load(std::memory_order_relaxed)) + { + checkTimeout(/* blocking= */ true); + to_destroy = std::move(to_destroy).resume(); + } + } + /// Send something to pipe to cancel executor waiting. uint64_t buf = 0; while (-1 == write(pipe_fd[1], &buf, sizeof(buf))) @@ -229,9 +215,7 @@ void RemoteQueryExecutorReadContext::cancel() RemoteQueryExecutorReadContext::~RemoteQueryExecutorReadContext() { - /// socket_fd is closed by Poco::Net::Socket - if (epoll_fd != -1) - close(epoll_fd); + /// connection_fd is closed by Poco::Net::Socket or Epoll if (pipe_fd[0] != -1) close(pipe_fd[0]); if (pipe_fd[1] != -1) diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.h b/src/DataStreams/RemoteQueryExecutorReadContext.h index 89dc2813a9a..4e935bf9c43 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.h +++ b/src/DataStreams/RemoteQueryExecutorReadContext.h @@ -7,7 +7,9 @@ #include #include #include +#include #include +#include #include namespace Poco::Net @@ -33,26 +35,30 @@ public: std::mutex fiber_lock; Poco::Timespan receive_timeout; - MultiplexedConnections & connections; + IConnections & connections; Poco::Net::Socket * last_used_socket = nullptr; /// Here we have three descriptors we are going to wait: - /// * socket_fd is a descriptor of connection. It may be changed in case of reading from several replicas. + /// * connection_fd is a descriptor of connection. It may be changed in case of reading from several replicas. /// * timer is a timerfd descriptor to manually check socket timeout /// * pipe_fd is a pipe we use to cancel query and socket polling by executor. - /// We put those descriptors into our own epoll_fd which is used by external executor. + /// We put those descriptors into our own epoll which is used by external executor. TimerDescriptor timer{CLOCK_MONOTONIC, 0}; - int socket_fd = -1; - int epoll_fd = -1; + bool is_timer_alarmed = false; + int connection_fd = -1; int pipe_fd[2] = { -1, -1 }; - explicit RemoteQueryExecutorReadContext(MultiplexedConnections & connections_); + Epoll epoll; + + std::string connection_fd_description; + + explicit RemoteQueryExecutorReadContext(IConnections & connections_); ~RemoteQueryExecutorReadContext(); - bool checkTimeout() const; - bool checkTimeoutImpl() const; + bool checkTimeout(bool blocking = false); + bool checkTimeoutImpl(bool blocking); - void setSocket(Poco::Net::Socket & socket); + void setConnectionFD(int fd, const Poco::Timespan & timeout = 0, const std::string & fd_description = ""); void setTimer() const; bool resumeRoutine(); diff --git a/src/DataStreams/TTLAggregationAlgorithm.cpp b/src/DataStreams/TTLAggregationAlgorithm.cpp index ebe08159c55..9a1cf45772f 100644 --- a/src/DataStreams/TTLAggregationAlgorithm.cpp +++ b/src/DataStreams/TTLAggregationAlgorithm.cpp @@ -28,12 +28,12 @@ TTLAggregationAlgorithm::TTLAggregationAlgorithm( descr.arguments.push_back(header.getPositionByName(name)); columns_for_aggregator.resize(description.aggregate_descriptions.size()); - const Settings & settings = storage_.global_context.getSettingsRef(); + const Settings & settings = storage_.getContext()->getSettingsRef(); Aggregator::Params params(header, keys, aggregates, false, settings.max_rows_to_group_by, settings.group_by_overflow_mode, 0, 0, settings.max_bytes_before_external_group_by, settings.empty_result_for_aggregation_by_empty_set, - storage_.global_context.getTemporaryVolume(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); + storage_.getContext()->getTemporaryVolume(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); aggregator = std::make_unique(params); } diff --git a/src/DataStreams/TTLBlockInputStream.cpp b/src/DataStreams/TTLBlockInputStream.cpp index 4f141a03475..8b31da6d2f1 100644 --- a/src/DataStreams/TTLBlockInputStream.cpp +++ b/src/DataStreams/TTLBlockInputStream.cpp @@ -68,8 +68,9 @@ TTLBlockInputStream::TTLBlockInputStream( auto default_ast = it->second.expression->clone(); default_ast = addTypeConversionToAST(std::move(default_ast), column.type->getName()); - auto syntax_result = TreeRewriter(storage_.global_context).analyze(default_ast, metadata_snapshot_->getColumns().getAllPhysical()); - default_expression = ExpressionAnalyzer{default_ast, syntax_result, storage_.global_context}.getActions(true); + auto syntax_result + = TreeRewriter(storage_.getContext()).analyze(default_ast, metadata_snapshot_->getColumns().getAllPhysical()); + default_expression = ExpressionAnalyzer{default_ast, syntax_result, storage_.getContext()}.getActions(true); default_column_name = default_ast->getColumnName(); } diff --git a/src/DataStreams/copyData.cpp b/src/DataStreams/copyData.cpp index a0651999034..a26052778a8 100644 --- a/src/DataStreams/copyData.cpp +++ b/src/DataStreams/copyData.cpp @@ -49,6 +49,16 @@ void copyDataImpl(IBlockInputStream & from, IBlockOutputStream & to, TCancelCall to.writeSuffix(); } +void copyData(IBlockInputStream & from, IBlockOutputStream & to, const std::function & progress, + std::atomic * is_cancelled) +{ + auto is_cancelled_pred = [is_cancelled] () + { + return isAtomicSet(is_cancelled); + }; + + copyDataImpl(from, to, is_cancelled_pred, progress); +} inline void doNothing(const Block &) {} diff --git a/src/DataStreams/copyData.h b/src/DataStreams/copyData.h index f2bce8f411b..3dc90aed37d 100644 --- a/src/DataStreams/copyData.h +++ b/src/DataStreams/copyData.h @@ -16,6 +16,9 @@ class Block; */ void copyData(IBlockInputStream & from, IBlockOutputStream & to, std::atomic * is_cancelled = nullptr); +void copyData(IBlockInputStream & from, IBlockOutputStream & to, const std::function & progress, + std::atomic * is_cancelled = nullptr); + void copyData(IBlockInputStream & from, IBlockOutputStream & to, const std::function & is_cancelled); void copyData(IBlockInputStream & from, IBlockOutputStream & to, const std::function & is_cancelled, diff --git a/src/DataTypes/DataTypeAggregateFunction.cpp b/src/DataTypes/DataTypeAggregateFunction.cpp index e92994ae979..7f7b01e031b 100644 --- a/src/DataTypes/DataTypeAggregateFunction.cpp +++ b/src/DataTypes/DataTypeAggregateFunction.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -58,207 +59,6 @@ std::string DataTypeAggregateFunction::doGetName() const return stream.str(); } -void DataTypeAggregateFunction::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const String & s = get(field); - writeVarUInt(s.size(), ostr); - writeString(s, ostr); -} - -void DataTypeAggregateFunction::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - UInt64 size; - readVarUInt(size, istr); - field = String(); - String & s = get(field); - s.resize(size); - istr.readStrict(s.data(), size); -} - -void DataTypeAggregateFunction::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - function->serialize(assert_cast(column).getData()[row_num], ostr); -} - -void DataTypeAggregateFunction::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - ColumnAggregateFunction & column_concrete = assert_cast(column); - - Arena & arena = column_concrete.createOrGetArena(); - size_t size_of_state = function->sizeOfData(); - AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); - - function->create(place); - try - { - function->deserialize(place, istr, &arena); - } - catch (...) - { - function->destroy(place); - throw; - } - - column_concrete.getData().push_back(place); -} - -void DataTypeAggregateFunction::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const ColumnAggregateFunction & real_column = typeid_cast(column); - const ColumnAggregateFunction::Container & vec = real_column.getData(); - - ColumnAggregateFunction::Container::const_iterator it = vec.begin() + offset; - ColumnAggregateFunction::Container::const_iterator end = limit ? it + limit : vec.end(); - - if (end > vec.end()) - end = vec.end(); - - for (; it != end; ++it) - function->serialize(*it, ostr); -} - -void DataTypeAggregateFunction::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const -{ - ColumnAggregateFunction & real_column = typeid_cast(column); - ColumnAggregateFunction::Container & vec = real_column.getData(); - - Arena & arena = real_column.createOrGetArena(); - real_column.set(function); - vec.reserve(vec.size() + limit); - - size_t size_of_state = function->sizeOfData(); - size_t align_of_state = function->alignOfData(); - - for (size_t i = 0; i < limit; ++i) - { - if (istr.eof()) - break; - - AggregateDataPtr place = arena.alignedAlloc(size_of_state, align_of_state); - - function->create(place); - - try - { - function->deserialize(place, istr, &arena); - } - catch (...) - { - function->destroy(place); - throw; - } - - vec.push_back(place); - } -} - -static String serializeToString(const AggregateFunctionPtr & function, const IColumn & column, size_t row_num) -{ - WriteBufferFromOwnString buffer; - function->serialize(assert_cast(column).getData()[row_num], buffer); - return buffer.str(); -} - -static void deserializeFromString(const AggregateFunctionPtr & function, IColumn & column, const String & s) -{ - ColumnAggregateFunction & column_concrete = assert_cast(column); - - Arena & arena = column_concrete.createOrGetArena(); - size_t size_of_state = function->sizeOfData(); - AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); - - function->create(place); - - try - { - ReadBufferFromString istr(s); - function->deserialize(place, istr, &arena); - } - catch (...) - { - function->destroy(place); - throw; - } - - column_concrete.getData().push_back(place); -} - -void DataTypeAggregateFunction::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeString(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeEscapedString(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - String s; - readEscapedString(s, istr); - deserializeFromString(function, column, s); -} - - -void DataTypeAggregateFunction::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeQuotedString(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - String s; - readQuotedStringWithSQLStyle(s, istr); - deserializeFromString(function, column, s); -} - - -void DataTypeAggregateFunction::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - String s; - readStringUntilEOF(s, istr); - deserializeFromString(function, column, s); -} - - -void DataTypeAggregateFunction::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeJSONString(serializeToString(function, column, row_num), ostr, settings); -} - - -void DataTypeAggregateFunction::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - String s; - readJSONString(s, istr); - deserializeFromString(function, column, s); -} - - -void DataTypeAggregateFunction::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeXMLStringForTextElement(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeCSV(serializeToString(function, column, row_num), ostr); -} - - -void DataTypeAggregateFunction::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String s; - readCSV(s, istr, settings.csv); - deserializeFromString(function, column, s); -} - - MutableColumnPtr DataTypeAggregateFunction::createColumn() const { return ColumnAggregateFunction::create(function); @@ -298,6 +98,11 @@ bool DataTypeAggregateFunction::equals(const IDataType & rhs) const return typeid(rhs) == typeid(*this) && getName() == rhs.getName(); } +SerializationPtr DataTypeAggregateFunction::doGetDefaultSerialization() const +{ + return std::make_shared(function); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index d07d46fd3ee..c3fea2ba727 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -39,27 +39,6 @@ public: DataTypePtr getReturnTypeToPredict() const { return function->getReturnTypeToPredict(); } DataTypes getArgumentsDataTypes() const { return argument_types; } - /// NOTE These two functions for serializing single values are incompatible with the functions below. - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -69,6 +48,8 @@ public: bool isParametric() const override { return true; } bool haveSubtypes() const override { return false; } bool shouldAlignRightInPrettyFormats() const override { return false; } + + SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/src/DataTypes/DataTypeArray.cpp b/src/DataTypes/DataTypeArray.cpp index 27088ab822c..bcf3a9c1f57 100644 --- a/src/DataTypes/DataTypeArray.cpp +++ b/src/DataTypes/DataTypeArray.cpp @@ -9,7 +9,9 @@ #include #include #include -#include +#include +#include +#include #include @@ -24,10 +26,7 @@ namespace DB namespace ErrorCodes { - extern const int CANNOT_READ_ALL_DATA; - extern const int CANNOT_READ_ARRAY_FROM_TEXT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int LOGICAL_ERROR; } @@ -37,490 +36,6 @@ DataTypeArray::DataTypeArray(const DataTypePtr & nested_) } -void DataTypeArray::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const Array & a = get(field); - writeVarUInt(a.size(), ostr); - for (size_t i = 0; i < a.size(); ++i) - { - nested->serializeBinary(a[i], ostr); - } -} - - -void DataTypeArray::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - size_t size; - readVarUInt(size, istr); - field = Array(size); - Array & arr = get(field); - for (size_t i = 0; i < size; ++i) - nested->deserializeBinary(arr[i], istr); -} - - -void DataTypeArray::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - size_t size = next_offset - offset; - - writeVarUInt(size, ostr); - - const IColumn & nested_column = column_array.getData(); - for (size_t i = offset; i < next_offset; ++i) - nested->serializeBinary(nested_column, i, ostr); -} - - -void DataTypeArray::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - ColumnArray & column_array = assert_cast(column); - ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t size; - readVarUInt(size, istr); - - IColumn & nested_column = column_array.getData(); - - size_t i = 0; - try - { - for (; i < size; ++i) - nested->deserializeBinary(nested_column, istr); - } - catch (...) - { - if (i) - nested_column.popBack(i); - throw; - } - - offsets.push_back(offsets.back() + size); -} - - -namespace -{ - void serializeArraySizesPositionIndependent(const IColumn & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) - { - const ColumnArray & column_array = typeid_cast(column); - const ColumnArray::Offsets & offset_values = column_array.getOffsets(); - size_t size = offset_values.size(); - - if (!size) - return; - - size_t end = limit && (offset + limit < size) - ? offset + limit - : size; - - ColumnArray::Offset prev_offset = offset_values[offset - 1]; - for (size_t i = offset; i < end; ++i) - { - ColumnArray::Offset current_offset = offset_values[i]; - writeIntBinary(current_offset - prev_offset, ostr); - prev_offset = current_offset; - } - } - - void deserializeArraySizesPositionIndependent(IColumn & column, ReadBuffer & istr, UInt64 limit) - { - ColumnArray & column_array = typeid_cast(column); - ColumnArray::Offsets & offset_values = column_array.getOffsets(); - size_t initial_size = offset_values.size(); - offset_values.resize(initial_size + limit); - - size_t i = initial_size; - ColumnArray::Offset current_offset = initial_size ? offset_values[initial_size - 1] : 0; - while (i < initial_size + limit && !istr.eof()) - { - ColumnArray::Offset current_size = 0; - readIntBinary(current_size, istr); - current_offset += current_size; - offset_values[i] = current_offset; - ++i; - } - - offset_values.resize(i); - } - - ColumnPtr arrayOffsetsToSizes(const IColumn & column) - { - const auto & column_offsets = assert_cast(column); - MutableColumnPtr column_sizes = column_offsets.cloneEmpty(); - - if (column_offsets.empty()) - return column_sizes; - - const auto & offsets_data = column_offsets.getData(); - auto & sizes_data = assert_cast(*column_sizes).getData(); - - sizes_data.resize(offsets_data.size()); - - IColumn::Offset prev_offset = 0; - for (size_t i = 0, size = offsets_data.size(); i < size; ++i) - { - auto current_offset = offsets_data[i]; - sizes_data[i] = current_offset - prev_offset; - prev_offset = current_offset; - } - - return column_sizes; - } - - ColumnPtr arraySizesToOffsets(const IColumn & column) - { - const auto & column_sizes = assert_cast(column); - MutableColumnPtr column_offsets = column_sizes.cloneEmpty(); - - if (column_sizes.empty()) - return column_offsets; - - const auto & sizes_data = column_sizes.getData(); - auto & offsets_data = assert_cast(*column_offsets).getData(); - - offsets_data.resize(sizes_data.size()); - - IColumn::Offset prev_offset = 0; - for (size_t i = 0, size = sizes_data.size(); i < size; ++i) - { - prev_offset += sizes_data[i]; - offsets_data[i] = prev_offset; - } - - return column_offsets; - } -} - - -void DataTypeArray::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - path.push_back(Substream::ArraySizes); - callback(path, *this); - path.back() = Substream::ArrayElements; - nested->enumerateStreams(callback, path); - path.pop_back(); -} - - -void DataTypeArray::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::ArrayElements); - nested->serializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeArray::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::ArrayElements); - nested->serializeBinaryBulkStateSuffix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeArray::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::ArrayElements); - nested->deserializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeArray::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - const ColumnArray & column_array = typeid_cast(column); - - /// First serialize array sizes. - settings.path.push_back(Substream::ArraySizes); - if (auto * stream = settings.getter(settings.path)) - { - if (settings.position_independent_encoding) - serializeArraySizesPositionIndependent(column, *stream, offset, limit); - else - DataTypeNumber().serializeBinaryBulk(*column_array.getOffsetsPtr(), *stream, offset, limit); - } - - /// Then serialize contents of arrays. - settings.path.back() = Substream::ArrayElements; - const ColumnArray::Offsets & offset_values = column_array.getOffsets(); - - if (offset > offset_values.size()) - return; - - /** offset - from which array to write. - * limit - how many arrays should be written, or 0, if you write everything that is. - * end - up to which array the recorded piece ends. - * - * nested_offset - from which element of the innards to write. - * nested_limit - how many elements of the innards to write, or 0, if you write everything that is. - */ - - size_t end = std::min(offset + limit, offset_values.size()); - - size_t nested_offset = offset ? offset_values[offset - 1] : 0; - size_t nested_limit = limit - ? offset_values[end - 1] - nested_offset - : 0; - - if (limit == 0 || nested_limit) - nested->serializeBinaryBulkWithMultipleStreams(column_array.getData(), nested_offset, nested_limit, settings, state); - settings.path.pop_back(); -} - - -void DataTypeArray::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - ColumnArray & column_array = typeid_cast(column); - settings.path.push_back(Substream::ArraySizes); - - if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) - { - column_array.getOffsetsPtr() = arraySizesToOffsets(*cached_column); - } - else if (auto * stream = settings.getter(settings.path)) - { - if (settings.position_independent_encoding) - deserializeArraySizesPositionIndependent(column, *stream, limit); - else - DataTypeNumber().deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0); - - addToSubstreamsCache(cache, settings.path, arrayOffsetsToSizes(column_array.getOffsetsColumn())); - } - - settings.path.back() = Substream::ArrayElements; - - ColumnArray::Offsets & offset_values = column_array.getOffsets(); - ColumnPtr & nested_column = column_array.getDataPtr(); - - /// Number of values corresponding with `offset_values` must be read. - size_t last_offset = offset_values.back(); - if (last_offset < nested_column->size()) - throw Exception("Nested column is longer than last offset", ErrorCodes::LOGICAL_ERROR); - size_t nested_limit = last_offset - nested_column->size(); - - /// Adjust value size hint. Divide it to the average array size. - settings.avg_value_size_hint = nested_limit ? settings.avg_value_size_hint / nested_limit * offset_values.size() : 0; - - nested->deserializeBinaryBulkWithMultipleStreams(nested_column, nested_limit, settings, state, cache); - - settings.path.pop_back(); - - /// Check consistency between offsets and elements subcolumns. - /// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER. - if (!nested_column->empty() && nested_column->size() != last_offset) - throw ParsingException("Cannot read all array values: read just " + toString(nested_column->size()) + " of " + toString(last_offset), - ErrorCodes::CANNOT_READ_ALL_DATA); -} - - -template -static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && write_nested) -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - const IColumn & nested_column = column_array.getData(); - - writeChar('[', ostr); - for (size_t i = offset; i < next_offset; ++i) - { - if (i != offset) - writeChar(',', ostr); - write_nested(nested_column, i); - } - writeChar(']', ostr); -} - - -template -static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) -{ - ColumnArray & column_array = assert_cast(column); - ColumnArray::Offsets & offsets = column_array.getOffsets(); - - IColumn & nested_column = column_array.getData(); - - size_t size = 0; - - bool has_braces = false; - if (checkChar('[', istr)) - has_braces = true; - else if (!allow_unenclosed) - throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); - - try - { - bool first = true; - while (!istr.eof() && *istr.position() != ']') - { - if (!first) - { - if (*istr.position() == ',') - ++istr.position(); - else - throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, - "Cannot read array from text, expected comma or end of array, found '{}'", - *istr.position()); - } - - first = false; - - skipWhitespaceIfAny(istr); - - if (*istr.position() == ']') - break; - - read_nested(nested_column); - ++size; - - skipWhitespaceIfAny(istr); - } - - if (has_braces) - assertChar(']', istr); - else /// If array is not enclosed in braces, we read until EOF. - assertEOF(istr); - } - catch (...) - { - if (size) - nested_column.popBack(size); - throw; - } - - offsets.push_back(offsets.back() + size); -} - - -void DataTypeArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeTextImpl(column, row_num, ostr, - [&](const IColumn & nested_column, size_t i) - { - nested->serializeAsTextQuoted(nested_column, i, ostr, settings); - }); -} - - -void DataTypeArray::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextImpl(column, istr, - [&](IColumn & nested_column) - { - nested->deserializeAsTextQuoted(nested_column, istr, settings); - }, false); -} - -void DataTypeArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - const IColumn & nested_column = column_array.getData(); - - writeChar('[', ostr); - for (size_t i = offset; i < next_offset; ++i) - { - if (i != offset) - writeChar(',', ostr); - nested->serializeAsTextJSON(nested_column, i, ostr, settings); - } - writeChar(']', ostr); -} - - -void DataTypeArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextImpl(column, istr, - [&](IColumn & nested_column) - { - nested->deserializeAsTextJSON(nested_column, istr, settings); - }, false); -} - - -void DataTypeArray::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - const IColumn & nested_column = column_array.getData(); - - writeCString("", ostr); - for (size_t i = offset; i < next_offset; ++i) - { - writeCString("", ostr); - nested->serializeAsTextXML(nested_column, i, ostr, settings); - writeCString("", ostr); - } - writeCString("", ostr); -} - - -void DataTypeArray::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - /// There is no good way to serialize an array in CSV. Therefore, we serialize it into a string, and then write the resulting string in CSV. - WriteBufferFromOwnString wb; - serializeText(column, row_num, wb, settings); - writeCSV(wb.str(), ostr); -} - - -void DataTypeArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String s; - readCSV(s, istr, settings.csv); - ReadBufferFromString rb(s); - - if (settings.csv.input_format_arrays_as_nested_csv) - { - deserializeTextImpl(column, rb, - [&](IColumn & nested_column) - { - nested->deserializeAsTextCSV(nested_column, rb, settings); - }, true); - } - else - { - deserializeTextImpl(column, rb, - [&](IColumn & nested_column) - { - nested->deserializeAsTextQuoted(nested_column, rb, settings); - }, true); - } -} - - MutableColumnPtr DataTypeArray::createColumn() const { return ColumnArray::create(nested->createColumn(), ColumnArray::ColumnOffsets::create()); @@ -546,7 +61,7 @@ DataTypePtr DataTypeArray::tryGetSubcolumnType(const String & subcolumn_name) co DataTypePtr DataTypeArray::tryGetSubcolumnTypeImpl(const String & subcolumn_name, size_t level) const { if (subcolumn_name == "size" + std::to_string(level)) - return createOneElementTuple(std::make_shared(), subcolumn_name, false); + return std::make_shared(); DataTypePtr subcolumn; if (const auto * nested_array = typeid_cast(nested.get())) @@ -554,7 +69,10 @@ DataTypePtr DataTypeArray::tryGetSubcolumnTypeImpl(const String & subcolumn_name else subcolumn = nested->tryGetSubcolumnType(subcolumn_name); - return (subcolumn ? std::make_shared(std::move(subcolumn)) : subcolumn); + if (subcolumn && subcolumn_name != MAIN_SUBCOLUMN_NAME) + subcolumn = std::make_shared(std::move(subcolumn)); + + return subcolumn; } ColumnPtr DataTypeArray::getSubcolumn(const String & subcolumn_name, const IColumn & column) const @@ -577,6 +95,32 @@ ColumnPtr DataTypeArray::getSubcolumnImpl(const String & subcolumn_name, const I return ColumnArray::create(subcolumn, column_array.getOffsetsPtr()); } +SerializationPtr DataTypeArray::getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +{ + return getSubcolumnSerializationImpl(subcolumn_name, base_serialization_getter, 0); +} + +SerializationPtr DataTypeArray::getSubcolumnSerializationImpl( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter, size_t level) const +{ + if (subcolumn_name == "size" + std::to_string(level)) + return std::make_shared(base_serialization_getter(DataTypeUInt64()), subcolumn_name, false); + + SerializationPtr subcolumn; + if (const auto * nested_array = typeid_cast(nested.get())) + subcolumn = nested_array->getSubcolumnSerializationImpl(subcolumn_name, base_serialization_getter, level + 1); + else + subcolumn = nested->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); + + return std::make_shared(subcolumn); +} + +SerializationPtr DataTypeArray::doGetDefaultSerialization() const +{ + return std::make_shared(nested->getDefaultSerialization()); +} + size_t DataTypeArray::getNumberOfDimensions() const { const DataTypeArray * nested_array = typeid_cast(nested.get()); diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 4185163e2e7..c720a15d798 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -1,13 +1,14 @@ #pragma once -#include +#include +#include namespace DB { -class DataTypeArray final : public DataTypeWithSimpleSerialization +class DataTypeArray final : public IDataType { private: /// The type of array elements. @@ -35,56 +36,6 @@ public: return false; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - /** Streaming serialization of arrays is arranged in a special way: - * - elements placed in a row are written/read without array sizes; - * - the sizes are written/read in a separate stream, - * This is necessary, because when implementing nested structures, several arrays can have common sizes. - */ - - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -105,6 +56,10 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; + SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; + + SerializationPtr doGetDefaultSerialization() const override; const DataTypePtr & getNestedType() const { return nested; } @@ -114,6 +69,8 @@ public: private: ColumnPtr getSubcolumnImpl(const String & subcolumn_name, const IColumn & column, size_t level) const; DataTypePtr tryGetSubcolumnTypeImpl(const String & subcolumn_name, size_t level) const; + SerializationPtr getSubcolumnSerializationImpl( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter, size_t level) const; }; } diff --git a/src/DataTypes/DataTypeCustom.h b/src/DataTypes/DataTypeCustom.h index 0fa2e365990..55796e3cc7a 100644 --- a/src/DataTypes/DataTypeCustom.h +++ b/src/DataTypes/DataTypeCustom.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { @@ -24,106 +24,20 @@ public: virtual String getName() const = 0; }; -class IDataTypeCustomTextSerialization -{ -public: - virtual ~IDataTypeCustomTextSerialization() {} - - /** Text serialization for displaying on a terminal or saving into a text file, and the like. - * Without escaping or quoting. - */ - virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - - /** Text deserialization without quoting or escaping. - */ - virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization with escaping but without quoting. - */ - virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization as a literal that may be inserted into a query. - */ - virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization for the CSV format. - */ - virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization intended for using in JSON format. - */ - virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - - /** Text serialization for putting into the XML format. - */ - virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const = 0; -}; - -/** Allows to customize an existing data type by representation with custom substreams. - * Customized data type will be serialized/deserialized to files with different names than base type, - * but binary and text representation will be unchanged. - * E.g it can be used for reading single subcolumns of complex types. - */ -class IDataTypeCustomStreams -{ -public: - virtual ~IDataTypeCustomStreams() = default; - - virtual void enumerateStreams( - const IDataType::StreamCallback & callback, - IDataType::SubstreamPath & path) const = 0; - - virtual void serializeBinaryBulkStatePrefix( - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const = 0; - - virtual void serializeBinaryBulkStateSuffix( - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const = 0; - - virtual void deserializeBinaryBulkStatePrefix( - IDataType::DeserializeBinaryBulkSettings & settings, - IDataType::DeserializeBinaryBulkStatePtr & state) const = 0; - - virtual void serializeBinaryBulkWithMultipleStreams( - const IColumn & column, - size_t offset, - size_t limit, - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const = 0; - - virtual void deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, - size_t limit, - IDataType::DeserializeBinaryBulkSettings & settings, - IDataType::DeserializeBinaryBulkStatePtr & state, - IDataType::SubstreamsCache * cache) const = 0; -}; - using DataTypeCustomNamePtr = std::unique_ptr; -using DataTypeCustomTextSerializationPtr = std::unique_ptr; -using DataTypeCustomStreamsPtr = std::unique_ptr; - /** Describe a data type customization */ struct DataTypeCustomDesc { DataTypeCustomNamePtr name; - DataTypeCustomTextSerializationPtr text_serialization; - DataTypeCustomStreamsPtr streams; + SerializationPtr serialization; DataTypeCustomDesc( DataTypeCustomNamePtr name_, - DataTypeCustomTextSerializationPtr text_serialization_ = nullptr, - DataTypeCustomStreamsPtr streams_ = nullptr) + SerializationPtr serialization_ = nullptr) : name(std::move(name_)) - , text_serialization(std::move(text_serialization_)) - , streams(std::move(streams_)) {} + , serialization(std::move(serialization_)) {} }; using DataTypeCustomDescPtr = std::unique_ptr; diff --git a/src/DataTypes/DataTypeCustomGeo.cpp b/src/DataTypes/DataTypeCustomGeo.cpp index 73d76e7e1e8..f7d05fa3be6 100644 --- a/src/DataTypes/DataTypeCustomGeo.cpp +++ b/src/DataTypes/DataTypeCustomGeo.cpp @@ -1,7 +1,6 @@ -#include +#include #include #include -#include #include #include #include @@ -9,106 +8,20 @@ namespace DB { -namespace -{ - -class DataTypeCustomPointSerialization : public DataTypeCustomSimpleTextSerialization -{ -public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - nestedDataType()->serializeAsText(column, row_num, ostr, settings); - } - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - nestedDataType()->deserializeAsWholeText(column, istr, settings); - } - - static DataTypePtr nestedDataType() - { - static auto data_type = DataTypePtr(std::make_unique( - DataTypes({std::make_unique(), std::make_unique()}))); - return data_type; - } -}; - -class DataTypeCustomRingSerialization : public DataTypeCustomSimpleTextSerialization -{ -public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - nestedDataType()->serializeAsText(column, row_num, ostr, settings); - } - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - nestedDataType()->deserializeAsWholeText(column, istr, settings); - } - - static DataTypePtr nestedDataType() - { - static auto data_type = DataTypePtr(std::make_unique(DataTypeCustomPointSerialization::nestedDataType())); - return data_type; - } -}; - -class DataTypeCustomPolygonSerialization : public DataTypeCustomSimpleTextSerialization -{ -public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - nestedDataType()->serializeAsText(column, row_num, ostr, settings); - } - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - nestedDataType()->deserializeAsWholeText(column, istr, settings); - } - - static DataTypePtr nestedDataType() - { - static auto data_type = DataTypePtr(std::make_unique(DataTypeCustomRingSerialization::nestedDataType())); - return data_type; - } -}; - -class DataTypeCustomMultiPolygonSerialization : public DataTypeCustomSimpleTextSerialization -{ -public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - nestedDataType()->serializeAsText(column, row_num, ostr, settings); - } - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - nestedDataType()->deserializeAsWholeText(column, istr, settings); - } - - static DataTypePtr nestedDataType() - { - static auto data_type = DataTypePtr(std::make_unique(DataTypeCustomPolygonSerialization::nestedDataType())); - return data_type; - } -}; - -} - void registerDataTypeDomainGeo(DataTypeFactory & factory) { // Custom type for point represented as its coordinates stored as Tuple(Float64, Float64) factory.registerSimpleDataTypeCustom("Point", [] { return std::make_pair(DataTypeFactory::instance().get("Tuple(Float64, Float64)"), - std::make_unique(std::make_unique("Point"), std::make_unique())); + std::make_unique(std::make_unique())); }); // Custom type for simple polygon without holes stored as Array(Point) factory.registerSimpleDataTypeCustom("Ring", [] { return std::make_pair(DataTypeFactory::instance().get("Array(Point)"), - std::make_unique(std::make_unique("Ring"), std::make_unique())); + std::make_unique(std::make_unique())); }); // Custom type for polygon with holes stored as Array(Ring) @@ -116,14 +29,14 @@ void registerDataTypeDomainGeo(DataTypeFactory & factory) factory.registerSimpleDataTypeCustom("Polygon", [] { return std::make_pair(DataTypeFactory::instance().get("Array(Ring)"), - std::make_unique(std::make_unique("Polygon"), std::make_unique())); + std::make_unique(std::make_unique())); }); // Custom type for multiple polygons with holes stored as Array(Polygon) factory.registerSimpleDataTypeCustom("MultiPolygon", [] { return std::make_pair(DataTypeFactory::instance().get("Array(Polygon)"), - std::make_unique(std::make_unique("MultiPolygon"), std::make_unique())); + std::make_unique(std::make_unique())); }); } diff --git a/src/DataTypes/DataTypeCustomGeo.h b/src/DataTypes/DataTypeCustomGeo.h new file mode 100644 index 00000000000..c2a83b3e577 --- /dev/null +++ b/src/DataTypes/DataTypeCustomGeo.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +namespace DB +{ + +class DataTypePointName : public DataTypeCustomFixedName +{ +public: + DataTypePointName() : DataTypeCustomFixedName("Point") {} +}; + +class DataTypeRingName : public DataTypeCustomFixedName +{ +public: + DataTypeRingName() : DataTypeCustomFixedName("Ring") {} +}; + +class DataTypePolygonName : public DataTypeCustomFixedName +{ +public: + DataTypePolygonName() : DataTypeCustomFixedName("Polygon") {} +}; + +class DataTypeMultiPolygonName : public DataTypeCustomFixedName +{ +public: + DataTypeMultiPolygonName() : DataTypeCustomFixedName("MultiPolygon") {} +}; + +} diff --git a/src/DataTypes/DataTypeCustomIPv4AndIPv6.cpp b/src/DataTypes/DataTypeCustomIPv4AndIPv6.cpp index 78a1e18679d..808aa43528e 100644 --- a/src/DataTypes/DataTypeCustomIPv4AndIPv6.cpp +++ b/src/DataTypes/DataTypeCustomIPv4AndIPv6.cpp @@ -1,115 +1,24 @@ -#include -#include -#include -#include +#include #include #include -#include namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; - extern const int CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING; -} - -namespace -{ - -class DataTypeCustomIPv4Serialization : public DataTypeCustomSimpleTextSerialization -{ -public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - const auto * col = checkAndGetColumn(&column); - if (!col) - { - throw Exception("IPv4 type can only serialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; - char * ptr = buffer; - formatIPv4(reinterpret_cast(&col->getData()[row_num]), ptr); - - ostr.write(buffer, strlen(buffer)); - } - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - ColumnUInt32 * col = typeid_cast(&column); - if (!col) - { - throw Exception("IPv4 type can only deserialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; - istr.read(buffer, sizeof(buffer) - 1); - UInt32 ipv4_value = 0; - if (!parseIPv4(buffer, reinterpret_cast(&ipv4_value))) - { - throw Exception("Invalid IPv4 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - } - - col->insert(ipv4_value); - } -}; - -class DataTypeCustomIPv6Serialization : public DataTypeCustomSimpleTextSerialization -{ -public: - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - const auto * col = checkAndGetColumn(&column); - if (!col) - { - throw Exception("IPv6 type domain can only serialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; - char * ptr = buffer; - formatIPv6(reinterpret_cast(col->getDataAt(row_num).data), ptr); - - ostr.write(buffer, strlen(buffer)); - } - - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - ColumnFixedString * col = typeid_cast(&column); - if (!col) - { - throw Exception("IPv6 type domain can only deserialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); - } - - char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; - istr.read(buffer, sizeof(buffer) - 1); - - std::string ipv6_value(IPV6_BINARY_LENGTH, '\0'); - if (!parseIPv6(buffer, reinterpret_cast(ipv6_value.data()))) - { - throw Exception("Invalid IPv6 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); - } - - col->insertString(ipv6_value); - } -}; - -} - void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory) { factory.registerSimpleDataTypeCustom("IPv4", [] { - return std::make_pair(DataTypeFactory::instance().get("UInt32"), - std::make_unique(std::make_unique("IPv4"), std::make_unique())); + auto type = DataTypeFactory::instance().get("UInt32"); + return std::make_pair(type, std::make_unique( + std::make_unique("IPv4"), std::make_unique(type->getDefaultSerialization()))); }); factory.registerSimpleDataTypeCustom("IPv6", [] { - return std::make_pair(DataTypeFactory::instance().get("FixedString(16)"), - std::make_unique(std::make_unique("IPv6"), std::make_unique())); + auto type = DataTypeFactory::instance().get("FixedString(16)"); + return std::make_pair(type, std::make_unique( + std::make_unique("IPv6"), std::make_unique(type->getDefaultSerialization()))); }); /// MySQL, MariaDB diff --git a/src/DataTypes/DataTypeCustomSimpleTextSerialization.cpp b/src/DataTypes/DataTypeCustomSimpleTextSerialization.cpp deleted file mode 100644 index 5bb963de667..00000000000 --- a/src/DataTypes/DataTypeCustomSimpleTextSerialization.cpp +++ /dev/null @@ -1,91 +0,0 @@ -#include - -#include -#include -#include -#include - -namespace -{ -using namespace DB; - -String serializeToString(const DataTypeCustomSimpleTextSerialization & domain, const IColumn & column, size_t row_num, const FormatSettings & settings) -{ - WriteBufferFromOwnString buffer; - domain.serializeText(column, row_num, buffer, settings); - - return buffer.str(); -} - -void deserializeFromString(const DataTypeCustomSimpleTextSerialization & domain, IColumn & column, const String & s, const FormatSettings & settings) -{ - ReadBufferFromString istr(s); - domain.deserializeText(column, istr, settings); -} - -} - -namespace DB -{ - -void DataTypeCustomSimpleTextSerialization::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readString(str, istr); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeEscapedString(serializeToString(*this, column, row_num, settings), ostr); -} - -void DataTypeCustomSimpleTextSerialization::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readEscapedString(str, istr); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeQuotedString(serializeToString(*this, column, row_num, settings), ostr); -} - -void DataTypeCustomSimpleTextSerialization::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readQuotedString(str, istr); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeCSVString(serializeToString(*this, column, row_num, settings), ostr); -} - -void DataTypeCustomSimpleTextSerialization::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readCSVString(str, istr, settings.csv); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings); -} - -void DataTypeCustomSimpleTextSerialization::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String str; - readJSONString(str, istr); - deserializeFromString(*this, column, str, settings); -} - -void DataTypeCustomSimpleTextSerialization::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeXMLStringForTextElement(serializeToString(*this, column, row_num, settings), ostr); -} - -} diff --git a/src/DataTypes/DataTypeCustom_fwd.h b/src/DataTypes/DataTypeCustom_fwd.h deleted file mode 100644 index 99c8eee9748..00000000000 --- a/src/DataTypes/DataTypeCustom_fwd.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -class IDataTypeCustomName; -class IDataTypeCustomTextSerialization; -class IDataTypeCustomStreams; -struct DataTypeCustomDesc; - -using DataTypeCustomNamePtr = std::unique_ptr; -using DataTypeCustomTextSerializationPtr = std::unique_ptr; -using DataTypeCustomStreamsPtr = std::unique_ptr; -using DataTypeCustomDescPtr = std::unique_ptr; - -} diff --git a/src/DataTypes/DataTypeDate.cpp b/src/DataTypes/DataTypeDate.cpp index 192a89cc454..0df2e329702 100644 --- a/src/DataTypes/DataTypeDate.cpp +++ b/src/DataTypes/DataTypeDate.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -11,79 +12,15 @@ namespace DB { -void DataTypeDate::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeDateText(DayNum(assert_cast(column).getData()[row_num]), ostr); -} - -void DataTypeDate::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextEscaped(column, istr, settings); -} - -void DataTypeDate::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - DayNum x; - readDateText(x, istr); - assert_cast(column).getData().push_back(x); -} - -void DataTypeDate::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} - -void DataTypeDate::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); -} - -void DataTypeDate::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - DayNum x; - assertChar('\'', istr); - readDateText(x, istr); - assertChar('\'', istr); - assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. -} - -void DataTypeDate::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - DayNum x; - assertChar('"', istr); - readDateText(x, istr); - assertChar('"', istr); - assert_cast(column).getData().push_back(x); -} - -void DataTypeDate::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - LocalDate value; - readCSV(value, istr); - assert_cast(column).getData().push_back(value.getDayNum()); -} - bool DataTypeDate::equals(const IDataType & rhs) const { return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeDate::doGetDefaultSerialization() const +{ + return std::make_shared(); +} void registerDataTypeDate(DataTypeFactory & factory) { diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index 496d7fe0b22..2f17207cc07 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -14,21 +14,13 @@ public: TypeIndex getTypeId() const override { return TypeIndex::Date; } const char * getFamilyName() const override { return family_name; } - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } bool equals(const IDataType & rhs) const override; + +protected: + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeDateTime.cpp b/src/DataTypes/DataTypeDateTime.cpp index d2bbb4a1efa..820bfd70766 100644 --- a/src/DataTypes/DataTypeDateTime.cpp +++ b/src/DataTypes/DataTypeDateTime.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -12,23 +13,6 @@ #include #include -namespace -{ -using namespace DB; -inline void readText(time_t & x, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) -{ - switch (settings.date_time_input_format) - { - case FormatSettings::DateTimeInputFormat::Basic: - readDateTimeText(x, istr, time_zone); - return; - case FormatSettings::DateTimeInputFormat::BestEffort: - parseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); - return; - } -} -} - namespace DB { @@ -36,7 +20,8 @@ TimezoneMixin::TimezoneMixin(const String & time_zone_name) : has_explicit_time_zone(!time_zone_name.empty()), time_zone(DateLUT::instance(time_zone_name)), utc_time_zone(DateLUT::instance("UTC")) -{} +{ +} DataTypeDateTime::DataTypeDateTime(const String & time_zone_name) : TimezoneMixin(time_zone_name) @@ -45,7 +30,8 @@ DataTypeDateTime::DataTypeDateTime(const String & time_zone_name) DataTypeDateTime::DataTypeDateTime(const TimezoneMixin & time_zone_) : TimezoneMixin(time_zone_) -{} +{ +} String DataTypeDateTime::doGetName() const { @@ -57,111 +43,6 @@ String DataTypeDateTime::doGetName() const return out.str(); } -void DataTypeDateTime::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - auto value = assert_cast(column).getData()[row_num]; - switch (settings.date_time_output_format) - { - case FormatSettings::DateTimeOutputFormat::Simple: - writeDateTimeText(value, ostr, time_zone); - return; - case FormatSettings::DateTimeOutputFormat::UnixTimestamp: - writeIntText(value, ostr); - return; - case FormatSettings::DateTimeOutputFormat::ISO: - writeDateTimeTextISO(value, ostr, utc_time_zone); - return; - } -} - -void DataTypeDateTime::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} - -void DataTypeDateTime::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextEscaped(column, istr, settings); -} - -void DataTypeDateTime::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - time_t x; - ::readText(x, istr, settings, time_zone, utc_time_zone); - assert_cast(column).getData().push_back(x); -} - -void DataTypeDateTime::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); -} - -void DataTypeDateTime::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - time_t x; - if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' - { - ::readText(x, istr, settings, time_zone, utc_time_zone); - assertChar('\'', istr); - } - else /// Just 1504193808 or 01504193808 - { - readIntText(x, istr); - } - assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. -} - -void DataTypeDateTime::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - time_t x; - if (checkChar('"', istr)) - { - ::readText(x, istr, settings, time_zone, utc_time_zone); - assertChar('"', istr); - } - else - { - readIntText(x, istr); - } - assert_cast(column).getData().push_back(x); -} - -void DataTypeDateTime::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - time_t x; - - if (istr.eof()) - throwReadAfterEOF(); - - char maybe_quote = *istr.position(); - - if (maybe_quote == '\'' || maybe_quote == '\"') - ++istr.position(); - - ::readText(x, istr, settings, time_zone, utc_time_zone); - - if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, istr); - - assert_cast(column).getData().push_back(x); -} - bool DataTypeDateTime::equals(const IDataType & rhs) const { /// DateTime with different timezones are equal, because: @@ -169,4 +50,9 @@ bool DataTypeDateTime::equals(const IDataType & rhs) const return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeDateTime::doGetDefaultSerialization() const +{ + return std::make_shared(time_zone, utc_time_zone); +} + } diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index edec889309b..926d529a5d8 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -19,9 +19,12 @@ public: TimezoneMixin(const TimezoneMixin &) = default; const DateLUTImpl & getTimeZone() const { return time_zone; } + bool hasExplicitTimeZone() const { return has_explicit_time_zone; } protected: + /// true if time zone name was provided in data type parameters, false if it's using default time zone. bool has_explicit_time_zone; + const DateLUTImpl & time_zone; const DateLUTImpl & utc_time_zone; }; @@ -58,21 +61,12 @@ public: String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::DateTime; } - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } bool equals(const IDataType & rhs) const override; + + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeDateTime64.cpp b/src/DataTypes/DataTypeDateTime64.cpp index 17b94e871bf..eaec585b6b4 100644 --- a/src/DataTypes/DataTypeDateTime64.cpp +++ b/src/DataTypes/DataTypeDateTime64.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -55,131 +56,6 @@ std::string DataTypeDateTime64::doGetName() const return out.str(); } -void DataTypeDateTime64::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - auto value = assert_cast(column).getData()[row_num]; - switch (settings.date_time_output_format) - { - case FormatSettings::DateTimeOutputFormat::Simple: - writeDateTimeText(value, scale, ostr, time_zone); - return; - case FormatSettings::DateTimeOutputFormat::UnixTimestamp: - writeDateTimeUnixTimestamp(value, scale, ostr); - return; - case FormatSettings::DateTimeOutputFormat::ISO: - writeDateTimeTextISO(value, scale, ostr, utc_time_zone); - return; - } -} - -void DataTypeDateTime64::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - DateTime64 result = 0; - readDateTime64Text(result, this->getScale(), istr, time_zone); - assert_cast(column).getData().push_back(result); -} - -void DataTypeDateTime64::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextEscaped(column, istr, settings); -} - -void DataTypeDateTime64::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} - -static inline void readText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) -{ - switch (settings.date_time_input_format) - { - case FormatSettings::DateTimeInputFormat::Basic: - readDateTime64Text(x, scale, istr, time_zone); - return; - case FormatSettings::DateTimeInputFormat::BestEffort: - parseDateTime64BestEffort(x, scale, istr, time_zone, utc_time_zone); - return; - } -} - -void DataTypeDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - DateTime64 x = 0; - readText(x, scale, istr, settings, time_zone, utc_time_zone); - assert_cast(column).getData().push_back(x); -} - -void DataTypeDateTime64::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); -} - -void DataTypeDateTime64::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - DateTime64 x = 0; - if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' - { - readText(x, scale, istr, settings, time_zone, utc_time_zone); - assertChar('\'', istr); - } - else /// Just 1504193808 or 01504193808 - { - readIntText(x, istr); - } - assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. -} - -void DataTypeDateTime64::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - DateTime64 x = 0; - if (checkChar('"', istr)) - { - readText(x, scale, istr, settings, time_zone, utc_time_zone); - assertChar('"', istr); - } - else - { - readIntText(x, istr); - } - assert_cast(column).getData().push_back(x); -} - -void DataTypeDateTime64::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - DateTime64 x = 0; - - if (istr.eof()) - throwReadAfterEOF(); - - char maybe_quote = *istr.position(); - - if (maybe_quote == '\'' || maybe_quote == '\"') - ++istr.position(); - - readText(x, scale, istr, settings, time_zone, utc_time_zone); - - if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, istr); - - assert_cast(column).getData().push_back(x); -} - bool DataTypeDateTime64::equals(const IDataType & rhs) const { if (const auto * ptype = typeid_cast(&rhs)) @@ -187,4 +63,9 @@ bool DataTypeDateTime64::equals(const IDataType & rhs) const return false; } +SerializationPtr DataTypeDateTime64::doGetDefaultSerialization() const +{ + return std::make_shared(time_zone, utc_time_zone, scale); +} + } diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index 198c3739f58..f51e0f5d047 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -31,82 +31,12 @@ public: std::string doGetName() const override; TypeIndex getTypeId() const override { return type_id; } - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool equals(const IDataType & rhs) const override; bool canBePromoted() const override { return false; } -}; -/** Tansform-type wrapper for DateTime64, applies given Transform to DateTime64 value or only to a whole part of it. - * - * Depending on what overloads of Transform::execute() are available, when called with DateTime64 value, - * invokes Transform::execute() with: - * * whole part of DateTime64 value, discarding fractional part. - * * DateTime64 value and scale factor. - * - * Suitable Transfotm-types are commonly used in Date/DateTime manipulation functions, - * and should implement static (or const) function with following signatures: - * R execute(UInt32 whole_value, ... , const TimeZoneImpl &) - * OR - * R execute(DateTime64 value, Int64 scale_factor, ... , const TimeZoneImpl &) - * - * Where R and T could be arbitrary types. -*/ -template -class TransformDateTime64 : public Transform -{ -private: - // Detect if Transform::execute is const or static method - // with signature defined by template args (ignoring result type). - template - struct TransformHasExecuteOverload : std::false_type {}; - - template - struct TransformHasExecuteOverload().execute(std::declval()...))>, Args...> - : std::true_type {}; - - template - static constexpr bool TransformHasExecuteOverload_v = TransformHasExecuteOverload::value; - -public: - static constexpr auto name = Transform::name; - - using Transform::execute; - - // non-explicit constructor to allow creating from scale value (or with no scale at all), indispensable in some contexts. - TransformDateTime64(UInt32 scale_ = 0) - : scale_multiplier(DecimalUtils::scaleMultiplier(scale_)) - {} - - template - inline auto execute(const DateTime64 & t, Args && ... args) const - { - const auto transform = static_cast(this); - - if constexpr (TransformHasExecuteOverload_v) - { - return transform->execute(t, scale_multiplier, std::forward(args)...); - } - else - { - const auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); - return transform->execute(static_cast(components.whole), std::forward(args)...); - } - } - -private: - DateTime64::NativeType scale_multiplier = 1; +protected: + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeDecimalBase.cpp b/src/DataTypes/DataTypeDecimalBase.cpp index ab17996167c..683710b8880 100644 --- a/src/DataTypes/DataTypeDecimalBase.cpp +++ b/src/DataTypes/DataTypeDecimalBase.cpp @@ -19,9 +19,14 @@ namespace ErrorCodes { } - -bool decimalCheckComparisonOverflow(const Context & context) { return context.getSettingsRef().decimal_check_overflow; } -bool decimalCheckArithmeticOverflow(const Context & context) { return context.getSettingsRef().decimal_check_overflow; } +bool decimalCheckComparisonOverflow(ContextPtr context) +{ + return context->getSettingsRef().decimal_check_overflow; +} +bool decimalCheckArithmeticOverflow(ContextPtr context) +{ + return context->getSettingsRef().decimal_check_overflow; +} template Field DataTypeDecimalBase::getDefault() const @@ -35,59 +40,6 @@ MutableColumnPtr DataTypeDecimalBase::createColumn() const return ColumnType::create(0, scale); } -template -void DataTypeDecimalBase::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - FieldType x = get>(field); - writeBinary(x, ostr); -} - -template -void DataTypeDecimalBase::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - const FieldType & x = assert_cast(column).getElement(row_num); - writeBinary(x, ostr); -} - -template -void DataTypeDecimalBase::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const typename ColumnType::Container & x = typeid_cast(column).getData(); - - size_t size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - ostr.write(reinterpret_cast(&x[offset]), sizeof(FieldType) * limit); -} - -template -void DataTypeDecimalBase::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - typename FieldType::NativeType x; - readBinary(x, istr); - field = DecimalField(T(x), this->scale); -} - -template -void DataTypeDecimalBase::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - typename FieldType::NativeType x; - readBinary(x, istr); - assert_cast(column).getData().push_back(FieldType(x)); -} - -template -void DataTypeDecimalBase::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double) const -{ - typename ColumnType::Container & x = typeid_cast(column).getData(); - size_t initial_size = x.size(); - x.resize(initial_size + limit); - size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(FieldType) * limit); - x.resize(initial_size + size / sizeof(FieldType)); -} - template T DataTypeDecimalBase::getScaleMultiplier(UInt32 scale_) { diff --git a/src/DataTypes/DataTypeDecimalBase.h b/src/DataTypes/DataTypeDecimalBase.h index d9079166fa7..9e44310a0ff 100644 --- a/src/DataTypes/DataTypeDecimalBase.h +++ b/src/DataTypes/DataTypeDecimalBase.h @@ -1,12 +1,12 @@ #pragma once -#include #include #include #include #include -#include +#include +#include #include @@ -18,9 +18,8 @@ namespace ErrorCodes extern const int ARGUMENT_OUT_OF_BOUND; } -class Context; -bool decimalCheckComparisonOverflow(const Context & context); -bool decimalCheckArithmeticOverflow(const Context & context); +bool decimalCheckComparisonOverflow(ContextPtr context); +bool decimalCheckArithmeticOverflow(ContextPtr context); inline UInt32 leastDecimalPrecisionFor(TypeIndex int_type) { @@ -55,7 +54,7 @@ inline UInt32 leastDecimalPrecisionFor(TypeIndex int_type) /// P is one of (9, 18, 38, 76); equals to the maximum precision for the biggest underlying type of operands. /// S is maximum scale of operands. The allowed valuas are [0, precision] template -class DataTypeDecimalBase : public DataTypeWithSimpleSerialization +class DataTypeDecimalBase : public IDataType { static_assert(IsDecimalNumber); @@ -96,14 +95,6 @@ public: bool canBeUsedInBooleanContext() const override { return true; } bool canBeInsideNullable() const override { return true; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - /// Decimal specific UInt32 getPrecision() const { return precision; } diff --git a/src/DataTypes/DataTypeEnum.cpp b/src/DataTypes/DataTypeEnum.cpp index 043c971266c..b8b0b906cc4 100644 --- a/src/DataTypes/DataTypeEnum.cpp +++ b/src/DataTypes/DataTypeEnum.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -19,7 +20,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_TYPE_OF_FIELD; - extern const int SYNTAX_ERROR; extern const int EMPTY_DATA_PASSED; extern const int UNEXPECTED_AST_STRUCTURE; extern const int ARGUMENT_OUT_OF_BOUND; @@ -65,203 +65,22 @@ std::string DataTypeEnum::generateName(const Values & values) } template -void DataTypeEnum::fillMaps() +DataTypeEnum::DataTypeEnum(const Values & values_) + : EnumValues(values_) + , type_name(generateName(this->getValues())) { - for (const auto & name_and_value : values) - { - const auto inserted_value = name_to_value_map.insert( - { StringRef{name_and_value.first}, name_and_value.second }); - - if (!inserted_value.second) - throw Exception{"Duplicate names in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) - + " and " + toString(inserted_value.first->getMapped()), - ErrorCodes::SYNTAX_ERROR}; - - const auto inserted_name = value_to_name_map.insert( - { name_and_value.second, StringRef{name_and_value.first} }); - - if (!inserted_name.second) - throw Exception{"Duplicate values in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) - + " and '" + toString((*inserted_name.first).first) + "'", - ErrorCodes::SYNTAX_ERROR}; - } -} - -template -DataTypeEnum::DataTypeEnum(const Values & values_) : values{values_} -{ - if (values.empty()) - throw Exception{"DataTypeEnum enumeration cannot be empty", ErrorCodes::EMPTY_DATA_PASSED}; - - std::sort(std::begin(values), std::end(values), [] (auto & left, auto & right) - { - return left.second < right.second; - }); - - fillMaps(); - type_name = generateName(values); -} - -template -void DataTypeEnum::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const FieldType x = get>(field); - writeBinary(x, ostr); -} - -template -void DataTypeEnum::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - FieldType x; - readBinary(x, istr); - field = castToNearestFieldType(x); -} - -template -void DataTypeEnum::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - writeBinary(assert_cast(column).getData()[row_num], ostr); -} - -template -void DataTypeEnum::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - typename ColumnType::ValueType x; - readBinary(x, istr); - assert_cast(column).getData().push_back(x); -} - -template -void DataTypeEnum::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeString(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeEscapedString(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.tsv.input_format_enum_as_number) - assert_cast(column).getData().push_back(readValue(istr)); - else - { - /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. - std::string field_name; - readEscapedString(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); - } -} - -template -void DataTypeEnum::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeQuotedString(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - std::string field_name; - readQuotedStringWithSQLStyle(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name))); -} - -template -void DataTypeEnum::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.tsv.input_format_enum_as_number) - assert_cast(column).getData().push_back(readValue(istr)); - else - { - std::string field_name; - readString(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); - } -} - -template -void DataTypeEnum::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeJSONString(getNameForValue(assert_cast(column).getData()[row_num]), ostr, settings); -} - -template -void DataTypeEnum::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeXMLStringForTextElement(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - if (!istr.eof() && *istr.position() != '"') - assert_cast(column).getData().push_back(readValue(istr)); - else - { - std::string field_name; - readJSONString(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name))); - } -} - -template -void DataTypeEnum::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeCSVString(getNameForValue(assert_cast(column).getData()[row_num]), ostr); -} - -template -void DataTypeEnum::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.csv.input_format_enum_as_number) - assert_cast(column).getData().push_back(readValue(istr)); - else - { - std::string field_name; - readCSVString(field_name, istr, settings.csv); - assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); - } -} - -template -void DataTypeEnum::serializeBinaryBulk( - const IColumn & column, WriteBuffer & ostr, const size_t offset, size_t limit) const -{ - const auto & x = typeid_cast(column).getData(); - const auto size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - ostr.write(reinterpret_cast(&x[offset]), sizeof(FieldType) * limit); -} - -template -void DataTypeEnum::deserializeBinaryBulk( - IColumn & column, ReadBuffer & istr, const size_t limit, const double /*avg_value_size_hint*/) const -{ - auto & x = typeid_cast(column).getData(); - const auto initial_size = x.size(); - x.resize(initial_size + limit); - const auto size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(FieldType) * limit); - x.resize(initial_size + size / sizeof(FieldType)); } template Field DataTypeEnum::getDefault() const { - return values.front().second; + return this->getValues().front().second; } template void DataTypeEnum::insertDefaultInto(IColumn & column) const { - assert_cast(column).getData().push_back(values.front().second); + assert_cast(column).getData().push_back(this->getValues().front().second); } template @@ -274,7 +93,7 @@ bool DataTypeEnum::equals(const IDataType & rhs) const template bool DataTypeEnum::textCanContainOnlyValidUTF8() const { - for (const auto & elem : values) + for (const auto & elem : this->getValues()) { const char * pos = elem.first.data(); const char * end = pos + elem.first.size(); @@ -305,14 +124,14 @@ Field DataTypeEnum::castToName(const Field & value_or_name) const { if (value_or_name.getType() == Field::Types::String) { - getValue(value_or_name.get()); /// Check correctness + this->getValue(value_or_name.get()); /// Check correctness return value_or_name.get(); } else if (value_or_name.getType() == Field::Types::Int64) { Int64 value = value_or_name.get(); checkOverflow(value); - return getNameForValue(static_cast(value)).toString(); + return this->getNameForValue(static_cast(value)).toString(); } else throw Exception(String("DataTypeEnum: Unsupported type of field ") + value_or_name.getTypeName(), ErrorCodes::BAD_TYPE_OF_FIELD); @@ -323,14 +142,14 @@ Field DataTypeEnum::castToValue(const Field & value_or_name) const { if (value_or_name.getType() == Field::Types::String) { - return getValue(value_or_name.get()); + return this->getValue(value_or_name.get()); } else if (value_or_name.getType() == Field::Types::Int64 || value_or_name.getType() == Field::Types::UInt64) { Int64 value = value_or_name.get(); checkOverflow(value); - getNameForValue(static_cast(value)); /// Check correctness + this->getNameForValue(static_cast(value)); /// Check correctness return value; } else @@ -341,25 +160,19 @@ Field DataTypeEnum::castToValue(const Field & value_or_name) const template bool DataTypeEnum::contains(const IDataType & rhs) const { - auto check = [&](const auto & value) - { - auto it = name_to_value_map.find(value.first); - /// If we don't have this name, than we have to be sure, - /// that this value exists in enum - if (it == name_to_value_map.end()) - return value_to_name_map.count(value.second) > 0; - - /// If we have this name, than it should have the same value - return it->value.second == value.second; - }; - if (const auto * rhs_enum8 = typeid_cast(&rhs)) - return std::all_of(rhs_enum8->getValues().begin(), rhs_enum8->getValues().end(), check); + return this->containsAll(rhs_enum8->getValues()); if (const auto * rhs_enum16 = typeid_cast(&rhs)) - return std::all_of(rhs_enum16->getValues().begin(), rhs_enum16->getValues().end(), check); + return this->containsAll(rhs_enum16->getValues()); return false; } +template +SerializationPtr DataTypeEnum::doGetDefaultSerialization() const +{ + return std::make_shared>(this->getValues()); +} + /// Explicit instantiations. template class DataTypeEnum; @@ -405,7 +218,7 @@ static DataTypePtr createExact(const ASTPtr & arguments) ErrorCodes::UNEXPECTED_AST_STRUCTURE); const String & field_name = name_literal->value.get(); - const auto value = value_literal->value.get>(); + const auto value = value_literal->value.get(); if (value > std::numeric_limits::max() || value < std::numeric_limits::min()) throw Exception{"Value " + toString(value) + " for element '" + field_name + "' exceeds range of " + EnumName::value, diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index 003613edb98..57657d1d110 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -11,12 +12,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; -} - - class IDataTypeEnum : public IDataType { public: @@ -36,102 +31,37 @@ public: template -class DataTypeEnum final : public IDataTypeEnum +class DataTypeEnum final : public IDataTypeEnum, public EnumValues { public: using FieldType = Type; using ColumnType = ColumnVector; - using Value = std::pair; - using Values = std::vector; - using NameToValueMap = HashMap; - using ValueToNameMap = std::unordered_map; + using typename EnumValues::Values; static constexpr bool is_parametric = true; private: - Values values; - NameToValueMap name_to_value_map; - ValueToNameMap value_to_name_map; std::string type_name; - static std::string generateName(const Values & values); - void fillMaps(); public: explicit DataTypeEnum(const Values & values_); - const Values & getValues() const { return values; } std::string doGetName() const override { return type_name; } const char * getFamilyName() const override; TypeIndex getTypeId() const override { return sizeof(FieldType) == 1 ? TypeIndex::Enum8 : TypeIndex::Enum16; } - auto findByValue(const FieldType & value) const - { - const auto it = value_to_name_map.find(value); - if (it == std::end(value_to_name_map)) - throw Exception{"Unexpected value " + toString(value) + " for type " + getName(), ErrorCodes::BAD_ARGUMENTS}; - - return it; - } - - const StringRef & getNameForValue(const FieldType & value) const - { - return findByValue(value)->second; - } - - FieldType getValue(StringRef field_name, bool try_treat_as_id = false) const - { - const auto it = name_to_value_map.find(field_name); - if (!it) - { - /// It is used in CSV and TSV input formats. If we fail to find given string in - /// enum names, we will try to treat it as enum id. - if (try_treat_as_id) - { - FieldType x; - ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); - readText(x, tmp_buf); - /// Check if we reached end of the tmp_buf (otherwise field_name is not a number) - /// and try to find it in enum ids - if (tmp_buf.eof() && value_to_name_map.find(x) != value_to_name_map.end()) - return x; - } - throw Exception{"Unknown element '" + field_name.toString() + "' for type " + getName(), ErrorCodes::BAD_ARGUMENTS}; - } - return it->getMapped(); - } - FieldType readValue(ReadBuffer & istr) const { FieldType x; readText(x, istr); - return findByValue(x)->first; + return this->findByValue(x)->first; } Field castToName(const Field & value_or_name) const override; Field castToValue(const Field & value_or_name) const override; - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, const size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, const size_t limit, const double avg_value_size_hint) const override; - MutableColumnPtr createColumn() const override { return ColumnType::create(); } Field getDefault() const override; @@ -147,6 +77,8 @@ public: /// Enum('a' = 1, 'b' = 2) -> Enum('c' = 1, 'b' = 2, 'd' = 3) OK /// Enum('a' = 1, 'b' = 2) -> Enum('a' = 2, 'b' = 1) NOT OK bool contains(const IDataType & rhs) const; + + SerializationPtr doGetDefaultSerialization() const override; }; diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index dc3ce039dbd..c28de15565c 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -84,16 +84,7 @@ DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr return get("LowCardinality", low_cardinality_params); } - DataTypePtr res = findCreatorByName(family_name)(parameters); - - if (CurrentThread::isInitialized()) - { - const auto * query_context = CurrentThread::get().getQueryContext(); - if (query_context && query_context->getSettingsRef().log_queries) - query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name); - } - - return res; + return findCreatorByName(family_name)(parameters); } DataTypePtr DataTypeFactory::getCustom(DataTypeCustomDescPtr customization) const @@ -165,10 +156,18 @@ void DataTypeFactory::registerSimpleDataTypeCustom(const String &name, SimpleCre const DataTypeFactory::Value & DataTypeFactory::findCreatorByName(const String & family_name) const { + ContextPtr query_context; + if (CurrentThread::isInitialized()) + query_context = CurrentThread::get().getQueryContext(); + { DataTypesDictionary::const_iterator it = data_types.find(family_name); if (data_types.end() != it) + { + if (query_context && query_context->getSettingsRef().log_queries) + query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name); return it->second; + } } String family_name_lowercase = Poco::toLower(family_name); @@ -176,7 +175,11 @@ const DataTypeFactory::Value & DataTypeFactory::findCreatorByName(const String & { DataTypesDictionary::const_iterator it = case_insensitive_data_types.find(family_name_lowercase); if (case_insensitive_data_types.end() != it) + { + if (query_context && query_context->getSettingsRef().log_queries) + query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name_lowercase); return it->second; + } } auto hints = this->getHints(family_name); diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index 618c1f51067..9fa3e30297b 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include @@ -86,6 +86,5 @@ void registerDataTypeLowCardinality(DataTypeFactory & factory); void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory); void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory); -void registerDataTypeOneElementTuple(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeFixedString.cpp b/src/DataTypes/DataTypeFixedString.cpp index 21cfe855169..a40592ba023 100644 --- a/src/DataTypes/DataTypeFixedString.cpp +++ b/src/DataTypes/DataTypeFixedString.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -22,7 +23,6 @@ namespace DB namespace ErrorCodes { - extern const int CANNOT_READ_ALL_DATA; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int UNEXPECTED_AST_STRUCTURE; } @@ -33,176 +33,6 @@ std::string DataTypeFixedString::doGetName() const return "FixedString(" + toString(n) + ")"; } - -void DataTypeFixedString::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const String & s = get(field); - ostr.write(s.data(), std::min(s.size(), n)); - if (s.size() < n) - for (size_t i = s.size(); i < n; ++i) - ostr.write(0); -} - - -void DataTypeFixedString::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - field = String(); - String & s = get(field); - s.resize(n); - istr.readStrict(s.data(), n); -} - - -void DataTypeFixedString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - ostr.write(reinterpret_cast(&assert_cast(column).getChars()[n * row_num]), n); -} - - -void DataTypeFixedString::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - ColumnFixedString::Chars & data = assert_cast(column).getChars(); - size_t old_size = data.size(); - data.resize(old_size + n); - try - { - istr.readStrict(reinterpret_cast(data.data() + old_size), n); - } - catch (...) - { - data.resize_assume_reserved(old_size); - throw; - } -} - - -void DataTypeFixedString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const ColumnFixedString::Chars & data = typeid_cast(column).getChars(); - - size_t size = data.size() / n; - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - if (limit) - ostr.write(reinterpret_cast(&data[n * offset]), n * limit); -} - - -void DataTypeFixedString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const -{ - ColumnFixedString::Chars & data = typeid_cast(column).getChars(); - - size_t initial_size = data.size(); - size_t max_bytes = limit * n; - data.resize(initial_size + max_bytes); - size_t read_bytes = istr.readBig(reinterpret_cast(&data[initial_size]), max_bytes); - - if (read_bytes % n != 0) - throw Exception("Cannot read all data of type FixedString. Bytes read:" + toString(read_bytes) + ". String size:" + toString(n) + ".", - ErrorCodes::CANNOT_READ_ALL_DATA); - - data.resize(initial_size + read_bytes); -} - - -void DataTypeFixedString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeString(reinterpret_cast(&assert_cast(column).getChars()[n * row_num]), n, ostr); -} - - -void DataTypeFixedString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeAnyEscapedString<'\''>(pos, pos + n, ostr); -} - - -static inline void alignStringLength(const DataTypeFixedString & type, - ColumnFixedString::Chars & data, - size_t string_start) -{ - ColumnFixedString::alignStringLength(data, type.getN(), string_start); -} - -template -static inline void read(const DataTypeFixedString & self, IColumn & column, Reader && reader) -{ - ColumnFixedString::Chars & data = typeid_cast(column).getChars(); - size_t prev_size = data.size(); - try - { - reader(data); - alignStringLength(self, data, prev_size); - } - catch (...) - { - data.resize_assume_reserved(prev_size); - throw; - } -} - - -void DataTypeFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); -} - - -void DataTypeFixedString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeAnyQuotedString<'\''>(pos, pos + n, ostr); -} - - -void DataTypeFixedString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readQuotedStringInto(data, istr); }); -} - - -void DataTypeFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringInto(data, istr); }); -} - - -void DataTypeFixedString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeJSONString(pos, pos + n, ostr, settings); -} - - -void DataTypeFixedString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readJSONStringInto(data, istr); }); -} - - -void DataTypeFixedString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeXMLStringForTextElement(pos, pos + n, ostr); -} - - -void DataTypeFixedString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - writeCSVString(pos, pos + n, ostr); -} - - -void DataTypeFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - read(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); }); -} - - MutableColumnPtr DataTypeFixedString::createColumn() const { return ColumnFixedString::create(n); @@ -218,6 +48,11 @@ bool DataTypeFixedString::equals(const IDataType & rhs) const return typeid(rhs) == typeid(*this) && n == static_cast(rhs).n; } +SerializationPtr DataTypeFixedString::doGetDefaultSerialization() const +{ + return std::make_shared(n); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeFixedString.h b/src/DataTypes/DataTypeFixedString.h index af82e4b5d11..d82ea9824f3 100644 --- a/src/DataTypes/DataTypeFixedString.h +++ b/src/DataTypes/DataTypeFixedString.h @@ -1,6 +1,7 @@ #pragma once #include +#include #define MAX_FIXEDSTRING_SIZE 0xFFFFFF @@ -40,38 +41,14 @@ public: return n; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; bool equals(const IDataType & rhs) const override; + SerializationPtr doGetDefaultSerialization() const override; + bool isParametric() const override { return true; } bool haveSubtypes() const override { return false; } bool isComparable() const override { return true; } @@ -82,6 +59,11 @@ public: bool isCategorial() const override { return true; } bool canBeInsideNullable() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } + + /// Makes sure that the length of a newly inserted string to `chars` is equal to getN(). + /// If the length is less than getN() the function will add zero characters up to getN(). + /// If the length is greater than getN() the function will throw an exception. + void alignStringLength(PaddedPODArray & chars, size_t old_size) const; }; } diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index 1b21b7de4bc..485083d67ee 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include namespace DB @@ -25,19 +26,6 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } -namespace -{ - const ColumnLowCardinality & getColumnLowCardinality(const IColumn & column) - { - return typeid_cast(column); - } - - ColumnLowCardinality & getColumnLowCardinality(IColumn & column) - { - return typeid_cast(column); - } -} - DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_) : dictionary_type(std::move(dictionary_type_)) { @@ -50,785 +38,6 @@ DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_) + dictionary_type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } -void DataTypeLowCardinality::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - path.push_back(Substream::DictionaryKeys); - dictionary_type->enumerateStreams(callback, path); - path.back() = Substream::DictionaryIndexes; - callback(path, *this); - path.pop_back(); -} - -struct KeysSerializationVersion -{ - enum Value - { - /// Version is written at the start of . - /// Dictionary is written as number N and N keys after them. - /// Dictionary can be shared for continuous range of granules, so some marks may point to the same position. - /// Shared dictionary is stored in state and is read once. - SharedDictionariesWithAdditionalKeys = 1, - }; - - Value value; - - static void checkVersion(UInt64 version) - { - if (version != SharedDictionariesWithAdditionalKeys) - throw Exception("Invalid version for DataTypeLowCardinality key column.", ErrorCodes::LOGICAL_ERROR); - } - - explicit KeysSerializationVersion(UInt64 version) : value(static_cast(version)) { checkVersion(version); } -}; - -/// Version is stored at the start of each granule. It's used to store indexes type and flags. -struct IndexesSerializationType -{ - using SerializationType = UInt64; - /// Need to read dictionary if it wasn't. - static constexpr SerializationType NeedGlobalDictionaryBit = 1u << 8u; - /// Need to read additional keys. Additional keys are stored before indexes as value N and N keys after them. - static constexpr SerializationType HasAdditionalKeysBit = 1u << 9u; - /// Need to update dictionary. It means that previous granule has different dictionary. - static constexpr SerializationType NeedUpdateDictionary = 1u << 10u; - - enum Type - { - TUInt8 = 0, - TUInt16, - TUInt32, - TUInt64, - }; - - Type type; - bool has_additional_keys; - bool need_global_dictionary; - bool need_update_dictionary; - - static constexpr SerializationType resetFlags(SerializationType type) - { - return type & (~(HasAdditionalKeysBit | NeedGlobalDictionaryBit | NeedUpdateDictionary)); - } - - static void checkType(SerializationType type) - { - UInt64 value = resetFlags(type); - if (value <= TUInt64) - return; - - throw Exception("Invalid type for DataTypeLowCardinality index column.", ErrorCodes::LOGICAL_ERROR); - } - - void serialize(WriteBuffer & buffer) const - { - SerializationType val = type; - if (has_additional_keys) - val |= HasAdditionalKeysBit; - if (need_global_dictionary) - val |= NeedGlobalDictionaryBit; - if (need_update_dictionary) - val |= NeedUpdateDictionary; - writeIntBinary(val, buffer); - } - - void deserialize(ReadBuffer & buffer) - { - SerializationType val; - readIntBinary(val, buffer); - checkType(val); - has_additional_keys = (val & HasAdditionalKeysBit) != 0; - need_global_dictionary = (val & NeedGlobalDictionaryBit) != 0; - need_update_dictionary = (val & NeedUpdateDictionary) != 0; - type = static_cast(resetFlags(val)); - } - - IndexesSerializationType(const IColumn & column, - bool has_additional_keys_, - bool need_global_dictionary_, - bool enumerate_dictionaries) - : has_additional_keys(has_additional_keys_) - , need_global_dictionary(need_global_dictionary_) - , need_update_dictionary(enumerate_dictionaries) - { - if (typeid_cast(&column)) - type = TUInt8; - else if (typeid_cast(&column)) - type = TUInt16; - else if (typeid_cast(&column)) - type = TUInt32; - else if (typeid_cast(&column)) - type = TUInt64; - else - throw Exception("Invalid Indexes column for IndexesSerializationType. Expected ColumnUInt*, got " - + column.getName(), ErrorCodes::LOGICAL_ERROR); - } - - DataTypePtr getDataType() const - { - if (type == TUInt8) - return std::make_shared(); - if (type == TUInt16) - return std::make_shared(); - if (type == TUInt32) - return std::make_shared(); - if (type == TUInt64) - return std::make_shared(); - - throw Exception("Can't create DataType from IndexesSerializationType.", ErrorCodes::LOGICAL_ERROR); - } - - IndexesSerializationType() = default; -}; - -struct SerializeStateLowCardinality : public IDataType::SerializeBinaryBulkState -{ - KeysSerializationVersion key_version; - MutableColumnUniquePtr shared_dictionary; - - explicit SerializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} -}; - -struct DeserializeStateLowCardinality : public IDataType::DeserializeBinaryBulkState -{ - KeysSerializationVersion key_version; - ColumnUniquePtr global_dictionary; - - IndexesSerializationType index_type; - ColumnPtr additional_keys; - ColumnPtr null_map; - UInt64 num_pending_rows = 0; - - /// If dictionary should be updated. - /// Can happen is some granules was skipped while reading from MergeTree. - /// We should store this flag in State because - /// in case of long block of empty arrays we may not need read dictionary at first reading. - bool need_update_dictionary = false; - - explicit DeserializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} -}; - -static SerializeStateLowCardinality * checkAndGetLowCardinalitySerializeState( - IDataType::SerializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeLowCardinality.", ErrorCodes::LOGICAL_ERROR); - - auto * low_cardinality_state = typeid_cast(state.get()); - if (!low_cardinality_state) - { - auto & state_ref = *state; - throw Exception("Invalid SerializeBinaryBulkState for DataTypeLowCardinality. Expected: " - + demangle(typeid(SerializeStateLowCardinality).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return low_cardinality_state; -} - -static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeState( - IDataType::DeserializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeLowCardinality.", ErrorCodes::LOGICAL_ERROR); - - auto * low_cardinality_state = typeid_cast(state.get()); - if (!low_cardinality_state) - { - auto & state_ref = *state; - throw Exception("Invalid DeserializeBinaryBulkState for DataTypeLowCardinality. Expected: " - + demangle(typeid(DeserializeStateLowCardinality).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return low_cardinality_state; -} - -void DataTypeLowCardinality::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::DictionaryKeys); - auto * stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!stream) - throw Exception("Got empty stream in DataTypeLowCardinality::serializeBinaryBulkStatePrefix", - ErrorCodes::LOGICAL_ERROR); - - /// Write version and create SerializeBinaryBulkState. - UInt64 key_version = KeysSerializationVersion::SharedDictionariesWithAdditionalKeys; - - writeIntBinary(key_version, *stream); - - state = std::make_shared(key_version); -} - -void DataTypeLowCardinality::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); - KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); - - if (low_cardinality_state->shared_dictionary && settings.low_cardinality_max_dictionary_size) - { - auto nested_column = low_cardinality_state->shared_dictionary->getNestedNotNullableColumn(); - - settings.path.push_back(Substream::DictionaryKeys); - auto * stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!stream) - throw Exception("Got empty stream in DataTypeLowCardinality::serializeBinaryBulkStateSuffix", - ErrorCodes::LOGICAL_ERROR); - - UInt64 num_keys = nested_column->size(); - writeIntBinary(num_keys, *stream); - removeNullable(dictionary_type)->serializeBinaryBulk(*nested_column, *stream, 0, num_keys); - low_cardinality_state->shared_dictionary = nullptr; - } -} - -void DataTypeLowCardinality::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::DictionaryKeys); - auto * stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!stream) - return; - - UInt64 keys_version; - readIntBinary(keys_version, *stream); - - state = std::make_shared(keys_version); -} - -namespace -{ - template - PaddedPODArray * getIndexesData(IColumn & indexes) - { - auto * column = typeid_cast *>(&indexes); - if (column) - return &column->getData(); - - return nullptr; - } - - struct IndexMapsWithAdditionalKeys - { - MutableColumnPtr dictionary_map; - MutableColumnPtr additional_keys_map; - }; - - template - IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeysRef(PaddedPODArray & index, size_t dict_size) - { - PaddedPODArray copy(index.cbegin(), index.cend()); - - HashMap dict_map; - HashMap add_keys_map; - - for (auto val : index) - { - if (val < dict_size) - dict_map.insert({val, dict_map.size()}); - else - add_keys_map.insert({val, add_keys_map.size()}); - } - - auto dictionary_map = ColumnVector::create(dict_map.size()); - auto additional_keys_map = ColumnVector::create(add_keys_map.size()); - auto & dict_data = dictionary_map->getData(); - auto & add_keys_data = additional_keys_map->getData(); - - for (auto val : dict_map) - dict_data[val.second] = val.first; - - for (auto val : add_keys_map) - add_keys_data[val.second] = val.first - dict_size; - - for (auto & val : index) - val = val < dict_size ? dict_map[val] - : add_keys_map[val] + dict_map.size(); - - for (size_t i = 0; i < index.size(); ++i) - { - T expected = index[i] < dict_data.size() ? dict_data[index[i]] - : add_keys_data[index[i] - dict_data.size()] + dict_size; - if (expected != copy[i]) - throw Exception("Expected " + toString(expected) + ", but got " + toString(copy[i]), ErrorCodes::LOGICAL_ERROR); - - } - - return {std::move(dictionary_map), std::move(additional_keys_map)}; - } - - template - IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(PaddedPODArray & index, size_t dict_size) - { - T max_less_dict_size = 0; - T max_value = 0; - - auto size = index.size(); - if (size == 0) - return {ColumnVector::create(), ColumnVector::create()}; - - for (size_t i = 0; i < size; ++i) - { - auto val = index[i]; - if (val < dict_size) - max_less_dict_size = std::max(max_less_dict_size, val); - - max_value = std::max(max_value, val); - } - - auto map_size = UInt64(max_less_dict_size) + 1; - auto overflow_map_size = max_value >= dict_size ? (UInt64(max_value - dict_size) + 1) : 0; - PaddedPODArray map(map_size, 0); - PaddedPODArray overflow_map(overflow_map_size, 0); - - T zero_pos_value = 0; - T zero_pos_overflowed_value = 0; - UInt64 cur_pos = 0; - UInt64 cur_overflowed_pos = 0; - - for (size_t i = 0; i < size; ++i) - { - T val = index[i]; - if (val < dict_size) - { - if (cur_pos == 0) - { - zero_pos_value = val; - ++cur_pos; - } - else if (map[val] == 0 && val != zero_pos_value) - { - map[val] = cur_pos; - ++cur_pos; - } - } - else - { - T shifted_val = val - dict_size; - if (cur_overflowed_pos == 0) - { - zero_pos_overflowed_value = shifted_val; - ++cur_overflowed_pos; - } - else if (overflow_map[shifted_val] == 0 && shifted_val != zero_pos_overflowed_value) - { - overflow_map[shifted_val] = cur_overflowed_pos; - ++cur_overflowed_pos; - } - } - } - - auto dictionary_map = ColumnVector::create(cur_pos); - auto additional_keys_map = ColumnVector::create(cur_overflowed_pos); - auto & dict_data = dictionary_map->getData(); - auto & add_keys_data = additional_keys_map->getData(); - - for (size_t i = 0; i < map_size; ++i) - if (map[i]) - dict_data[map[i]] = static_cast(i); - - for (size_t i = 0; i < overflow_map_size; ++i) - if (overflow_map[i]) - add_keys_data[overflow_map[i]] = static_cast(i); - - if (cur_pos) - dict_data[0] = zero_pos_value; - if (cur_overflowed_pos) - add_keys_data[0] = zero_pos_overflowed_value; - - for (size_t i = 0; i < size; ++i) - { - T & val = index[i]; - if (val < dict_size) - val = map[val]; - else - val = overflow_map[val - dict_size] + cur_pos; - } - - return {std::move(dictionary_map), std::move(additional_keys_map)}; - } - - /// Update column and return map with old indexes. - /// Let N is the number of distinct values which are less than max_size; - /// old_column - column before function call; - /// new_column - column after function call: - /// * if old_column[i] < max_size, than - /// dictionary_map[new_column[i]] = old_column[i] - /// * else - /// additional_keys_map[new_column[i]] = old_column[i] - dict_size + N - IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(IColumn & column, size_t dict_size) - { - if (auto * data_uint8 = getIndexesData(column)) - return mapIndexWithAdditionalKeys(*data_uint8, dict_size); - else if (auto * data_uint16 = getIndexesData(column)) - return mapIndexWithAdditionalKeys(*data_uint16, dict_size); - else if (auto * data_uint32 = getIndexesData(column)) - return mapIndexWithAdditionalKeys(*data_uint32, dict_size); - else if (auto * data_uint64 = getIndexesData(column)) - return mapIndexWithAdditionalKeys(*data_uint64, dict_size); - else - throw Exception("Indexes column for mapIndexWithAdditionalKeys must be UInt, got" + column.getName(), - ErrorCodes::LOGICAL_ERROR); - } -} - -void DataTypeLowCardinality::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::DictionaryKeys); - auto * keys_stream = settings.getter(settings.path); - settings.path.back() = Substream::DictionaryIndexes; - auto * indexes_stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!keys_stream && !indexes_stream) - return; - - if (!keys_stream) - throw Exception("Got empty stream for DataTypeLowCardinality keys.", ErrorCodes::LOGICAL_ERROR); - - if (!indexes_stream) - throw Exception("Got empty stream for DataTypeLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); - - const ColumnLowCardinality & low_cardinality_column = typeid_cast(column); - - auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); - auto & global_dictionary = low_cardinality_state->shared_dictionary; - KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); - - bool need_update_dictionary = global_dictionary == nullptr; - if (need_update_dictionary) - global_dictionary = createColumnUnique(*dictionary_type); - - size_t max_limit = column.size() - offset; - limit = limit ? std::min(limit, max_limit) : max_limit; - - /// Do not write anything for empty column. (May happen while writing empty arrays.) - if (limit == 0) - return; - - auto sub_column = low_cardinality_column.cutAndCompact(offset, limit); - ColumnPtr positions = sub_column->getIndexesPtr(); - ColumnPtr keys = sub_column->getDictionary().getNestedColumn(); - - if (settings.low_cardinality_max_dictionary_size) - { - /// Insert used_keys into global dictionary and update sub_index. - auto indexes_with_overflow = global_dictionary->uniqueInsertRangeWithOverflow(*keys, 0, keys->size(), - settings.low_cardinality_max_dictionary_size); - size_t max_size = settings.low_cardinality_max_dictionary_size + indexes_with_overflow.overflowed_keys->size(); - ColumnLowCardinality::Index(indexes_with_overflow.indexes->getPtr()).check(max_size); - - if (global_dictionary->size() > settings.low_cardinality_max_dictionary_size) - throw Exception("Got dictionary with size " + toString(global_dictionary->size()) + - " but max dictionary size is " + toString(settings.low_cardinality_max_dictionary_size), - ErrorCodes::LOGICAL_ERROR); - - positions = indexes_with_overflow.indexes->index(*positions, 0); - keys = std::move(indexes_with_overflow.overflowed_keys); - - if (global_dictionary->size() < settings.low_cardinality_max_dictionary_size && !keys->empty()) - throw Exception("Has additional keys, but dict size is " + toString(global_dictionary->size()) + - " which is less then max dictionary size (" + toString(settings.low_cardinality_max_dictionary_size) + ")", - ErrorCodes::LOGICAL_ERROR); - } - - if (const auto * nullable_keys = checkAndGetColumn(*keys)) - keys = nullable_keys->getNestedColumnPtr(); - - bool need_additional_keys = !keys->empty(); - bool need_dictionary = settings.low_cardinality_max_dictionary_size != 0; - bool need_write_dictionary = !settings.low_cardinality_use_single_dictionary_for_part - && global_dictionary->size() >= settings.low_cardinality_max_dictionary_size; - - IndexesSerializationType index_version(*positions, need_additional_keys, need_dictionary, need_update_dictionary); - index_version.serialize(*indexes_stream); - - if (need_write_dictionary) - { - const auto & nested_column = global_dictionary->getNestedNotNullableColumn(); - UInt64 num_keys = nested_column->size(); - writeIntBinary(num_keys, *keys_stream); - removeNullable(dictionary_type)->serializeBinaryBulk(*nested_column, *keys_stream, 0, num_keys); - low_cardinality_state->shared_dictionary = nullptr; - } - - if (need_additional_keys) - { - UInt64 num_keys = keys->size(); - writeIntBinary(num_keys, *indexes_stream); - removeNullable(dictionary_type)->serializeBinaryBulk(*keys, *indexes_stream, 0, num_keys); - } - - UInt64 num_rows = positions->size(); - writeIntBinary(num_rows, *indexes_stream); - index_version.getDataType()->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows); -} - -void DataTypeLowCardinality::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * /* cache */) const -{ - ColumnLowCardinality & low_cardinality_column = typeid_cast(column); - - settings.path.push_back(Substream::DictionaryKeys); - auto * keys_stream = settings.getter(settings.path); - settings.path.back() = Substream::DictionaryIndexes; - auto * indexes_stream = settings.getter(settings.path); - settings.path.pop_back(); - - if (!keys_stream && !indexes_stream) - return; - - if (!keys_stream) - throw Exception("Got empty stream for DataTypeLowCardinality keys.", ErrorCodes::LOGICAL_ERROR); - - if (!indexes_stream) - throw Exception("Got empty stream for DataTypeLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); - - auto * low_cardinality_state = checkAndGetLowCardinalityDeserializeState(state); - KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); - - auto read_dictionary = [this, low_cardinality_state, keys_stream]() - { - UInt64 num_keys; - readIntBinary(num_keys, *keys_stream); - - auto keys_type = removeNullable(dictionary_type); - auto global_dict_keys = keys_type->createColumn(); - keys_type->deserializeBinaryBulk(*global_dict_keys, *keys_stream, num_keys, 0); - - auto column_unique = createColumnUnique(*dictionary_type, std::move(global_dict_keys)); - low_cardinality_state->global_dictionary = std::move(column_unique); - }; - - auto read_additional_keys = [this, low_cardinality_state, indexes_stream]() - { - UInt64 num_keys; - readIntBinary(num_keys, *indexes_stream); - auto keys_type = removeNullable(dictionary_type); - auto additional_keys = keys_type->createColumn(); - keys_type->deserializeBinaryBulk(*additional_keys, *indexes_stream, num_keys, 0); - low_cardinality_state->additional_keys = std::move(additional_keys); - - if (!low_cardinality_state->index_type.need_global_dictionary && dictionary_type->isNullable()) - { - auto null_map = ColumnUInt8::create(num_keys, 0); - if (num_keys) - null_map->getElement(0) = 1; - - low_cardinality_state->null_map = std::move(null_map); - } - }; - - auto read_indexes = [this, low_cardinality_state, indexes_stream, &low_cardinality_column](UInt64 num_rows) - { - auto indexes_type = low_cardinality_state->index_type.getDataType(); - MutableColumnPtr indexes_column = indexes_type->createColumn(); - indexes_type->deserializeBinaryBulk(*indexes_column, *indexes_stream, num_rows, 0); - - auto & global_dictionary = low_cardinality_state->global_dictionary; - const auto & additional_keys = low_cardinality_state->additional_keys; - - bool has_additional_keys = low_cardinality_state->index_type.has_additional_keys; - bool column_is_empty = low_cardinality_column.empty(); - - if (!low_cardinality_state->index_type.need_global_dictionary) - { - ColumnPtr keys_column = additional_keys; - if (low_cardinality_state->null_map) - keys_column = ColumnNullable::create(additional_keys, low_cardinality_state->null_map); - low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*keys_column, *indexes_column); - } - else if (!has_additional_keys) - { - if (column_is_empty) - low_cardinality_column.setSharedDictionary(global_dictionary); - - auto local_column = ColumnLowCardinality::create(global_dictionary, std::move(indexes_column)); - low_cardinality_column.insertRangeFrom(*local_column, 0, num_rows); - } - else - { - auto maps = mapIndexWithAdditionalKeys(*indexes_column, global_dictionary->size()); - - ColumnLowCardinality::Index(maps.additional_keys_map->getPtr()).check(additional_keys->size()); - - ColumnLowCardinality::Index(indexes_column->getPtr()).check( - maps.dictionary_map->size() + maps.additional_keys_map->size()); - - auto used_keys = IColumn::mutate(global_dictionary->getNestedColumn()->index(*maps.dictionary_map, 0)); - - if (!maps.additional_keys_map->empty()) - { - auto used_add_keys = additional_keys->index(*maps.additional_keys_map, 0); - - if (dictionary_type->isNullable()) - { - ColumnPtr null_map = ColumnUInt8::create(used_add_keys->size(), 0); - used_add_keys = ColumnNullable::create(used_add_keys, null_map); - } - - used_keys->insertRangeFrom(*used_add_keys, 0, used_add_keys->size()); - } - - low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*used_keys, *indexes_column); - } - }; - - if (!settings.continuous_reading) - { - low_cardinality_state->num_pending_rows = 0; - - /// Remember in state that some granules were skipped and we need to update dictionary. - low_cardinality_state->need_update_dictionary = true; - } - - while (limit) - { - if (low_cardinality_state->num_pending_rows == 0) - { - if (indexes_stream->eof()) - break; - - auto & index_type = low_cardinality_state->index_type; - auto & global_dictionary = low_cardinality_state->global_dictionary; - - index_type.deserialize(*indexes_stream); - - bool need_update_dictionary = - !global_dictionary || index_type.need_update_dictionary || low_cardinality_state->need_update_dictionary; - if (index_type.need_global_dictionary && need_update_dictionary) - { - read_dictionary(); - low_cardinality_state->need_update_dictionary = false; - } - - if (low_cardinality_state->index_type.has_additional_keys) - read_additional_keys(); - else - low_cardinality_state->additional_keys = nullptr; - - readIntBinary(low_cardinality_state->num_pending_rows, *indexes_stream); - } - - size_t num_rows_to_read = std::min(limit, low_cardinality_state->num_pending_rows); - read_indexes(num_rows_to_read); - limit -= num_rows_to_read; - low_cardinality_state->num_pending_rows -= num_rows_to_read; - } -} - -void DataTypeLowCardinality::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - dictionary_type->serializeBinary(field, ostr); -} -void DataTypeLowCardinality::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - dictionary_type->deserializeBinary(field, istr); -} - -void DataTypeLowCardinality::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - serializeImpl(column, row_num, &IDataType::serializeBinary, ostr); -} -void DataTypeLowCardinality::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - deserializeImpl(column, &IDataType::deserializeBinary, istr); -} - -void DataTypeLowCardinality::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextEscaped, ostr, settings); -} - -void DataTypeLowCardinality::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsTextEscaped, istr, settings); -} - -void DataTypeLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextQuoted, ostr, settings); -} - -void DataTypeLowCardinality::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsTextQuoted, istr, settings); -} - -void DataTypeLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsWholeText, istr, settings); -} - -void DataTypeLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextCSV, ostr, settings); -} - -void DataTypeLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsTextCSV, istr, settings); -} - -void DataTypeLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsText, ostr, settings); -} - -void DataTypeLowCardinality::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextJSON, ostr, settings); -} -void DataTypeLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeImpl(column, &IDataType::deserializeAsTextJSON, istr, settings); -} - -void DataTypeLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeImpl(column, row_num, &IDataType::serializeAsTextXML, ostr, settings); -} - -template -void DataTypeLowCardinality::serializeImpl( - const IColumn & column, size_t row_num, DataTypeLowCardinality::SerializeFunctionPtr func, Args &&... args) const -{ - const auto & low_cardinality_column = getColumnLowCardinality(column); - size_t unique_row_number = low_cardinality_column.getIndexes().getUInt(row_num); - (dictionary_type.get()->*func)(*low_cardinality_column.getDictionary().getNestedColumn(), unique_row_number, std::forward(args)...); -} - -template -void DataTypeLowCardinality::deserializeImpl( - IColumn & column, DataTypeLowCardinality::DeserializeFunctionPtr func, Args &&... args) const -{ - auto & low_cardinality_column= getColumnLowCardinality(column); - auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); - - (dictionary_type.get()->*func)(*temp_column, std::forward(args)...); - - low_cardinality_column.insertFromFullColumn(*temp_column, 0); -} - namespace { template @@ -927,6 +136,11 @@ bool DataTypeLowCardinality::equals(const IDataType & rhs) const return dictionary_type->equals(*low_cardinality_rhs.dictionary_type); } +SerializationPtr DataTypeLowCardinality::doGetDefaultSerialization() const +{ + return std::make_shared(dictionary_type); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index f5b6b571187..1266174c6d6 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -24,50 +24,6 @@ public: const char * getFamilyName() const override { return "LowCardinality"; } TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; } - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; - - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -100,6 +56,7 @@ public: static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys); private: + SerializationPtr doGetDefaultSerialization() const override; template using SerializeFunctionPtr = void (IDataType::*)(const IColumn &, size_t, Params ...) const; diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index 9972452862f..1d580761362 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,7 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int CANNOT_READ_MAP_FROM_TEXT; + extern const int BAD_ARGUMENTS; } @@ -38,6 +39,8 @@ DataTypeMap::DataTypeMap(const DataTypes & elems_) key_type = elems_[0]; value_type = elems_[1]; + assertKeyType(); + nested = std::make_shared( std::make_shared(DataTypes{key_type, value_type}, Names{"keys", "values"})); } @@ -45,7 +48,19 @@ DataTypeMap::DataTypeMap(const DataTypes & elems_) DataTypeMap::DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & value_type_) : key_type(key_type_), value_type(value_type_) , nested(std::make_shared( - std::make_shared(DataTypes{key_type_, value_type_}, Names{"keys", "values"}))) {} + std::make_shared(DataTypes{key_type_, value_type_}, Names{"keys", "values"}))) +{ + assertKeyType(); +} + +void DataTypeMap::assertKeyType() const +{ + if (!key_type->isValueRepresentedByInteger() && !isStringOrFixedString(*key_type) && !WhichDataType(key_type).isNothing()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Type of Map key must be a type, that can be represented by integer or string," + " but {} given", key_type->getName()); +} + std::string DataTypeMap::doGetName() const { @@ -60,11 +75,6 @@ static const IColumn & extractNestedColumn(const IColumn & column) return assert_cast(column).getNestedColumn(); } -static IColumn & extractNestedColumn(IColumn & column) -{ - return assert_cast(column).getNestedColumn(); -} - DataTypePtr DataTypeMap::tryGetSubcolumnType(const String & subcolumn_name) const { return nested->tryGetSubcolumnType(subcolumn_name); @@ -75,265 +85,10 @@ ColumnPtr DataTypeMap::getSubcolumn(const String & subcolumn_name, const IColumn return nested->getSubcolumn(subcolumn_name, extractNestedColumn(column)); } -void DataTypeMap::serializeBinary(const Field & field, WriteBuffer & ostr) const +SerializationPtr DataTypeMap::getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const { - const auto & map = get(field); - writeVarUInt(map.size(), ostr); - for (const auto & elem : map) - { - const auto & tuple = elem.safeGet(); - assert(tuple.size() == 2); - key_type->serializeBinary(tuple[0], ostr); - value_type->serializeBinary(tuple[1], ostr); - } -} - -void DataTypeMap::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - size_t size; - readVarUInt(size, istr); - field = Map(size); - for (auto & elem : field.get()) - { - Tuple tuple(2); - key_type->deserializeBinary(tuple[0], istr); - value_type->deserializeBinary(tuple[1], istr); - elem = std::move(tuple); - } -} - -void DataTypeMap::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - nested->serializeBinary(extractNestedColumn(column), row_num, ostr); -} - -void DataTypeMap::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - nested->deserializeBinary(extractNestedColumn(column), istr); -} - - -template -void DataTypeMap::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const -{ - const auto & column_map = assert_cast(column); - - const auto & nested_array = column_map.getNestedColumn(); - const auto & nested_tuple = column_map.getNestedData(); - const auto & offsets = nested_array.getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - writeChar('{', ostr); - for (size_t i = offset; i < next_offset; ++i) - { - if (i != offset) - writeChar(',', ostr); - writer(key_type, nested_tuple.getColumn(0), i); - writeChar(':', ostr); - writer(value_type, nested_tuple.getColumn(1), i); - } - writeChar('}', ostr); -} - -template -void DataTypeMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, bool need_safe_get_int_key, Reader && reader) const -{ - auto & column_map = assert_cast(column); - - auto & nested_array = column_map.getNestedColumn(); - auto & nested_tuple = column_map.getNestedData(); - auto & offsets = nested_array.getOffsets(); - - auto & key_column = nested_tuple.getColumn(0); - auto & value_column = nested_tuple.getColumn(1); - - size_t size = 0; - assertChar('{', istr); - - try - { - bool first = true; - while (!istr.eof() && *istr.position() != '}') - { - if (!first) - { - if (*istr.position() == ',') - ++istr.position(); - else - throw Exception("Cannot read Map from text", ErrorCodes::CANNOT_READ_MAP_FROM_TEXT); - } - - first = false; - - skipWhitespaceIfAny(istr); - - if (*istr.position() == '}') - break; - - if (need_safe_get_int_key) - { - ReadBuffer::Position tmp = istr.position(); - while (*tmp != ':' && *tmp != '}') - ++tmp; - *tmp = ' '; - reader(key_type, key_column); - } - else - { - reader(key_type, key_column); - skipWhitespaceIfAny(istr); - assertChar(':', istr); - } - - ++size; - skipWhitespaceIfAny(istr); - reader(value_type, value_column); - - skipWhitespaceIfAny(istr); - } - - offsets.push_back(offsets.back() + size); - assertChar('}', istr); - } - catch (...) - { - throw; - } -} - -void DataTypeMap::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeTextImpl(column, row_num, ostr, - [&](const DataTypePtr & subcolumn_type, const IColumn & subcolumn, size_t pos) - { - subcolumn_type->serializeAsTextQuoted(subcolumn, pos, ostr, settings); - }); -} - -void DataTypeMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - // need_safe_get_int_key is set for Integer to prevent to readIntTextUnsafe - bool need_safe_get_int_key = isInteger(key_type); - - deserializeTextImpl(column, istr, need_safe_get_int_key, - [&](const DataTypePtr & subcolumn_type, IColumn & subcolumn) - { - subcolumn_type->deserializeAsTextQuoted(subcolumn, istr, settings); - }); -} - - -void DataTypeMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeTextImpl(column, row_num, ostr, - [&](const DataTypePtr & subcolumn_type, const IColumn & subcolumn, size_t pos) - { - subcolumn_type->serializeAsTextJSON(subcolumn, pos, ostr, settings); - }); -} - -void DataTypeMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - // need_safe_get_int_key is set for Integer to prevent to readIntTextUnsafe - bool need_safe_get_int_key = isInteger(key_type); - - deserializeTextImpl(column, istr, need_safe_get_int_key, - [&](const DataTypePtr & subcolumn_type, IColumn & subcolumn) - { - subcolumn_type->deserializeAsTextJSON(subcolumn, istr, settings); - }); -} - -void DataTypeMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const auto & column_map = assert_cast(column); - const auto & offsets = column_map.getNestedColumn().getOffsets(); - - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - - const auto & nested_data = column_map.getNestedData(); - - writeCString("", ostr); - for (size_t i = offset; i < next_offset; ++i) - { - writeCString("", ostr); - writeCString("", ostr); - key_type->serializeAsTextXML(nested_data.getColumn(0), i, ostr, settings); - writeCString("", ostr); - - writeCString("", ostr); - value_type->serializeAsTextXML(nested_data.getColumn(1), i, ostr, settings); - writeCString("", ostr); - writeCString("", ostr); - } - writeCString("", ostr); -} - -void DataTypeMap::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - WriteBufferFromOwnString wb; - serializeText(column, row_num, wb, settings); - writeCSV(wb.str(), ostr); -} - -void DataTypeMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - String s; - readCSV(s, istr, settings.csv); - ReadBufferFromString rb(s); - deserializeText(column, rb, settings); -} - - -void DataTypeMap::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - nested->enumerateStreams(callback, path); -} - -void DataTypeMap::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - nested->serializeBinaryBulkStatePrefix(settings, state); -} - -void DataTypeMap::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - nested->serializeBinaryBulkStateSuffix(settings, state); -} - -void DataTypeMap::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - nested->deserializeBinaryBulkStatePrefix(settings, state); -} - - -void DataTypeMap::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - nested->serializeBinaryBulkWithMultipleStreams(extractNestedColumn(column), offset, limit, settings, state); -} - -void DataTypeMap::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - auto & column_map = assert_cast(column); - nested->deserializeBinaryBulkWithMultipleStreams(column_map.getNestedColumnPtr(), limit, settings, state, cache); + return nested->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); } MutableColumnPtr DataTypeMap::createColumn() const @@ -346,6 +101,14 @@ Field DataTypeMap::getDefault() const return Map(); } +SerializationPtr DataTypeMap::doGetDefaultSerialization() const +{ + return std::make_shared( + key_type->getDefaultSerialization(), + value_type->getDefaultSerialization(), + nested->getDefaultSerialization()); +} + bool DataTypeMap::equals(const IDataType & rhs) const { if (typeid(rhs) != typeid(*this)) diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 88ea44a0d5a..09b8448885a 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -11,7 +11,7 @@ namespace DB * Serialization of type 'Map(K, V)' is similar to serialization. * of 'Array(Tuple(keys K, values V))' or in other words of 'Nested(keys K, valuev V)'. */ -class DataTypeMap final : public DataTypeWithSimpleSerialization +class DataTypeMap final : public IDataType { private: DataTypePtr key_type; @@ -34,47 +34,8 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; - - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; + SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; MutableColumnPtr createColumn() const override; @@ -88,15 +49,12 @@ public: const DataTypePtr & getKeyType() const { return key_type; } const DataTypePtr & getValueType() const { return value_type; } DataTypes getKeyValueTypes() const { return {key_type, value_type}; } - const DataTypePtr & getNestedType() const { return nested; } -private: - template - void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const; + SerializationPtr doGetDefaultSerialization() const override; - template - void deserializeTextImpl(IColumn & column, ReadBuffer & istr, bool need_safe_get_int_key, Reader && reader) const; +private: + void assertKeyType() const; }; } diff --git a/src/DataTypes/DataTypeNested.cpp b/src/DataTypes/DataTypeNested.cpp index cfbfb4c1750..eba1bba5dfe 100644 --- a/src/DataTypes/DataTypeNested.cpp +++ b/src/DataTypes/DataTypeNested.cpp @@ -57,7 +57,7 @@ static std::pair create(const ASTPtr & argum auto data_type = std::make_shared(std::make_shared(nested_types, nested_names)); auto custom_name = std::make_unique(nested_types, nested_names); - return std::make_pair(std::move(data_type), std::make_unique(std::move(custom_name), nullptr)); + return std::make_pair(std::move(data_type), std::make_unique(std::move(custom_name))); } void registerDataTypeNested(DataTypeFactory & factory) diff --git a/src/DataTypes/DataTypeNested.h b/src/DataTypes/DataTypeNested.h index 9fb12ad4924..1ad06477a6e 100644 --- a/src/DataTypes/DataTypeNested.h +++ b/src/DataTypes/DataTypeNested.h @@ -1,7 +1,6 @@ #pragma once -#include -#include +#include namespace DB diff --git a/src/DataTypes/DataTypeNothing.cpp b/src/DataTypes/DataTypeNothing.cpp index 94a7fd75071..388a65754b5 100644 --- a/src/DataTypes/DataTypeNothing.cpp +++ b/src/DataTypes/DataTypeNothing.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -14,27 +15,16 @@ MutableColumnPtr DataTypeNothing::createColumn() const return ColumnNothing::create(0); } -void DataTypeNothing::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - size_t size = column.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - for (size_t i = 0; i < limit; ++i) - ostr.write('0'); -} - -void DataTypeNothing::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const -{ - typeid_cast(column).addSize(istr.tryIgnore(limit)); -} - bool DataTypeNothing::equals(const IDataType & rhs) const { return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeNothing::doGetDefaultSerialization() const +{ + return std::make_shared(); +} + void registerDataTypeNothing(DataTypeFactory & factory) { diff --git a/src/DataTypes/DataTypeNothing.h b/src/DataTypes/DataTypeNothing.h index e9421fb15e8..c7d12388de9 100644 --- a/src/DataTypes/DataTypeNothing.h +++ b/src/DataTypes/DataTypeNothing.h @@ -20,10 +20,6 @@ public: MutableColumnPtr createColumn() const override; - /// These methods read and write zero bytes just to allow to figure out size of column. - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - bool equals(const IDataType & rhs) const override; bool isParametric() const override { return false; } @@ -31,6 +27,8 @@ public: bool haveMaximumSizeOfValue() const override { return true; } size_t getSizeOfValueInMemory() const override { return 0; } bool canBeInsideNullable() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index 903ebeb3ddc..3820a320c6d 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -2,7 +2,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -22,7 +23,6 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int CANNOT_READ_ALL_DATA; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -42,450 +42,6 @@ bool DataTypeNullable::onlyNull() const } -void DataTypeNullable::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - path.push_back(Substream::NullMap); - callback(path, *this); - path.back() = Substream::NullableElements; - nested_data_type->enumerateStreams(callback, path); - path.pop_back(); -} - - -void DataTypeNullable::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::NullableElements); - nested_data_type->serializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeNullable::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::NullableElements); - nested_data_type->serializeBinaryBulkStateSuffix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeNullable::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - settings.path.push_back(Substream::NullableElements); - nested_data_type->deserializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); -} - - -void DataTypeNullable::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - const ColumnNullable & col = assert_cast(column); - col.checkConsistency(); - - /// First serialize null map. - settings.path.push_back(Substream::NullMap); - if (auto * stream = settings.getter(settings.path)) - DataTypeUInt8().serializeBinaryBulk(col.getNullMapColumn(), *stream, offset, limit); - - /// Then serialize contents of arrays. - settings.path.back() = Substream::NullableElements; - nested_data_type->serializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), offset, limit, settings, state); - settings.path.pop_back(); -} - - -void DataTypeNullable::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - ColumnNullable & col = assert_cast(column); - - settings.path.push_back(Substream::NullMap); - if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) - { - col.getNullMapColumnPtr() = cached_column; - } - else if (auto * stream = settings.getter(settings.path)) - { - DataTypeUInt8().deserializeBinaryBulk(col.getNullMapColumn(), *stream, limit, 0); - addToSubstreamsCache(cache, settings.path, col.getNullMapColumnPtr()); - } - - settings.path.back() = Substream::NullableElements; - nested_data_type->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumnPtr(), limit, settings, state, cache); - settings.path.pop_back(); -} - - -void DataTypeNullable::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - if (field.isNull()) - { - writeBinary(true, ostr); - } - else - { - writeBinary(false, ostr); - nested_data_type->serializeBinary(field, ostr); - } -} - -void DataTypeNullable::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - bool is_null = false; - readBinary(is_null, istr); - if (!is_null) - { - nested_data_type->deserializeBinary(field, istr); - } - else - { - field = Null(); - } -} - -void DataTypeNullable::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - const ColumnNullable & col = assert_cast(column); - - bool is_null = col.isNullAt(row_num); - writeBinary(is_null, ostr); - if (!is_null) - nested_data_type->serializeBinary(col.getNestedColumn(), row_num, ostr); -} - -/// Deserialize value into ColumnNullable. -/// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. -template , ReturnType>* = nullptr> -static ReturnType safeDeserialize( - IColumn & column, const IDataType & /*nested_data_type*/, - CheckForNull && check_for_null, DeserializeNested && deserialize_nested) -{ - ColumnNullable & col = assert_cast(column); - - if (check_for_null()) - { - col.insertDefault(); - } - else - { - deserialize_nested(col.getNestedColumn()); - - try - { - col.getNullMapData().push_back(0); - } - catch (...) - { - col.getNestedColumn().popBack(1); - throw; - } - } -} - -/// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. -template , ReturnType>* = nullptr> -static ReturnType safeDeserialize( - IColumn & column, const IDataType & nested_data_type, - CheckForNull && check_for_null, DeserializeNested && deserialize_nested) -{ - assert(!dynamic_cast(&column)); - assert(!dynamic_cast(&nested_data_type)); - bool insert_default = check_for_null(); - if (insert_default) - nested_data_type.insertDefaultInto(column); - else - deserialize_nested(column); - return !insert_default; -} - - -void DataTypeNullable::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - safeDeserialize(column, *nested_data_type, - [&istr] { bool is_null = false; readBinary(is_null, istr); return is_null; }, - [this, &istr] (IColumn & nested) { nested_data_type->deserializeBinary(nested, istr); }); -} - - -void DataTypeNullable::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeString(settings.tsv.null_representation, ostr); - else - nested_data_type->serializeAsTextEscaped(col.getNestedColumn(), row_num, ostr, settings); -} - - -void DataTypeNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextEscaped(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - /// Little tricky, because we cannot discriminate null from first character. - - if (istr.eof()) - throw ParsingException("Unexpected end of stream, while parsing value of Nullable type", ErrorCodes::CANNOT_READ_ALL_DATA); - - /// This is not null, surely. - if (*istr.position() != '\\') - { - return safeDeserialize(column, *nested_data_type, - [] { return false; }, - [&nested_data_type, &istr, &settings] (IColumn & nested) { nested_data_type->deserializeAsTextEscaped(nested, istr, settings); }); - } - else - { - /// Now we know, that data in buffer starts with backslash. - ++istr.position(); - - if (istr.eof()) - throw ParsingException("Unexpected end of stream, while parsing value of Nullable type, after backslash", ErrorCodes::CANNOT_READ_ALL_DATA); - - return safeDeserialize(column, *nested_data_type, - [&istr] - { - if (*istr.position() == 'N') - { - ++istr.position(); - return true; - } - return false; - }, - [&nested_data_type, &istr, &settings] (IColumn & nested) - { - if (istr.position() != istr.buffer().begin()) - { - /// We could step back to consume backslash again. - --istr.position(); - nested_data_type->deserializeAsTextEscaped(nested, istr, settings); - } - else - { - /// Otherwise, we need to place backslash back in front of istr. - ReadBufferFromMemory prefix("\\", 1); - ConcatReadBuffer prepended_istr(prefix, istr); - - nested_data_type->deserializeAsTextEscaped(nested, prepended_istr, settings); - - /// Synchronise cursor position in original buffer. - - if (prepended_istr.count() > 1) - istr.position() = prepended_istr.position(); - } - }); - } -} - -void DataTypeNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeCString("NULL", ostr); - else - nested_data_type->serializeAsTextQuoted(col.getNestedColumn(), row_num, ostr, settings); -} - - -void DataTypeNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextQuoted(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - return safeDeserialize(column, *nested_data_type, - [&istr] - { - return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr); - }, - [&nested_data_type, &istr, &settings] (IColumn & nested) { nested_data_type->deserializeAsTextQuoted(nested, istr, settings); }); -} - - -void DataTypeNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeWholeText(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - return safeDeserialize(column, *nested_data_type, - [&istr] - { - return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr) - || checkStringByFirstCharacterAndAssertTheRest("ᴺᵁᴸᴸ", istr); - }, - [&nested_data_type, &istr, &settings] (IColumn & nested) { nested_data_type->deserializeAsWholeText(nested, istr, settings); }); -} - - -void DataTypeNullable::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeCString("\\N", ostr); - else - nested_data_type->serializeAsTextCSV(col.getNestedColumn(), row_num, ostr, settings); -} - -void DataTypeNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextCSV(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - constexpr char const * null_literal = "NULL"; - constexpr size_t len = 4; - size_t null_prefix_len = 0; - - auto check_for_null = [&istr, &settings, &null_prefix_len] - { - if (checkStringByFirstCharacterAndAssertTheRest("\\N", istr)) - return true; - if (!settings.csv.unquoted_null_literal_as_null) - return false; - - /// Check for unquoted NULL - while (!istr.eof() && null_prefix_len < len && null_literal[null_prefix_len] == *istr.position()) - { - ++null_prefix_len; - ++istr.position(); - } - if (null_prefix_len == len) - return true; - - /// Value and "NULL" have common prefix, but value is not "NULL". - /// Restore previous buffer position if possible. - if (null_prefix_len <= istr.offset()) - { - istr.position() -= null_prefix_len; - null_prefix_len = 0; - } - return false; - }; - - auto deserialize_nested = [&nested_data_type, &settings, &istr, &null_prefix_len] (IColumn & nested) - { - if (likely(!null_prefix_len)) - nested_data_type->deserializeAsTextCSV(nested, istr, settings); - else - { - /// Previous buffer position was not restored, - /// so we need to prepend extracted characters (rare case) - ReadBufferFromMemory prepend(null_literal, null_prefix_len); - ConcatReadBuffer buf(prepend, istr); - nested_data_type->deserializeAsTextCSV(nested, buf, settings); - - /// Check if all extracted characters were read by nested parser and update buffer position - if (null_prefix_len < buf.count()) - istr.position() = buf.position(); - else if (null_prefix_len > buf.count()) - { - /// It can happen only if there is an unquoted string instead of a number - /// or if someone uses 'U' or 'L' as delimiter in CSV. - /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. - if (settings.csv.delimiter == 'U' || settings.csv.delimiter == 'L') - throw DB::ParsingException("Enabled setting input_format_csv_unquoted_null_literal_as_null may not work correctly " - "with format_csv_delimiter = 'U' or 'L' for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); - WriteBufferFromOwnString parsed_value; - nested_data_type->serializeAsTextCSV(nested, nested.size() - 1, parsed_value, settings); - throw DB::ParsingException("Error while parsing \"" + std::string(null_literal, null_prefix_len) - + std::string(istr.position(), std::min(size_t{10}, istr.available())) + "\" as Nullable(" + nested_data_type->getName() - + ") at position " + std::to_string(istr.count()) + ": expected \"NULL\" or " + nested_data_type->getName() - + ", got \"" + std::string(null_literal, buf.count()) + "\", which was deserialized as \"" - + parsed_value.str() + "\". It seems that input data is ill-formatted.", - ErrorCodes::CANNOT_READ_ALL_DATA); - } - } - }; - - return safeDeserialize(column, *nested_data_type, check_for_null, deserialize_nested); -} - -void DataTypeNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - /// In simple text format (like 'Pretty' format) (these formats are suitable only for output and cannot be parsed back), - /// data is printed without escaping. - /// It makes theoretically impossible to distinguish between NULL and some string value, regardless on how do we print NULL. - /// For this reason, we output NULL in a bit strange way. - /// This assumes UTF-8 and proper font support. This is Ok, because Pretty formats are "presentational", not for data exchange. - - if (col.isNullAt(row_num)) - { - if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) - writeCString("ᴺᵁᴸᴸ", ostr); - else - writeCString("NULL", ostr); - } - else - nested_data_type->serializeAsText(col.getNestedColumn(), row_num, ostr, settings); -} - -void DataTypeNullable::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeCString("null", ostr); - else - nested_data_type->serializeAsTextJSON(col.getNestedColumn(), row_num, ostr, settings); -} - -void DataTypeNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeTextJSON(column, istr, settings, nested_data_type); -} - -template -ReturnType DataTypeNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const DataTypePtr & nested_data_type) -{ - return safeDeserialize(column, *nested_data_type, - [&istr] { return checkStringByFirstCharacterAndAssertTheRest("null", istr); }, - [&nested_data_type, &istr, &settings] (IColumn & nested) { nested_data_type->deserializeAsTextJSON(nested, istr, settings); }); -} - -void DataTypeNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - const ColumnNullable & col = assert_cast(column); - - if (col.isNullAt(row_num)) - writeCString("\\N", ostr); - else - nested_data_type->serializeAsTextXML(col.getNestedColumn(), row_num, ostr, settings); -} - MutableColumnPtr DataTypeNullable::createColumn() const { return ColumnNullable::create(nested_data_type->createColumn(), ColumnUInt8::create()); @@ -510,7 +66,7 @@ bool DataTypeNullable::equals(const IDataType & rhs) const DataTypePtr DataTypeNullable::tryGetSubcolumnType(const String & subcolumn_name) const { if (subcolumn_name == "null") - return createOneElementTuple(std::make_shared(), subcolumn_name, false); + return std::make_shared(); return nested_data_type->tryGetSubcolumnType(subcolumn_name); } @@ -524,6 +80,20 @@ ColumnPtr DataTypeNullable::getSubcolumn(const String & subcolumn_name, const IC return nested_data_type->getSubcolumn(subcolumn_name, column_nullable.getNestedColumn()); } +SerializationPtr DataTypeNullable::getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +{ + if (subcolumn_name == "null") + return std::make_shared(base_serialization_getter(DataTypeUInt8()), subcolumn_name, false); + + return nested_data_type->getSubcolumnSerialization(subcolumn_name, base_serialization_getter); +} + +SerializationPtr DataTypeNullable::doGetDefaultSerialization() const +{ + return std::make_shared(nested_data_type->getDefaultSerialization()); +} + static DataTypePtr create(const ASTPtr & arguments) { @@ -556,11 +126,4 @@ DataTypePtr removeNullable(const DataTypePtr & type) return type; } - -template bool DataTypeNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); -template bool DataTypeNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); -template bool DataTypeNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &, const DataTypePtr & nested); -template bool DataTypeNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); -template bool DataTypeNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &, const DataTypePtr & nested); - } diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 5e71a1bee4d..1557179d072 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -18,61 +18,6 @@ public: const char * getFamilyName() const override { return "Nullable"; } TypeIndex getTypeId() const override { return TypeIndex::Nullable; } - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; - - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - /** It is questionable, how NULL values could be represented in CSV. There are three variants: - * 1. \N - * 2. empty string (without quotes) - * 3. NULL - * We support all of them (however, second variant is supported by CSVRowInputStream, not by deserializeTextCSV). - * (see also input_format_defaults_for_omitted_fields and input_format_csv_unquoted_null_literal_as_null settings) - * In CSV, non-NULL string value, starting with \N characters, must be placed in quotes, to avoid ambiguity. - */ - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -95,25 +40,16 @@ public: size_t getSizeOfValueInMemory() const override; bool onlyNull() const override; bool canBeInsideLowCardinality() const override { return nested_data_type->canBeInsideLowCardinality(); } + DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; + SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; const DataTypePtr & getNestedType() const { return nested_data_type; } - - /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) - /// If ReturnType is void, deserialize Nullable(T) - template - static ReturnType deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); - template - static ReturnType deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); - template - static ReturnType deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &, const DataTypePtr & nested); - template - static ReturnType deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const DataTypePtr & nested); - template - static ReturnType deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &, const DataTypePtr & nested); - private: + SerializationPtr doGetDefaultSerialization() const override; + DataTypePtr nested_data_type; }; diff --git a/src/DataTypes/DataTypeNumberBase.cpp b/src/DataTypes/DataTypeNumberBase.cpp index ae3e6762d27..a9df7db7334 100644 --- a/src/DataTypes/DataTypeNumberBase.cpp +++ b/src/DataTypes/DataTypeNumberBase.cpp @@ -13,196 +13,12 @@ namespace DB { -template -void DataTypeNumberBase::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeText(assert_cast &>(column).getData()[row_num], ostr); -} - -template -void DataTypeNumberBase::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - T x; - - if constexpr (is_integer_v && is_arithmetic_v) - readIntTextUnsafe(x, istr); - else - readText(x, istr); - - assert_cast &>(column).getData().push_back(x); -} - -template -static inline void writeDenormalNumber(T x, WriteBuffer & ostr) -{ - if constexpr (std::is_floating_point_v) - { - if (std::signbit(x)) - { - if (isNaN(x)) - writeCString("-nan", ostr); - else - writeCString("-inf", ostr); - } - else - { - if (isNaN(x)) - writeCString("nan", ostr); - else - writeCString("inf", ostr); - } - } - else - { - /// This function is not called for non floating point numbers. - (void)x; - } -} - - -template -void DataTypeNumberBase::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - auto x = assert_cast &>(column).getData()[row_num]; - bool is_finite = isFinite(x); - - const bool need_quote = (is_integer_v && (sizeof(T) >= 8) && settings.json.quote_64bit_integers) - || (settings.json.quote_denormals && !is_finite); - - if (need_quote) - writeChar('"', ostr); - - if (is_finite) - writeText(x, ostr); - else if (!settings.json.quote_denormals) - writeCString("null", ostr); - else - writeDenormalNumber(x, ostr); - - if (need_quote) - writeChar('"', ostr); -} - -template -void DataTypeNumberBase::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - bool has_quote = false; - if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. - { - has_quote = true; - ++istr.position(); - } - - FieldType x; - - /// null - if (!has_quote && !istr.eof() && *istr.position() == 'n') - { - ++istr.position(); - assertString("ull", istr); - - x = NaNOrZero(); - } - else - { - static constexpr bool is_uint8 = std::is_same_v; - static constexpr bool is_int8 = std::is_same_v; - - if (is_uint8 || is_int8) - { - // extra conditions to parse true/false strings into 1/0 - if (istr.eof()) - throwReadAfterEOF(); - if (*istr.position() == 't' || *istr.position() == 'f') - { - bool tmp = false; - readBoolTextWord(tmp, istr); - x = tmp; - } - else - readText(x, istr); - } - else - { - readText(x, istr); - } - - if (has_quote) - assertChar('"', istr); - } - - assert_cast &>(column).getData().push_back(x); -} - -template -void DataTypeNumberBase::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - FieldType x; - readCSV(x, istr); - assert_cast &>(column).getData().push_back(x); -} - template Field DataTypeNumberBase::getDefault() const { return NearestFieldType(); } -template -void DataTypeNumberBase::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - /// ColumnVector::ValueType is a narrower type. For example, UInt8, when the Field type is UInt64 - typename ColumnVector::ValueType x = get>(field); - writeBinary(x, ostr); -} - -template -void DataTypeNumberBase::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - typename ColumnVector::ValueType x; - readBinary(x, istr); - field = NearestFieldType(x); -} - -template -void DataTypeNumberBase::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - writeBinary(assert_cast &>(column).getData()[row_num], ostr); -} - -template -void DataTypeNumberBase::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - typename ColumnVector::ValueType x; - readBinary(x, istr); - assert_cast &>(column).getData().push_back(x); -} - -template -void DataTypeNumberBase::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); - - size_t size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - if (limit) - ostr.write(reinterpret_cast(&x[offset]), sizeof(typename ColumnVector::ValueType) * limit); -} - -template -void DataTypeNumberBase::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const -{ - typename ColumnVector::Container & x = typeid_cast &>(column).getData(); - size_t initial_size = x.size(); - x.resize(initial_size + limit); - size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(typename ColumnVector::ValueType) * limit); - x.resize(initial_size + size / sizeof(typename ColumnVector::ValueType)); -} - - template MutableColumnPtr DataTypeNumberBase::createColumn() const { @@ -212,13 +28,13 @@ MutableColumnPtr DataTypeNumberBase::createColumn() const template bool DataTypeNumberBase::isValueRepresentedByInteger() const { - return std::is_integral_v; + return is_integer_v; } template bool DataTypeNumberBase::isValueRepresentedByUnsignedInteger() const { - return std::is_integral_v && is_unsigned_v; + return is_integer_v && is_unsigned_v; } diff --git a/src/DataTypes/DataTypeNumberBase.h b/src/DataTypes/DataTypeNumberBase.h index 22a70ac7277..97c3563b032 100644 --- a/src/DataTypes/DataTypeNumberBase.h +++ b/src/DataTypes/DataTypeNumberBase.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB @@ -14,7 +14,7 @@ class ColumnVector; /** Implements part of the IDataType interface, common to all numbers and for Date and DateTime. */ template -class DataTypeNumberBase : public DataTypeWithSimpleSerialization +class DataTypeNumberBase : public IDataType { static_assert(IsNumber); @@ -30,21 +30,8 @@ public: const char * getFamilyName() const override { return family_name; } TypeIndex getTypeId() const override { return type_id; } - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; Field getDefault() const override; - /** Format is platform-dependent. */ - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - MutableColumnPtr createColumn() const override; bool isParametric() const override { return false; } @@ -53,7 +40,7 @@ public: bool shouldAlignRightInPrettyFormats() const override { /// Just a number, without customizations. Counterexample: IPv4. - return !custom_text_serialization; + return !custom_serialization; } bool textCanContainOnlyValidUTF8() const override { return true; } @@ -66,6 +53,8 @@ public: size_t getSizeOfValueInMemory() const override { return sizeof(T); } bool isCategorial() const override { return isValueRepresentedByInteger(); } bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override { return std::make_shared>(); } }; /// Prevent implicit template instantiation of DataTypeNumberBase for common numeric types diff --git a/src/DataTypes/DataTypeOneElementTuple.cpp b/src/DataTypes/DataTypeOneElementTuple.cpp deleted file mode 100644 index a4169220362..00000000000 --- a/src/DataTypes/DataTypeOneElementTuple.cpp +++ /dev/null @@ -1,112 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace -{ - -/** Custom substreams representation for single subcolumn. - * It serializes/deserializes column as a nested type, but in that way - * if it was a named tuple with one element and a given name. - */ -class DataTypeOneElementTupleStreams : public IDataTypeCustomStreams -{ -private: - DataTypePtr nested; - String name; - bool escape_delimiter; - -public: - DataTypeOneElementTupleStreams(const DataTypePtr & nested_, const String & name_, bool escape_delimiter_) - : nested(nested_), name(name_), escape_delimiter(escape_delimiter_) {} - - void enumerateStreams( - const IDataType::StreamCallback & callback, - IDataType::SubstreamPath & path) const override - { - addToPath(path); - nested->enumerateStreams(callback, path); - path.pop_back(); - } - - void serializeBinaryBulkStatePrefix( - IDataType:: SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const override - { - addToPath(settings.path); - nested->serializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); - } - - void serializeBinaryBulkStateSuffix( - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const override - { - addToPath(settings.path); - nested->serializeBinaryBulkStateSuffix(settings, state); - settings.path.pop_back(); - } - - void deserializeBinaryBulkStatePrefix( - IDataType::DeserializeBinaryBulkSettings & settings, - IDataType::DeserializeBinaryBulkStatePtr & state) const override - { - addToPath(settings.path); - nested->deserializeBinaryBulkStatePrefix(settings, state); - settings.path.pop_back(); - } - - void serializeBinaryBulkWithMultipleStreams( - const IColumn & column, - size_t offset, - size_t limit, - IDataType::SerializeBinaryBulkSettings & settings, - IDataType::SerializeBinaryBulkStatePtr & state) const override - { - addToPath(settings.path); - nested->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); - settings.path.pop_back(); - } - - void deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, - size_t limit, - IDataType::DeserializeBinaryBulkSettings & settings, - IDataType::DeserializeBinaryBulkStatePtr & state, - IDataType::SubstreamsCache * cache) const override - { - addToPath(settings.path); - nested->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); - settings.path.pop_back(); - } - -private: - void addToPath(IDataType::SubstreamPath & path) const - { - path.push_back(IDataType::Substream::TupleElement); - path.back().tuple_element_name = name; - path.back().escape_tuple_delimiter = escape_delimiter; - } -}; - -} - -DataTypePtr createOneElementTuple(const DataTypePtr & type, const String & name, bool escape_delimiter) -{ - auto custom_desc = std::make_unique( - std::make_unique(type->getName()),nullptr, - std::make_unique(type, name, escape_delimiter)); - - return DataTypeFactory::instance().getCustom(std::move(custom_desc)); -} - -} diff --git a/src/DataTypes/DataTypeOneElementTuple.h b/src/DataTypes/DataTypeOneElementTuple.h deleted file mode 100644 index 03b0511ef4a..00000000000 --- a/src/DataTypes/DataTypeOneElementTuple.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -DataTypePtr createOneElementTuple(const DataTypePtr & type, const String & name, bool escape_delimiter = true); - -} diff --git a/src/DataTypes/DataTypeString.cpp b/src/DataTypes/DataTypeString.cpp index d760df5075d..41ae578a70f 100644 --- a/src/DataTypes/DataTypeString.cpp +++ b/src/DataTypes/DataTypeString.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -34,281 +35,6 @@ namespace ErrorCodes extern const int UNEXPECTED_AST_STRUCTURE; } - -void DataTypeString::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const String & s = get(field); - writeVarUInt(s.size(), ostr); - writeString(s, ostr); -} - - -void DataTypeString::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - UInt64 size; - readVarUInt(size, istr); - field = String(); - String & s = get(field); - s.resize(size); - istr.readStrict(s.data(), size); -} - - -void DataTypeString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - const StringRef & s = assert_cast(column).getDataAt(row_num); - writeVarUInt(s.size, ostr); - writeString(s, ostr); -} - - -void DataTypeString::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - ColumnString & column_string = assert_cast(column); - ColumnString::Chars & data = column_string.getChars(); - ColumnString::Offsets & offsets = column_string.getOffsets(); - - UInt64 size; - readVarUInt(size, istr); - - size_t old_chars_size = data.size(); - size_t offset = old_chars_size + size + 1; - offsets.push_back(offset); - - try - { - data.resize(offset); - istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); - data.back() = 0; - } - catch (...) - { - offsets.pop_back(); - data.resize_assume_reserved(old_chars_size); - throw; - } -} - - -void DataTypeString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const -{ - const ColumnString & column_string = typeid_cast(column); - const ColumnString::Chars & data = column_string.getChars(); - const ColumnString::Offsets & offsets = column_string.getOffsets(); - - size_t size = column.size(); - if (!size) - return; - - size_t end = limit && offset + limit < size - ? offset + limit - : size; - - if (offset == 0) - { - UInt64 str_size = offsets[0] - 1; - writeVarUInt(str_size, ostr); - ostr.write(reinterpret_cast(data.data()), str_size); - - ++offset; - } - - for (size_t i = offset; i < end; ++i) - { - UInt64 str_size = offsets[i] - offsets[i - 1] - 1; - writeVarUInt(str_size, ostr); - ostr.write(reinterpret_cast(&data[offsets[i - 1]]), str_size); - } -} - - -template -static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit) -{ - size_t offset = data.size(); - for (size_t i = 0; i < limit; ++i) - { - if (istr.eof()) - break; - - UInt64 size; - readVarUInt(size, istr); - - offset += size + 1; - offsets.push_back(offset); - - data.resize(offset); - - if (size) - { -#ifdef __SSE2__ - /// An optimistic branch in which more efficient copying is possible. - if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end()) - { - const __m128i * sse_src_pos = reinterpret_cast(istr.position()); - const __m128i * sse_src_end = sse_src_pos + (size + (16 * UNROLL_TIMES - 1)) / 16 / UNROLL_TIMES * UNROLL_TIMES; - __m128i * sse_dst_pos = reinterpret_cast<__m128i *>(&data[offset - size - 1]); - - while (sse_src_pos < sse_src_end) - { - for (size_t j = 0; j < UNROLL_TIMES; ++j) - _mm_storeu_si128(sse_dst_pos + j, _mm_loadu_si128(sse_src_pos + j)); - - sse_src_pos += UNROLL_TIMES; - sse_dst_pos += UNROLL_TIMES; - } - - istr.position() += size; - } - else -#endif - { - istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); - } - } - - data[offset - 1] = 0; - } -} - - -void DataTypeString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const -{ - ColumnString & column_string = typeid_cast(column); - ColumnString::Chars & data = column_string.getChars(); - ColumnString::Offsets & offsets = column_string.getOffsets(); - - double avg_chars_size = 1; /// By default reserve only for empty strings. - - if (avg_value_size_hint && avg_value_size_hint > sizeof(offsets[0])) - { - /// Randomly selected. - constexpr auto avg_value_size_hint_reserve_multiplier = 1.2; - - avg_chars_size = (avg_value_size_hint - sizeof(offsets[0])) * avg_value_size_hint_reserve_multiplier; - } - - size_t size_to_reserve = data.size() + std::ceil(limit * avg_chars_size); - - /// Never reserve for too big size. - if (size_to_reserve < 256 * 1024 * 1024) - { - try - { - data.reserve(size_to_reserve); - } - catch (Exception & e) - { - e.addMessage( - "(avg_value_size_hint = " + toString(avg_value_size_hint) - + ", avg_chars_size = " + toString(avg_chars_size) - + ", limit = " + toString(limit) + ")"); - throw; - } - } - - offsets.reserve(offsets.size() + limit); - - if (avg_chars_size >= 64) - deserializeBinarySSE2<4>(data, offsets, istr, limit); - else if (avg_chars_size >= 48) - deserializeBinarySSE2<3>(data, offsets, istr, limit); - else if (avg_chars_size >= 32) - deserializeBinarySSE2<2>(data, offsets, istr, limit); - else - deserializeBinarySSE2<1>(data, offsets, istr, limit); -} - - -void DataTypeString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeString(assert_cast(column).getDataAt(row_num), ostr); -} - - -void DataTypeString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeEscapedString(assert_cast(column).getDataAt(row_num), ostr); -} - - -template -static inline void read(IColumn & column, Reader && reader) -{ - ColumnString & column_string = assert_cast(column); - ColumnString::Chars & data = column_string.getChars(); - ColumnString::Offsets & offsets = column_string.getOffsets(); - size_t old_chars_size = data.size(); - size_t old_offsets_size = offsets.size(); - try - { - reader(data); - data.push_back(0); - offsets.push_back(data.size()); - } - catch (...) - { - offsets.resize_assume_reserved(old_offsets_size); - data.resize_assume_reserved(old_chars_size); - throw; - } -} - - -void DataTypeString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); }); -} - - -void DataTypeString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); -} - - -void DataTypeString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeQuotedString(assert_cast(column).getDataAt(row_num), ostr); -} - - -void DataTypeString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); -} - - -void DataTypeString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeJSONString(assert_cast(column).getDataAt(row_num), ostr, settings); -} - - -void DataTypeString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); -} - - -void DataTypeString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeXMLStringForTextElement(assert_cast(column).getDataAt(row_num), ostr); -} - - -void DataTypeString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeCSVString<>(assert_cast(column).getDataAt(row_num), ostr); -} - - -void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); -} - - Field DataTypeString::getDefault() const { return String(); @@ -325,6 +51,11 @@ bool DataTypeString::equals(const IDataType & rhs) const return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeString::doGetDefaultSerialization() const +{ + return std::make_shared(); +} + static DataTypePtr create(const ASTPtr & arguments) { if (arguments && !arguments->children.empty()) diff --git a/src/DataTypes/DataTypeString.h b/src/DataTypes/DataTypeString.h index 7f8aa1fd0cf..0fc38e9c6f0 100644 --- a/src/DataTypes/DataTypeString.h +++ b/src/DataTypes/DataTypeString.h @@ -22,31 +22,6 @@ public: TypeIndex getTypeId() const override { return type_id; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -61,6 +36,8 @@ public: bool isCategorial() const override { return true; } bool canBeInsideNullable() const override { return true; } bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index 2261e776ea2..b30efb163ab 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -5,7 +5,9 @@ #include #include #include -#include +#include +#include +#include #include #include #include @@ -29,7 +31,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int DUPLICATE_COLUMN; extern const int EMPTY_DATA_PASSED; - extern const int LOGICAL_ERROR; extern const int NOT_FOUND_COLUMN_IN_BLOCK; extern const int ILLEGAL_COLUMN; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; @@ -113,32 +114,6 @@ static inline const IColumn & extractElementColumn(const IColumn & column, size_ return assert_cast(column).getColumn(idx); } - -void DataTypeTuple::serializeBinary(const Field & field, WriteBuffer & ostr) const -{ - const auto & tuple = get(field); - for (const auto idx_elem : ext::enumerate(elems)) - idx_elem.second->serializeBinary(tuple[idx_elem.first], ostr); -} - -void DataTypeTuple::deserializeBinary(Field & field, ReadBuffer & istr) const -{ - const size_t size = elems.size(); - - Tuple tuple(size); - for (const auto i : ext::range(0, size)) - elems[i]->deserializeBinary(tuple[i], istr); - - field = tuple; -} - -void DataTypeTuple::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const -{ - for (const auto idx_elem : ext::enumerate(elems)) - idx_elem.second->serializeBinary(extractElementColumn(column, idx_elem.first), row_num, ostr); -} - - template static void addElementSafe(const DataTypes & elems, IColumn & column, F && impl) { @@ -151,7 +126,8 @@ static void addElementSafe(const DataTypes & elems, IColumn & column, F && impl) // Check that all columns now have the same size. size_t new_size = column.size(); - for (auto i : ext::range(1, ext::size(elems))) + + for (auto i : ext::range(0, ext::size(elems))) { const auto & element_column = extractElementColumn(column, i); if (element_column.size() != new_size) @@ -168,6 +144,7 @@ static void addElementSafe(const DataTypes & elems, IColumn & column, F && impl) for (const auto & i : ext::range(0, ext::size(elems))) { auto & element_column = extractElementColumn(column, i); + if (element_column.size() > old_size) element_column.popBack(1); } @@ -176,334 +153,6 @@ static void addElementSafe(const DataTypes & elems, IColumn & column, F && impl) } } - -void DataTypeTuple::deserializeBinary(IColumn & column, ReadBuffer & istr) const -{ - addElementSafe(elems, column, [&] - { - for (const auto & i : ext::range(0, ext::size(elems))) - elems[i]->deserializeBinary(extractElementColumn(column, i), istr); - }); -} - -void DataTypeTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('(', ostr); - for (const auto i : ext::range(0, ext::size(elems))) - { - if (i != 0) - writeChar(',', ostr); - elems[i]->serializeAsTextQuoted(extractElementColumn(column, i), row_num, ostr, settings); - } - writeChar(')', ostr); -} - -void DataTypeTuple::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - const size_t size = elems.size(); - assertChar('(', istr); - - addElementSafe(elems, column, [&] - { - for (const auto i : ext::range(0, size)) - { - skipWhitespaceIfAny(istr); - if (i != 0) - { - assertChar(',', istr); - skipWhitespaceIfAny(istr); - } - elems[i]->deserializeAsTextQuoted(extractElementColumn(column, i), istr, settings); - } - }); - - // Special format for one element tuple (1,) - if (1 == elems.size()) - { - skipWhitespaceIfAny(istr); - // Allow both (1) and (1,) - checkChar(',', istr); - } - skipWhitespaceIfAny(istr); - assertChar(')', istr); -} - -void DataTypeTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (settings.json.named_tuples_as_objects - && have_explicit_names) - { - writeChar('{', ostr); - for (const auto i : ext::range(0, ext::size(elems))) - { - if (i != 0) - { - writeChar(',', ostr); - } - writeJSONString(names[i], ostr, settings); - writeChar(':', ostr); - elems[i]->serializeAsTextJSON(extractElementColumn(column, i), row_num, ostr, settings); - } - writeChar('}', ostr); - } - else - { - writeChar('[', ostr); - for (const auto i : ext::range(0, ext::size(elems))) - { - if (i != 0) - writeChar(',', ostr); - elems[i]->serializeAsTextJSON(extractElementColumn(column, i), row_num, ostr, settings); - } - writeChar(']', ostr); - } -} - -void DataTypeTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (settings.json.named_tuples_as_objects - && have_explicit_names) - { - skipWhitespaceIfAny(istr); - assertChar('{', istr); - skipWhitespaceIfAny(istr); - - addElementSafe(elems, column, [&] - { - // Require all elements but in arbitrary order. - for (auto i : ext::range(0, ext::size(elems))) - { - if (i > 0) - { - skipWhitespaceIfAny(istr); - assertChar(',', istr); - skipWhitespaceIfAny(istr); - } - - std::string name; - readDoubleQuotedString(name, istr); - skipWhitespaceIfAny(istr); - assertChar(':', istr); - skipWhitespaceIfAny(istr); - - const size_t element_pos = getPositionByName(name); - auto & element_column = extractElementColumn(column, element_pos); - elems[element_pos]->deserializeAsTextJSON(element_column, istr, settings); - } - }); - - skipWhitespaceIfAny(istr); - assertChar('}', istr); - } - else - { - const size_t size = elems.size(); - assertChar('[', istr); - - addElementSafe(elems, column, [&] - { - for (const auto i : ext::range(0, size)) - { - skipWhitespaceIfAny(istr); - if (i != 0) - { - assertChar(',', istr); - skipWhitespaceIfAny(istr); - } - elems[i]->deserializeAsTextJSON(extractElementColumn(column, i), istr, settings); - } - }); - - skipWhitespaceIfAny(istr); - assertChar(']', istr); - } -} - -void DataTypeTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeCString("", ostr); - for (const auto i : ext::range(0, ext::size(elems))) - { - writeCString("", ostr); - elems[i]->serializeAsTextXML(extractElementColumn(column, i), row_num, ostr, settings); - writeCString("", ostr); - } - writeCString("", ostr); -} - -void DataTypeTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - for (const auto i : ext::range(0, ext::size(elems))) - { - if (i != 0) - writeChar(',', ostr); - elems[i]->serializeAsTextCSV(extractElementColumn(column, i), row_num, ostr, settings); - } -} - -void DataTypeTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - addElementSafe(elems, column, [&] - { - const size_t size = elems.size(); - for (const auto i : ext::range(0, size)) - { - if (i != 0) - { - skipWhitespaceIfAny(istr); - assertChar(settings.csv.delimiter, istr); - skipWhitespaceIfAny(istr); - } - elems[i]->deserializeAsTextCSV(extractElementColumn(column, i), istr, settings); - } - }); -} - -void DataTypeTuple::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const -{ - path.push_back(Substream::TupleElement); - for (const auto i : ext::range(0, ext::size(elems))) - { - path.back().tuple_element_name = names[i]; - elems[i]->enumerateStreams(callback, path); - } - path.pop_back(); -} - -struct SerializeBinaryBulkStateTuple : public IDataType::SerializeBinaryBulkState -{ - std::vector states; -}; - -struct DeserializeBinaryBulkStateTuple : public IDataType::DeserializeBinaryBulkState -{ - std::vector states; -}; - -static SerializeBinaryBulkStateTuple * checkAndGetTupleSerializeState(IDataType::SerializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); - - auto * tuple_state = typeid_cast(state.get()); - if (!tuple_state) - { - auto & state_ref = *state; - throw Exception("Invalid SerializeBinaryBulkState for DataTypeTuple. Expected: " - + demangle(typeid(SerializeBinaryBulkStateTuple).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return tuple_state; -} - -static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(IDataType::DeserializeBinaryBulkStatePtr & state) -{ - if (!state) - throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); - - auto * tuple_state = typeid_cast(state.get()); - if (!tuple_state) - { - auto & state_ref = *state; - throw Exception("Invalid DeserializeBinaryBulkState for DataTypeTuple. Expected: " - + demangle(typeid(DeserializeBinaryBulkStateTuple).name()) + ", got " - + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); - } - - return tuple_state; -} - -void DataTypeTuple::serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - auto tuple_state = std::make_shared(); - tuple_state->states.resize(elems.size()); - - settings.path.push_back(Substream::TupleElement); - for (size_t i = 0; i < elems.size(); ++i) - { - settings.path.back().tuple_element_name = names[i]; - elems[i]->serializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); - } - settings.path.pop_back(); - - state = std::move(tuple_state); -} - -void DataTypeTuple::serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - auto * tuple_state = checkAndGetTupleSerializeState(state); - - settings.path.push_back(Substream::TupleElement); - for (size_t i = 0; i < elems.size(); ++i) - { - settings.path.back().tuple_element_name = names[i]; - elems[i]->serializeBinaryBulkStateSuffix(settings, tuple_state->states[i]); - } - settings.path.pop_back(); -} - -void DataTypeTuple::deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - auto tuple_state = std::make_shared(); - tuple_state->states.resize(elems.size()); - - settings.path.push_back(Substream::TupleElement); - for (size_t i = 0; i < elems.size(); ++i) - { - settings.path.back().tuple_element_name = names[i]; - elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); - } - settings.path.pop_back(); - - state = std::move(tuple_state); -} - -void DataTypeTuple::serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - auto * tuple_state = checkAndGetTupleSerializeState(state); - - settings.path.push_back(Substream::TupleElement); - for (const auto i : ext::range(0, ext::size(elems))) - { - settings.path.back().tuple_element_name = names[i]; - const auto & element_col = extractElementColumn(column, i); - elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]); - } - settings.path.pop_back(); -} - -void DataTypeTuple::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - auto * tuple_state = checkAndGetTupleDeserializeState(state); - auto & column_tuple = assert_cast(column); - - settings.path.push_back(Substream::TupleElement); - settings.avg_value_size_hint = 0; - for (const auto i : ext::range(0, ext::size(elems))) - { - settings.path.back().tuple_element_name = names[i]; - elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache); - } - settings.path.pop_back(); -} - MutableColumnPtr DataTypeTuple::createColumn() const { size_t size = elems.size(); @@ -587,47 +236,98 @@ size_t DataTypeTuple::getSizeOfValueInMemory() const return res; } -DataTypePtr DataTypeTuple::tryGetSubcolumnType(const String & subcolumn_name) const +template +auto DataTypeTuple::getSubcolumnEntity(const String & subcolumn_name, + const OnSuccess & on_success, const OnContinue & on_continue) const { + using ReturnType = decltype(on_success(0)); for (size_t i = 0; i < names.size(); ++i) { if (startsWith(subcolumn_name, names[i])) { size_t name_length = names[i].size(); - DataTypePtr subcolumn_type; - if (subcolumn_name.size() == name_length) - subcolumn_type = elems[i]; - else if (subcolumn_name[name_length] == '.') - subcolumn_type = elems[i]->tryGetSubcolumnType(subcolumn_name.substr(name_length + 1)); - if (subcolumn_type) - return createOneElementTuple(std::move(subcolumn_type), names[i]); + if (subcolumn_name.size() == name_length) + return on_success(i); + + if (subcolumn_name[name_length] == '.') + return on_continue(i, subcolumn_name.substr(name_length + 1)); } } - return nullptr; + return ReturnType{}; +} + +DataTypePtr DataTypeTuple::tryGetSubcolumnType(const String & subcolumn_name) const +{ + if (subcolumn_name == MAIN_SUBCOLUMN_NAME) + return shared_from_this(); + + auto on_success = [&](size_t pos) { return elems[pos]; }; + auto on_continue = [&](size_t pos, const String & next_subcolumn) { return elems[pos]->tryGetSubcolumnType(next_subcolumn); }; + + return getSubcolumnEntity(subcolumn_name, on_success, on_continue); } ColumnPtr DataTypeTuple::getSubcolumn(const String & subcolumn_name, const IColumn & column) const { - for (size_t i = 0; i < names.size(); ++i) + auto on_success = [&](size_t pos) { return extractElementColumn(column, pos).getPtr(); }; + auto on_continue = [&](size_t pos, const String & next_subcolumn) { - if (startsWith(subcolumn_name, names[i])) - { - size_t name_length = names[i].size(); - const auto & subcolumn = extractElementColumn(column, i); + return elems[pos]->getSubcolumn(next_subcolumn, extractElementColumn(column, pos)); + }; - if (subcolumn_name.size() == name_length) - return subcolumn.assumeMutable(); - - if (subcolumn_name[name_length] == '.') - return elems[i]->getSubcolumn(subcolumn_name.substr(name_length + 1), subcolumn); - } - } + if (auto subcolumn = getSubcolumnEntity(subcolumn_name, on_success, on_continue)) + return subcolumn; throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); } +SerializationPtr DataTypeTuple::getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const +{ + auto on_success = [&](size_t pos) + { + return std::make_shared(base_serialization_getter(*elems[pos]), names[pos]); + }; + + auto on_continue = [&](size_t pos, const String & next_subcolumn) + { + auto next_serialization = elems[pos]->getSubcolumnSerialization(next_subcolumn, base_serialization_getter); + return std::make_shared(next_serialization, names[pos]); + }; + + if (auto serialization = getSubcolumnEntity(subcolumn_name, on_success, on_continue)) + return serialization; + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); +} + + +SerializationPtr DataTypeTuple::doGetDefaultSerialization() const +{ + SerializationTuple::ElementSerializations serializations(elems.size()); + for (size_t i = 0; i < elems.size(); ++i) + { + auto serialization = elems[i]->getDefaultSerialization(); + serializations[i] = std::make_shared(serialization, names[i]); + } + + return std::make_shared(std::move(serializations), have_explicit_names); +} + +SerializationPtr DataTypeTuple::getSerialization(const String & column_name, const StreamExistenceCallback & callback) const +{ + SerializationTuple::ElementSerializations serializations(elems.size()); + for (size_t i = 0; i < elems.size(); ++i) + { + auto subcolumn_name = Nested::concatenateName(column_name, names[i]); + auto serializaion = elems[i]->getSerialization(subcolumn_name, callback); + serializations[i] = std::make_shared(serializaion, names[i]); + } + + return std::make_shared(std::move(serializations), have_explicit_names); +} static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index 12ccf574c0e..e572b23f987 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -16,7 +16,7 @@ namespace DB * * All tuples with same size and types of elements are equivalent for expressions, regardless to names of elements. */ -class DataTypeTuple final : public DataTypeWithSimpleSerialization +class DataTypeTuple final : public IDataType { private: DataTypes elems; @@ -37,50 +37,6 @@ public: bool canBeInsideNullable() const override { return false; } - void serializeBinary(const Field & field, WriteBuffer & ostr) const override; - void deserializeBinary(Field & field, ReadBuffer & istr) const override; - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - - /// Tuples in CSV format will be serialized as separate columns (that is, losing their nesting in the tuple). - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - - /** Each sub-column in a tuple is serialized in separate stream. - */ - void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override; - - void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; - - void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const override; - - void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -99,6 +55,13 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override; ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override; + SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const override; + + SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const override; + + SerializationPtr doGetDefaultSerialization() const override; + const DataTypes & getElements() const { return elems; } const Strings & getElementNames() const { return names; } @@ -106,6 +69,11 @@ public: bool haveExplicitNames() const { return have_explicit_names; } bool serializeNames() const { return serialize_names; } + +private: + template + auto getSubcolumnEntity(const String & subcolumn_name, + const OnSuccess & on_success, const OnContinue & on_continue) const; }; } diff --git a/src/DataTypes/DataTypeUUID.cpp b/src/DataTypes/DataTypeUUID.cpp index b66cbadaef0..387ccc56a71 100644 --- a/src/DataTypes/DataTypeUUID.cpp +++ b/src/DataTypes/DataTypeUUID.cpp @@ -1,87 +1,20 @@ #include #include -#include -#include -#include -#include +#include namespace DB { -void DataTypeUUID::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - writeText(UUID(assert_cast(column).getData()[row_num]), ostr); -} - -void DataTypeUUID::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - UUID x; - readText(x, istr); - assert_cast(column).getData().push_back(x); -} - -void DataTypeUUID::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - deserializeText(column, istr, settings); -} - -void DataTypeUUID::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} - -void DataTypeUUID::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); -} - -void DataTypeUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - UUID x; - assertChar('\'', istr); - readText(x, istr); - assertChar('\'', istr); - assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. -} - -void DataTypeUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - UUID x; - assertChar('"', istr); - readText(x, istr); - assertChar('"', istr); - assert_cast(column).getData().push_back(x); -} - -void DataTypeUUID::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); -} - -void DataTypeUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - UUID value; - readCSV(value, istr); - assert_cast(column).getData().push_back(value); -} - bool DataTypeUUID::equals(const IDataType & rhs) const { return typeid(rhs) == typeid(*this); } +SerializationPtr DataTypeUUID::doGetDefaultSerialization() const +{ + return std::make_shared(); +} void registerDataTypeUUID(DataTypeFactory & factory) { diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index de0c7c7d8cf..1546ca385a4 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -16,22 +16,13 @@ public: bool equals(const IDataType & rhs) const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - bool canBeUsedInBitOperations() const override { return true; } bool canBeInsideNullable() const override { return true; } bool canBeInsideLowCardinality() const override { return false; } bool canBePromoted() const override { return false; } + + SerializationPtr doGetDefaultSerialization() const override; }; } diff --git a/src/DataTypes/DataTypesDecimal.cpp b/src/DataTypes/DataTypesDecimal.cpp index 160e09d92d8..cecfcea8dac 100644 --- a/src/DataTypes/DataTypesDecimal.cpp +++ b/src/DataTypes/DataTypesDecimal.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -46,55 +47,6 @@ DataTypePtr DataTypeDecimal::promoteNumericType() const return std::make_shared(PromotedType::maxPrecision(), this->scale); } -template -void DataTypeDecimal::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const -{ - T value = assert_cast(column).getData()[row_num]; - writeText(value, this->scale, ostr); -} - -template -bool DataTypeDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale) -{ - UInt32 unread_scale = scale; - if (!tryReadDecimalText(istr, x, precision, unread_scale)) - return false; - - if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) - return false; - - return true; -} - -template -void DataTypeDecimal::readText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale, bool csv) -{ - UInt32 unread_scale = scale; - if (csv) - readCSVDecimalText(istr, x, precision, unread_scale); - else - readDecimalText(istr, x, precision, unread_scale); - - if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) - throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); -} - -template -void DataTypeDecimal::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - T x; - readText(x, istr); - assert_cast(column).getData().push_back(x); -} - -template -void DataTypeDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const -{ - T x; - readText(x, istr, true); - assert_cast(column).getData().push_back(x); -} - template T DataTypeDecimal::parseFromString(const String & str) const { @@ -109,6 +61,12 @@ T DataTypeDecimal::parseFromString(const String & str) const return x; } +template +SerializationPtr DataTypeDecimal::doGetDefaultSerialization() const +{ + return std::make_shared>(this->precision, this->scale); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypesDecimal.h b/src/DataTypes/DataTypesDecimal.h index 2b708b53be0..5aeac78b2ef 100644 --- a/src/DataTypes/DataTypesDecimal.h +++ b/src/DataTypes/DataTypesDecimal.h @@ -42,17 +42,9 @@ public: bool canBePromoted() const override { return true; } DataTypePtr promoteNumericType() const override; - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - bool equals(const IDataType & rhs) const override; - T parseFromString(const String & str) const; - void readText(T & x, ReadBuffer & istr, bool csv = false) const { readText(x, istr, this->precision, this->scale, csv); } - - static void readText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); - static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_); + SerializationPtr doGetDefaultSerialization() const override; }; template diff --git a/src/DataTypes/DataTypesNumber.h b/src/DataTypes/DataTypesNumber.h index 0ec655bde50..79272756465 100644 --- a/src/DataTypes/DataTypesNumber.h +++ b/src/DataTypes/DataTypesNumber.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -25,6 +26,11 @@ class DataTypeNumber final : public DataTypeNumberBase using PromotedType = DataTypeNumber>; return std::make_shared(); } + + SerializationPtr doGetDefaultSerialization() const override + { + return std::make_shared>(); + } }; using DataTypeUInt8 = DataTypeNumber; diff --git a/src/DataTypes/EnumValues.cpp b/src/DataTypes/EnumValues.cpp new file mode 100644 index 00000000000..d4ca7e4dfdd --- /dev/null +++ b/src/DataTypes/EnumValues.cpp @@ -0,0 +1,77 @@ +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; + extern const int EMPTY_DATA_PASSED; + extern const int BAD_ARGUMENTS; +} + +template +EnumValues::EnumValues(const Values & values_) + : values(values_) +{ + if (values.empty()) + throw Exception{"DataTypeEnum enumeration cannot be empty", ErrorCodes::EMPTY_DATA_PASSED}; + + std::sort(std::begin(values), std::end(values), [] (auto & left, auto & right) + { + return left.second < right.second; + }); + + fillMaps(); +} + +template +void EnumValues::fillMaps() +{ + for (const auto & name_and_value : values) + { + const auto inserted_value = name_to_value_map.insert( + { StringRef{name_and_value.first}, name_and_value.second }); + + if (!inserted_value.second) + throw Exception{"Duplicate names in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) + + " and " + toString(inserted_value.first->getMapped()), + ErrorCodes::SYNTAX_ERROR}; + + const auto inserted_name = value_to_name_map.insert( + { name_and_value.second, StringRef{name_and_value.first} }); + + if (!inserted_name.second) + throw Exception{"Duplicate values in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) + + " and '" + toString((*inserted_name.first).first) + "'", + ErrorCodes::SYNTAX_ERROR}; + } +} + +template +T EnumValues::getValue(StringRef field_name, bool try_treat_as_id) const +{ + const auto it = name_to_value_map.find(field_name); + if (!it) + { + /// It is used in CSV and TSV input formats. If we fail to find given string in + /// enum names, we will try to treat it as enum id. + if (try_treat_as_id) + { + T x; + ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); + readText(x, tmp_buf); + /// Check if we reached end of the tmp_buf (otherwise field_name is not a number) + /// and try to find it in enum ids + if (tmp_buf.eof() && value_to_name_map.find(x) != value_to_name_map.end()) + return x; + } + throw Exception{"Unknown element '" + field_name.toString() + "' for enum", ErrorCodes::BAD_ARGUMENTS}; + } + return it->getMapped(); +} + +template class EnumValues; +template class EnumValues; + +} diff --git a/src/DataTypes/EnumValues.h b/src/DataTypes/EnumValues.h new file mode 100644 index 00000000000..45ac30f9cd7 --- /dev/null +++ b/src/DataTypes/EnumValues.h @@ -0,0 +1,71 @@ +#pragma once + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +template +class EnumValues +{ +public: + using Value = std::pair; + using Values = std::vector; + using NameToValueMap = HashMap; + using ValueToNameMap = std::unordered_map; + +private: + Values values; + NameToValueMap name_to_value_map; + ValueToNameMap value_to_name_map; + + void fillMaps(); + +public: + EnumValues(const Values & values_); + + const Values & getValues() const { return values; } + + auto findByValue(const T & value) const + { + const auto it = value_to_name_map.find(value); + if (it == std::end(value_to_name_map)) + throw Exception{"Unexpected value " + toString(value) + " in enum", ErrorCodes::BAD_ARGUMENTS}; + + return it; + } + + const StringRef & getNameForValue(const T & value) const + { + return findByValue(value)->second; + } + + T getValue(StringRef field_name, bool try_treat_as_id = false) const; + + template + bool containsAll(const TValues & rhs_values) const + { + auto check = [&](const auto & value) + { + auto it = name_to_value_map.find(value.first); + /// If we don't have this name, than we have to be sure, + /// that this value exists in enum + if (it == name_to_value_map.end()) + return value_to_name_map.count(value.second) > 0; + + /// If we have this name, than it should have the same value + return it->value.second == value.second; + }; + + return std::all_of(rhs_values.begin(), rhs_values.end(), check); + } +}; + +} + diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 5582a8698e0..c0679557ec9 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB @@ -18,54 +19,11 @@ namespace DB namespace ErrorCodes { - extern const int MULTIPLE_STREAMS_REQUIRED; extern const int LOGICAL_ERROR; extern const int DATA_TYPE_CANNOT_BE_PROMOTED; extern const int ILLEGAL_COLUMN; } -String IDataType::Substream::toString() const -{ - switch (type) - { - case ArrayElements: - return "ArrayElements"; - case ArraySizes: - return "ArraySizes"; - case NullableElements: - return "NullableElements"; - case NullMap: - return "NullMap"; - case TupleElement: - return "TupleElement(" + tuple_element_name + ", " - + std::to_string(escape_tuple_delimiter) + ")"; - case DictionaryKeys: - return "DictionaryKeys"; - case DictionaryIndexes: - return "DictionaryIndexes"; - } - - __builtin_unreachable(); -} - -String IDataType::SubstreamPath::toString() const -{ - WriteBufferFromOwnString wb; - wb << "{"; - for (size_t i = 0; i < size(); ++i) - { - if (i != 0) - wb << ", "; - wb << at(i).toString(); - } - wb << "}"; - return wb.str(); -} - -IDataType::IDataType() : custom_name(nullptr), custom_text_serialization(nullptr), custom_streams(nullptr) -{ -} - IDataType::~IDataType() = default; String IDataType::getName() const @@ -119,21 +77,19 @@ DataTypePtr IDataType::promoteNumericType() const throw Exception("Data type " + getName() + " can't be promoted.", ErrorCodes::DATA_TYPE_CANNOT_BE_PROMOTED); } -void IDataType::serializeBinaryBulk(const IColumn &, WriteBuffer &, size_t, size_t) const -{ - throw Exception("Data type " + getName() + " must be serialized with multiple streams", ErrorCodes::MULTIPLE_STREAMS_REQUIRED); -} - -void IDataType::deserializeBinaryBulk(IColumn &, ReadBuffer &, size_t, double) const -{ - throw Exception("Data type " + getName() + " must be deserialized with multiple streams", ErrorCodes::MULTIPLE_STREAMS_REQUIRED); -} - size_t IDataType::getSizeOfValueInMemory() const { throw Exception("Value of type " + getName() + " in memory is not of fixed size.", ErrorCodes::LOGICAL_ERROR); } +DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const +{ + if (subcolumn_name == MAIN_SUBCOLUMN_NAME) + return shared_from_this(); + + return nullptr; +} + DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const { auto subcolumn_type = tryGetSubcolumnType(subcolumn_name); @@ -151,14 +107,14 @@ ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const IColumn & Names IDataType::getSubcolumnNames() const { NameSet res; - enumerateStreams([&res, this](const SubstreamPath & substream_path, const IDataType & /* substream_type */) + getDefaultSerialization()->enumerateStreams([&res, this](const ISerialization::SubstreamPath & substream_path) { - SubstreamPath new_path; + ISerialization::SubstreamPath new_path; /// Iterate over path to try to get intermediate subcolumns for complex nested types. for (const auto & elem : substream_path) { new_path.push_back(elem); - auto subcolumn_name = getSubcolumnNameForStream(new_path); + auto subcolumn_name = ISerialization::getSubcolumnNameForStream(new_path); if (!subcolumn_name.empty() && tryGetSubcolumnType(subcolumn_name)) res.insert(subcolumn_name); } @@ -167,287 +123,72 @@ Names IDataType::getSubcolumnNames() const return Names(std::make_move_iterator(res.begin()), std::make_move_iterator(res.end())); } -static String getNameForSubstreamPath( - String stream_name, - const IDataType::SubstreamPath & path, - bool escape_tuple_delimiter) -{ - size_t array_level = 0; - for (const auto & elem : path) - { - if (elem.type == IDataType::Substream::NullMap) - stream_name += ".null"; - else if (elem.type == IDataType::Substream::ArraySizes) - stream_name += ".size" + toString(array_level); - else if (elem.type == IDataType::Substream::ArrayElements) - ++array_level; - else if (elem.type == IDataType::Substream::DictionaryKeys) - stream_name += ".dict"; - else if (elem.type == IDataType::Substream::TupleElement) - { - /// For compatibility reasons, we use %2E (escaped dot) instead of dot. - /// Because nested data may be represented not by Array of Tuple, - /// but by separate Array columns with names in a form of a.b, - /// and name is encoded as a whole. - stream_name += (escape_tuple_delimiter && elem.escape_tuple_delimiter ? - escapeForFileName(".") : ".") + escapeForFileName(elem.tuple_element_name); - } - } - - return stream_name; -} - -String IDataType::getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path) -{ - auto name_in_storage = column.getNameInStorage(); - auto nested_storage_name = Nested::extractTableName(name_in_storage); - - if (name_in_storage != nested_storage_name && (path.size() == 1 && path[0].type == IDataType::Substream::ArraySizes)) - name_in_storage = nested_storage_name; - - auto stream_name = escapeForFileName(name_in_storage); - return getNameForSubstreamPath(std::move(stream_name), path, true); -} - -String IDataType::getSubcolumnNameForStream(const SubstreamPath & path) -{ - auto subcolumn_name = getNameForSubstreamPath("", path, false); - if (!subcolumn_name.empty()) - subcolumn_name = subcolumn_name.substr(1); // It starts with a dot. - - return subcolumn_name; -} - -bool IDataType::isSpecialCompressionAllowed(const SubstreamPath & path) -{ - for (const Substream & elem : path) - { - if (elem.type == Substream::NullMap - || elem.type == Substream::ArraySizes - || elem.type == Substream::DictionaryIndexes) - return false; - } - return true; -} - void IDataType::insertDefaultInto(IColumn & column) const { column.insertDefault(); } -void IDataType::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const -{ - if (custom_streams) - custom_streams->enumerateStreams(callback, path); - else - enumerateStreamsImpl(callback, path); -} - -void IDataType::serializeBinaryBulkStatePrefix( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - if (custom_streams) - custom_streams->serializeBinaryBulkStatePrefix(settings, state); - else - serializeBinaryBulkStatePrefixImpl(settings, state); -} - -void IDataType::serializeBinaryBulkStateSuffix( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - if (custom_streams) - custom_streams->serializeBinaryBulkStateSuffix(settings, state); - else - serializeBinaryBulkStateSuffixImpl(settings, state); -} - -void IDataType::deserializeBinaryBulkStatePrefix( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const -{ - if (custom_streams) - custom_streams->deserializeBinaryBulkStatePrefix(settings, state); - else - deserializeBinaryBulkStatePrefixImpl(settings, state); -} - -void IDataType::serializeBinaryBulkWithMultipleStreams( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const -{ - if (custom_streams) - custom_streams->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); - else - serializeBinaryBulkWithMultipleStreamsImpl(column, offset, limit, settings, state); -} - -void IDataType::deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & /* state */, - SubstreamsCache * /* cache */) const -{ - if (ReadBuffer * stream = settings.getter(settings.path)) - deserializeBinaryBulk(column, *stream, limit, settings.avg_value_size_hint); -} - - -void IDataType::deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const -{ - if (custom_streams) - { - custom_streams->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); - return; - } - - /// Do not cache complex type, because they can be constructed - /// from their subcolumns, which are in cache. - if (!haveSubtypes()) - { - auto cached_column = getFromSubstreamsCache(cache, settings.path); - if (cached_column) - { - column = cached_column; - return; - } - } - - auto mutable_column = column->assumeMutable(); - deserializeBinaryBulkWithMultipleStreamsImpl(*mutable_column, limit, settings, state, cache); - column = std::move(mutable_column); - - if (!haveSubtypes()) - addToSubstreamsCache(cache, settings.path, column); -} - -void IDataType::serializeAsTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextEscaped(column, row_num, ostr, settings); - else - serializeTextEscaped(column, row_num, ostr, settings); -} - -void IDataType::deserializeAsTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeTextEscaped(column, istr, settings); - else - deserializeTextEscaped(column, istr, settings); -} - -void IDataType::serializeAsTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextQuoted(column, row_num, ostr, settings); - else - serializeTextQuoted(column, row_num, ostr, settings); -} - -void IDataType::deserializeAsTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeTextQuoted(column, istr, settings); - else - deserializeTextQuoted(column, istr, settings); -} - -void IDataType::serializeAsTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextCSV(column, row_num, ostr, settings); - else - serializeTextCSV(column, row_num, ostr, settings); -} - -void IDataType::deserializeAsTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeTextCSV(column, istr, settings); - else - deserializeTextCSV(column, istr, settings); -} - -void IDataType::serializeAsText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeText(column, row_num, ostr, settings); - else - serializeText(column, row_num, ostr, settings); -} - -void IDataType::deserializeAsWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeWholeText(column, istr, settings); - else - deserializeWholeText(column, istr, settings); -} - -void IDataType::serializeAsTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextJSON(column, row_num, ostr, settings); - else - serializeTextJSON(column, row_num, ostr, settings); -} - -void IDataType::deserializeAsTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->deserializeTextJSON(column, istr, settings); - else - deserializeTextJSON(column, istr, settings); -} - -void IDataType::serializeAsTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - if (custom_text_serialization) - custom_text_serialization->serializeTextXML(column, row_num, ostr, settings); - else - serializeTextXML(column, row_num, ostr, settings); -} - void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const { /// replace only if not null if (custom_desc_->name) custom_name = std::move(custom_desc_->name); - if (custom_desc_->text_serialization) - custom_text_serialization = std::move(custom_desc_->text_serialization); - - if (custom_desc_->streams) - custom_streams = std::move(custom_desc_->streams); + if (custom_desc_->serialization) + custom_serialization = std::move(custom_desc_->serialization); } -void IDataType::addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column) +SerializationPtr IDataType::getDefaultSerialization() const { - if (cache && !path.empty()) - cache->emplace(getSubcolumnNameForStream(path), column); + if (custom_serialization) + return custom_serialization; + + return doGetDefaultSerialization(); } -ColumnPtr IDataType::getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path) +SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const BaseSerializationGetter &) const { - if (!cache || path.empty()) - return nullptr; + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); +} - auto it = cache->find(getSubcolumnNameForStream(path)); - if (it == cache->end()) - return nullptr; +// static +SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const IDataType::StreamExistenceCallback & callback) +{ + if (column.isSubcolumn()) + { + /// Wrap to custom serialization deepest subcolumn, which is represented in non-complex type. + auto base_serialization_getter = [&](const IDataType & subcolumn_type) + { + return subcolumn_type.getSerialization(column.name, callback); + }; - return it->second; + auto type_in_storage = column.getTypeInStorage(); + return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), base_serialization_getter); + } + + return column.type->getSerialization(column.name, callback); +} + +SerializationPtr IDataType::getSerialization(const String &, const StreamExistenceCallback &) const +{ + return getDefaultSerialization(); +} + +DataTypePtr IDataType::getTypeForSubstream(const ISerialization::SubstreamPath & substream_path) const +{ + auto type = tryGetSubcolumnType(ISerialization::getSubcolumnNameForStream(substream_path)); + if (type) + return type->getSubcolumnType(MAIN_SUBCOLUMN_NAME); + + return getSubcolumnType(MAIN_SUBCOLUMN_NAME); +} + +void IDataType::enumerateStreams(const SerializationPtr & serialization, const StreamCallbackWithType & callback, ISerialization::SubstreamPath & path) const +{ + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + { + callback(substream_path, *getTypeForSubstream(substream_path)); + }, path); } } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index c9c848a8037..9b762cfa4c5 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include namespace DB @@ -27,19 +27,25 @@ using DataTypePtr = std::shared_ptr; using DataTypes = std::vector; struct NameAndTypePair; +class SerializationInfo; /** Properties of data type. - * Contains methods for serialization/deserialization. + * + * Contains methods for getting serialization instances. + * One data type may have different serializations, which can be chosen + * dynamically before reading or writing, according to information about + * column content (see `getSerialization` methods). + * * Implementations of this interface represent a data type (example: UInt8) * or parametric family of data types (example: Array(...)). * * DataType is totally immutable object. You can always share them. */ -class IDataType : private boost::noncopyable +class IDataType : private boost::noncopyable, public std::enable_shared_from_this { public: - IDataType(); + IDataType() = default; virtual ~IDataType(); /// Compile time flag. If false, then if C++ types are the same, then SQL types are also the same. @@ -57,275 +63,47 @@ public: /// Data type id. It's used for runtime type checks. virtual TypeIndex getTypeId() const = 0; - /** Binary serialization for range of values in column - for writing to disk/network, etc. - * - * Some data types are represented in multiple streams while being serialized. - * Example: - * - Arrays are represented as stream of all elements and stream of array sizes. - * - Nullable types are represented as stream of values (with unspecified values in place of NULLs) and stream of NULL flags. - * - * Different streams are identified by "path". - * If the data type require single stream (it's true for most of data types), the stream will have empty path. - * Otherwise, the path can have components like "array elements", "array sizes", etc. - * - * For multidimensional arrays, path can have arbitrary length. - * As an example, for 2-dimensional arrays of numbers we have at least three streams: - * - array sizes; (sizes of top level arrays) - * - array elements / array sizes; (sizes of second level (nested) arrays) - * - array elements / array elements; (the most deep elements, placed contiguously) - * - * Descendants must override either serializeBinaryBulk, deserializeBinaryBulk methods (for simple cases with single stream) - * or serializeBinaryBulkWithMultipleStreams, deserializeBinaryBulkWithMultipleStreams, enumerateStreams methods (for cases with multiple streams). - * - * Default implementations of ...WithMultipleStreams methods will call serializeBinaryBulk, deserializeBinaryBulk for single stream. - */ - - struct Substream - { - enum Type - { - ArrayElements, - ArraySizes, - - NullableElements, - NullMap, - - TupleElement, - - DictionaryKeys, - DictionaryIndexes, - }; - Type type; - - /// Index of tuple element, starting at 1 or name. - String tuple_element_name; - - /// Do we need to escape a dot in filenames for tuple elements. - bool escape_tuple_delimiter = true; - - Substream(Type type_) : type(type_) {} - - String toString() const; - }; - - struct SubstreamPath : public std::vector - { - String toString() const; - }; - - /// Cache for common substreams of one type, but possible different its subcolumns. - /// E.g. sizes of arrays of Nested data type. - using SubstreamsCache = std::unordered_map; - - using StreamCallback = std::function; - - void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const; - void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); } - void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); } - - virtual DataTypePtr tryGetSubcolumnType(const String & /* subcolumn_name */) const { return nullptr; } + static constexpr auto MAIN_SUBCOLUMN_NAME = "__main"; + virtual DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const; DataTypePtr getSubcolumnType(const String & subcolumn_name) const; virtual ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const; Names getSubcolumnNames() const; - using OutputStreamGetter = std::function; - using InputStreamGetter = std::function; + /// Returns default serialization of data type. + SerializationPtr getDefaultSerialization() const; - struct SerializeBinaryBulkState - { - virtual ~SerializeBinaryBulkState() = default; - }; - struct DeserializeBinaryBulkState - { - virtual ~DeserializeBinaryBulkState() = default; - }; + /// Asks whether the stream with given name exists in table. + /// If callback returned true for all streams, which are required for + /// one of serialization types, that serialization will be chosen for reading. + /// If callback always returned false, the default serialization will be chosen. + using StreamExistenceCallback = std::function; + using BaseSerializationGetter = std::function; - using SerializeBinaryBulkStatePtr = std::shared_ptr; - using DeserializeBinaryBulkStatePtr = std::shared_ptr; + /// Chooses serialization for reading of one column or subcolumns by + /// checking existence of substreams using callback. + static SerializationPtr getSerialization( + const NameAndTypePair & column, + const StreamExistenceCallback & callback = [](const String &) { return false; }); - struct SerializeBinaryBulkSettings - { - OutputStreamGetter getter; - SubstreamPath path; + virtual SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const; - size_t low_cardinality_max_dictionary_size = 0; - bool low_cardinality_use_single_dictionary_for_part = true; + /// Returns serialization wrapper for reading one particular subcolumn of data type. + virtual SerializationPtr getSubcolumnSerialization( + const String & subcolumn_name, const BaseSerializationGetter & base_serialization_getter) const; - bool position_independent_encoding = true; - }; + using StreamCallbackWithType = std::function; - struct DeserializeBinaryBulkSettings - { - InputStreamGetter getter; - SubstreamPath path; - - /// True if continue reading from previous positions in file. False if made fseek to the start of new granule. - bool continuous_reading = true; - - bool position_independent_encoding = true; - /// If not zero, may be used to avoid reallocations while reading column of String type. - double avg_value_size_hint = 0; - }; - - /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. - void serializeBinaryBulkStatePrefix( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const; - - /// Call after serializeBinaryBulkWithMultipleStreams chain to finish serialization. - void serializeBinaryBulkStateSuffix( - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const; - - /// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr. - void deserializeBinaryBulkStatePrefix( - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const; - - /** 'offset' and 'limit' are used to specify range. - * limit = 0 - means no limit. - * offset must be not greater than size of column. - * offset + limit could be greater than size of column - * - in that case, column is serialized till the end. - */ - void serializeBinaryBulkWithMultipleStreams( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const; - - /// Read no more than limit values and append them into column. - void deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache = nullptr) const; - - /** Override these methods for data types that require just single stream (most of data types). - */ - virtual void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const; - virtual void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const; - - /** Serialization/deserialization of individual values. - * - * These are helper methods for implementation of various formats to input/output for user (like CSV, JSON, etc.). - * There is no one-to-one correspondence between formats and these methods. - * For example, TabSeparated and Pretty formats could use same helper method serializeTextEscaped. - * - * For complex data types (like arrays) binary serde for individual values may differ from bulk serde. - * For example, if you serialize single array, it will be represented as its size and elements in single contiguous stream, - * but if you bulk serialize column with arrays, then sizes and elements will be written to separate streams. - */ - - /// There is two variants for binary serde. First variant work with Field. - virtual void serializeBinary(const Field & field, WriteBuffer & ostr) const = 0; - virtual void deserializeBinary(Field & field, ReadBuffer & istr) const = 0; - - /// Other variants takes a column, to avoid creating temporary Field object. - /// Column must be non-constant. - - /// Serialize one value of a column at specified row number. - virtual void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const = 0; - /// Deserialize one value and insert into a column. - /// If method will throw an exception, then column will be in same state as before call to method. - virtual void deserializeBinary(IColumn & column, ReadBuffer & istr) const = 0; - - /** Text serialization with escaping but without quoting. - */ - void serializeAsTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - - void deserializeAsTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization as a literal that may be inserted into a query. - */ - void serializeAsTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - - void deserializeAsTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization for the CSV format. - */ - void serializeAsTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - void deserializeAsTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization for displaying on a terminal or saving into a text file, and the like. - * Without escaping or quoting. - */ - void serializeAsText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - - /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. - */ - void deserializeAsWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization intended for using in JSON format. - */ - void serializeAsTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; - void deserializeAsTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; - - /** Text serialization for putting into the XML format. - */ - void serializeAsTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + void enumerateStreams(const SerializationPtr & serialization, const StreamCallbackWithType & callback, ISerialization::SubstreamPath & path) const; + void enumerateStreams(const SerializationPtr & serialization, const StreamCallbackWithType & callback, ISerialization::SubstreamPath && path) const { enumerateStreams(serialization, callback, path); } + void enumerateStreams(const SerializationPtr & serialization, const StreamCallbackWithType & callback) const { enumerateStreams(serialization, callback, {}); } protected: virtual String doGetName() const; + virtual SerializationPtr doGetDefaultSerialization() const = 0; - virtual void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const - { - callback(path, *this); - } - - virtual void serializeBinaryBulkStatePrefixImpl( - SerializeBinaryBulkSettings & /*settings*/, - SerializeBinaryBulkStatePtr & /*state*/) const {} - - virtual void serializeBinaryBulkStateSuffixImpl( - SerializeBinaryBulkSettings & /*settings*/, - SerializeBinaryBulkStatePtr & /*state*/) const {} - - virtual void deserializeBinaryBulkStatePrefixImpl( - DeserializeBinaryBulkSettings & /*settings*/, - DeserializeBinaryBulkStatePtr & /*state*/) const {} - - virtual void serializeBinaryBulkWithMultipleStreamsImpl( - const IColumn & column, - size_t offset, - size_t limit, - SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & /*state*/) const - { - if (WriteBuffer * stream = settings.getter(settings.path)) - serializeBinaryBulk(column, *stream, offset, limit); - } - - virtual void deserializeBinaryBulkWithMultipleStreamsImpl( - IColumn & column, - size_t limit, - DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * cache) const; - - /// Default implementations of text serialization in case of 'custom_text_serialization' is not set. - - virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; - virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; - virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const - { - serializeText(column, row_num, ostr, settings); - } + DataTypePtr getTypeForSubstream(const ISerialization::SubstreamPath & substream_path) const; public: - static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column); - static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); - /** Create empty column for corresponding type. */ virtual MutableColumnPtr createColumn() const = 0; @@ -357,7 +135,6 @@ public: /// Checks that two instances belong to the same type virtual bool equals(const IDataType & rhs) const = 0; - /// Various properties on behaviour of data type. /** The data type is dependent on parameters and types with different parameters are different. @@ -483,27 +260,20 @@ public: /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint); - static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); - static String getSubcolumnNameForStream(const SubstreamPath & path); - - /// Substream path supports special compression methods like codec Delta. - /// For all other substreams (like ArraySizes, NullMasks, etc.) we use only - /// generic compression codecs like LZ4. - static bool isSpecialCompressionAllowed(const SubstreamPath & path); protected: friend class DataTypeFactory; friend class AggregateFunctionSimpleState; + /// Customize this DataType void setCustomization(DataTypeCustomDescPtr custom_desc_) const; /// This is mutable to allow setting custom name and serialization on `const IDataType` post construction. mutable DataTypeCustomNamePtr custom_name; - mutable DataTypeCustomTextSerializationPtr custom_text_serialization; - mutable DataTypeCustomStreamsPtr custom_streams; + mutable SerializationPtr custom_serialization; public: const IDataTypeCustomName * getCustomName() const { return custom_name.get(); } - const IDataTypeCustomStreams * getCustomStreams() const { return custom_streams.get(); } + const ISerialization * getCustomSerialization() const { return custom_serialization.get(); } }; diff --git a/src/DataTypes/IDataTypeDummy.h b/src/DataTypes/IDataTypeDummy.h index 08cc0778a6e..ca522d1c9b4 100644 --- a/src/DataTypes/IDataTypeDummy.h +++ b/src/DataTypes/IDataTypeDummy.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -17,7 +17,7 @@ namespace ErrorCodes * * That is, this class is used just to distinguish the corresponding data type from the others. */ -class IDataTypeDummy : public DataTypeWithSimpleSerialization +class IDataTypeDummy : public IDataType { private: [[noreturn]] void throwNoSerialization() const @@ -26,15 +26,6 @@ private: } public: - void serializeBinary(const Field &, WriteBuffer &) const override { throwNoSerialization(); } - void deserializeBinary(Field &, ReadBuffer &) const override { throwNoSerialization(); } - void serializeBinary(const IColumn &, size_t, WriteBuffer &) const override { throwNoSerialization(); } - void deserializeBinary(IColumn &, ReadBuffer &) const override { throwNoSerialization(); } - void serializeBinaryBulk(const IColumn &, WriteBuffer &, size_t, size_t) const override { throwNoSerialization(); } - void deserializeBinaryBulk(IColumn &, ReadBuffer &, size_t, double) const override { throwNoSerialization(); } - void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } - void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } - MutableColumnPtr createColumn() const override { throw Exception("Method createColumn() is not implemented for data type " + getName(), ErrorCodes::NOT_IMPLEMENTED); @@ -52,6 +43,8 @@ public: bool haveSubtypes() const override { return false; } bool cannotBeStoredInTables() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override { throwNoSerialization(); } }; } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp new file mode 100644 index 00000000000..ab2e8e1958b --- /dev/null +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int MULTIPLE_STREAMS_REQUIRED; +} + +String ISerialization::Substream::toString() const +{ + switch (type) + { + case ArrayElements: + return "ArrayElements"; + case ArraySizes: + return "ArraySizes"; + case NullableElements: + return "NullableElements"; + case NullMap: + return "NullMap"; + case TupleElement: + return "TupleElement(" + tuple_element_name + ", " + + std::to_string(escape_tuple_delimiter) + ")"; + case DictionaryKeys: + return "DictionaryKeys"; + case DictionaryIndexes: + return "DictionaryIndexes"; + case SparseElements: + return "SparseElements"; + case SparseOffsets: + return "SparseOffsets"; + } + + __builtin_unreachable(); +} + +String ISerialization::SubstreamPath::toString() const +{ + WriteBufferFromOwnString wb; + wb << "{"; + for (size_t i = 0; i < size(); ++i) + { + if (i != 0) + wb << ", "; + wb << at(i).toString(); + } + wb << "}"; + return wb.str(); +} + +void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + callback(path); +} + +void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const +{ + throw Exception(ErrorCodes::MULTIPLE_STREAMS_REQUIRED, "Column {} must be serialized with multiple streams", column.getName()); +} + +void ISerialization::deserializeBinaryBulk(IColumn & column, ReadBuffer &, size_t, double) const +{ + throw Exception(ErrorCodes::MULTIPLE_STREAMS_REQUIRED, "Column {} must be deserialized with multiple streams", column.getName()); +} + +void ISerialization::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & /* state */) const +{ + if (WriteBuffer * stream = settings.getter(settings.path)) + serializeBinaryBulk(column, *stream, offset, limit); +} + +void ISerialization::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & /* state */, + SubstreamsCache * cache) const +{ + auto cached_column = getFromSubstreamsCache(cache, settings.path); + if (cached_column) + { + column = cached_column; + } + else if (ReadBuffer * stream = settings.getter(settings.path)) + { + auto mutable_column = column->assumeMutable(); + deserializeBinaryBulk(*mutable_column, *stream, limit, settings.avg_value_size_hint); + column = std::move(mutable_column); + addToSubstreamsCache(cache, settings.path, column); + } +} + +static String getNameForSubstreamPath( + String stream_name, + const ISerialization::SubstreamPath & path, + bool escape_tuple_delimiter) +{ + using Substream = ISerialization::Substream; + + size_t array_level = 0; + for (const auto & elem : path) + { + if (elem.type == Substream::NullMap) + stream_name += ".null"; + else if (elem.type == Substream::ArraySizes) + stream_name += ".size" + toString(array_level); + else if (elem.type == Substream::ArrayElements) + ++array_level; + else if (elem.type == Substream::DictionaryKeys) + stream_name += ".dict"; + else if (elem.type == Substream::SparseOffsets) + stream_name += ".sparse.idx"; + else if (elem.type == Substream::TupleElement) + { + /// For compatibility reasons, we use %2E (escaped dot) instead of dot. + /// Because nested data may be represented not by Array of Tuple, + /// but by separate Array columns with names in a form of a.b, + /// and name is encoded as a whole. + stream_name += (escape_tuple_delimiter && elem.escape_tuple_delimiter ? + escapeForFileName(".") : ".") + escapeForFileName(elem.tuple_element_name); + } + } + + return stream_name; +} + +String ISerialization::getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path) +{ + return getFileNameForStream(column.getNameInStorage(), path); +} + +String ISerialization::getFileNameForStream(const String & name_in_storage, const SubstreamPath & path) +{ + String stream_name; + auto nested_storage_name = Nested::extractTableName(name_in_storage); + if (name_in_storage != nested_storage_name && (path.size() == 1 && path[0].type == ISerialization::Substream::ArraySizes)) + stream_name = escapeForFileName(nested_storage_name); + else + stream_name = escapeForFileName(name_in_storage); + + return getNameForSubstreamPath(std::move(stream_name), path, true); +} + +String ISerialization::getSubcolumnNameForStream(const SubstreamPath & path) +{ + auto subcolumn_name = getNameForSubstreamPath("", path, false); + if (!subcolumn_name.empty()) + subcolumn_name = subcolumn_name.substr(1); // It starts with a dot. + + return subcolumn_name; +} + +void ISerialization::addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column) +{ + if (cache && !path.empty()) + cache->emplace(getSubcolumnNameForStream(path), column); +} + +ColumnPtr ISerialization::getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path) +{ + if (!cache || path.empty()) + return nullptr; + + auto it = cache->find(getSubcolumnNameForStream(path)); + if (it == cache->end()) + return nullptr; + + return it->second; +} + +bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) +{ + for (const auto & elem : path) + { + if (elem.type == Substream::NullMap + || elem.type == Substream::ArraySizes + || elem.type == Substream::DictionaryIndexes + || elem.type == Substream::SparseOffsets) + return false; + } + return true; +} + +} diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h new file mode 100644 index 00000000000..03785fc07f4 --- /dev/null +++ b/src/DataTypes/Serializations/ISerialization.h @@ -0,0 +1,259 @@ +#pragma once + +#include +#include + +#include +#include + +namespace DB +{ + +class IDataType; + +class ReadBuffer; +class WriteBuffer; +class ProtobufReader; +class ProtobufWriter; + +class IColumn; +using ColumnPtr = COW::Ptr; +using MutableColumnPtr = COW::MutablePtr; + +class Field; + +struct FormatSettings; +struct NameAndTypePair; + +class ISerialization +{ +public: + ISerialization() = default; + virtual ~ISerialization() = default; + + /** Binary serialization for range of values in column - for writing to disk/network, etc. + * + * Some data types are represented in multiple streams while being serialized. + * Example: + * - Arrays are represented as stream of all elements and stream of array sizes. + * - Nullable types are represented as stream of values (with unspecified values in place of NULLs) and stream of NULL flags. + * + * Different streams are identified by "path". + * If the data type require single stream (it's true for most of data types), the stream will have empty path. + * Otherwise, the path can have components like "array elements", "array sizes", etc. + * + * For multidimensional arrays, path can have arbitrary length. + * As an example, for 2-dimensional arrays of numbers we have at least three streams: + * - array sizes; (sizes of top level arrays) + * - array elements / array sizes; (sizes of second level (nested) arrays) + * - array elements / array elements; (the most deep elements, placed contiguously) + * + * Descendants must override either serializeBinaryBulk, deserializeBinaryBulk methods (for simple cases with single stream) + * or serializeBinaryBulkWithMultipleStreams, deserializeBinaryBulkWithMultipleStreams, enumerateStreams methods (for cases with multiple streams). + * + * Default implementations of ...WithMultipleStreams methods will call serializeBinaryBulk, deserializeBinaryBulk for single stream. + */ + + struct Substream + { + enum Type + { + ArrayElements, + ArraySizes, + + NullableElements, + NullMap, + + TupleElement, + + DictionaryKeys, + DictionaryIndexes, + + SparseElements, + SparseOffsets, + }; + Type type; + + /// Index of tuple element, starting at 1 or name. + String tuple_element_name; + + /// Do we need to escape a dot in filenames for tuple elements. + bool escape_tuple_delimiter = true; + + Substream(Type type_) : type(type_) {} + + String toString() const; + }; + + struct SubstreamPath : public std::vector + { + String toString() const; + }; + + /// Cache for common substreams of one type, but possible different its subcolumns. + /// E.g. sizes of arrays of Nested data type. + using SubstreamsCache = std::unordered_map; + + using StreamCallback = std::function; + + virtual void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const; + void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); } + void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); } + + using OutputStreamGetter = std::function; + using InputStreamGetter = std::function; + + struct SerializeBinaryBulkState + { + virtual ~SerializeBinaryBulkState() = default; + }; + + struct DeserializeBinaryBulkState + { + virtual ~DeserializeBinaryBulkState() = default; + }; + + using SerializeBinaryBulkStatePtr = std::shared_ptr; + using DeserializeBinaryBulkStatePtr = std::shared_ptr; + + struct SerializeBinaryBulkSettings + { + OutputStreamGetter getter; + SubstreamPath path; + + size_t low_cardinality_max_dictionary_size = 0; + bool low_cardinality_use_single_dictionary_for_part = true; + + bool position_independent_encoding = true; + }; + + struct DeserializeBinaryBulkSettings + { + InputStreamGetter getter; + SubstreamPath path; + + /// True if continue reading from previous positions in file. False if made fseek to the start of new granule. + bool continuous_reading = true; + + bool position_independent_encoding = true; + /// If not zero, may be used to avoid reallocations while reading column of String type. + double avg_value_size_hint = 0; + }; + + /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. + virtual void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & /*settings*/, + SerializeBinaryBulkStatePtr & /*state*/) const {} + + /// Call after serializeBinaryBulkWithMultipleStreams chain to finish serialization. + virtual void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & /*settings*/, + SerializeBinaryBulkStatePtr & /*state*/) const {} + + /// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr. + virtual void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & /*settings*/, + DeserializeBinaryBulkStatePtr & /*state*/) const {} + + /** 'offset' and 'limit' are used to specify range. + * limit = 0 - means no limit. + * offset must be not greater than size of column. + * offset + limit could be greater than size of column + * - in that case, column is serialized till the end. + */ + virtual void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const; + + /// Read no more than limit values and append them into column. + virtual void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const; + + /** Override these methods for data types that require just single stream (most of data types). + */ + virtual void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const; + virtual void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const; + + /** Serialization/deserialization of individual values. + * + * These are helper methods for implementation of various formats to input/output for user (like CSV, JSON, etc.). + * There is no one-to-one correspondence between formats and these methods. + * For example, TabSeparated and Pretty formats could use same helper method serializeTextEscaped. + * + * For complex data types (like arrays) binary serde for individual values may differ from bulk serde. + * For example, if you serialize single array, it will be represented as its size and elements in single contiguous stream, + * but if you bulk serialize column with arrays, then sizes and elements will be written to separate streams. + */ + + /// There is two variants for binary serde. First variant work with Field. + virtual void serializeBinary(const Field & field, WriteBuffer & ostr) const = 0; + virtual void deserializeBinary(Field & field, ReadBuffer & istr) const = 0; + + /// Other variants takes a column, to avoid creating temporary Field object. + /// Column must be non-constant. + + /// Serialize one value of a column at specified row number. + virtual void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const = 0; + /// Deserialize one value and insert into a column. + /// If method will throw an exception, then column will be in same state as before call to method. + virtual void deserializeBinary(IColumn & column, ReadBuffer & istr) const = 0; + + /** Text serialization with escaping but without quoting. + */ + virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization as a literal that may be inserted into a query. + */ + virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization for the CSV format. + */ + virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization for displaying on a terminal or saving into a text file, and the like. + * Without escaping or quoting. + */ + virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. + */ + virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization intended for using in JSON format. + */ + virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization for putting into the XML format. + */ + virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const + { + serializeText(column, row_num, ostr, settings); + } + + static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); + static String getFileNameForStream(const String & name_in_storage, const SubstreamPath & path); + static String getSubcolumnNameForStream(const SubstreamPath & path); + + static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column); + static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); + + static bool isSpecialCompressionAllowed(const SubstreamPath & path); +}; + +using SerializationPtr = std::shared_ptr; +using Serializations = std::vector; + +} diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp new file mode 100644 index 00000000000..e0bcb65d895 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -0,0 +1,221 @@ +#include + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +void SerializationAggregateFunction::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const String & s = get(field); + writeVarUInt(s.size(), ostr); + writeString(s, ostr); +} + +void SerializationAggregateFunction::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + UInt64 size; + readVarUInt(size, istr); + field = String(); + String & s = get(field); + s.resize(size); + istr.readStrict(s.data(), size); +} + +void SerializationAggregateFunction::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + function->serialize(assert_cast(column).getData()[row_num], ostr); +} + +void SerializationAggregateFunction::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + ColumnAggregateFunction & column_concrete = assert_cast(column); + + Arena & arena = column_concrete.createOrGetArena(); + size_t size_of_state = function->sizeOfData(); + AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); + + function->create(place); + try + { + function->deserialize(place, istr, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + column_concrete.getData().push_back(place); +} + +void SerializationAggregateFunction::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnAggregateFunction & real_column = typeid_cast(column); + const ColumnAggregateFunction::Container & vec = real_column.getData(); + + ColumnAggregateFunction::Container::const_iterator it = vec.begin() + offset; + ColumnAggregateFunction::Container::const_iterator end = limit ? it + limit : vec.end(); + + if (end > vec.end()) + end = vec.end(); + + for (; it != end; ++it) + function->serialize(*it, ostr); +} + +void SerializationAggregateFunction::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + ColumnAggregateFunction & real_column = typeid_cast(column); + ColumnAggregateFunction::Container & vec = real_column.getData(); + + Arena & arena = real_column.createOrGetArena(); + real_column.set(function); + vec.reserve(vec.size() + limit); + + size_t size_of_state = function->sizeOfData(); + size_t align_of_state = function->alignOfData(); + + for (size_t i = 0; i < limit; ++i) + { + if (istr.eof()) + break; + + AggregateDataPtr place = arena.alignedAlloc(size_of_state, align_of_state); + + function->create(place); + + try + { + function->deserialize(place, istr, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + vec.push_back(place); + } +} + +static String serializeToString(const AggregateFunctionPtr & function, const IColumn & column, size_t row_num) +{ + WriteBufferFromOwnString buffer; + function->serialize(assert_cast(column).getData()[row_num], buffer); + return buffer.str(); +} + +static void deserializeFromString(const AggregateFunctionPtr & function, IColumn & column, const String & s) +{ + ColumnAggregateFunction & column_concrete = assert_cast(column); + + Arena & arena = column_concrete.createOrGetArena(); + size_t size_of_state = function->sizeOfData(); + AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); + + function->create(place); + + try + { + ReadBufferFromString istr(s); + function->deserialize(place, istr, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + column_concrete.getData().push_back(place); +} + +void SerializationAggregateFunction::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readEscapedString(s, istr); + deserializeFromString(function, column, s); +} + + +void SerializationAggregateFunction::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readQuotedStringWithSQLStyle(s, istr); + deserializeFromString(function, column, s); +} + + +void SerializationAggregateFunction::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readStringUntilEOF(s, istr); + deserializeFromString(function, column, s); +} + + +void SerializationAggregateFunction::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(serializeToString(function, column, row_num), ostr, settings); +} + + +void SerializationAggregateFunction::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readJSONString(s, istr); + deserializeFromString(function, column, s); +} + + +void SerializationAggregateFunction::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSV(serializeToString(function, column, row_num), ostr); +} + + +void SerializationAggregateFunction::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + deserializeFromString(function, column, s); +} + +} diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.h b/src/DataTypes/Serializations/SerializationAggregateFunction.h new file mode 100644 index 00000000000..58a7d52ffe7 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.h @@ -0,0 +1,43 @@ +#pragma once + +#include + +#include + + +namespace DB +{ + +class SerializationAggregateFunction final : public ISerialization +{ +private: + AggregateFunctionPtr function; + +public: + static constexpr bool is_parametric = true; + + SerializationAggregateFunction(const AggregateFunctionPtr & function_): function(function_) {} + + /// NOTE These two functions for serializing single values are incompatible with the functions below. + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp new file mode 100644 index 00000000000..70a72c51e78 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -0,0 +1,507 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; + extern const int CANNOT_READ_ARRAY_FROM_TEXT; + extern const int LOGICAL_ERROR; +} + +void SerializationArray::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const Array & a = get(field); + writeVarUInt(a.size(), ostr); + for (size_t i = 0; i < a.size(); ++i) + { + nested->serializeBinary(a[i], ostr); + } +} + + +void SerializationArray::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + size_t size; + readVarUInt(size, istr); + field = Array(size); + Array & arr = get(field); + for (size_t i = 0; i < size; ++i) + nested->deserializeBinary(arr[i], istr); +} + + +void SerializationArray::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + size_t size = next_offset - offset; + + writeVarUInt(size, ostr); + + const IColumn & nested_column = column_array.getData(); + for (size_t i = offset; i < next_offset; ++i) + nested->serializeBinary(nested_column, i, ostr); +} + + +void SerializationArray::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + ColumnArray & column_array = assert_cast(column); + ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t size; + readVarUInt(size, istr); + + IColumn & nested_column = column_array.getData(); + + size_t i = 0; + try + { + for (; i < size; ++i) + nested->deserializeBinary(nested_column, istr); + } + catch (...) + { + if (i) + nested_column.popBack(i); + throw; + } + + offsets.push_back(offsets.back() + size); +} + + +namespace +{ + void serializeArraySizesPositionIndependent(const IColumn & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) + { + const ColumnArray & column_array = typeid_cast(column); + const ColumnArray::Offsets & offset_values = column_array.getOffsets(); + size_t size = offset_values.size(); + + if (!size) + return; + + size_t end = limit && (offset + limit < size) + ? offset + limit + : size; + + ColumnArray::Offset prev_offset = offset_values[offset - 1]; + for (size_t i = offset; i < end; ++i) + { + ColumnArray::Offset current_offset = offset_values[i]; + writeIntBinary(current_offset - prev_offset, ostr); + prev_offset = current_offset; + } + } + + void deserializeArraySizesPositionIndependent(ColumnArray & column_array, ReadBuffer & istr, UInt64 limit) + { + ColumnArray::Offsets & offset_values = column_array.getOffsets(); + size_t initial_size = offset_values.size(); + offset_values.resize(initial_size + limit); + + size_t i = initial_size; + ColumnArray::Offset current_offset = initial_size ? offset_values[initial_size - 1] : 0; + while (i < initial_size + limit && !istr.eof()) + { + ColumnArray::Offset current_size = 0; + readIntBinary(current_size, istr); + current_offset += current_size; + offset_values[i] = current_offset; + ++i; + } + + offset_values.resize(i); + } + + ColumnPtr arraySizesToOffsets(const IColumn & column) + { + const auto & column_sizes = assert_cast(column); + MutableColumnPtr column_offsets = column_sizes.cloneEmpty(); + + if (column_sizes.empty()) + return column_offsets; + + const auto & sizes_data = column_sizes.getData(); + auto & offsets_data = assert_cast(*column_offsets).getData(); + + offsets_data.resize(sizes_data.size()); + + IColumn::Offset prev_offset = 0; + for (size_t i = 0, size = sizes_data.size(); i < size; ++i) + { + prev_offset += sizes_data[i]; + offsets_data[i] = prev_offset; + } + + return column_offsets; + } +} + +ColumnPtr arrayOffsetsToSizes(const IColumn & column) +{ + const auto & column_offsets = assert_cast(column); + MutableColumnPtr column_sizes = column_offsets.cloneEmpty(); + + if (column_offsets.empty()) + return column_sizes; + + const auto & offsets_data = column_offsets.getData(); + auto & sizes_data = assert_cast(*column_sizes).getData(); + + sizes_data.resize(offsets_data.size()); + + IColumn::Offset prev_offset = 0; + for (size_t i = 0, size = offsets_data.size(); i < size; ++i) + { + auto current_offset = offsets_data[i]; + sizes_data[i] = current_offset - prev_offset; + prev_offset = current_offset; + } + + return column_sizes; +} + + +void SerializationArray::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + path.push_back(Substream::ArraySizes); + callback(path); + path.back() = Substream::ArrayElements; + nested->enumerateStreams(callback, path); + path.pop_back(); +} + + +void SerializationArray::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + nested->serializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + nested->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + nested->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnArray & column_array = typeid_cast(column); + + /// First serialize array sizes. + settings.path.push_back(Substream::ArraySizes); + if (auto * stream = settings.getter(settings.path)) + { + if (settings.position_independent_encoding) + serializeArraySizesPositionIndependent(column, *stream, offset, limit); + else + SerializationNumber().serializeBinaryBulk(*column_array.getOffsetsPtr(), *stream, offset, limit); + } + + /// Then serialize contents of arrays. + settings.path.back() = Substream::ArrayElements; + const ColumnArray::Offsets & offset_values = column_array.getOffsets(); + + if (offset > offset_values.size()) + return; + + /** offset - from which array to write. + * limit - how many arrays should be written, or 0, if you write everything that is. + * end - up to which array the recorded piece ends. + * + * nested_offset - from which element of the innards to write. + * nested_limit - how many elements of the innards to write, or 0, if you write everything that is. + */ + + size_t end = std::min(offset + limit, offset_values.size()); + + size_t nested_offset = offset ? offset_values[offset - 1] : 0; + size_t nested_limit = limit + ? offset_values[end - 1] - nested_offset + : 0; + + if (limit == 0 || nested_limit) + nested->serializeBinaryBulkWithMultipleStreams(column_array.getData(), nested_offset, nested_limit, settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnArray & column_array = typeid_cast(*mutable_column); + settings.path.push_back(Substream::ArraySizes); + + if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) + { + column_array.getOffsetsPtr() = arraySizesToOffsets(*cached_column); + } + else if (auto * stream = settings.getter(settings.path)) + { + if (settings.position_independent_encoding) + deserializeArraySizesPositionIndependent(column_array, *stream, limit); + else + SerializationNumber().deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0); + + addToSubstreamsCache(cache, settings.path, arrayOffsetsToSizes(column_array.getOffsetsColumn())); + } + + settings.path.back() = Substream::ArrayElements; + + ColumnArray::Offsets & offset_values = column_array.getOffsets(); + ColumnPtr & nested_column = column_array.getDataPtr(); + + /// Number of values corresponding with `offset_values` must be read. + size_t last_offset = offset_values.back(); + if (last_offset < nested_column->size()) + throw Exception("Nested column is longer than last offset", ErrorCodes::LOGICAL_ERROR); + size_t nested_limit = last_offset - nested_column->size(); + + /// Adjust value size hint. Divide it to the average array size. + settings.avg_value_size_hint = nested_limit ? settings.avg_value_size_hint / nested_limit * offset_values.size() : 0; + + nested->deserializeBinaryBulkWithMultipleStreams(nested_column, nested_limit, settings, state, cache); + + settings.path.pop_back(); + + /// Check consistency between offsets and elements subcolumns. + /// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER. + if (!nested_column->empty() && nested_column->size() != last_offset) + throw ParsingException("Cannot read all array values: read just " + toString(nested_column->size()) + " of " + toString(last_offset), + ErrorCodes::CANNOT_READ_ALL_DATA); + + column = std::move(mutable_column); +} + + +template +static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && write_nested) +{ + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeChar('[', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + write_nested(nested_column, i); + } + writeChar(']', ostr); +} + + +template +static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) +{ + ColumnArray & column_array = assert_cast(column); + ColumnArray::Offsets & offsets = column_array.getOffsets(); + + IColumn & nested_column = column_array.getData(); + + size_t size = 0; + + bool has_braces = false; + if (checkChar('[', istr)) + has_braces = true; + else if (!allow_unenclosed) + throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + + try + { + bool first = true; + while (!istr.eof() && *istr.position() != ']') + { + if (!first) + { + if (*istr.position() == ',') + ++istr.position(); + else + throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, + "Cannot read array from text, expected comma or end of array, found '{}'", + *istr.position()); + } + + first = false; + + skipWhitespaceIfAny(istr); + + if (*istr.position() == ']') + break; + + read_nested(nested_column); + ++size; + + skipWhitespaceIfAny(istr); + } + + if (has_braces) + assertChar(']', istr); + else /// If array is not enclosed in braces, we read until EOF. + assertEOF(istr); + } + catch (...) + { + if (size) + nested_column.popBack(size); + throw; + } + + offsets.push_back(offsets.back() + size); +} + + +void SerializationArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, + [&](const IColumn & nested_column, size_t i) + { + nested->serializeTextQuoted(nested_column, i, ostr, settings); + }); +} + + +void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, istr, + [&](IColumn & nested_column) + { + nested->deserializeTextQuoted(nested_column, istr, settings); + }, false); +} + +void SerializationArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeChar('[', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + nested->serializeTextJSON(nested_column, i, ostr, settings); + } + writeChar(']', ostr); +} + + +void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, istr, + [&](IColumn & nested_column) + { + nested->deserializeTextJSON(nested_column, istr, settings); + }, false); +} + + +void SerializationArray::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeCString("", ostr); + for (size_t i = offset; i < next_offset; ++i) + { + writeCString("", ostr); + nested->serializeTextXML(nested_column, i, ostr, settings); + writeCString("", ostr); + } + writeCString("", ostr); +} + + +void SerializationArray::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + /// There is no good way to serialize an array in CSV. Therefore, we serialize it into a string, and then write the resulting string in CSV. + WriteBufferFromOwnString wb; + serializeText(column, row_num, wb, settings); + writeCSV(wb.str(), ostr); +} + + +void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + ReadBufferFromString rb(s); + + if (settings.csv.input_format_arrays_as_nested_csv) + { + deserializeTextImpl(column, rb, + [&](IColumn & nested_column) + { + nested->deserializeTextCSV(nested_column, rb, settings); + }, true); + } + else + { + deserializeTextImpl(column, rb, + [&](IColumn & nested_column) + { + nested->deserializeTextQuoted(nested_column, rb, settings); + }, true); + } +} + +} diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h new file mode 100644 index 00000000000..71037090a48 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -0,0 +1,69 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationArray final : public SimpleTextSerialization +{ +private: + SerializationPtr nested; + +public: + SerializationArray(const SerializationPtr & nested_) : nested(nested_) {} + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Streaming serialization of arrays is arranged in a special way: + * - elements placed in a row are written/read without array sizes; + * - the sizes are written/read in a separate stream, + * This is necessary, because when implementing nested structures, several arrays can have common sizes. + */ + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; +}; + +ColumnPtr arrayOffsetsToSizes(const IColumn & column); + +} diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp new file mode 100644 index 00000000000..9347c4f60f3 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp @@ -0,0 +1,97 @@ +#include + +#include +#include +#include +#include + +namespace +{ + +using namespace DB; + +String serializeToString(const SerializationCustomSimpleText & domain, const IColumn & column, size_t row_num, const FormatSettings & settings) +{ + WriteBufferFromOwnString buffer; + domain.serializeText(column, row_num, buffer, settings); + + return buffer.str(); +} + +void deserializeFromString(const SerializationCustomSimpleText & domain, IColumn & column, const String & s, const FormatSettings & settings) +{ + ReadBufferFromString istr(s); + domain.deserializeText(column, istr, settings); +} + +} + +namespace DB +{ + +SerializationCustomSimpleText::SerializationCustomSimpleText(const SerializationPtr & nested_) + : SerializationWrapper(nested_) +{ +} + +void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeEscapedString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readEscapedString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeQuotedString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readQuotedString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeCSVString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readCSVString(str, istr, settings.csv); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings); +} + +void SerializationCustomSimpleText::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readJSONString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeXMLStringForTextElement(serializeToString(*this, column, row_num, settings), ostr); +} + +} diff --git a/src/DataTypes/DataTypeCustomSimpleTextSerialization.h b/src/DataTypes/Serializations/SerializationCustomSimpleText.h similarity index 92% rename from src/DataTypes/DataTypeCustomSimpleTextSerialization.h rename to src/DataTypes/Serializations/SerializationCustomSimpleText.h index d983b66eecc..ae938b1104b 100644 --- a/src/DataTypes/DataTypeCustomSimpleTextSerialization.h +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB { @@ -12,9 +12,11 @@ class IColumn; /** Simple IDataTypeCustomTextSerialization that uses serializeText/deserializeText * for all serialization and deserialization. */ -class DataTypeCustomSimpleTextSerialization : public IDataTypeCustomTextSerialization +class SerializationCustomSimpleText : public SerializationWrapper { public: + SerializationCustomSimpleText(const SerializationPtr & nested_); + // Methods that subclasses must override in order to get full serialization/deserialization support. virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override = 0; virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; diff --git a/src/DataTypes/Serializations/SerializationDate.cpp b/src/DataTypes/Serializations/SerializationDate.cpp new file mode 100644 index 00000000000..ee9110d360d --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDate.cpp @@ -0,0 +1,83 @@ +#include + +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + +void SerializationDate::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeDateText(DayNum(assert_cast(column).getData()[row_num]), ostr); +} + +void SerializationDate::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); +} + +void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + readDateText(x, istr); + assert_cast(column).getData().push_back(x); +} + +void SerializationDate::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationDate::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDate::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + assertChar('\'', istr); + readDateText(x, istr); + assertChar('\'', istr); + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDate::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + assertChar('"', istr); + readDateText(x, istr); + assertChar('"', istr); + assert_cast(column).getData().push_back(x); +} + +void SerializationDate::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + LocalDate value; + readCSV(value, istr); + assert_cast(column).getData().push_back(value.getDayNum()); +} + +} diff --git a/src/DataTypes/Serializations/SerializationDate.h b/src/DataTypes/Serializations/SerializationDate.h new file mode 100644 index 00000000000..099d7444c3d --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDate.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationDate final : public SerializationNumber +{ +public: + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp new file mode 100644 index 00000000000..16e47601eeb --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -0,0 +1,155 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace +{ + +inline void readText(time_t & x, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + readDateTimeText(x, istr, time_zone); + return; + case FormatSettings::DateTimeInputFormat::BestEffort: + parseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); + return; + } +} + +} + +SerializationDateTime::SerializationDateTime( + const DateLUTImpl & time_zone_, const DateLUTImpl & utc_time_zone_) + : time_zone(time_zone_), utc_time_zone(utc_time_zone_) +{ +} + +void SerializationDateTime::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto value = assert_cast(column).getData()[row_num]; + switch (settings.date_time_output_format) + { + case FormatSettings::DateTimeOutputFormat::Simple: + writeDateTimeText(value, ostr, time_zone); + return; + case FormatSettings::DateTimeOutputFormat::UnixTimestamp: + writeIntText(value, ostr); + return; + case FormatSettings::DateTimeOutputFormat::ISO: + writeDateTimeTextISO(value, ostr, utc_time_zone); + return; + } +} + +void SerializationDateTime::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationDateTime::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); +} + +void SerializationDateTime::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + readText(x, istr, settings, time_zone, utc_time_zone); + if (x < 0) + x = 0; + assert_cast(column).getData().push_back(x); +} + +void SerializationDateTime::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDateTime::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + readText(x, istr, settings, time_zone, utc_time_zone); + assertChar('\'', istr); + } + else /// Just 1504193808 or 01504193808 + { + readIntText(x, istr); + } + if (x < 0) + x = 0; + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDateTime::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('"', istr)) + { + readText(x, istr, settings, time_zone, utc_time_zone); + assertChar('"', istr); + } + else + { + readIntText(x, istr); + } + if (x < 0) + x = 0; + assert_cast(column).getData().push_back(x); +} + +void SerializationDateTime::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + + if (istr.eof()) + throwReadAfterEOF(); + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + ++istr.position(); + + readText(x, istr, settings, time_zone, utc_time_zone); + + if (maybe_quote == '\'' || maybe_quote == '\"') + assertChar(maybe_quote, istr); + + if (x < 0) + x = 0; + + assert_cast(column).getData().push_back(x); +} + +} diff --git a/src/DataTypes/Serializations/SerializationDateTime.h b/src/DataTypes/Serializations/SerializationDateTime.h new file mode 100644 index 00000000000..8cf57ddef89 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDateTime.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +class DateLUTImpl; + +namespace DB +{ + +class SerializationDateTime final : public SerializationNumber +{ +private: + const DateLUTImpl & time_zone; + const DateLUTImpl & utc_time_zone; + +public: + SerializationDateTime(const DateLUTImpl & time_zone_, const DateLUTImpl & utc_time_zone_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} + diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp new file mode 100644 index 00000000000..9f3958faa4d --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -0,0 +1,151 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +SerializationDateTime64::SerializationDateTime64( + const DateLUTImpl & time_zone_, const DateLUTImpl & utc_time_zone_, UInt32 scale_) + : SerializationDecimalBase(DecimalUtils::max_precision, scale_) + , time_zone(time_zone_), utc_time_zone(utc_time_zone_) +{ +} + +void SerializationDateTime64::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto value = assert_cast(column).getData()[row_num]; + switch (settings.date_time_output_format) + { + case FormatSettings::DateTimeOutputFormat::Simple: + writeDateTimeText(value, scale, ostr, time_zone); + return; + case FormatSettings::DateTimeOutputFormat::UnixTimestamp: + writeDateTimeUnixTimestamp(value, scale, ostr); + return; + case FormatSettings::DateTimeOutputFormat::ISO: + writeDateTimeTextISO(value, scale, ostr, utc_time_zone); + return; + } +} + +void SerializationDateTime64::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DateTime64 result = 0; + readDateTime64Text(result, scale, istr, time_zone); + assert_cast(column).getData().push_back(result); +} + +void SerializationDateTime64::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); +} + +void SerializationDateTime64::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +static inline void readText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + readDateTime64Text(x, scale, istr, time_zone); + return; + case FormatSettings::DateTimeInputFormat::BestEffort: + parseDateTime64BestEffort(x, scale, istr, time_zone, utc_time_zone); + return; + } +} + +void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assert_cast(column).getData().push_back(x); +} + +void SerializationDateTime64::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDateTime64::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assertChar('\'', istr); + } + else /// Just 1504193808 or 01504193808 + { + readIntText(x, istr); + } + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDateTime64::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('"', istr)) + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assertChar('"', istr); + } + else + { + readIntText(x, istr); + } + assert_cast(column).getData().push_back(x); +} + +void SerializationDateTime64::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + + if (istr.eof()) + throwReadAfterEOF(); + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + ++istr.position(); + + readText(x, scale, istr, settings, time_zone, utc_time_zone); + + if (maybe_quote == '\'' || maybe_quote == '\"') + assertChar(maybe_quote, istr); + + assert_cast(column).getData().push_back(x); +} + +} diff --git a/src/DataTypes/Serializations/SerializationDateTime64.h b/src/DataTypes/Serializations/SerializationDateTime64.h new file mode 100644 index 00000000000..c36649daef1 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDateTime64.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +class DateLUTImpl; + +namespace DB +{ + +class SerializationDateTime64 final : public SerializationDecimalBase +{ +private: + const DateLUTImpl & time_zone; + const DateLUTImpl & utc_time_zone; + +public: + SerializationDateTime64(const DateLUTImpl & time_zone_, const DateLUTImpl & utc_time_zone_, UInt32 scale_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDecimal.cpp b/src/DataTypes/Serializations/SerializationDecimal.cpp new file mode 100644 index 00000000000..e0073c80aca --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDecimal.cpp @@ -0,0 +1,74 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int DECIMAL_OVERFLOW; +} + +template +bool SerializationDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale) +{ + UInt32 unread_scale = scale; + if (!tryReadDecimalText(istr, x, precision, unread_scale)) + return false; + + if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) + return false; + + return true; +} + +template +void SerializationDecimal::readText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale, bool csv) +{ + UInt32 unread_scale = scale; + if (csv) + readCSVDecimalText(istr, x, precision, unread_scale); + else + readDecimalText(istr, x, precision, unread_scale); + + if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) + throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); +} + +template +void SerializationDecimal::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + T value = assert_cast(column).getData()[row_num]; + writeText(value, this->scale, ostr); +} + +template +void SerializationDecimal::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + readText(x, istr); + assert_cast(column).getData().push_back(x); +} + +template +void SerializationDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + readText(x, istr, true); + assert_cast(column).getData().push_back(x); +} + +template class SerializationDecimal; +template class SerializationDecimal; +template class SerializationDecimal; +template class SerializationDecimal; + +} diff --git a/src/DataTypes/Serializations/SerializationDecimal.h b/src/DataTypes/Serializations/SerializationDecimal.h new file mode 100644 index 00000000000..dc193cdf0d3 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDecimal.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +namespace DB +{ + +template +class SerializationDecimal final : public SerializationDecimalBase +{ +public: + using typename SerializationDecimalBase::ColumnType; + + SerializationDecimal(UInt32 precision_, UInt32 scale_) + : SerializationDecimalBase(precision_, scale_) {} + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void readText(T & x, ReadBuffer & istr, bool csv = false) const { readText(x, istr, this->precision, this->scale, csv); } + + static void readText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); + static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDecimalBase.cpp b/src/DataTypes/Serializations/SerializationDecimalBase.cpp new file mode 100644 index 00000000000..8557c9ff719 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDecimalBase.cpp @@ -0,0 +1,73 @@ +#include + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +template +void SerializationDecimalBase::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + FieldType x = get>(field); + writeBinary(x, ostr); +} + +template +void SerializationDecimalBase::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const FieldType & x = assert_cast(column).getElement(row_num); + writeBinary(x, ostr); +} + +template +void SerializationDecimalBase::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnType::Container & x = typeid_cast(column).getData(); + + size_t size = x.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + ostr.write(reinterpret_cast(&x[offset]), sizeof(FieldType) * limit); +} + +template +void SerializationDecimalBase::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + typename FieldType::NativeType x; + readBinary(x, istr); + field = DecimalField(T(x), this->scale); +} + +template +void SerializationDecimalBase::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + typename FieldType::NativeType x; + readBinary(x, istr); + assert_cast(column).getData().push_back(FieldType(x)); +} + +template +void SerializationDecimalBase::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double) const +{ + typename ColumnType::Container & x = typeid_cast(column).getData(); + size_t initial_size = x.size(); + x.resize(initial_size + limit); + size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(FieldType) * limit); + x.resize(initial_size + size / sizeof(FieldType)); +} + +template class SerializationDecimalBase; +template class SerializationDecimalBase; +template class SerializationDecimalBase; +template class SerializationDecimalBase; +template class SerializationDecimalBase; + +} diff --git a/src/DataTypes/Serializations/SerializationDecimalBase.h b/src/DataTypes/Serializations/SerializationDecimalBase.h new file mode 100644 index 00000000000..fd3dcb17e35 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDecimalBase.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include + +namespace DB +{ + +template +class SerializationDecimalBase : public SimpleTextSerialization +{ +protected: + const UInt32 precision; + const UInt32 scale; + +public: + using FieldType = T; + using ColumnType = ColumnDecimal; + + SerializationDecimalBase(UInt32 precision_, UInt32 scale_) + : precision(precision_), scale(scale_) {} + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp new file mode 100644 index 00000000000..a1550e70608 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -0,0 +1,112 @@ +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +template +void SerializationEnum::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.tsv.input_format_enum_as_number) + assert_cast(column).getData().push_back(readValue(istr)); + else + { + /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. + std::string field_name; + readEscapedString(field_name, istr); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template +void SerializationEnum::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + std::string field_name; + readQuotedStringWithSQLStyle(field_name, istr); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name))); +} + +template +void SerializationEnum::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.tsv.input_format_enum_as_number) + assert_cast(column).getData().push_back(readValue(istr)); + else + { + std::string field_name; + readString(field_name, istr); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template +void SerializationEnum::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr, settings); +} + +template +void SerializationEnum::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + if (!istr.eof() && *istr.position() != '"') + assert_cast(column).getData().push_back(readValue(istr)); + else + { + std::string field_name; + readJSONString(field_name, istr); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name))); + } +} + +template +void SerializationEnum::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSVString(this->getNameForValue(assert_cast(column).getData()[row_num]), ostr); +} + +template +void SerializationEnum::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.csv.input_format_enum_as_number) + assert_cast(column).getData().push_back(readValue(istr)); + else + { + std::string field_name; + readCSVString(field_name, istr, settings.csv); + assert_cast(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template class SerializationEnum; +template class SerializationEnum; + +} diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h new file mode 100644 index 00000000000..dfa9e74c7a1 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include + +namespace DB +{ + +template +class SerializationEnum : public SerializationNumber, public EnumValues +{ +public: + using typename SerializationNumber::FieldType; + using typename SerializationNumber::ColumnType; + using typename EnumValues::Values; + + SerializationEnum(const Values & values_) : EnumValues(values_) {} + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + FieldType readValue(ReadBuffer & istr) const + { + FieldType x; + readText(x, istr); + return this->findByValue(x)->first; + } +}; + +} diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp new file mode 100644 index 00000000000..5c63631e2a3 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -0,0 +1,203 @@ +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; + extern const int TOO_LARGE_STRING_SIZE; +} + +void SerializationFixedString::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const String & s = get(field); + ostr.write(s.data(), std::min(s.size(), n)); + if (s.size() < n) + for (size_t i = s.size(); i < n; ++i) + ostr.write(0); +} + + +void SerializationFixedString::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + field = String(); + String & s = get(field); + s.resize(n); + istr.readStrict(s.data(), n); +} + + +void SerializationFixedString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + ostr.write(reinterpret_cast(&assert_cast(column).getChars()[n * row_num]), n); +} + + +void SerializationFixedString::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + ColumnFixedString::Chars & data = assert_cast(column).getChars(); + size_t old_size = data.size(); + data.resize(old_size + n); + try + { + istr.readStrict(reinterpret_cast(data.data() + old_size), n); + } + catch (...) + { + data.resize_assume_reserved(old_size); + throw; + } +} + + +void SerializationFixedString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + + size_t size = data.size() / n; + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast(&data[n * offset]), n * limit); +} + + +void SerializationFixedString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + + size_t initial_size = data.size(); + size_t max_bytes = limit * n; + data.resize(initial_size + max_bytes); + size_t read_bytes = istr.readBig(reinterpret_cast(&data[initial_size]), max_bytes); + + if (read_bytes % n != 0) + throw Exception("Cannot read all data of type FixedString. Bytes read:" + toString(read_bytes) + ". String size:" + toString(n) + ".", + ErrorCodes::CANNOT_READ_ALL_DATA); + + data.resize(initial_size + read_bytes); +} + + +void SerializationFixedString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(reinterpret_cast(&assert_cast(column).getChars()[n * row_num]), n, ostr); +} + + +void SerializationFixedString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeAnyEscapedString<'\''>(pos, pos + n, ostr); +} + + +void SerializationFixedString::alignStringLength(size_t n, PaddedPODArray & data, size_t string_start) +{ + size_t length = data.size() - string_start; + if (length < n) + { + data.resize_fill(string_start + n); + } + else if (length > n) + { + data.resize_assume_reserved(string_start); + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large value for FixedString({})", n); + } +} + +template +static inline void read(const SerializationFixedString & self, IColumn & column, Reader && reader) +{ + ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + size_t prev_size = data.size(); + try + { + reader(data); + SerializationFixedString::alignStringLength(self.getN(), data, prev_size); + } + catch (...) + { + data.resize_assume_reserved(prev_size); + throw; + } +} + + +void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeAnyQuotedString<'\''>(pos, pos + n, ostr); +} + + +void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readQuotedStringInto(data, istr); }); +} + + +void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeJSONString(pos, pos + n, ostr, settings); +} + + +void SerializationFixedString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readJSONStringInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeXMLStringForTextElement(pos, pos + n, ostr); +} + + +void SerializationFixedString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); + writeCSVString(pos, pos + n, ostr); +} + + +void SerializationFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + read(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); }); +} + + +} diff --git a/src/DataTypes/Serializations/SerializationFixedString.h b/src/DataTypes/Serializations/SerializationFixedString.h new file mode 100644 index 00000000000..82559d10800 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationFixedString.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class SerializationFixedString : public ISerialization +{ +private: + size_t n; + +public: + SerializationFixedString(size_t n_) : n(n_) {} + size_t getN() const { return n; } + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /// Makes sure that the length of a newly inserted string to `chars` is equal to getN(). + /// If the length is less than getN() the function will add zero characters up to getN(). + /// If the length is greater than getN() the function will throw an exception. + static void alignStringLength(size_t n, PaddedPODArray & data, size_t string_start); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationIP.cpp b/src/DataTypes/Serializations/SerializationIP.cpp new file mode 100644 index 00000000000..ec49f960c77 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationIP.cpp @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING; + extern const int ILLEGAL_COLUMN; +} + +SerializationIPv4::SerializationIPv4(const SerializationPtr & nested_) + : SerializationCustomSimpleText(nested_) +{ +} + +void SerializationIPv4::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const auto * col = checkAndGetColumn(&column); + if (!col) + { + throw Exception("IPv4 type can only serialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); + } + + char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; + char * ptr = buffer; + formatIPv4(reinterpret_cast(&col->getData()[row_num]), ptr); + + ostr.write(buffer, strlen(buffer)); +} + +void SerializationIPv4::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ColumnUInt32 * col = typeid_cast(&column); + if (!col) + { + throw Exception("IPv4 type can only deserialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); + } + + char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; + istr.read(buffer, sizeof(buffer) - 1); + UInt32 ipv4_value = 0; + if (!parseIPv4(buffer, reinterpret_cast(&ipv4_value))) + { + throw Exception("Invalid IPv4 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + } + + col->insert(ipv4_value); +} + +SerializationIPv6::SerializationIPv6(const SerializationPtr & nested_) + : SerializationCustomSimpleText(nested_) +{ +} +void SerializationIPv6::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const auto * col = checkAndGetColumn(&column); + if (!col) + { + throw Exception("IPv6 type domain can only serialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); + } + + char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; + char * ptr = buffer; + formatIPv6(reinterpret_cast(col->getDataAt(row_num).data), ptr); + + ostr.write(buffer, strlen(buffer)); +} + +void SerializationIPv6::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ColumnFixedString * col = typeid_cast(&column); + if (!col) + { + throw Exception("IPv6 type domain can only deserialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN); + } + + char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; + istr.read(buffer, sizeof(buffer) - 1); + + std::string ipv6_value(IPV6_BINARY_LENGTH, '\0'); + if (!parseIPv6(buffer, reinterpret_cast(ipv6_value.data()))) + { + throw Exception("Invalid IPv6 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + } + + col->insertString(ipv6_value); +} + +} diff --git a/src/DataTypes/Serializations/SerializationIP.h b/src/DataTypes/Serializations/SerializationIP.h new file mode 100644 index 00000000000..f1f4d90aba5 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationIP.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationIPv4 final : public SerializationCustomSimpleText +{ +public: + SerializationIPv4(const SerializationPtr & nested_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; +}; + +class SerializationIPv6 : public SerializationCustomSimpleText +{ +public: + SerializationIPv6(const SerializationPtr & nested_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp new file mode 100644 index 00000000000..41d9a4100e0 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -0,0 +1,827 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace +{ + const ColumnLowCardinality & getColumnLowCardinality(const IColumn & column) + { + return typeid_cast(column); + } + + ColumnLowCardinality & getColumnLowCardinality(IColumn & column) + { + return typeid_cast(column); + } +} + +SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dictionary_type_) + : dictionary_type(dictionary_type_) + , dict_inner_serialization(removeNullable(dictionary_type_)->getDefaultSerialization()) +{ +} + +void SerializationLowCardinality::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + path.push_back(Substream::DictionaryKeys); + dict_inner_serialization->enumerateStreams(callback, path); + path.back() = Substream::DictionaryIndexes; + callback(path); + path.pop_back(); +} + +struct KeysSerializationVersion +{ + enum Value + { + /// Version is written at the start of . + /// Dictionary is written as number N and N keys after them. + /// Dictionary can be shared for continuous range of granules, so some marks may point to the same position. + /// Shared dictionary is stored in state and is read once. + SharedDictionariesWithAdditionalKeys = 1, + }; + + Value value; + + static void checkVersion(UInt64 version) + { + if (version != SharedDictionariesWithAdditionalKeys) + throw Exception("Invalid version for SerializationLowCardinality key column.", ErrorCodes::LOGICAL_ERROR); + } + + explicit KeysSerializationVersion(UInt64 version) : value(static_cast(version)) { checkVersion(version); } +}; + +/// Version is stored at the start of each granule. It's used to store indexes type and flags. +struct IndexesSerializationType +{ + using SerializationType = UInt64; + /// Need to read dictionary if it wasn't. + static constexpr SerializationType NeedGlobalDictionaryBit = 1u << 8u; + /// Need to read additional keys. Additional keys are stored before indexes as value N and N keys after them. + static constexpr SerializationType HasAdditionalKeysBit = 1u << 9u; + /// Need to update dictionary. It means that previous granule has different dictionary. + static constexpr SerializationType NeedUpdateDictionary = 1u << 10u; + + enum Type + { + TUInt8 = 0, + TUInt16, + TUInt32, + TUInt64, + }; + + Type type; + bool has_additional_keys; + bool need_global_dictionary; + bool need_update_dictionary; + + static constexpr SerializationType resetFlags(SerializationType type) + { + return type & (~(HasAdditionalKeysBit | NeedGlobalDictionaryBit | NeedUpdateDictionary)); + } + + static void checkType(SerializationType type) + { + UInt64 value = resetFlags(type); + if (value <= TUInt64) + return; + + throw Exception("Invalid type for SerializationLowCardinality index column.", ErrorCodes::LOGICAL_ERROR); + } + + void serialize(WriteBuffer & buffer) const + { + SerializationType val = type; + if (has_additional_keys) + val |= HasAdditionalKeysBit; + if (need_global_dictionary) + val |= NeedGlobalDictionaryBit; + if (need_update_dictionary) + val |= NeedUpdateDictionary; + writeIntBinary(val, buffer); + } + + void deserialize(ReadBuffer & buffer) + { + SerializationType val; + readIntBinary(val, buffer); + checkType(val); + has_additional_keys = (val & HasAdditionalKeysBit) != 0; + need_global_dictionary = (val & NeedGlobalDictionaryBit) != 0; + need_update_dictionary = (val & NeedUpdateDictionary) != 0; + type = static_cast(resetFlags(val)); + } + + IndexesSerializationType(const IColumn & column, + bool has_additional_keys_, + bool need_global_dictionary_, + bool enumerate_dictionaries) + : has_additional_keys(has_additional_keys_) + , need_global_dictionary(need_global_dictionary_) + , need_update_dictionary(enumerate_dictionaries) + { + if (typeid_cast(&column)) + type = TUInt8; + else if (typeid_cast(&column)) + type = TUInt16; + else if (typeid_cast(&column)) + type = TUInt32; + else if (typeid_cast(&column)) + type = TUInt64; + else + throw Exception("Invalid Indexes column for IndexesSerializationType. Expected ColumnUInt*, got " + + column.getName(), ErrorCodes::LOGICAL_ERROR); + } + + DataTypePtr getDataType() const + { + if (type == TUInt8) + return std::make_shared(); + if (type == TUInt16) + return std::make_shared(); + if (type == TUInt32) + return std::make_shared(); + if (type == TUInt64) + return std::make_shared(); + + throw Exception("Can't create DataType from IndexesSerializationType.", ErrorCodes::LOGICAL_ERROR); + } + + IndexesSerializationType() = default; +}; + +struct SerializeStateLowCardinality : public ISerialization::SerializeBinaryBulkState +{ + KeysSerializationVersion key_version; + MutableColumnUniquePtr shared_dictionary; + + explicit SerializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} +}; + +struct DeserializeStateLowCardinality : public ISerialization::DeserializeBinaryBulkState +{ + KeysSerializationVersion key_version; + ColumnUniquePtr global_dictionary; + + IndexesSerializationType index_type; + ColumnPtr additional_keys; + ColumnPtr null_map; + UInt64 num_pending_rows = 0; + + /// If dictionary should be updated. + /// Can happen is some granules was skipped while reading from MergeTree. + /// We should store this flag in State because + /// in case of long block of empty arrays we may not need read dictionary at first reading. + bool need_update_dictionary = false; + + explicit DeserializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} +}; + +static SerializeStateLowCardinality * checkAndGetLowCardinalitySerializeState( + ISerialization::SerializeBinaryBulkStatePtr & state) +{ + if (!state) + throw Exception("Got empty state for SerializationLowCardinality.", ErrorCodes::LOGICAL_ERROR); + + auto * low_cardinality_state = typeid_cast(state.get()); + if (!low_cardinality_state) + { + auto & state_ref = *state; + throw Exception("Invalid SerializeBinaryBulkState for SerializationLowCardinality. Expected: " + + demangle(typeid(SerializeStateLowCardinality).name()) + ", got " + + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); + } + + return low_cardinality_state; +} + +static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeState( + ISerialization::DeserializeBinaryBulkStatePtr & state) +{ + if (!state) + throw Exception("Got empty state for SerializationLowCardinality.", ErrorCodes::LOGICAL_ERROR); + + auto * low_cardinality_state = typeid_cast(state.get()); + if (!low_cardinality_state) + { + auto & state_ref = *state; + throw Exception("Invalid DeserializeBinaryBulkState for SerializationLowCardinality. Expected: " + + demangle(typeid(DeserializeStateLowCardinality).name()) + ", got " + + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); + } + + return low_cardinality_state; +} + +void SerializationLowCardinality::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception("Got empty stream in SerializationLowCardinality::serializeBinaryBulkStatePrefix", + ErrorCodes::LOGICAL_ERROR); + + /// Write version and create SerializeBinaryBulkState. + UInt64 key_version = KeysSerializationVersion::SharedDictionariesWithAdditionalKeys; + + writeIntBinary(key_version, *stream); + + state = std::make_shared(key_version); +} + +void SerializationLowCardinality::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + if (low_cardinality_state->shared_dictionary && settings.low_cardinality_max_dictionary_size) + { + auto nested_column = low_cardinality_state->shared_dictionary->getNestedNotNullableColumn(); + + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception("Got empty stream in SerializationLowCardinality::serializeBinaryBulkStateSuffix", + ErrorCodes::LOGICAL_ERROR); + + UInt64 num_keys = nested_column->size(); + writeIntBinary(num_keys, *stream); + dict_inner_serialization->serializeBinaryBulk(*nested_column, *stream, 0, num_keys); + low_cardinality_state->shared_dictionary = nullptr; + } +} + +void SerializationLowCardinality::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + return; + + UInt64 keys_version; + readIntBinary(keys_version, *stream); + + state = std::make_shared(keys_version); +} + +namespace +{ + template + PaddedPODArray * getIndexesData(IColumn & indexes) + { + auto * column = typeid_cast *>(&indexes); + if (column) + return &column->getData(); + + return nullptr; + } + + struct IndexMapsWithAdditionalKeys + { + MutableColumnPtr dictionary_map; + MutableColumnPtr additional_keys_map; + }; + + template + IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeysRef(PaddedPODArray & index, size_t dict_size) + { + PaddedPODArray copy(index.cbegin(), index.cend()); + + HashMap dict_map; + HashMap add_keys_map; + + for (auto val : index) + { + if (val < dict_size) + dict_map.insert({val, dict_map.size()}); + else + add_keys_map.insert({val, add_keys_map.size()}); + } + + auto dictionary_map = ColumnVector::create(dict_map.size()); + auto additional_keys_map = ColumnVector::create(add_keys_map.size()); + auto & dict_data = dictionary_map->getData(); + auto & add_keys_data = additional_keys_map->getData(); + + for (auto val : dict_map) + dict_data[val.second] = val.first; + + for (auto val : add_keys_map) + add_keys_data[val.second] = val.first - dict_size; + + for (auto & val : index) + val = val < dict_size ? dict_map[val] + : add_keys_map[val] + dict_map.size(); + + for (size_t i = 0; i < index.size(); ++i) + { + T expected = index[i] < dict_data.size() ? dict_data[index[i]] + : add_keys_data[index[i] - dict_data.size()] + dict_size; + if (expected != copy[i]) + throw Exception("Expected " + toString(expected) + ", but got " + toString(copy[i]), ErrorCodes::LOGICAL_ERROR); + + } + + return {std::move(dictionary_map), std::move(additional_keys_map)}; + } + + template + IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(PaddedPODArray & index, size_t dict_size) + { + T max_less_dict_size = 0; + T max_value = 0; + + auto size = index.size(); + if (size == 0) + return {ColumnVector::create(), ColumnVector::create()}; + + for (size_t i = 0; i < size; ++i) + { + auto val = index[i]; + if (val < dict_size) + max_less_dict_size = std::max(max_less_dict_size, val); + + max_value = std::max(max_value, val); + } + + auto map_size = UInt64(max_less_dict_size) + 1; + auto overflow_map_size = max_value >= dict_size ? (UInt64(max_value - dict_size) + 1) : 0; + PaddedPODArray map(map_size, 0); + PaddedPODArray overflow_map(overflow_map_size, 0); + + T zero_pos_value = 0; + T zero_pos_overflowed_value = 0; + UInt64 cur_pos = 0; + UInt64 cur_overflowed_pos = 0; + + for (size_t i = 0; i < size; ++i) + { + T val = index[i]; + if (val < dict_size) + { + if (cur_pos == 0) + { + zero_pos_value = val; + ++cur_pos; + } + else if (map[val] == 0 && val != zero_pos_value) + { + map[val] = cur_pos; + ++cur_pos; + } + } + else + { + T shifted_val = val - dict_size; + if (cur_overflowed_pos == 0) + { + zero_pos_overflowed_value = shifted_val; + ++cur_overflowed_pos; + } + else if (overflow_map[shifted_val] == 0 && shifted_val != zero_pos_overflowed_value) + { + overflow_map[shifted_val] = cur_overflowed_pos; + ++cur_overflowed_pos; + } + } + } + + auto dictionary_map = ColumnVector::create(cur_pos); + auto additional_keys_map = ColumnVector::create(cur_overflowed_pos); + auto & dict_data = dictionary_map->getData(); + auto & add_keys_data = additional_keys_map->getData(); + + for (size_t i = 0; i < map_size; ++i) + if (map[i]) + dict_data[map[i]] = static_cast(i); + + for (size_t i = 0; i < overflow_map_size; ++i) + if (overflow_map[i]) + add_keys_data[overflow_map[i]] = static_cast(i); + + if (cur_pos) + dict_data[0] = zero_pos_value; + if (cur_overflowed_pos) + add_keys_data[0] = zero_pos_overflowed_value; + + for (size_t i = 0; i < size; ++i) + { + T & val = index[i]; + if (val < dict_size) + val = map[val]; + else + val = overflow_map[val - dict_size] + cur_pos; + } + + return {std::move(dictionary_map), std::move(additional_keys_map)}; + } + + /// Update column and return map with old indexes. + /// Let N is the number of distinct values which are less than max_size; + /// old_column - column before function call; + /// new_column - column after function call: + /// * if old_column[i] < max_size, than + /// dictionary_map[new_column[i]] = old_column[i] + /// * else + /// additional_keys_map[new_column[i]] = old_column[i] - dict_size + N + IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(IColumn & column, size_t dict_size) + { + if (auto * data_uint8 = getIndexesData(column)) + return mapIndexWithAdditionalKeys(*data_uint8, dict_size); + else if (auto * data_uint16 = getIndexesData(column)) + return mapIndexWithAdditionalKeys(*data_uint16, dict_size); + else if (auto * data_uint32 = getIndexesData(column)) + return mapIndexWithAdditionalKeys(*data_uint32, dict_size); + else if (auto * data_uint64 = getIndexesData(column)) + return mapIndexWithAdditionalKeys(*data_uint64, dict_size); + else + throw Exception("Indexes column for mapIndexWithAdditionalKeys must be UInt, got " + column.getName(), + ErrorCodes::LOGICAL_ERROR); + } +} + +void SerializationLowCardinality::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * keys_stream = settings.getter(settings.path); + settings.path.back() = Substream::DictionaryIndexes; + auto * indexes_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!keys_stream && !indexes_stream) + return; + + if (!keys_stream) + throw Exception("Got empty stream for SerializationLowCardinality keys.", ErrorCodes::LOGICAL_ERROR); + + if (!indexes_stream) + throw Exception("Got empty stream for SerializationLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); + + const ColumnLowCardinality & low_cardinality_column = typeid_cast(column); + + auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state); + auto & global_dictionary = low_cardinality_state->shared_dictionary; + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + bool need_update_dictionary = global_dictionary == nullptr; + if (need_update_dictionary) + global_dictionary = DataTypeLowCardinality::createColumnUnique(*dictionary_type); + + size_t max_limit = column.size() - offset; + limit = limit ? std::min(limit, max_limit) : max_limit; + + /// Do not write anything for empty column. (May happen while writing empty arrays.) + if (limit == 0) + return; + + auto sub_column = low_cardinality_column.cutAndCompact(offset, limit); + ColumnPtr positions = sub_column->getIndexesPtr(); + ColumnPtr keys = sub_column->getDictionary().getNestedColumn(); + + if (settings.low_cardinality_max_dictionary_size) + { + /// Insert used_keys into global dictionary and update sub_index. + auto indexes_with_overflow = global_dictionary->uniqueInsertRangeWithOverflow(*keys, 0, keys->size(), + settings.low_cardinality_max_dictionary_size); + size_t max_size = settings.low_cardinality_max_dictionary_size + indexes_with_overflow.overflowed_keys->size(); + ColumnLowCardinality::Index(indexes_with_overflow.indexes->getPtr()).check(max_size); + + if (global_dictionary->size() > settings.low_cardinality_max_dictionary_size) + throw Exception("Got dictionary with size " + toString(global_dictionary->size()) + + " but max dictionary size is " + toString(settings.low_cardinality_max_dictionary_size), + ErrorCodes::LOGICAL_ERROR); + + positions = indexes_with_overflow.indexes->index(*positions, 0); + keys = std::move(indexes_with_overflow.overflowed_keys); + + if (global_dictionary->size() < settings.low_cardinality_max_dictionary_size && !keys->empty()) + throw Exception("Has additional keys, but dict size is " + toString(global_dictionary->size()) + + " which is less then max dictionary size (" + toString(settings.low_cardinality_max_dictionary_size) + ")", + ErrorCodes::LOGICAL_ERROR); + } + + if (const auto * nullable_keys = checkAndGetColumn(*keys)) + keys = nullable_keys->getNestedColumnPtr(); + + bool need_additional_keys = !keys->empty(); + bool need_dictionary = settings.low_cardinality_max_dictionary_size != 0; + bool need_write_dictionary = !settings.low_cardinality_use_single_dictionary_for_part + && global_dictionary->size() >= settings.low_cardinality_max_dictionary_size; + + IndexesSerializationType index_version(*positions, need_additional_keys, need_dictionary, need_update_dictionary); + index_version.serialize(*indexes_stream); + + if (need_write_dictionary) + { + const auto & nested_column = global_dictionary->getNestedNotNullableColumn(); + UInt64 num_keys = nested_column->size(); + writeIntBinary(num_keys, *keys_stream); + dict_inner_serialization->serializeBinaryBulk(*nested_column, *keys_stream, 0, num_keys); + low_cardinality_state->shared_dictionary = nullptr; + } + + if (need_additional_keys) + { + UInt64 num_keys = keys->size(); + writeIntBinary(num_keys, *indexes_stream); + dict_inner_serialization->serializeBinaryBulk(*keys, *indexes_stream, 0, num_keys); + } + + UInt64 num_rows = positions->size(); + writeIntBinary(num_rows, *indexes_stream); + auto index_serialization = index_version.getDataType()->getDefaultSerialization(); + index_serialization->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows); +} + +void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * /* cache */) const +{ + auto mutable_column = column->assumeMutable(); + ColumnLowCardinality & low_cardinality_column = typeid_cast(*mutable_column); + + settings.path.push_back(Substream::DictionaryKeys); + auto * keys_stream = settings.getter(settings.path); + settings.path.back() = Substream::DictionaryIndexes; + auto * indexes_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!keys_stream && !indexes_stream) + return; + + if (!keys_stream) + throw Exception("Got empty stream for SerializationLowCardinality keys.", ErrorCodes::LOGICAL_ERROR); + + if (!indexes_stream) + throw Exception("Got empty stream for SerializationLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR); + + auto * low_cardinality_state = checkAndGetLowCardinalityDeserializeState(state); + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + auto read_dictionary = [this, low_cardinality_state, keys_stream]() + { + UInt64 num_keys; + readIntBinary(num_keys, *keys_stream); + + auto keys_type = removeNullable(dictionary_type); + auto global_dict_keys = keys_type->createColumn(); + dict_inner_serialization->deserializeBinaryBulk(*global_dict_keys, *keys_stream, num_keys, 0); + + auto column_unique = DataTypeLowCardinality::createColumnUnique(*dictionary_type, std::move(global_dict_keys)); + low_cardinality_state->global_dictionary = std::move(column_unique); + }; + + auto read_additional_keys = [this, low_cardinality_state, indexes_stream]() + { + UInt64 num_keys; + readIntBinary(num_keys, *indexes_stream); + auto keys_type = removeNullable(dictionary_type); + auto additional_keys = keys_type->createColumn(); + dict_inner_serialization->deserializeBinaryBulk(*additional_keys, *indexes_stream, num_keys, 0); + low_cardinality_state->additional_keys = std::move(additional_keys); + + if (!low_cardinality_state->index_type.need_global_dictionary && dictionary_type->isNullable()) + { + auto null_map = ColumnUInt8::create(num_keys, 0); + if (num_keys) + null_map->getElement(0) = 1; + + low_cardinality_state->null_map = std::move(null_map); + } + }; + + auto read_indexes = [this, low_cardinality_state, indexes_stream, &low_cardinality_column](UInt64 num_rows) + { + auto indexes_type = low_cardinality_state->index_type.getDataType(); + MutableColumnPtr indexes_column = indexes_type->createColumn(); + indexes_type->getDefaultSerialization()->deserializeBinaryBulk(*indexes_column, *indexes_stream, num_rows, 0); + + auto & global_dictionary = low_cardinality_state->global_dictionary; + const auto & additional_keys = low_cardinality_state->additional_keys; + + bool has_additional_keys = low_cardinality_state->index_type.has_additional_keys; + bool column_is_empty = low_cardinality_column.empty(); + + if (!low_cardinality_state->index_type.need_global_dictionary) + { + ColumnPtr keys_column = additional_keys; + if (low_cardinality_state->null_map) + keys_column = ColumnNullable::create(additional_keys, low_cardinality_state->null_map); + low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*keys_column, *indexes_column); + } + else if (!has_additional_keys) + { + if (column_is_empty) + low_cardinality_column.setSharedDictionary(global_dictionary); + + auto local_column = ColumnLowCardinality::create(global_dictionary, std::move(indexes_column)); + low_cardinality_column.insertRangeFrom(*local_column, 0, num_rows); + } + else + { + auto maps = mapIndexWithAdditionalKeys(*indexes_column, global_dictionary->size()); + + ColumnLowCardinality::Index(maps.additional_keys_map->getPtr()).check(additional_keys->size()); + + ColumnLowCardinality::Index(indexes_column->getPtr()).check( + maps.dictionary_map->size() + maps.additional_keys_map->size()); + + auto used_keys = IColumn::mutate(global_dictionary->getNestedColumn()->index(*maps.dictionary_map, 0)); + + if (!maps.additional_keys_map->empty()) + { + auto used_add_keys = additional_keys->index(*maps.additional_keys_map, 0); + + if (dictionary_type->isNullable()) + { + ColumnPtr null_map = ColumnUInt8::create(used_add_keys->size(), 0); + used_add_keys = ColumnNullable::create(used_add_keys, null_map); + } + + used_keys->insertRangeFrom(*used_add_keys, 0, used_add_keys->size()); + } + + low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*used_keys, *indexes_column); + } + }; + + if (!settings.continuous_reading) + { + low_cardinality_state->num_pending_rows = 0; + + /// Remember in state that some granules were skipped and we need to update dictionary. + low_cardinality_state->need_update_dictionary = true; + } + + while (limit) + { + if (low_cardinality_state->num_pending_rows == 0) + { + if (indexes_stream->eof()) + break; + + auto & index_type = low_cardinality_state->index_type; + auto & global_dictionary = low_cardinality_state->global_dictionary; + + index_type.deserialize(*indexes_stream); + + bool need_update_dictionary = + !global_dictionary || index_type.need_update_dictionary || low_cardinality_state->need_update_dictionary; + if (index_type.need_global_dictionary && need_update_dictionary) + { + read_dictionary(); + low_cardinality_state->need_update_dictionary = false; + } + + if (low_cardinality_state->index_type.has_additional_keys) + read_additional_keys(); + else + low_cardinality_state->additional_keys = nullptr; + + readIntBinary(low_cardinality_state->num_pending_rows, *indexes_stream); + } + + size_t num_rows_to_read = std::min(limit, low_cardinality_state->num_pending_rows); + read_indexes(num_rows_to_read); + limit -= num_rows_to_read; + low_cardinality_state->num_pending_rows -= num_rows_to_read; + } + + column = std::move(mutable_column); +} + +void SerializationLowCardinality::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + dictionary_type->getDefaultSerialization()->serializeBinary(field, ostr); +} +void SerializationLowCardinality::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + dictionary_type->getDefaultSerialization()->deserializeBinary(field, istr); +} + +void SerializationLowCardinality::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + serializeImpl(column, row_num, &ISerialization::serializeBinary, ostr); +} +void SerializationLowCardinality::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + deserializeImpl(column, &ISerialization::deserializeBinary, istr); +} + +void SerializationLowCardinality::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextEscaped, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextEscaped, istr, settings); +} + +void SerializationLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextQuoted, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextQuoted, istr, settings); +} + +void SerializationLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeWholeText, istr, settings); +} + +void SerializationLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextCSV, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextCSV, istr, settings); +} + +void SerializationLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeText, ostr, settings); +} + +void SerializationLowCardinality::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextJSON, ostr, settings); +} +void SerializationLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextJSON, istr, settings); +} + +void SerializationLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextXML, ostr, settings); +} + +template +void SerializationLowCardinality::serializeImpl( + const IColumn & column, size_t row_num, SerializationLowCardinality::SerializeFunctionPtr func, Args &&... args) const +{ + const auto & low_cardinality_column = getColumnLowCardinality(column); + size_t unique_row_number = low_cardinality_column.getIndexes().getUInt(row_num); + auto serialization = dictionary_type->getDefaultSerialization(); + (serialization.get()->*func)(*low_cardinality_column.getDictionary().getNestedColumn(), unique_row_number, std::forward(args)...); +} + +template +void SerializationLowCardinality::deserializeImpl( + IColumn & column, SerializationLowCardinality::DeserializeFunctionPtr func, Args &&... args) const +{ + auto & low_cardinality_column= getColumnLowCardinality(column); + auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + + auto serialization = dictionary_type->getDefaultSerialization(); + (serialization.get()->*func)(*temp_column, std::forward(args)...); + + low_cardinality_column.insertFromFullColumn(*temp_column, 0); +} + +} diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h new file mode 100644 index 00000000000..e9ca0349e38 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -0,0 +1,81 @@ +#pragma once + +#include + +namespace DB +{ + +class IDataType; +using DataTypePtr = std::shared_ptr; + +class SerializationLowCardinality : public ISerialization +{ +private: + DataTypePtr dictionary_type; + SerializationPtr dict_inner_serialization; + +public: + SerializationLowCardinality(const DataTypePtr & dictionary_type); + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + +private: + template + using SerializeFunctionPtr = void (ISerialization::*)(const IColumn &, size_t, Params ...) const; + + template + void serializeImpl(const IColumn & column, size_t row_num, SerializeFunctionPtr func, Args &&... args) const; + + template + using DeserializeFunctionPtr = void (ISerialization::*)(IColumn &, Params ...) const; + + template + void deserializeImpl(IColumn & column, DeserializeFunctionPtr func, Args &&... args) const; + + // template + // static MutableColumnUniquePtr createColumnUniqueImpl(const IDataType & keys_type, const Creator & creator); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp new file mode 100644 index 00000000000..26b473c9d0a --- /dev/null +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -0,0 +1,291 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_MAP_FROM_TEXT; +} + +SerializationMap::SerializationMap(const SerializationPtr & key_, const SerializationPtr & value_, const SerializationPtr & nested_) + : key(key_), value(value_), nested(nested_) +{ +} + +static const IColumn & extractNestedColumn(const IColumn & column) +{ + return assert_cast(column).getNestedColumn(); +} + +static IColumn & extractNestedColumn(IColumn & column) +{ + return assert_cast(column).getNestedColumn(); +} + +void SerializationMap::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const auto & map = get(field); + writeVarUInt(map.size(), ostr); + for (const auto & elem : map) + { + const auto & tuple = elem.safeGet(); + assert(tuple.size() == 2); + key->serializeBinary(tuple[0], ostr); + value->serializeBinary(tuple[1], ostr); + } +} + +void SerializationMap::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + size_t size; + readVarUInt(size, istr); + field = Map(size); + for (auto & elem : field.get()) + { + Tuple tuple(2); + key->deserializeBinary(tuple[0], istr); + value->deserializeBinary(tuple[1], istr); + elem = std::move(tuple); + } +} + +void SerializationMap::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + nested->serializeBinary(extractNestedColumn(column), row_num, ostr); +} + +void SerializationMap::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + nested->deserializeBinary(extractNestedColumn(column), istr); +} + + +template +void SerializationMap::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const +{ + const auto & column_map = assert_cast(column); + + const auto & nested_array = column_map.getNestedColumn(); + const auto & nested_tuple = column_map.getNestedData(); + const auto & offsets = nested_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + writeChar('{', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + writer(key, nested_tuple.getColumn(0), i); + writeChar(':', ostr); + writer(value, nested_tuple.getColumn(1), i); + } + writeChar('}', ostr); +} + +template +void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const +{ + auto & column_map = assert_cast(column); + + auto & nested_array = column_map.getNestedColumn(); + auto & nested_tuple = column_map.getNestedData(); + auto & offsets = nested_array.getOffsets(); + + auto & key_column = nested_tuple.getColumn(0); + auto & value_column = nested_tuple.getColumn(1); + + size_t size = 0; + assertChar('{', istr); + + try + { + bool first = true; + while (!istr.eof() && *istr.position() != '}') + { + if (!first) + { + if (*istr.position() == ',') + ++istr.position(); + else + throw Exception("Cannot read Map from text", ErrorCodes::CANNOT_READ_MAP_FROM_TEXT); + } + + first = false; + + skipWhitespaceIfAny(istr); + + if (*istr.position() == '}') + break; + + reader(key, key_column); + skipWhitespaceIfAny(istr); + assertChar(':', istr); + + ++size; + skipWhitespaceIfAny(istr); + reader(value, value_column); + + skipWhitespaceIfAny(istr); + } + + offsets.push_back(offsets.back() + size); + assertChar('}', istr); + } + catch (...) + { + throw; + } +} + +void SerializationMap::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, + [&](const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) + { + subcolumn_serialization->serializeTextQuoted(subcolumn, pos, ostr, settings); + }); +} + +void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + + deserializeTextImpl(column, istr, + [&](const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + subcolumn_serialization->deserializeTextQuoted(subcolumn, istr, settings); + }); +} + + +void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, + [&](const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) + { + subcolumn_serialization->serializeTextJSON(subcolumn, pos, ostr, settings); + }); +} + +void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, istr, + [&](const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + subcolumn_serialization->deserializeTextJSON(subcolumn, istr, settings); + }); +} + +void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_map = assert_cast(column); + const auto & offsets = column_map.getNestedColumn().getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const auto & nested_data = column_map.getNestedData(); + + writeCString("", ostr); + for (size_t i = offset; i < next_offset; ++i) + { + writeCString("", ostr); + writeCString("", ostr); + key->serializeTextXML(nested_data.getColumn(0), i, ostr, settings); + writeCString("", ostr); + + writeCString("", ostr); + value->serializeTextXML(nested_data.getColumn(1), i, ostr, settings); + writeCString("", ostr); + writeCString("", ostr); + } + writeCString("", ostr); +} + +void SerializationMap::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString wb; + serializeText(column, row_num, wb, settings); + writeCSV(wb.str(), ostr); +} + +void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + ReadBufferFromString rb(s); + deserializeText(column, rb, settings); +} + + +void SerializationMap::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + nested->enumerateStreams(callback, path); +} + +void SerializationMap::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkStatePrefix(settings, state); +} + +void SerializationMap::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkStateSuffix(settings, state); +} + +void SerializationMap::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + nested->deserializeBinaryBulkStatePrefix(settings, state); +} + + +void SerializationMap::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkWithMultipleStreams(extractNestedColumn(column), offset, limit, settings, state); +} + +void SerializationMap::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto & column_map = assert_cast(*column->assumeMutable()); + nested->deserializeBinaryBulkWithMultipleStreams(column_map.getNestedColumnPtr(), limit, settings, state, cache); +} + +} diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h new file mode 100644 index 00000000000..6f72d5c2594 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -0,0 +1,71 @@ +#pragma once + +#include + + +namespace DB +{ + +class SerializationMap final : public SimpleTextSerialization +{ +private: + SerializationPtr key; + SerializationPtr value; + + /// 'nested' is an Array(Tuple(key_type, value_type)) + SerializationPtr nested; + +public: + SerializationMap(const SerializationPtr & key_type_, const SerializationPtr & value_type_, const SerializationPtr & nested_); + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + template + void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const; + + template + void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; +}; + +} + diff --git a/src/DataTypes/Serializations/SerializationNothing.cpp b/src/DataTypes/Serializations/SerializationNothing.cpp new file mode 100644 index 00000000000..6b11ea6d252 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNothing.cpp @@ -0,0 +1,25 @@ +#include +#include +#include +#include + +namespace DB +{ + +void SerializationNothing::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + size_t size = column.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + for (size_t i = 0; i < limit; ++i) + ostr.write('0'); +} + +void SerializationNothing::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + typeid_cast(column).addSize(istr.tryIgnore(limit)); +} + +} diff --git a/src/DataTypes/Serializations/SerializationNothing.h b/src/DataTypes/Serializations/SerializationNothing.h new file mode 100644 index 00000000000..a7b26c117bc --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNothing.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class SerializationNothing : public SimpleTextSerialization +{ +private: + [[noreturn]] void throwNoSerialization() const + { + throw Exception("Serialization is not implemented", ErrorCodes::NOT_IMPLEMENTED); + } +public: + void serializeBinary(const Field &, WriteBuffer &) const override { throwNoSerialization(); } + void deserializeBinary(Field &, ReadBuffer &) const override { throwNoSerialization(); } + void serializeBinary(const IColumn &, size_t, WriteBuffer &) const override { throwNoSerialization(); } + void deserializeBinary(IColumn &, ReadBuffer &) const override { throwNoSerialization(); } + void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + + /// These methods read and write zero bytes just to allow to figure out size of column. + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp new file mode 100644 index 00000000000..4de2b08c043 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -0,0 +1,473 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; +} + +void SerializationNullable::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + path.push_back(Substream::NullMap); + callback(path); + path.back() = Substream::NullableElements; + nested->enumerateStreams(callback, path); + path.pop_back(); +} + + +void SerializationNullable::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + nested->serializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + nested->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + nested->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnNullable & col = assert_cast(column); + col.checkConsistency(); + + /// First serialize null map. + settings.path.push_back(Substream::NullMap); + if (auto * stream = settings.getter(settings.path)) + SerializationNumber().serializeBinaryBulk(col.getNullMapColumn(), *stream, offset, limit); + + /// Then serialize contents of arrays. + settings.path.back() = Substream::NullableElements; + nested->serializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), offset, limit, settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnNullable & col = assert_cast(*mutable_column); + + settings.path.push_back(Substream::NullMap); + if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) + { + col.getNullMapColumnPtr() = cached_column; + } + else if (auto * stream = settings.getter(settings.path)) + { + SerializationNumber().deserializeBinaryBulk(col.getNullMapColumn(), *stream, limit, 0); + addToSubstreamsCache(cache, settings.path, col.getNullMapColumnPtr()); + } + + settings.path.back() = Substream::NullableElements; + nested->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumnPtr(), limit, settings, state, cache); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + if (field.isNull()) + { + writeBinary(true, ostr); + } + else + { + writeBinary(false, ostr); + nested->serializeBinary(field, ostr); + } +} + +void SerializationNullable::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + bool is_null = false; + readBinary(is_null, istr); + if (!is_null) + { + nested->deserializeBinary(field, istr); + } + else + { + field = Null(); + } +} + +void SerializationNullable::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const ColumnNullable & col = assert_cast(column); + + bool is_null = col.isNullAt(row_num); + writeBinary(is_null, ostr); + if (!is_null) + nested->serializeBinary(col.getNestedColumn(), row_num, ostr); +} + +/// Deserialize value into ColumnNullable. +/// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. +template , ReturnType>* = nullptr> +static ReturnType safeDeserialize( + IColumn & column, const ISerialization &, + CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +{ + ColumnNullable & col = assert_cast(column); + + if (check_for_null()) + { + col.insertDefault(); + } + else + { + deserialize_nested(col.getNestedColumn()); + + try + { + col.getNullMapData().push_back(0); + } + catch (...) + { + col.getNestedColumn().popBack(1); + throw; + } + } +} + +/// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. +template , ReturnType>* = nullptr> +static ReturnType safeDeserialize( + IColumn & column, const ISerialization & nested, + CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +{ + assert(!dynamic_cast(&column)); + assert(!dynamic_cast(&nested)); + UNUSED(nested); + + bool insert_default = check_for_null(); + if (insert_default) + column.insertDefault(); + else + deserialize_nested(column); + return !insert_default; +} + + +void SerializationNullable::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + safeDeserialize(column, *nested, + [&istr] { bool is_null = false; readBinary(is_null, istr); return is_null; }, + [this, &istr] (IColumn & nested_column) { nested->deserializeBinary(nested_column, istr); }); +} + + +void SerializationNullable::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeString(settings.tsv.null_representation, ostr); + else + nested->serializeTextEscaped(col.getNestedColumn(), row_num, ostr, settings); +} + + +void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscapedImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + /// Little tricky, because we cannot discriminate null from first character. + + if (istr.eof() || *istr.position() != '\\') /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. + { + /// This is not null, surely. + return safeDeserialize(column, *nested, + [] { return false; }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextEscaped(nested_column, istr, settings); }); + } + else + { + /// Now we know, that data in buffer starts with backslash. + ++istr.position(); + + if (istr.eof()) + throw ParsingException("Unexpected end of stream, while parsing value of Nullable type, after backslash", ErrorCodes::CANNOT_READ_ALL_DATA); + + return safeDeserialize(column, *nested, + [&istr] + { + if (*istr.position() == 'N') + { + ++istr.position(); + return true; + } + return false; + }, + [&nested, &istr, &settings] (IColumn & nested_column) + { + if (istr.position() != istr.buffer().begin()) + { + /// We could step back to consume backslash again. + --istr.position(); + nested->deserializeTextEscaped(nested_column, istr, settings); + } + else + { + /// Otherwise, we need to place backslash back in front of istr. + ReadBufferFromMemory prefix("\\", 1); + ConcatReadBuffer prepended_istr(prefix, istr); + + nested->deserializeTextEscaped(nested_column, prepended_istr, settings); + + /// Synchronise cursor position in original buffer. + + if (prepended_istr.count() > 1) + istr.position() = prepended_istr.position(); + } + }); + } +} + +void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeCString("NULL", ostr); + else + nested->serializeTextQuoted(col.getNestedColumn(), row_num, ostr, settings); +} + + +void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextQuotedImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return safeDeserialize(column, *nested, + [&istr] + { + return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr); + }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextQuoted(nested_column, istr, settings); }); +} + + +void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeWholeTextImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return safeDeserialize(column, *nested, + [&istr] + { + return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr) + || checkStringByFirstCharacterAndAssertTheRest("ᴺᵁᴸᴸ", istr); + }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeWholeText(nested_column, istr, settings); }); +} + + +void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeCString("\\N", ostr); + else + nested->serializeTextCSV(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextCSVImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + constexpr char const * null_literal = "NULL"; + constexpr size_t len = 4; + size_t null_prefix_len = 0; + + auto check_for_null = [&istr, &settings, &null_prefix_len] + { + if (checkStringByFirstCharacterAndAssertTheRest("\\N", istr)) + return true; + if (!settings.csv.unquoted_null_literal_as_null) + return false; + + /// Check for unquoted NULL + while (!istr.eof() && null_prefix_len < len && null_literal[null_prefix_len] == *istr.position()) + { + ++null_prefix_len; + ++istr.position(); + } + if (null_prefix_len == len) + return true; + + /// Value and "NULL" have common prefix, but value is not "NULL". + /// Restore previous buffer position if possible. + if (null_prefix_len <= istr.offset()) + { + istr.position() -= null_prefix_len; + null_prefix_len = 0; + } + return false; + }; + + auto deserialize_nested = [&nested, &settings, &istr, &null_prefix_len] (IColumn & nested_column) + { + if (likely(!null_prefix_len)) + nested->deserializeTextCSV(nested_column, istr, settings); + else + { + /// Previous buffer position was not restored, + /// so we need to prepend extracted characters (rare case) + ReadBufferFromMemory prepend(null_literal, null_prefix_len); + ConcatReadBuffer buf(prepend, istr); + nested->deserializeTextCSV(nested_column, buf, settings); + + /// Check if all extracted characters were read by nested parser and update buffer position + if (null_prefix_len < buf.count()) + istr.position() = buf.position(); + else if (null_prefix_len > buf.count()) + { + /// It can happen only if there is an unquoted string instead of a number + /// or if someone uses 'U' or 'L' as delimiter in CSV. + /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. + if (settings.csv.delimiter == 'U' || settings.csv.delimiter == 'L') + throw DB::ParsingException("Enabled setting input_format_csv_unquoted_null_literal_as_null may not work correctly " + "with format_csv_delimiter = 'U' or 'L' for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); + WriteBufferFromOwnString parsed_value; + nested->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); + throw DB::ParsingException("Error while parsing \"" + std::string(null_literal, null_prefix_len) + + std::string(istr.position(), std::min(size_t{10}, istr.available())) + "\" as Nullable" + + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(null_literal, buf.count()) + + "\", which was deserialized as \"" + + parsed_value.str() + "\". It seems that input data is ill-formatted.", + ErrorCodes::CANNOT_READ_ALL_DATA); + } + } + }; + + return safeDeserialize(column, *nested, check_for_null, deserialize_nested); +} + +void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + /// In simple text format (like 'Pretty' format) (these formats are suitable only for output and cannot be parsed back), + /// data is printed without escaping. + /// It makes theoretically impossible to distinguish between NULL and some string value, regardless on how do we print NULL. + /// For this reason, we output NULL in a bit strange way. + /// This assumes UTF-8 and proper font support. This is Ok, because Pretty formats are "presentational", not for data exchange. + + if (col.isNullAt(row_num)) + { + if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) + writeCString("ᴺᵁᴸᴸ", ostr); + else + writeCString("NULL", ostr); + } + else + nested->serializeText(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeCString("null", ostr); + else + nested->serializeTextJSON(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return safeDeserialize(column, *nested, + [&istr] { return checkStringByFirstCharacterAndAssertTheRest("null", istr); }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextJSON(nested_column, istr, settings); }); +} + +void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeCString("\\N", ostr); + else + nested->serializeTextXML(col.getNestedColumn(), row_num, ostr, settings); +} + +template bool SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + +} diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h new file mode 100644 index 00000000000..b0b96c021d3 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -0,0 +1,85 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationNullable : public ISerialization +{ +private: + SerializationPtr nested; + +public: + SerializationNullable(const SerializationPtr & nested_) : nested(nested_) {} + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + /** It is questionable, how NULL values could be represented in CSV. There are three variants: + * 1. \N + * 2. empty string (without quotes) + * 3. NULL + * We support all of them (however, second variant is supported by CSVRowInputStream, not by deserializeTextCSV). + * (see also input_format_defaults_for_omitted_fields and input_format_csv_unquoted_null_literal_as_null settings) + * In CSV, non-NULL string value, starting with \N characters, must be placed in quotes, to avoid ambiguity. + */ + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) + /// If ReturnType is void, deserialize Nullable(T) + template + static ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template + static ReturnType deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template + static ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + template + static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template + static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +}; + +} diff --git a/src/DataTypes/Serializations/SerializationNumber.cpp b/src/DataTypes/Serializations/SerializationNumber.cpp new file mode 100644 index 00000000000..b0a91b11716 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNumber.cpp @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +template +void SerializationNumber::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeText(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationNumber::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + + if constexpr (is_integer_v && is_arithmetic_v) + readIntTextUnsafe(x, istr); + else + readText(x, istr); + + assert_cast &>(column).getData().push_back(x); +} + +template +static inline void writeDenormalNumber(T x, WriteBuffer & ostr) +{ + if constexpr (std::is_floating_point_v) + { + if (std::signbit(x)) + { + if (isNaN(x)) + writeCString("-nan", ostr); + else + writeCString("-inf", ostr); + } + else + { + if (isNaN(x)) + writeCString("nan", ostr); + else + writeCString("inf", ostr); + } + } + else + { + /// This function is not called for non floating point numbers. + (void)x; + } +} + + +template +void SerializationNumber::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto x = assert_cast &>(column).getData()[row_num]; + bool is_finite = isFinite(x); + + const bool need_quote = (is_integer_v && (sizeof(T) >= 8) && settings.json.quote_64bit_integers) + || (settings.json.quote_denormals && !is_finite); + + if (need_quote) + writeChar('"', ostr); + + if (is_finite) + writeText(x, ostr); + else if (!settings.json.quote_denormals) + writeCString("null", ostr); + else + writeDenormalNumber(x, ostr); + + if (need_quote) + writeChar('"', ostr); +} + +template +void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + bool has_quote = false; + if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. + { + has_quote = true; + ++istr.position(); + } + + FieldType x; + + /// null + if (!has_quote && !istr.eof() && *istr.position() == 'n') + { + ++istr.position(); + assertString("ull", istr); + + x = NaNOrZero(); + } + else + { + static constexpr bool is_uint8 = std::is_same_v; + static constexpr bool is_int8 = std::is_same_v; + + if (is_uint8 || is_int8) + { + // extra conditions to parse true/false strings into 1/0 + if (istr.eof()) + throwReadAfterEOF(); + if (*istr.position() == 't' || *istr.position() == 'f') + { + bool tmp = false; + readBoolTextWord(tmp, istr); + x = tmp; + } + else + readText(x, istr); + } + else + { + readText(x, istr); + } + + if (has_quote) + assertChar('"', istr); + } + + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationNumber::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + FieldType x; + readCSV(x, istr); + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationNumber::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + /// ColumnVector::ValueType is a narrower type. For example, UInt8, when the Field type is UInt64 + typename ColumnVector::ValueType x = get(field); + writeBinary(x, ostr); +} + +template +void SerializationNumber::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + typename ColumnVector::ValueType x; + readBinary(x, istr); + field = NearestFieldType(x); +} + +template +void SerializationNumber::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + writeBinary(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationNumber::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + typename ColumnVector::ValueType x; + readBinary(x, istr); + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationNumber::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + + size_t size = x.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast(&x[offset]), sizeof(typename ColumnVector::ValueType) * limit); +} + +template +void SerializationNumber::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + size_t initial_size = x.size(); + x.resize(initial_size + limit); + size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(typename ColumnVector::ValueType) * limit); + x.resize(initial_size + size / sizeof(typename ColumnVector::ValueType)); +} + +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; // base for UUID +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; +template class SerializationNumber; + +} diff --git a/src/DataTypes/Serializations/SerializationNumber.h b/src/DataTypes/Serializations/SerializationNumber.h new file mode 100644 index 00000000000..09976a4bc4f --- /dev/null +++ b/src/DataTypes/Serializations/SerializationNumber.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +namespace DB +{ + +template +class SerializationNumber : public SimpleTextSerialization +{ + static_assert(IsNumber); + +public: + using FieldType = T; + using ColumnType = ColumnVector; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + /** Format is platform-dependent. */ + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp new file mode 100644 index 00000000000..c3c24ed6749 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -0,0 +1,300 @@ +#include + +#include +#include + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include + +#ifdef __SSE2__ + #include +#endif + +namespace DB +{ + +void SerializationString::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const String & s = get(field); + writeVarUInt(s.size(), ostr); + writeString(s, ostr); +} + + +void SerializationString::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + UInt64 size; + readVarUInt(size, istr); + field = String(); + String & s = get(field); + s.resize(size); + istr.readStrict(s.data(), size); +} + + +void SerializationString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + const StringRef & s = assert_cast(column).getDataAt(row_num); + writeVarUInt(s.size, ostr); + writeString(s, ostr); +} + + +void SerializationString::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + ColumnString & column_string = assert_cast(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + + UInt64 size; + readVarUInt(size, istr); + + size_t old_chars_size = data.size(); + size_t offset = old_chars_size + size + 1; + offsets.push_back(offset); + + try + { + data.resize(offset); + istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); + data.back() = 0; + } + catch (...) + { + offsets.pop_back(); + data.resize_assume_reserved(old_chars_size); + throw; + } +} + + +void SerializationString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnString & column_string = typeid_cast(column); + const ColumnString::Chars & data = column_string.getChars(); + const ColumnString::Offsets & offsets = column_string.getOffsets(); + + size_t size = column.size(); + if (!size) + return; + + size_t end = limit && offset + limit < size + ? offset + limit + : size; + + if (offset == 0) + { + UInt64 str_size = offsets[0] - 1; + writeVarUInt(str_size, ostr); + ostr.write(reinterpret_cast(data.data()), str_size); + + ++offset; + } + + for (size_t i = offset; i < end; ++i) + { + UInt64 str_size = offsets[i] - offsets[i - 1] - 1; + writeVarUInt(str_size, ostr); + ostr.write(reinterpret_cast(&data[offsets[i - 1]]), str_size); + } +} + + +template +static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit) +{ + size_t offset = data.size(); + for (size_t i = 0; i < limit; ++i) + { + if (istr.eof()) + break; + + UInt64 size; + readVarUInt(size, istr); + + offset += size + 1; + offsets.push_back(offset); + + data.resize(offset); + + if (size) + { +#ifdef __SSE2__ + /// An optimistic branch in which more efficient copying is possible. + if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end()) + { + const __m128i * sse_src_pos = reinterpret_cast(istr.position()); + const __m128i * sse_src_end = sse_src_pos + (size + (16 * UNROLL_TIMES - 1)) / 16 / UNROLL_TIMES * UNROLL_TIMES; + __m128i * sse_dst_pos = reinterpret_cast<__m128i *>(&data[offset - size - 1]); + + while (sse_src_pos < sse_src_end) + { + for (size_t j = 0; j < UNROLL_TIMES; ++j) + _mm_storeu_si128(sse_dst_pos + j, _mm_loadu_si128(sse_src_pos + j)); + + sse_src_pos += UNROLL_TIMES; + sse_dst_pos += UNROLL_TIMES; + } + + istr.position() += size; + } + else +#endif + { + istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); + } + } + + data[offset - 1] = 0; + } +} + + +void SerializationString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const +{ + ColumnString & column_string = typeid_cast(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + + double avg_chars_size = 1; /// By default reserve only for empty strings. + + if (avg_value_size_hint && avg_value_size_hint > sizeof(offsets[0])) + { + /// Randomly selected. + constexpr auto avg_value_size_hint_reserve_multiplier = 1.2; + + avg_chars_size = (avg_value_size_hint - sizeof(offsets[0])) * avg_value_size_hint_reserve_multiplier; + } + + size_t size_to_reserve = data.size() + std::ceil(limit * avg_chars_size); + + /// Never reserve for too big size. + if (size_to_reserve < 256 * 1024 * 1024) + { + try + { + data.reserve(size_to_reserve); + } + catch (Exception & e) + { + e.addMessage( + "(avg_value_size_hint = " + toString(avg_value_size_hint) + + ", avg_chars_size = " + toString(avg_chars_size) + + ", limit = " + toString(limit) + ")"); + throw; + } + } + + offsets.reserve(offsets.size() + limit); + + if (avg_chars_size >= 64) + deserializeBinarySSE2<4>(data, offsets, istr, limit); + else if (avg_chars_size >= 48) + deserializeBinarySSE2<3>(data, offsets, istr, limit); + else if (avg_chars_size >= 32) + deserializeBinarySSE2<2>(data, offsets, istr, limit); + else + deserializeBinarySSE2<1>(data, offsets, istr, limit); +} + + +void SerializationString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(assert_cast(column).getDataAt(row_num), ostr); +} + + +void SerializationString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(assert_cast(column).getDataAt(row_num), ostr); +} + + +template +static inline void read(IColumn & column, Reader && reader) +{ + ColumnString & column_string = assert_cast(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + size_t old_chars_size = data.size(); + size_t old_offsets_size = offsets.size(); + try + { + reader(data); + data.push_back(0); + offsets.push_back(data.size()); + } + catch (...) + { + offsets.resize_assume_reserved(old_offsets_size); + data.resize_assume_reserved(old_chars_size); + throw; + } +} + + +void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); }); +} + + +void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); +} + + +void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(assert_cast(column).getDataAt(row_num), ostr); +} + + +void SerializationString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); +} + + +void SerializationString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(assert_cast(column).getDataAt(row_num), ostr, settings); +} + + +void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); +} + + +void SerializationString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(assert_cast(column).getDataAt(row_num), ostr); +} + + +void SerializationString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSVString<>(assert_cast(column).getDataAt(row_num), ostr); +} + + +void SerializationString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); +} + + +} diff --git a/src/DataTypes/Serializations/SerializationString.h b/src/DataTypes/Serializations/SerializationString.h new file mode 100644 index 00000000000..ee5de2c18f1 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationString.h @@ -0,0 +1,37 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationString final : public ISerialization +{ +public: + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp new file mode 100644 index 00000000000..bdeea80477e --- /dev/null +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -0,0 +1,408 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH; + extern const int NOT_FOUND_COLUMN_IN_BLOCK; + extern const int LOGICAL_ERROR; +} + + +static inline IColumn & extractElementColumn(IColumn & column, size_t idx) +{ + return assert_cast(column).getColumn(idx); +} + +static inline const IColumn & extractElementColumn(const IColumn & column, size_t idx) +{ + return assert_cast(column).getColumn(idx); +} + +void SerializationTuple::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + const auto & tuple = get(field); + for (const auto idx_elem : ext::enumerate(elems)) + idx_elem.second->serializeBinary(tuple[idx_elem.first], ostr); +} + +void SerializationTuple::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + const size_t size = elems.size(); + + Tuple tuple(size); + for (const auto i : ext::range(0, size)) + elems[i]->deserializeBinary(tuple[i], istr); + + field = tuple; +} + +void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + for (const auto idx_elem : ext::enumerate(elems)) + idx_elem.second->serializeBinary(extractElementColumn(column, idx_elem.first), row_num, ostr); +} + + +template +static void addElementSafe(size_t num_elems, IColumn & column, F && impl) +{ + /// We use the assumption that tuples of zero size do not exist. + size_t old_size = column.size(); + + try + { + impl(); + + // Check that all columns now have the same size. + size_t new_size = column.size(); + for (auto i : ext::range(1, num_elems)) + { + const auto & element_column = extractElementColumn(column, i); + if (element_column.size() != new_size) + { + // This is not a logical error because it may work with + // user-supplied data. + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, + "Cannot read a tuple because not all elements are present"); + } + } + } + catch (...) + { + for (const auto & i : ext::range(0, num_elems)) + { + auto & element_column = extractElementColumn(column, i); + if (element_column.size() > old_size) + element_column.popBack(1); + } + + throw; + } +} + +void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + addElementSafe(elems.size(), column, [&] + { + for (const auto & i : ext::range(0, ext::size(elems))) + elems[i]->deserializeBinary(extractElementColumn(column, i), istr); + }); +} + +void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('(', ostr); + for (const auto i : ext::range(0, ext::size(elems))) + { + if (i != 0) + writeChar(',', ostr); + elems[i]->serializeTextQuoted(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar(')', ostr); +} + +void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + const size_t size = elems.size(); + assertChar('(', istr); + + addElementSafe(elems.size(), column, [&] + { + for (const auto i : ext::range(0, size)) + { + skipWhitespaceIfAny(istr); + if (i != 0) + { + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextQuoted(extractElementColumn(column, i), istr, settings); + } + }); + + // Special format for one element tuple (1,) + if (1 == elems.size()) + { + skipWhitespaceIfAny(istr); + // Allow both (1) and (1,) + checkChar(',', istr); + } + skipWhitespaceIfAny(istr); + assertChar(')', istr); +} + +void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + if (settings.json.named_tuples_as_objects + && have_explicit_names) + { + writeChar('{', ostr); + for (const auto i : ext::range(0, ext::size(elems))) + { + if (i != 0) + { + writeChar(',', ostr); + } + writeJSONString(elems[i]->getElementName(), ostr, settings); + writeChar(':', ostr); + elems[i]->serializeTextJSON(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar('}', ostr); + } + else + { + writeChar('[', ostr); + for (const auto i : ext::range(0, ext::size(elems))) + { + if (i != 0) + writeChar(',', ostr); + elems[i]->serializeTextJSON(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar(']', ostr); + } +} + +void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.json.named_tuples_as_objects + && have_explicit_names) + { + skipWhitespaceIfAny(istr); + assertChar('{', istr); + skipWhitespaceIfAny(istr); + + addElementSafe(elems.size(), column, [&] + { + // Require all elements but in arbitrary order. + for (auto i : ext::range(0, ext::size(elems))) + { + if (i > 0) + { + skipWhitespaceIfAny(istr); + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + + std::string name; + readDoubleQuotedString(name, istr); + skipWhitespaceIfAny(istr); + assertChar(':', istr); + skipWhitespaceIfAny(istr); + + const size_t element_pos = getPositionByName(name); + auto & element_column = extractElementColumn(column, element_pos); + elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + } + }); + + skipWhitespaceIfAny(istr); + assertChar('}', istr); + } + else + { + const size_t size = elems.size(); + assertChar('[', istr); + + addElementSafe(elems.size(), column, [&] + { + for (const auto i : ext::range(0, size)) + { + skipWhitespaceIfAny(istr); + if (i != 0) + { + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextJSON(extractElementColumn(column, i), istr, settings); + } + }); + + skipWhitespaceIfAny(istr); + assertChar(']', istr); + } +} + +void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeCString("", ostr); + for (const auto i : ext::range(0, ext::size(elems))) + { + writeCString("", ostr); + elems[i]->serializeTextXML(extractElementColumn(column, i), row_num, ostr, settings); + writeCString("", ostr); + } + writeCString("", ostr); +} + +void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + for (const auto i : ext::range(0, ext::size(elems))) + { + if (i != 0) + writeChar(',', ostr); + elems[i]->serializeTextCSV(extractElementColumn(column, i), row_num, ostr, settings); + } +} + +void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (const auto i : ext::range(0, size)) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + assertChar(settings.csv.delimiter, istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextCSV(extractElementColumn(column, i), istr, settings); + } + }); +} + +void SerializationTuple::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + for (const auto & elem : elems) + elem->enumerateStreams(callback, path); +} + +struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState +{ + std::vector states; +}; + +struct DeserializeBinaryBulkStateTuple : public ISerialization::DeserializeBinaryBulkState +{ + std::vector states; +}; + +static SerializeBinaryBulkStateTuple * checkAndGetTupleSerializeState(ISerialization::SerializeBinaryBulkStatePtr & state) +{ + if (!state) + throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); + + auto * tuple_state = typeid_cast(state.get()); + if (!tuple_state) + { + auto & state_ref = *state; + throw Exception("Invalid SerializeBinaryBulkState for DataTypeTuple. Expected: " + + demangle(typeid(SerializeBinaryBulkStateTuple).name()) + ", got " + + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); + } + + return tuple_state; +} + +static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(ISerialization::DeserializeBinaryBulkStatePtr & state) +{ + if (!state) + throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR); + + auto * tuple_state = typeid_cast(state.get()); + if (!tuple_state) + { + auto & state_ref = *state; + throw Exception("Invalid DeserializeBinaryBulkState for DataTypeTuple. Expected: " + + demangle(typeid(DeserializeBinaryBulkStateTuple).name()) + ", got " + + demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR); + } + + return tuple_state; +} + +void SerializationTuple::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto tuple_state = std::make_shared(); + tuple_state->states.resize(elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->serializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); + + state = std::move(tuple_state); +} + +void SerializationTuple::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * tuple_state = checkAndGetTupleSerializeState(state); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->serializeBinaryBulkStateSuffix(settings, tuple_state->states[i]); +} + +void SerializationTuple::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + auto tuple_state = std::make_shared(); + tuple_state->states.resize(elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); + + state = std::move(tuple_state); +} + +void SerializationTuple::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * tuple_state = checkAndGetTupleSerializeState(state); + + for (const auto i : ext::range(0, ext::size(elems))) + { + const auto & element_col = extractElementColumn(column, i); + elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]); + } +} + +void SerializationTuple::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto * tuple_state = checkAndGetTupleDeserializeState(state); + + auto mutable_column = column->assumeMutable(); + auto & column_tuple = assert_cast(*mutable_column); + + settings.avg_value_size_hint = 0; + for (const auto i : ext::range(0, ext::size(elems))) + elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache); +} + +size_t SerializationTuple::getPositionByName(const String & name) const +{ + size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + if (elems[i]->getElementName() == name) + return i; + throw Exception("Tuple doesn't have element with name '" + name + "'", ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); +} + +} diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h new file mode 100644 index 00000000000..13668572fff --- /dev/null +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class SerializationTuple final : public SimpleTextSerialization +{ +public: + using ElementSerializationPtr = std::shared_ptr; + using ElementSerializations = std::vector; + + SerializationTuple(const ElementSerializations & elems_, bool have_explicit_names_) + : elems(elems_), have_explicit_names(have_explicit_names_) {} + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + /// Tuples in CSV format will be serialized as separate columns (that is, losing their nesting in the tuple). + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Each sub-column in a tuple is serialized in separate stream. + */ + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + ElementSerializations elems; + bool have_explicit_names; + + size_t getPositionByName(const String & name) const; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationTupleElement.cpp b/src/DataTypes/Serializations/SerializationTupleElement.cpp new file mode 100644 index 00000000000..4b50810fcd6 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationTupleElement.cpp @@ -0,0 +1,73 @@ +#include + +namespace DB +{ + +void SerializationTupleElement::enumerateStreams( + const StreamCallback & callback, + SubstreamPath & path) const +{ + addToPath(path); + nested_serialization->enumerateStreams(callback, path); + path.pop_back(); +} + +void SerializationTupleElement::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + +void SerializationTupleElement::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + +void SerializationTupleElement::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + +void SerializationTupleElement::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); + settings.path.pop_back(); +} + +void SerializationTupleElement::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + addToPath(settings.path); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); + settings.path.pop_back(); +} + +void SerializationTupleElement::addToPath(SubstreamPath & path) const +{ + path.push_back(Substream::TupleElement); + path.back().tuple_element_name = name; + path.back().escape_tuple_delimiter = escape_delimiter; +} + +} diff --git a/src/DataTypes/Serializations/SerializationTupleElement.h b/src/DataTypes/Serializations/SerializationTupleElement.h new file mode 100644 index 00000000000..b85014c9e64 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationTupleElement.h @@ -0,0 +1,57 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationTupleElement final : public SerializationWrapper +{ +private: + String name; + bool escape_delimiter; + +public: + SerializationTupleElement(const SerializationPtr & nested_, const String & name_, bool escape_delimiter_ = true) + : SerializationWrapper(nested_) + , name(name_), escape_delimiter(escape_delimiter_) + { + } + + const String & getElementName() const { return name; } + + void enumerateStreams( + const StreamCallback & callback, + SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + void addToPath(SubstreamPath & path) const; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp new file mode 100644 index 00000000000..1a0640a5e69 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +void SerializationUUID::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeText(UUID(assert_cast(column).getData()[row_num]), ostr); +} + +void SerializationUUID::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + readText(x, istr); + assert_cast(column).getData().push_back(x); +} + +void SerializationUUID::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeText(column, istr, settings); +} + +void SerializationUUID::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationUUID::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + assertChar('\'', istr); + readText(x, istr); + assertChar('\'', istr); + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + assertChar('"', istr); + readText(x, istr); + assertChar('"', istr); + assert_cast(column).getData().push_back(x); +} + +void SerializationUUID::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID value; + readCSV(value, istr); + assert_cast(column).getData().push_back(value); +} + +} diff --git a/src/DataTypes/Serializations/SerializationUUID.h b/src/DataTypes/Serializations/SerializationUUID.h new file mode 100644 index 00000000000..93bf166bbd9 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationUUID.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace DB +{ + +class SerializationUUID : public SerializationNumber +{ +public: + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp new file mode 100644 index 00000000000..f75c9a1dd8b --- /dev/null +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -0,0 +1,140 @@ +#include +#include + +namespace DB +{ + +void SerializationWrapper::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const +{ + nested_serialization->enumerateStreams(callback, path); +} + +void SerializationWrapper::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested_serialization->serializeBinaryBulkStatePrefix(settings, state); +} + +void SerializationWrapper::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested_serialization->serializeBinaryBulkStateSuffix(settings, state); +} + +void SerializationWrapper::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); +} + +void SerializationWrapper::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + + nested_serialization->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); +} + +void SerializationWrapper::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + + nested_serialization->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); +} + +void SerializationWrapper::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + nested_serialization->serializeBinaryBulk(column, ostr, offset, limit); +} + +void SerializationWrapper::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const +{ + nested_serialization->deserializeBinaryBulk(column, istr, limit, avg_value_size_hint); +} + +void SerializationWrapper::serializeBinary(const Field & field, WriteBuffer & ostr) const +{ + nested_serialization->serializeBinary(field, ostr); +} + +void SerializationWrapper::deserializeBinary(Field & field, ReadBuffer & istr) const +{ + nested_serialization->deserializeBinary(field, istr); +} + +void SerializationWrapper::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + nested_serialization->serializeBinary(column, row_num, ostr); +} + +void SerializationWrapper::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + nested_serialization->deserializeBinary(column, istr); +} + +void SerializationWrapper::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextEscaped(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextEscaped(column, istr, settings); +} + +void SerializationWrapper::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextQuoted(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextQuoted(column, istr, settings); +} + +void SerializationWrapper::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextCSV(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextCSV(column, istr, settings); +} + +void SerializationWrapper::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeText(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeWholeText(column, istr, settings); +} + +void SerializationWrapper::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextJSON(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextJSON(column, istr, settings); +} + +void SerializationWrapper::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextXML(column, row_num, ostr, settings); +} + +} diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h new file mode 100644 index 00000000000..399d3b198b3 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -0,0 +1,74 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Wrapper for serialization, which calls methods, which are not overridden, from nested serialization. +/// You can inherit this class, when you need to override bunch of methods, to avoid boilerplate code. +class SerializationWrapper : public ISerialization +{ +protected: + SerializationPtr nested_serialization; + +public: + SerializationWrapper(const SerializationPtr & nested_serialization_) : nested_serialization(nested_serialization_) {} + + void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; +}; + +} diff --git a/src/DataTypes/DataTypeWithSimpleSerialization.h b/src/DataTypes/Serializations/SimpleTextSerialization.h similarity index 87% rename from src/DataTypes/DataTypeWithSimpleSerialization.h rename to src/DataTypes/Serializations/SimpleTextSerialization.h index 6f6120deb4f..04c5b190203 100644 --- a/src/DataTypes/DataTypeWithSimpleSerialization.h +++ b/src/DataTypes/Serializations/SimpleTextSerialization.h @@ -1,16 +1,15 @@ #pragma once -#include +#include namespace DB { -/// Helper class to define same IDataType text (de)serialization for all the variants (escaped, quoted, JSON, CSV). +/// Helper class to define same ISerialization text (de)serialization for all the variants (escaped, quoted, JSON, CSV). /// You need to define serializeText() and deserializeText() in derived class. -class DataTypeWithSimpleSerialization : public IDataType +class SimpleTextSerialization : public ISerialization { protected: - DataTypeWithSimpleSerialization() - {} + SimpleTextSerialization() = default; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override { diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index da0e2ed8ce4..6710313349b 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -162,6 +163,36 @@ DataTypePtr getLeastSupertype(const DataTypes & types) } } + /// For maps + { + bool have_maps = false; + bool all_maps = true; + DataTypes key_types; + DataTypes value_types; + key_types.reserve(types.size()); + value_types.reserve(types.size()); + + for (const auto & type : types) + { + if (const DataTypeMap * type_map = typeid_cast(type.get())) + { + have_maps = true; + key_types.emplace_back(type_map->getKeyType()); + value_types.emplace_back(type_map->getValueType()); + } + else + all_maps = false; + } + + if (have_maps) + { + if (!all_maps) + throw Exception(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + return std::make_shared(getLeastSupertype(key_types), getLeastSupertype(value_types)); + } + } + /// For LowCardinality. This is above Nullable, because LowCardinality can contain Nullable but cannot be inside Nullable. { bool have_low_cardinality = false; diff --git a/src/DataTypes/tests/gtest_DataType_deserializeAsText.cpp b/src/DataTypes/tests/gtest_DataType_deserializeAsText.cpp index 48e2f0d80a0..9d8c32c92b6 100644 --- a/src/DataTypes/tests/gtest_DataType_deserializeAsText.cpp +++ b/src/DataTypes/tests/gtest_DataType_deserializeAsText.cpp @@ -67,7 +67,7 @@ TEST_P(ParseDataTypeTest, parseStringValue) for (const auto & value : p.values) { ReadBuffer buffer(const_cast(value.data()), value.size(), 0); - data_type->deserializeAsWholeText(*col, buffer, FormatSettings{}); + data_type->getDefaultSerialization()->deserializeWholeText(*col, buffer, FormatSettings{}); } ASSERT_EQ(p.expected_values.size(), col->size()) << "Actual items: " << *col; diff --git a/src/DataTypes/ya.make b/src/DataTypes/ya.make index 356424af8dd..e7294c298e5 100644 --- a/src/DataTypes/ya.make +++ b/src/DataTypes/ya.make @@ -15,7 +15,6 @@ SRCS( DataTypeCustomGeo.cpp DataTypeCustomIPv4AndIPv6.cpp DataTypeCustomSimpleAggregateFunction.cpp - DataTypeCustomSimpleTextSerialization.cpp DataTypeDate.cpp DataTypeDateTime.cpp DataTypeDateTime64.cpp @@ -32,15 +31,37 @@ SRCS( DataTypeNothing.cpp DataTypeNullable.cpp DataTypeNumberBase.cpp - DataTypeOneElementTuple.cpp DataTypeString.cpp DataTypeTuple.cpp DataTypeUUID.cpp DataTypesDecimal.cpp DataTypesNumber.cpp + EnumValues.cpp FieldToDataType.cpp IDataType.cpp NestedUtils.cpp + Serializations/ISerialization.cpp + Serializations/SerializationAggregateFunction.cpp + Serializations/SerializationArray.cpp + Serializations/SerializationCustomSimpleText.cpp + Serializations/SerializationDate.cpp + Serializations/SerializationDateTime.cpp + Serializations/SerializationDateTime64.cpp + Serializations/SerializationDecimal.cpp + Serializations/SerializationDecimalBase.cpp + Serializations/SerializationEnum.cpp + Serializations/SerializationFixedString.cpp + Serializations/SerializationIP.cpp + Serializations/SerializationLowCardinality.cpp + Serializations/SerializationMap.cpp + Serializations/SerializationNothing.cpp + Serializations/SerializationNullable.cpp + Serializations/SerializationNumber.cpp + Serializations/SerializationString.cpp + Serializations/SerializationTuple.cpp + Serializations/SerializationTupleElement.cpp + Serializations/SerializationUUID.cpp + Serializations/SerializationWrapper.cpp convertMySQLDataType.cpp getLeastSupertype.cpp getMostSubtype.cpp diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 71e0effb2d2..6d564bc29a3 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -35,10 +35,10 @@ public: UUID uuid() const override { return table()->getStorageID().uuid; } }; -DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger_name, const Context & context_) +DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger_name, ContextPtr context_) : DatabaseOrdinary(name_, std::move(metadata_path_), "store/", logger_name, context_) - , path_to_table_symlinks(global_context.getPath() + "data/" + escapeForFileName(name_) + "/") - , path_to_metadata_symlink(global_context.getPath() + "metadata/" + escapeForFileName(name_)) + , path_to_table_symlinks(getContext()->getPath() + "data/" + escapeForFileName(name_) + "/") + , path_to_metadata_symlink(getContext()->getPath() + "metadata/" + escapeForFileName(name_)) , db_uuid(uuid) { assert(db_uuid != UUIDHelpers::Nil); @@ -46,7 +46,7 @@ DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, c tryCreateMetadataSymlink(); } -DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const Context & context_) +DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, ContextPtr context_) : DatabaseAtomic(name_, std::move(metadata_path_), uuid, "DatabaseAtomic (" + name_ + ")", context_) { } @@ -68,7 +68,7 @@ String DatabaseAtomic::getTableDataPath(const ASTCreateQuery & query) const return tmp; } -void DatabaseAtomic::drop(const Context &) +void DatabaseAtomic::drop(ContextPtr) { assert(tables.empty()); try @@ -106,8 +106,15 @@ StoragePtr DatabaseAtomic::detachTable(const String & name) return table; } -void DatabaseAtomic::dropTable(const Context & context, const String & table_name, bool no_delay) +void DatabaseAtomic::dropTable(ContextPtr local_context, const String & table_name, bool no_delay) { + if (auto * mv = dynamic_cast(tryGetTable(table_name, local_context).get())) + { + /// Remove the inner table (if any) to avoid deadlock + /// (due to attempt to execute DROP from the worker thread) + mv->dropInnerTable(no_delay, local_context); + } + String table_metadata_path = getObjectMetadataPath(table_name); String table_metadata_path_drop; StoragePtr table; @@ -115,8 +122,8 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam std::unique_lock lock(mutex); table = getTableUnlocked(table_name, lock); table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); - auto txn = context.getZooKeeperMetadataTransaction(); - if (txn && !context.isInternalSubquery()) + auto txn = local_context->getZooKeeperMetadataTransaction(); + if (txn && !local_context->isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename @@ -131,16 +138,13 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam } if (table->storesDataOnDisk()) tryRemoveSymlink(table_name); - /// Remove the inner table (if any) to avoid deadlock - /// (due to attempt to execute DROP from the worker thread) - if (auto * mv = dynamic_cast(table.get())) - mv->dropInnerTable(no_delay, context); + /// Notify DatabaseCatalog that table was dropped. It will remove table data in background. /// Cleanup is performed outside of database to allow easily DROP DATABASE without waiting for cleanup to complete. DatabaseCatalog::instance().enqueueDroppedTableCleanup(table->getStorageID(), table, table_metadata_path_drop, no_delay); } -void DatabaseAtomic::renameTable(const Context & context, const String & table_name, IDatabase & to_database, +void DatabaseAtomic::renameTable(ContextPtr local_context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) { if (typeid(*this) != typeid(to_database)) @@ -148,7 +152,7 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n if (!typeid_cast(&to_database)) throw Exception("Moving tables between databases of different engines is not supported", ErrorCodes::NOT_IMPLEMENTED); /// Allow moving tables between Atomic and Ordinary (with table lock) - DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary); + DatabaseOnDisk::renameTable(local_context, table_name, to_database, to_table_name, exchange, dictionary); return; } @@ -244,8 +248,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } /// Table renaming actually begins here - auto txn = context.getZooKeeperMetadataTransaction(); - if (txn && !context.isInternalSubquery()) + auto txn = local_context->getZooKeeperMetadataTransaction(); + if (txn && !local_context->isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename @@ -287,7 +291,7 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, const String & table_metadata_tmp_path, const String & table_metadata_path, - const Context & query_context) + ContextPtr query_context) { DetachedTables not_in_use; auto table_data_path = getTableDataPath(query); @@ -305,8 +309,8 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora DatabaseCatalog::instance().addUUIDMapping(query.uuid); locked_uuid = true; - auto txn = query_context.getZooKeeperMetadataTransaction(); - if (txn && !query_context.isInternalSubquery()) + auto txn = query_context->getZooKeeperMetadataTransaction(); + if (txn && !query_context->isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) @@ -329,7 +333,7 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora } void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, - const String & /*statement*/, const Context & query_context) + const String & /*statement*/, ContextPtr query_context) { bool check_file_exists = true; SCOPE_EXIT({ std::error_code code; if (check_file_exists) std::filesystem::remove(table_metadata_tmp_path, code); }); @@ -340,8 +344,8 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); - auto txn = query_context.getZooKeeperMetadataTransaction(); - if (txn && !query_context.isInternalSubquery()) + auto txn = query_context->getZooKeeperMetadataTransaction(); + if (txn && !query_context->isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename @@ -361,8 +365,8 @@ void DatabaseAtomic::assertDetachedTableNotInUse(const UUID & uuid) /// 4. INSERT INTO table ...; (both Storage instances writes data without any synchronization) /// To avoid it, we remember UUIDs of detached tables and does not allow ATTACH table with such UUID until detached instance still in use. if (detached_tables.count(uuid)) - throw Exception("Cannot attach table with UUID " + toString(uuid) + - ", because it was detached but still used by some query. Retry later.", ErrorCodes::TABLE_ALREADY_EXISTS); + throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Cannot attach table with UUID {}, " + "because it was detached but still used by some query. Retry later.", toString(uuid)); } void DatabaseAtomic::setDetachedTableNotInUseForce(const UUID & uuid) @@ -405,26 +409,27 @@ void DatabaseAtomic::assertCanBeDetached(bool cleanup) "because some tables are still in use. Retry later.", ErrorCodes::DATABASE_NOT_EMPTY); } -DatabaseTablesIteratorPtr DatabaseAtomic::getTablesIterator(const Context & context, const IDatabase::FilterByNameFunction & filter_by_table_name) +DatabaseTablesIteratorPtr +DatabaseAtomic::getTablesIterator(ContextPtr local_context, const IDatabase::FilterByNameFunction & filter_by_table_name) { - auto base_iter = DatabaseWithOwnTablesBase::getTablesIterator(context, filter_by_table_name); + auto base_iter = DatabaseWithOwnTablesBase::getTablesIterator(local_context, filter_by_table_name); return std::make_unique(std::move(typeid_cast(*base_iter))); } UUID DatabaseAtomic::tryGetTableUUID(const String & table_name) const { - if (auto table = tryGetTable(table_name, global_context)) + if (auto table = tryGetTable(table_name, getContext())) return table->getStorageID().uuid; return UUIDHelpers::Nil; } -void DatabaseAtomic::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) +void DatabaseAtomic::loadStoredObjects(ContextPtr local_context, bool has_force_restore_data_flag, bool force_attach) { /// Recreate symlinks to table data dirs in case of force restore, because some of them may be broken if (has_force_restore_data_flag) Poco::File(path_to_table_symlinks).remove(true); - DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag, force_attach); + DatabaseOrdinary::loadStoredObjects(local_context, has_force_restore_data_flag, force_attach); if (has_force_restore_data_flag) { @@ -445,7 +450,7 @@ void DatabaseAtomic::tryCreateSymlink(const String & table_name, const String & try { String link = path_to_table_symlinks + escapeForFileName(table_name); - Poco::File data = Poco::Path(global_context.getPath()).makeAbsolute().toString() + actual_data_path; + Poco::File data = Poco::Path(getContext()->getPath()).makeAbsolute().toString() + actual_data_path; if (!if_data_path_exist || data.exists()) data.linkTo(link, Poco::File::LINK_SYMBOLIC); } @@ -505,8 +510,8 @@ void DatabaseAtomic::renameDatabase(const String & new_name) } auto new_name_escaped = escapeForFileName(new_name); - auto old_database_metadata_path = global_context.getPath() + "metadata/" + escapeForFileName(getDatabaseName()) + ".sql"; - auto new_database_metadata_path = global_context.getPath() + "metadata/" + new_name_escaped + ".sql"; + auto old_database_metadata_path = getContext()->getPath() + "metadata/" + escapeForFileName(getDatabaseName()) + ".sql"; + auto new_database_metadata_path = getContext()->getPath() + "metadata/" + new_name_escaped + ".sql"; renameNoReplace(old_database_metadata_path, new_database_metadata_path); String old_path_to_table_symlinks; @@ -531,9 +536,9 @@ void DatabaseAtomic::renameDatabase(const String & new_name) renameDictionaryInMemoryUnlocked(old_name, name); } - path_to_metadata_symlink = global_context.getPath() + "metadata/" + new_name_escaped; + path_to_metadata_symlink = getContext()->getPath() + "metadata/" + new_name_escaped; old_path_to_table_symlinks = path_to_table_symlinks; - path_to_table_symlinks = global_context.getPath() + "data/" + new_name_escaped + "/"; + path_to_table_symlinks = getContext()->getPath() + "data/" + new_name_escaped + "/"; } Poco::File(old_path_to_table_symlinks).renameTo(path_to_table_symlinks); @@ -563,17 +568,11 @@ void DatabaseAtomic::renameDictionaryInMemoryUnlocked(const StorageID & old_name auto result = external_loader.getLoadResult(toString(old_name.uuid)); if (!result.object) return; - const auto & dict = dynamic_cast(*result.object); + const auto & dict = dynamic_cast(*result.object); dict.updateDictionaryName(new_name); } void DatabaseAtomic::waitDetachedTableNotInUse(const UUID & uuid) { - { - std::lock_guard lock{mutex}; - if (detached_tables.count(uuid) == 0) - return; - } - /// Table is in use while its shared_ptr counter is greater than 1. /// We cannot trigger condvar on shared_ptr destruction, so it's busy wait. while (true) @@ -589,5 +588,13 @@ void DatabaseAtomic::waitDetachedTableNotInUse(const UUID & uuid) } } +void DatabaseAtomic::checkDetachedTableNotInUse(const UUID & uuid) +{ + DetachedTables not_in_use; + std::lock_guard lock{mutex}; + not_in_use = cleanupDetachedTables(); + assertDetachedTableNotInUse(uuid); +} + } diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 09cdf269b35..695d22360ca 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -19,8 +19,8 @@ namespace DB class DatabaseAtomic : public DatabaseOrdinary { public: - DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger_name, const Context & context_); - DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const Context & context_); + DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger_name, ContextPtr context_); + DatabaseAtomic(String name_, String metadata_path_, UUID uuid, ContextPtr context_); String getEngineName() const override { return "Atomic"; } UUID getUUID() const override { return db_uuid; } @@ -28,14 +28,14 @@ public: void renameDatabase(const String & new_name) override; void renameTable( - const Context & context, + ContextPtr context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) override; - void dropTable(const Context & context, const String & table_name, bool no_delay) override; + void dropTable(ContextPtr context, const String & table_name, bool no_delay) override; void attachTable(const String & name, const StoragePtr & table, const String & relative_table_path) override; StoragePtr detachTable(const String & name) override; @@ -43,11 +43,11 @@ public: String getTableDataPath(const String & table_name) const override; String getTableDataPath(const ASTCreateQuery & query) const override; - void drop(const Context & /*context*/) override; + void drop(ContextPtr /*context*/) override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) override; - void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; + void loadStoredObjects(ContextPtr context, bool has_force_restore_data_flag, bool force_attach) override; /// Atomic database cannot be detached if there is detached table which still in use void assertCanBeDetached(bool cleanup) override; @@ -58,15 +58,16 @@ public: void tryRemoveSymlink(const String & table_name); void waitDetachedTableNotInUse(const UUID & uuid) override; + void checkDetachedTableNotInUse(const UUID & uuid) override; void setDetachedTableNotInUseForce(const UUID & uuid); protected: - void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) override; + void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, ContextPtr query_context) override; void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) override; + const String & table_metadata_tmp_path, const String & table_metadata_path, ContextPtr query_context) override; void assertDetachedTableNotInUse(const UUID & uuid); - typedef std::unordered_map DetachedTables; + using DetachedTables = std::unordered_map; [[nodiscard]] DetachedTables cleanupDetachedTables(); void tryCreateMetadataSymlink(); @@ -74,7 +75,7 @@ protected: void renameDictionaryInMemoryUnlocked(const StorageID & old_name, const StorageID & new_name); //TODO store path in DatabaseWithOwnTables::tables - typedef std::unordered_map NameToPathMap; + using NameToPathMap = std::unordered_map; NameToPathMap table_name_to_path; DetachedTables detached_tables; diff --git a/src/Databases/DatabaseDictionary.cpp b/src/Databases/DatabaseDictionary.cpp index ff5510f0bf9..c00201145eb 100644 --- a/src/Databases/DatabaseDictionary.cpp +++ b/src/Databases/DatabaseDictionary.cpp @@ -44,17 +44,16 @@ namespace } } -DatabaseDictionary::DatabaseDictionary(const String & name_, const Context & context_) - : IDatabase(name_) +DatabaseDictionary::DatabaseDictionary(const String & name_, ContextPtr context_) + : IDatabase(name_), WithContext(context_->getGlobalContext()) , log(&Poco::Logger::get("DatabaseDictionary(" + database_name + ")")) - , global_context(context_.getGlobalContext()) { } Tables DatabaseDictionary::listTables(const FilterByNameFunction & filter_by_name) { Tables tables; - auto load_results = global_context.getExternalDictionariesLoader().getLoadResults(filter_by_name); + auto load_results = getContext()->getExternalDictionariesLoader().getLoadResults(filter_by_name); String db_name = getDatabaseName(); for (auto & load_result : load_results) { @@ -65,34 +64,34 @@ Tables DatabaseDictionary::listTables(const FilterByNameFunction & filter_by_nam return tables; } -bool DatabaseDictionary::isTableExist(const String & table_name, const Context &) const +bool DatabaseDictionary::isTableExist(const String & table_name, ContextPtr) const { - return global_context.getExternalDictionariesLoader().getCurrentStatus(table_name) != ExternalLoader::Status::NOT_EXIST; + return getContext()->getExternalDictionariesLoader().getCurrentStatus(table_name) != ExternalLoader::Status::NOT_EXIST; } -StoragePtr DatabaseDictionary::tryGetTable(const String & table_name, const Context &) const +StoragePtr DatabaseDictionary::tryGetTable(const String & table_name, ContextPtr) const { - auto load_result = global_context.getExternalDictionariesLoader().getLoadResult(table_name); + auto load_result = getContext()->getExternalDictionariesLoader().getLoadResult(table_name); return createStorageDictionary(getDatabaseName(), load_result); } -DatabaseTablesIteratorPtr DatabaseDictionary::getTablesIterator(const Context &, const FilterByNameFunction & filter_by_table_name) +DatabaseTablesIteratorPtr DatabaseDictionary::getTablesIterator(ContextPtr, const FilterByNameFunction & filter_by_table_name) { return std::make_unique(listTables(filter_by_table_name), getDatabaseName()); } bool DatabaseDictionary::empty() const { - return !global_context.getExternalDictionariesLoader().hasObjects(); + return !getContext()->getExternalDictionariesLoader().hasObjects(); } -ASTPtr DatabaseDictionary::getCreateTableQueryImpl(const String & table_name, const Context &, bool throw_on_error) const +ASTPtr DatabaseDictionary::getCreateTableQueryImpl(const String & table_name, ContextPtr, bool throw_on_error) const { String query; { WriteBufferFromString buffer(query); - auto load_result = global_context.getExternalDictionariesLoader().getLoadResult(table_name); + auto load_result = getContext()->getExternalDictionariesLoader().getLoadResult(table_name); if (!load_result.config) { if (throw_on_error) @@ -106,7 +105,7 @@ ASTPtr DatabaseDictionary::getCreateTableQueryImpl(const String & table_name, co buffer << ") Engine = Dictionary(" << backQuoteIfNeed(table_name) << ")"; } - auto settings = global_context.getSettingsRef(); + auto settings = getContext()->getSettingsRef(); ParserCreateQuery parser; const char * pos = query.data(); std::string error_message; @@ -126,7 +125,7 @@ ASTPtr DatabaseDictionary::getCreateDatabaseQuery() const WriteBufferFromString buffer(query); buffer << "CREATE DATABASE " << backQuoteIfNeed(getDatabaseName()) << " ENGINE = Dictionary"; } - auto settings = global_context.getSettingsRef(); + auto settings = getContext()->getSettingsRef(); ParserCreateQuery parser; return parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); } diff --git a/src/Databases/DatabaseDictionary.h b/src/Databases/DatabaseDictionary.h index 2cfc6ef3285..06402a96381 100644 --- a/src/Databases/DatabaseDictionary.h +++ b/src/Databases/DatabaseDictionary.h @@ -1,11 +1,12 @@ #pragma once -#include -#include #include #include #include +#include +#include + namespace Poco { @@ -19,21 +20,21 @@ namespace DB /* Database to store StorageDictionary tables * automatically creates tables for all dictionaries */ -class DatabaseDictionary final : public IDatabase +class DatabaseDictionary final : public IDatabase, WithContext { public: - DatabaseDictionary(const String & name_, const Context & context_); + DatabaseDictionary(const String & name_, ContextPtr context_); String getEngineName() const override { return "Dictionary"; } - bool isTableExist(const String & table_name, const Context & context) const override; + bool isTableExist(const String & table_name, ContextPtr context) const override; - StoragePtr tryGetTable(const String & table_name, const Context & context) const override; + StoragePtr tryGetTable(const String & table_name, ContextPtr context) const override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) override; bool empty() const override; @@ -44,11 +45,10 @@ public: void shutdown() override; protected: - ASTPtr getCreateTableQueryImpl(const String & table_name, const Context & context, bool throw_on_error) const override; + ASTPtr getCreateTableQueryImpl(const String & table_name, ContextPtr context, bool throw_on_error) const override; private: Poco::Logger * log; - const Context & global_context; Tables listTables(const FilterByNameFunction & filter_by_name); }; diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index cd0143556c9..9d09ac731d2 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -30,13 +30,14 @@ #endif #if USE_MYSQL || USE_LIBPQXX +#include #include #include #endif #if USE_LIBPQXX #include // Y_IGNORE -#include +#include #endif namespace DB @@ -50,7 +51,7 @@ namespace ErrorCodes extern const int CANNOT_CREATE_DATABASE; } -DatabasePtr DatabaseFactory::get(const ASTCreateQuery & create, const String & metadata_path, Context & context) +DatabasePtr DatabaseFactory::get(const ASTCreateQuery & create, const String & metadata_path, ContextPtr context) { bool created = false; @@ -65,8 +66,8 @@ DatabasePtr DatabaseFactory::get(const ASTCreateQuery & create, const String & m DatabasePtr impl = getImpl(create, metadata_path, context); - if (impl && context.hasQueryContext() && context.getSettingsRef().log_queries) - context.getQueryContext().addQueryFactoriesInfo(Context::QueryLogFactories::Database, impl->getEngineName()); + if (impl && context->hasQueryContext() && context->getSettingsRef().log_queries) + context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Database, impl->getEngineName()); return impl; @@ -91,7 +92,7 @@ static inline ValueType safeGetLiteralValue(const ASTPtr &ast, const String &eng return ast->as()->value.safeGet(); } -DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String & metadata_path, Context & context) +DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String & metadata_path, ContextPtr context) { auto * engine_define = create.storage; const String & database_name = create.database; @@ -133,19 +134,20 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String ASTs & arguments = engine->arguments->children; arguments[1] = evaluateConstantExpressionOrIdentifierAsLiteral(arguments[1], context); - const auto & host_name_and_port = safeGetLiteralValue(arguments[0], engine_name); + const auto & host_port = safeGetLiteralValue(arguments[0], engine_name); const auto & mysql_database_name = safeGetLiteralValue(arguments[1], engine_name); const auto & mysql_user_name = safeGetLiteralValue(arguments[2], engine_name); const auto & mysql_user_password = safeGetLiteralValue(arguments[3], engine_name); try { - const auto & [remote_host_name, remote_port] = parseAddress(host_name_and_port, 3306); - auto mysql_pool = mysqlxx::Pool(mysql_database_name, remote_host_name, mysql_user_name, mysql_user_password, remote_port); - if (engine_name == "MySQL") { auto mysql_database_settings = std::make_unique(); + /// Split into replicas if needed. + size_t max_addresses = context->getSettingsRef().glob_expansion_max_elements; + auto addresses = parseRemoteDescriptionForExternalDatabase(host_port, max_addresses, 3306); + auto mysql_pool = mysqlxx::PoolWithFailover(mysql_database_name, addresses, mysql_user_name, mysql_user_password); mysql_database_settings->loadFromQueryContext(context); mysql_database_settings->loadFromQuery(*engine_define); /// higher priority @@ -154,7 +156,10 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String context, database_name, metadata_path, engine_define, mysql_database_name, std::move(mysql_database_settings), std::move(mysql_pool)); } + const auto & [remote_host_name, remote_port] = parseAddress(host_port, 3306); MySQLClient client(remote_host_name, remote_port, mysql_user_name, mysql_user_password); + auto mysql_pool = mysqlxx::Pool(mysql_database_name, remote_host_name, mysql_user_name, mysql_user_password, remote_port); + auto materialize_mode_settings = std::make_unique(); @@ -204,9 +209,9 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String String shard_name = safeGetLiteralValue(arguments[1], "Replicated"); String replica_name = safeGetLiteralValue(arguments[2], "Replicated"); - zookeeper_path = context.getMacros()->expand(zookeeper_path); - shard_name = context.getMacros()->expand(shard_name); - replica_name = context.getMacros()->expand(replica_name); + zookeeper_path = context->getMacros()->expand(zookeeper_path); + shard_name = context->getMacros()->expand(shard_name); + replica_name = context->getMacros()->expand(replica_name); DatabaseReplicatedSettings database_replicated_settings{}; if (engine_define->settings) @@ -243,14 +248,20 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (engine->arguments->children.size() == 5) use_table_cache = safeGetLiteralValue(engine_args[4], engine_name); - auto parsed_host_port = parseAddress(host_port, 5432); + /// Split into replicas if needed. + size_t max_addresses = context->getSettingsRef().glob_expansion_max_elements; + auto addresses = parseRemoteDescriptionForExternalDatabase(host_port, max_addresses, 5432); /// no connection is made here - auto connection = std::make_shared( - postgres_database_name, parsed_host_port.first, parsed_host_port.second, username, password); + auto connection_pool = std::make_shared( + postgres_database_name, + addresses, + username, password, + context->getSettingsRef().postgresql_connection_pool_size, + context->getSettingsRef().postgresql_connection_pool_wait_timeout); return std::make_shared( - context, metadata_path, engine_define, database_name, postgres_database_name, connection, use_table_cache); + context, metadata_path, engine_define, database_name, postgres_database_name, connection_pool, use_table_cache); } #endif diff --git a/src/Databases/DatabaseFactory.h b/src/Databases/DatabaseFactory.h index 88d33dc1cd5..8992ea27093 100644 --- a/src/Databases/DatabaseFactory.h +++ b/src/Databases/DatabaseFactory.h @@ -11,9 +11,9 @@ class ASTCreateQuery; class DatabaseFactory { public: - static DatabasePtr get(const ASTCreateQuery & create, const String & metadata_path, Context & context); + static DatabasePtr get(const ASTCreateQuery & create, const String & metadata_path, ContextPtr context); - static DatabasePtr getImpl(const ASTCreateQuery & create, const String & metadata_path, Context & context); + static DatabasePtr getImpl(const ASTCreateQuery & create, const String & metadata_path, ContextPtr context); }; } diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index f297bf2c82f..d498cb96062 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include @@ -27,7 +27,7 @@ namespace ErrorCodes } -DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_) +DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, ContextPtr context_) : DatabaseOnDisk(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseLazy (" + name_ + ")", context_) , expiration_time(expiration_time_) { @@ -35,11 +35,11 @@ DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, void DatabaseLazy::loadStoredObjects( - Context & context, + ContextPtr local_context, bool /* has_force_restore_data_flag */, bool /*force_attach*/) { - iterateMetadataFiles(context, [this](const String & file_name) + iterateMetadataFiles(local_context, [this](const String & file_name) { const std::string table_name = file_name.substr(0, file_name.size() - 4); @@ -56,15 +56,15 @@ void DatabaseLazy::loadStoredObjects( void DatabaseLazy::createTable( - const Context & context, + ContextPtr local_context, const String & table_name, const StoragePtr & table, const ASTPtr & query) { - SCOPE_EXIT({ clearExpiredTables(); }); + SCOPE_EXIT_MEMORY_SAFE({ clearExpiredTables(); }); if (!endsWith(table->getName(), "Log")) throw Exception("Lazy engine can be used only with *Log tables.", ErrorCodes::UNSUPPORTED_METHOD); - DatabaseOnDisk::createTable(context, table_name, table, query); + DatabaseOnDisk::createTable(local_context, table_name, table, query); /// DatabaseOnDisk::createTable renames file, so we need to get new metadata_modification_time. std::lock_guard lock(mutex); @@ -74,24 +74,24 @@ void DatabaseLazy::createTable( } void DatabaseLazy::dropTable( - const Context & context, + ContextPtr local_context, const String & table_name, bool no_delay) { - SCOPE_EXIT({ clearExpiredTables(); }); - DatabaseOnDisk::dropTable(context, table_name, no_delay); + SCOPE_EXIT_MEMORY_SAFE({ clearExpiredTables(); }); + DatabaseOnDisk::dropTable(local_context, table_name, no_delay); } void DatabaseLazy::renameTable( - const Context & context, + ContextPtr local_context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) { - SCOPE_EXIT({ clearExpiredTables(); }); - DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary); + SCOPE_EXIT_MEMORY_SAFE({ clearExpiredTables(); }); + DatabaseOnDisk::renameTable(local_context, table_name, to_database, to_table_name, exchange, dictionary); } @@ -105,7 +105,7 @@ time_t DatabaseLazy::getObjectMetadataModificationTime(const String & table_name } void DatabaseLazy::alterTable( - const Context & /* context */, + ContextPtr /* context */, const StorageID & /*table_id*/, const StorageInMemoryMetadata & /* metadata */) { @@ -115,14 +115,14 @@ void DatabaseLazy::alterTable( bool DatabaseLazy::isTableExist(const String & table_name) const { - SCOPE_EXIT({ clearExpiredTables(); }); + SCOPE_EXIT_MEMORY_SAFE({ clearExpiredTables(); }); std::lock_guard lock(mutex); return tables_cache.find(table_name) != tables_cache.end(); } StoragePtr DatabaseLazy::tryGetTable(const String & table_name) const { - SCOPE_EXIT({ clearExpiredTables(); }); + SCOPE_EXIT_MEMORY_SAFE({ clearExpiredTables(); }); { std::lock_guard lock(mutex); auto it = tables_cache.find(table_name); @@ -142,7 +142,7 @@ StoragePtr DatabaseLazy::tryGetTable(const String & table_name) const return loadTable(table_name); } -DatabaseTablesIteratorPtr DatabaseLazy::getTablesIterator(const Context &, const FilterByNameFunction & filter_by_table_name) +DatabaseTablesIteratorPtr DatabaseLazy::getTablesIterator(ContextPtr, const FilterByNameFunction & filter_by_table_name) { std::lock_guard lock(mutex); Strings filtered_tables; @@ -224,7 +224,7 @@ DatabaseLazy::~DatabaseLazy() StoragePtr DatabaseLazy::loadTable(const String & table_name) const { - SCOPE_EXIT({ clearExpiredTables(); }); + SCOPE_EXIT_MEMORY_SAFE({ clearExpiredTables(); }); LOG_DEBUG(log, "Load table {} to cache.", backQuote(table_name)); @@ -233,9 +233,9 @@ StoragePtr DatabaseLazy::loadTable(const String & table_name) const try { StoragePtr table; - Context context_copy(global_context); /// some tables can change context, but not LogTables + auto context_copy = Context::createCopy(context); /// some tables can change context, but not LogTables - auto ast = parseQueryFromMetadata(log, global_context, table_metadata_path, /*throw_on_error*/ true, /*remove_empty*/false); + auto ast = parseQueryFromMetadata(log, getContext(), table_metadata_path, /*throw_on_error*/ true, /*remove_empty*/false); if (ast) { const auto & ast_create = ast->as(); diff --git a/src/Databases/DatabaseLazy.h b/src/Databases/DatabaseLazy.h index 2d091297c91..99a71b342fa 100644 --- a/src/Databases/DatabaseLazy.h +++ b/src/Databases/DatabaseLazy.h @@ -18,7 +18,7 @@ class Context; class DatabaseLazy final : public DatabaseOnDisk { public: - DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_); + DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, ContextPtr context_); String getEngineName() const override { return "Lazy"; } @@ -27,22 +27,22 @@ public: bool canContainDistributedTables() const override { return false; } void loadStoredObjects( - Context & context, + ContextPtr context, bool has_force_restore_data_flag, bool force_attach) override; void createTable( - const Context & context, + ContextPtr context, const String & table_name, const StoragePtr & table, const ASTPtr & query) override; void dropTable( - const Context & context, + ContextPtr context, const String & table_name, bool no_delay) override; void renameTable( - const Context & context, + ContextPtr context, const String & table_name, IDatabase & to_database, const String & to_table_name, @@ -50,21 +50,21 @@ public: bool dictionary) override; void alterTable( - const Context & context, + ContextPtr context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) override; time_t getObjectMetadataModificationTime(const String & table_name) const override; - bool isTableExist(const String & table_name, const Context &) const override { return isTableExist(table_name); } + bool isTableExist(const String & table_name, ContextPtr) const override { return isTableExist(table_name); } bool isTableExist(const String & table_name) const; - StoragePtr tryGetTable(const String & table_name, const Context &) const override { return tryGetTable(table_name); } + StoragePtr tryGetTable(const String & table_name, ContextPtr) const override { return tryGetTable(table_name); } StoragePtr tryGetTable(const String & table_name) const; bool empty() const override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) override; void attachTable(const String & table_name, const StoragePtr & table, const String & relative_table_path) override; diff --git a/src/Databases/DatabaseMemory.cpp b/src/Databases/DatabaseMemory.cpp index 357acb32371..f21a145df55 100644 --- a/src/Databases/DatabaseMemory.cpp +++ b/src/Databases/DatabaseMemory.cpp @@ -16,13 +16,13 @@ namespace ErrorCodes extern const int UNKNOWN_TABLE; } -DatabaseMemory::DatabaseMemory(const String & name_, const Context & context) - : DatabaseWithOwnTablesBase(name_, "DatabaseMemory(" + name_ + ")", context) +DatabaseMemory::DatabaseMemory(const String & name_, ContextPtr context_) + : DatabaseWithOwnTablesBase(name_, "DatabaseMemory(" + name_ + ")", context_) , data_path("data/" + escapeForFileName(database_name) + "/") {} void DatabaseMemory::createTable( - const Context & /*context*/, + ContextPtr /*context*/, const String & table_name, const StoragePtr & table, const ASTPtr & query) @@ -33,7 +33,7 @@ void DatabaseMemory::createTable( } void DatabaseMemory::dropTable( - const Context & /*context*/, + ContextPtr /*context*/, const String & table_name, bool /*no_delay*/) { @@ -67,7 +67,7 @@ ASTPtr DatabaseMemory::getCreateDatabaseQuery() const return create_query; } -ASTPtr DatabaseMemory::getCreateTableQueryImpl(const String & table_name, const Context &, bool throw_on_error) const +ASTPtr DatabaseMemory::getCreateTableQueryImpl(const String & table_name, ContextPtr, bool throw_on_error) const { std::lock_guard lock{mutex}; auto it = create_queries.find(table_name); @@ -83,15 +83,15 @@ ASTPtr DatabaseMemory::getCreateTableQueryImpl(const String & table_name, const UUID DatabaseMemory::tryGetTableUUID(const String & table_name) const { - if (auto table = tryGetTable(table_name, global_context)) + if (auto table = tryGetTable(table_name, getContext())) return table->getStorageID().uuid; return UUIDHelpers::Nil; } -void DatabaseMemory::drop(const Context & context) +void DatabaseMemory::drop(ContextPtr local_context) { /// Remove data on explicit DROP DATABASE - std::filesystem::remove_all(context.getPath() + data_path); + std::filesystem::remove_all(local_context->getPath() + data_path); } } diff --git a/src/Databases/DatabaseMemory.h b/src/Databases/DatabaseMemory.h index 40cc808e42b..8c9cf86ec19 100644 --- a/src/Databases/DatabaseMemory.h +++ b/src/Databases/DatabaseMemory.h @@ -19,22 +19,22 @@ namespace DB class DatabaseMemory final : public DatabaseWithOwnTablesBase { public: - DatabaseMemory(const String & name_, const Context & context); + DatabaseMemory(const String & name_, ContextPtr context); String getEngineName() const override { return "Memory"; } void createTable( - const Context & context, + ContextPtr context, const String & table_name, const StoragePtr & table, const ASTPtr & query) override; void dropTable( - const Context & context, + ContextPtr context, const String & table_name, bool no_delay) override; - ASTPtr getCreateTableQueryImpl(const String & name, const Context & context, bool throw_on_error) const override; + ASTPtr getCreateTableQueryImpl(const String & name, ContextPtr context, bool throw_on_error) const override; ASTPtr getCreateDatabaseQuery() const override; /// DatabaseMemory allows to create tables, which store data on disk. @@ -46,7 +46,7 @@ public: UUID tryGetTableUUID(const String & table_name) const override; - void drop(const Context & context) override; + void drop(ContextPtr context) override; private: String data_path; diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index e5d2b23ace0..14ad1c7e4c5 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -46,7 +46,7 @@ std::pair createTableFromAST( ASTCreateQuery ast_create_query, const String & database_name, const String & table_data_path_relative, - Context & context, + ContextPtr context, bool has_force_restore_data_flag) { ast_create_query.attach = true; @@ -58,7 +58,7 @@ std::pair createTableFromAST( auto table_function = factory.get(ast_create_query.as_table_function, context); ColumnsDescription columns; if (ast_create_query.columns_list && ast_create_query.columns_list->columns) - columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, false); + columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); StoragePtr storage = table_function->execute(ast_create_query.as_table_function, context, ast_create_query.table, std::move(columns)); storage->renameInMemory(ast_create_query); return {ast_create_query.table, storage}; @@ -69,7 +69,7 @@ std::pair createTableFromAST( if (!ast_create_query.columns_list || !ast_create_query.columns_list->columns) throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); - ColumnsDescription columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, false); + ColumnsDescription columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); ConstraintsDescription constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); return @@ -79,7 +79,7 @@ std::pair createTableFromAST( ast_create_query, table_data_path_relative, context, - context.getGlobalContext(), + context->getGlobalContext(), columns, constraints, has_force_restore_data_flag) @@ -188,23 +188,23 @@ DatabaseOnDisk::DatabaseOnDisk( const String & metadata_path_, const String & data_path_, const String & logger, - const Context & context) - : DatabaseWithOwnTablesBase(name, logger, context) + ContextPtr local_context) + : DatabaseWithOwnTablesBase(name, logger, local_context) , metadata_path(metadata_path_) , data_path(data_path_) { - Poco::File(context.getPath() + data_path).createDirectories(); + Poco::File(local_context->getPath() + data_path).createDirectories(); Poco::File(metadata_path).createDirectories(); } void DatabaseOnDisk::createTable( - const Context & context, + ContextPtr local_context, const String & table_name, const StoragePtr & table, const ASTPtr & query) { - const auto & settings = context.getSettingsRef(); + const auto & settings = local_context->getSettingsRef(); const auto & create = query->as(); assert(table_name == create.table); @@ -221,18 +221,20 @@ void DatabaseOnDisk::createTable( /// But there is protection from it - see using DDLGuard in InterpreterCreateQuery. if (isDictionaryExist(table_name)) - throw Exception(ErrorCodes::DICTIONARY_ALREADY_EXISTS, "Dictionary {}.{} already exists", backQuote(getDatabaseName()), backQuote(table_name)); + throw Exception( + ErrorCodes::DICTIONARY_ALREADY_EXISTS, "Dictionary {}.{} already exists", backQuote(getDatabaseName()), backQuote(table_name)); - if (isTableExist(table_name, global_context)) - throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Table {}.{} already exists", backQuote(getDatabaseName()), backQuote(table_name)); + if (isTableExist(table_name, getContext())) + throw Exception( + ErrorCodes::TABLE_ALREADY_EXISTS, "Table {}.{} already exists", backQuote(getDatabaseName()), backQuote(table_name)); String table_metadata_path = getObjectMetadataPath(table_name); if (create.attach_short_syntax) { /// Metadata already exists, table was detached + removeDetachedPermanentlyFlag(local_context, table_name, table_metadata_path, true); attachTable(table_name, table, getTableDataPath(create)); - removeDetachedPermanentlyFlag(table_name, table_metadata_path); return; } @@ -241,7 +243,7 @@ void DatabaseOnDisk::createTable( if (create.attach && Poco::File(table_metadata_path).exists()) { - ASTPtr ast_detached = parseQueryFromMetadata(log, context, table_metadata_path); + ASTPtr ast_detached = parseQueryFromMetadata(log, local_context, table_metadata_path); auto & create_detached = ast_detached->as(); // either both should be Nil, either values should be equal @@ -268,14 +270,14 @@ void DatabaseOnDisk::createTable( out.close(); } - commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path, context); + commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path, local_context); - removeDetachedPermanentlyFlag(table_name, table_metadata_path); + removeDetachedPermanentlyFlag(local_context, table_name, table_metadata_path, false); } /// If the table was detached permanently we will have a flag file with /// .sql.detached extension, is not needed anymore since we attached the table back -void DatabaseOnDisk::removeDetachedPermanentlyFlag(const String & table_name, const String & table_metadata_path) const +void DatabaseOnDisk::removeDetachedPermanentlyFlag(ContextPtr, const String & table_name, const String & table_metadata_path, bool) const { try { @@ -293,7 +295,7 @@ void DatabaseOnDisk::removeDetachedPermanentlyFlag(const String & table_name, co void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, const String & table_metadata_tmp_path, const String & table_metadata_path, - const Context & /*query_context*/) + ContextPtr /*query_context*/) { try { @@ -311,7 +313,7 @@ void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const Stora } } -void DatabaseOnDisk::detachTablePermanently(const Context &, const String & table_name) +void DatabaseOnDisk::detachTablePermanently(ContextPtr, const String & table_name) { auto table = detachTable(table_name); @@ -327,7 +329,7 @@ void DatabaseOnDisk::detachTablePermanently(const Context &, const String & tabl } } -void DatabaseOnDisk::dropTable(const Context & context, const String & table_name, bool /*no_delay*/) +void DatabaseOnDisk::dropTable(ContextPtr local_context, const String & table_name, bool /*no_delay*/) { String table_metadata_path = getObjectMetadataPath(table_name); String table_metadata_path_drop = table_metadata_path + drop_suffix; @@ -349,7 +351,7 @@ void DatabaseOnDisk::dropTable(const Context & context, const String & table_nam table->drop(); table->is_dropped = true; - Poco::File table_data_dir{context.getPath() + table_data_path_relative}; + Poco::File table_data_dir{local_context->getPath() + table_data_path_relative}; if (table_data_dir.exists()) table_data_dir.remove(true); } @@ -387,7 +389,7 @@ void DatabaseOnDisk::checkMetadataFilenameAvailabilityUnlocked(const String & to } void DatabaseOnDisk::renameTable( - const Context & context, + ContextPtr local_context, const String & table_name, IDatabase & to_database, const String & to_table_name, @@ -418,15 +420,16 @@ void DatabaseOnDisk::renameTable( String table_metadata_path; ASTPtr attach_query; /// DatabaseLazy::detachTable may return nullptr even if table exists, so we need tryGetTable for this case. - StoragePtr table = tryGetTable(table_name, global_context); + StoragePtr table = tryGetTable(table_name, getContext()); detachTable(table_name); UUID prev_uuid = UUIDHelpers::Nil; try { - table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); + table_lock = table->lockExclusively( + local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); table_metadata_path = getObjectMetadataPath(table_name); - attach_query = parseQueryFromMetadata(log, context, table_metadata_path); + attach_query = parseQueryFromMetadata(log, local_context, table_metadata_path); auto & create = attach_query->as(); create.database = to_database.getDatabaseName(); create.table = to_table_name; @@ -454,7 +457,7 @@ void DatabaseOnDisk::renameTable( } /// Now table data are moved to new database, so we must add metadata and attach table to new database - to_database.createTable(context, to_table_name, table, attach_query); + to_database.createTable(local_context, to_table_name, table, attach_query); Poco::File(table_metadata_path).remove(); @@ -473,10 +476,10 @@ void DatabaseOnDisk::renameTable( /// It returns create table statement (even if table is detached) -ASTPtr DatabaseOnDisk::getCreateTableQueryImpl(const String & table_name, const Context &, bool throw_on_error) const +ASTPtr DatabaseOnDisk::getCreateTableQueryImpl(const String & table_name, ContextPtr, bool throw_on_error) const { ASTPtr ast; - bool has_table = tryGetTable(table_name, global_context) != nullptr; + bool has_table = tryGetTable(table_name, getContext()) != nullptr; auto table_metadata_path = getObjectMetadataPath(table_name); try { @@ -497,11 +500,11 @@ ASTPtr DatabaseOnDisk::getCreateDatabaseQuery() const { ASTPtr ast; - auto settings = global_context.getSettingsRef(); + auto settings = getContext()->getSettingsRef(); { std::lock_guard lock(mutex); - auto database_metadata_path = global_context.getPath() + "metadata/" + escapeForFileName(database_name) + ".sql"; - ast = parseQueryFromMetadata(log, global_context, database_metadata_path, true); + auto database_metadata_path = getContext()->getPath() + "metadata/" + escapeForFileName(database_name) + ".sql"; + ast = parseQueryFromMetadata(log, getContext(), database_metadata_path, true); auto & ast_create_query = ast->as(); ast_create_query.attach = false; ast_create_query.database = database_name; @@ -518,10 +521,10 @@ ASTPtr DatabaseOnDisk::getCreateDatabaseQuery() const return ast; } -void DatabaseOnDisk::drop(const Context & context) +void DatabaseOnDisk::drop(ContextPtr local_context) { assert(tables.empty()); - Poco::File(context.getPath() + getDataPath()).remove(false); + Poco::File(local_context->getPath() + getDataPath()).remove(false); Poco::File(getMetadataPath()).remove(false); } @@ -541,14 +544,14 @@ time_t DatabaseOnDisk::getObjectMetadataModificationTime(const String & object_n return static_cast(0); } -void DatabaseOnDisk::iterateMetadataFiles(const Context & context, const IteratingFunction & process_metadata_file) const +void DatabaseOnDisk::iterateMetadataFiles(ContextPtr local_context, const IteratingFunction & process_metadata_file) const { auto process_tmp_drop_metadata_file = [&](const String & file_name) { assert(getUUID() == UUIDHelpers::Nil); static const char * tmp_drop_ext = ".sql.tmp_drop"; const std::string object_name = file_name.substr(0, file_name.size() - strlen(tmp_drop_ext)); - if (Poco::File(context.getPath() + getDataPath() + '/' + object_name).exists()) + if (Poco::File(local_context->getPath() + getDataPath() + '/' + object_name).exists()) { Poco::File(getMetadataPath() + file_name).renameTo(getMetadataPath() + object_name + ".sql"); LOG_WARNING(log, "Object {} was not dropped previously and will be restored", backQuote(object_name)); @@ -615,7 +618,12 @@ void DatabaseOnDisk::iterateMetadataFiles(const Context & context, const Iterati pool.wait(); } -ASTPtr DatabaseOnDisk::parseQueryFromMetadata(Poco::Logger * logger, const Context & context, const String & metadata_file_path, bool throw_on_error /*= true*/, bool remove_empty /*= false*/) +ASTPtr DatabaseOnDisk::parseQueryFromMetadata( + Poco::Logger * logger, + ContextPtr local_context, + const String & metadata_file_path, + bool throw_on_error /*= true*/, + bool remove_empty /*= false*/) { String query; @@ -643,7 +651,7 @@ ASTPtr DatabaseOnDisk::parseQueryFromMetadata(Poco::Logger * logger, const Conte return nullptr; } - auto settings = context.getSettingsRef(); + auto settings = local_context->getSettingsRef(); ParserCreateQuery parser; const char * pos = query.data(); std::string error_message; @@ -662,7 +670,12 @@ ASTPtr DatabaseOnDisk::parseQueryFromMetadata(Poco::Logger * logger, const Conte table_name = unescapeForFileName(table_name); if (create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER && logger) - LOG_WARNING(logger, "File {} contains both UUID and table name. Will use name `{}` instead of `{}`", metadata_file_path, table_name, create.table); + LOG_WARNING( + logger, + "File {} contains both UUID and table name. Will use name `{}` instead of `{}`", + metadata_file_path, + table_name, + create.table); create.table = table_name; } @@ -671,7 +684,7 @@ ASTPtr DatabaseOnDisk::parseQueryFromMetadata(Poco::Logger * logger, const Conte ASTPtr DatabaseOnDisk::getCreateQueryFromMetadata(const String & database_metadata_path, bool throw_on_error) const { - ASTPtr ast = parseQueryFromMetadata(log, global_context, database_metadata_path, throw_on_error); + ASTPtr ast = parseQueryFromMetadata(log, getContext(), database_metadata_path, throw_on_error); if (ast) { diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index fefe6e91606..677465e306e 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -16,7 +16,7 @@ std::pair createTableFromAST( ASTCreateQuery ast_create_query, const String & database_name, const String & table_data_path_relative, - Context & context, + ContextPtr context, bool has_force_restore_data_flag); /** Get the string with the table definition based on the CREATE query. @@ -33,23 +33,23 @@ void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemo class DatabaseOnDisk : public DatabaseWithOwnTablesBase { public: - DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); + DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, ContextPtr context); void createTable( - const Context & context, + ContextPtr context, const String & table_name, const StoragePtr & table, const ASTPtr & query) override; - void detachTablePermanently(const Context & context, const String & table_name) override; + void detachTablePermanently(ContextPtr context, const String & table_name) override; void dropTable( - const Context & context, + ContextPtr context, const String & table_name, bool no_delay) override; void renameTable( - const Context & context, + ContextPtr context, const String & table_name, IDatabase & to_database, const String & to_table_name, @@ -58,7 +58,7 @@ public: ASTPtr getCreateDatabaseQuery() const override; - void drop(const Context & context) override; + void drop(ContextPtr context) override; String getObjectMetadataPath(const String & object_name) const override; @@ -69,7 +69,7 @@ public: String getTableDataPath(const ASTCreateQuery & query) const override { return getTableDataPath(query.table); } String getMetadataPath() const override { return metadata_path; } - static ASTPtr parseQueryFromMetadata(Poco::Logger * log, const Context & context, const String & metadata_file_path, bool throw_on_error = true, bool remove_empty = false); + static ASTPtr parseQueryFromMetadata(Poco::Logger * log, ContextPtr context, const String & metadata_file_path, bool throw_on_error = true, bool remove_empty = false); /// will throw when the table we want to attach already exists (in active / detached / detached permanently form) void checkMetadataFilenameAvailability(const String & to_table_name) const; @@ -82,23 +82,22 @@ protected: using IteratingFunction = std::function; - void iterateMetadataFiles(const Context & context, const IteratingFunction & process_metadata_file) const; + void iterateMetadataFiles(ContextPtr context, const IteratingFunction & process_metadata_file) const; ASTPtr getCreateTableQueryImpl( const String & table_name, - const Context & context, + ContextPtr context, bool throw_on_error) const override; ASTPtr getCreateQueryFromMetadata(const String & metadata_path, bool throw_on_error) const; virtual void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context); + const String & table_metadata_tmp_path, const String & table_metadata_path, ContextPtr query_context); + + virtual void removeDetachedPermanentlyFlag(ContextPtr context, const String & table_name, const String & table_metadata_path, bool attach) const; const String metadata_path; const String data_path; - -private: - void removeDetachedPermanentlyFlag(const String & table_name, const String & table_metadata_path) const; }; } diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index a94668dacf7..840be0e006a 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -36,7 +36,7 @@ static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; namespace { void tryAttachTable( - Context & context, + ContextPtr context, const ASTCreateQuery & query, DatabaseOrdinary & database, const String & database_name, @@ -62,7 +62,7 @@ namespace } - void tryAttachDictionary(const ASTPtr & query, DatabaseOrdinary & database, const String & metadata_path, const Context & context) + void tryAttachDictionary(const ASTPtr & query, DatabaseOrdinary & database, const String & metadata_path, ContextPtr context) { auto & create_query = query->as(); assert(create_query.is_dictionary); @@ -94,18 +94,18 @@ namespace } -DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context_) +DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, ContextPtr context_) : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseOrdinary (" + name_ + ")", context_) { } DatabaseOrdinary::DatabaseOrdinary( - const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_) + const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, ContextPtr context_) : DatabaseWithDictionaries(name_, metadata_path_, data_path_, logger, context_) { } -void DatabaseOrdinary::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool /*force_attach*/) +void DatabaseOrdinary::loadStoredObjects(ContextPtr local_context, bool has_force_restore_data_flag, bool /*force_attach*/) { /** Tables load faster if they are loaded in sorted (by name) order. * Otherwise (for the ext4 filesystem), `DirectoryIterator` iterates through them in some order, @@ -117,7 +117,8 @@ void DatabaseOrdinary::loadStoredObjects(Context & context, bool has_force_resto size_t total_dictionaries = 0; - auto process_metadata = [&context, &file_names, &total_dictionaries, &file_names_mutex, this](const String & file_name) + auto process_metadata = [context_weak = ContextWeakPtr(local_context), &file_names, &total_dictionaries, &file_names_mutex, this]( + const String & file_name) { fs::path path(getMetadataPath()); fs::path file_path(file_name); @@ -125,7 +126,7 @@ void DatabaseOrdinary::loadStoredObjects(Context & context, bool has_force_resto try { - auto ast = parseQueryFromMetadata(log, context, full_path.string(), /*throw_on_error*/ true, /*remove_empty*/ false); + auto ast = parseQueryFromMetadata(log, getContext(), full_path.string(), /*throw_on_error*/ true, /*remove_empty*/ false); if (ast) { auto * create_query = ast->as(); @@ -155,7 +156,7 @@ void DatabaseOrdinary::loadStoredObjects(Context & context, bool has_force_resto } }; - iterateMetadataFiles(context, process_metadata); + iterateMetadataFiles(local_context, process_metadata); size_t total_tables = file_names.size() - total_dictionaries; @@ -180,7 +181,7 @@ void DatabaseOrdinary::loadStoredObjects(Context & context, bool has_force_resto auto create_query = query->as(); if (create_query.is_dictionary) { - tryAttachDictionary(query, *this, getMetadataPath() + name, context); + tryAttachDictionary(query, *this, getMetadataPath() + name, local_context); /// Messages, so that it's not boring to wait for the server to load for a long time. logAboutProgress(log, ++dictionaries_processed, total_dictionaries, watch); @@ -195,7 +196,7 @@ void DatabaseOrdinary::loadStoredObjects(Context & context, bool has_force_resto pool.scheduleOrThrowOnError([&]() { tryAttachTable( - context, + local_context, create_query, *this, database_name, @@ -245,7 +246,7 @@ void DatabaseOrdinary::startupTables(ThreadPool & thread_pool) thread_pool.wait(); } -void DatabaseOrdinary::alterTable(const Context & context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) +void DatabaseOrdinary::alterTable(ContextPtr local_context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) { String table_name = table_id.table_name; /// Read the definition of the table and replace the necessary parts with new ones. @@ -265,7 +266,7 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab statement.data() + statement.size(), "in file " + table_metadata_path, 0, - context.getSettingsRef().max_parser_depth); + local_context->getSettingsRef().max_parser_depth); applyMetadataChangesToCreateQuery(ast, metadata); @@ -274,15 +275,15 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab WriteBufferFromFile out(table_metadata_tmp_path, statement.size(), O_WRONLY | O_CREAT | O_EXCL); writeString(statement, out); out.next(); - if (context.getSettingsRef().fsync_metadata) + if (local_context->getSettingsRef().fsync_metadata) out.sync(); out.close(); } - commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, context); + commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, local_context); } -void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & /*statement*/, const Context & /*query_context*/) +void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & /*statement*/, ContextPtr /*query_context*/) { try { diff --git a/src/Databases/DatabaseOrdinary.h b/src/Databases/DatabaseOrdinary.h index c1ad32345f6..4cf58cef9f0 100644 --- a/src/Databases/DatabaseOrdinary.h +++ b/src/Databases/DatabaseOrdinary.h @@ -14,23 +14,26 @@ namespace DB class DatabaseOrdinary : public DatabaseWithDictionaries { public: - DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context); - DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_); + DatabaseOrdinary(const String & name_, const String & metadata_path_, ContextPtr context); + DatabaseOrdinary( + const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, ContextPtr context_); String getEngineName() const override { return "Ordinary"; } - void loadStoredObjects( - Context & context, - bool has_force_restore_data_flag, - bool force_attach) override; + void loadStoredObjects(ContextPtr context, bool has_force_restore_data_flag, bool force_attach) override; void alterTable( - const Context & context, + ContextPtr context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) override; protected: - virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context); + virtual void commitAlterTable( + const StorageID & table_id, + const String & table_metadata_tmp_path, + const String & table_metadata_path, + const String & statement, + ContextPtr query_context); void startupTables(ThreadPool & thread_pool); }; diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 12cff3407d3..46d67e275ba 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -18,10 +18,12 @@ #include #include #include +#include #include #include #include #include +#include namespace DB { @@ -45,12 +47,12 @@ static constexpr const char * BROKEN_TABLES_SUFFIX = "_broken_tables"; zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { - return global_context.getZooKeeper(); + return getContext()->getZooKeeper(); } -static inline String getHostID(const Context & global_context, const UUID & db_uuid) +static inline String getHostID(ContextPtr global_context, const UUID & db_uuid) { - return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()) + ':' + toString(db_uuid); + return Cluster::Address::toString(getFQDNOrHostName(), global_context->getTCPPort()) + ':' + toString(db_uuid); } @@ -64,7 +66,7 @@ DatabaseReplicated::DatabaseReplicated( const String & shard_name_, const String & replica_name_, DatabaseReplicatedSettings db_settings_, - const Context & context_) + ContextPtr context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , shard_name(shard_name_) @@ -105,11 +107,26 @@ std::pair DatabaseReplicated::parseFullReplicaName(const String ClusterPtr DatabaseReplicated::getCluster() const { - /// TODO Maintain up-to-date Cluster and allow to use it in Distributed tables + std::lock_guard lock{mutex}; + if (cluster) + return cluster; + + cluster = getClusterImpl(); + return cluster; +} + +void DatabaseReplicated::setCluster(ClusterPtr && new_cluster) +{ + std::lock_guard lock{mutex}; + cluster = std::move(new_cluster); +} + +ClusterPtr DatabaseReplicated::getClusterImpl() const +{ Strings hosts; Strings host_ids; - auto zookeeper = global_context.getZooKeeper(); + auto zookeeper = getContext()->getZooKeeper(); constexpr int max_retries = 10; int iteration = 0; bool success = false; @@ -120,7 +137,7 @@ ClusterPtr DatabaseReplicated::getCluster() const hosts = zookeeper->getChildren(zookeeper_path + "/replicas", &stat); if (hosts.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found"); - Int32 cver = stat.cversion; + Int32 cversion = stat.cversion; std::sort(hosts.begin(), hosts.end()); std::vector futures; @@ -139,7 +156,9 @@ ClusterPtr DatabaseReplicated::getCluster() const } zookeeper->get(zookeeper_path + "/replicas", &stat); - if (success && cver == stat.version) + if (cversion != stat.cversion) + success = false; + if (success) break; } if (!success) @@ -157,34 +176,35 @@ ClusterPtr DatabaseReplicated::getCluster() const if (id == DROPPED_MARK) continue; auto [shard, replica] = parseFullReplicaName(hosts[i]); - auto pos = id.find(':'); - String host = id.substr(0, pos); + auto pos = id.rfind(':'); + String host_port = id.substr(0, pos); if (shard != current_shard) { current_shard = shard; if (!shards.back().empty()) shards.emplace_back(); } - shards.back().emplace_back(unescapeForFileName(host)); + shards.back().emplace_back(unescapeForFileName(host_port)); } - /// TODO make it configurable - String username = "default"; - String password; + String username = db_settings.cluster_username; + String password = db_settings.cluster_password; + UInt16 default_port = getContext()->getTCPPort(); + bool secure = db_settings.cluster_secure_connection; - return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); + return std::make_shared(getContext()->getSettingsRef(), shards, username, password, default_port, false, secure); } void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach) { try { - if (!global_context.hasZooKeeper()) + if (!getContext()->hasZooKeeper()) { throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } - auto current_zookeeper = global_context.getZooKeeper(); + auto current_zookeeper = getContext()->getZooKeeper(); if (!current_zookeeper->exists(zookeeper_path)) { @@ -197,7 +217,7 @@ void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach) String replica_host_id; if (current_zookeeper->tryGet(replica_path, replica_host_id)) { - String host_id = getHostID(global_context, db_uuid); + String host_id = getHostID(getContext(), db_uuid); if (replica_host_id != host_id) throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", @@ -253,11 +273,8 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP __builtin_unreachable(); } -void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) +void DatabaseReplicated::createEmptyLogEntry(Coordination::Requests & ops, const ZooKeeperPtr & current_zookeeper) { - /// Write host name to replica_path, it will protect from multiple replicas with the same name - auto host_id = getHostID(global_context, db_uuid); - /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). DDLLogEntry entry{}; @@ -266,36 +283,95 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); String query_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(query_path + "/committed", getFullReplicaName(), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); +} + +void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) +{ + /// Write host name to replica_path, it will protect from multiple replicas with the same name + auto host_id = getHostID(getContext(), db_uuid); + Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", "0", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); + createEmptyLogEntry(ops, current_zookeeper); current_zookeeper->multi(ops); } -void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) +void DatabaseReplicated::loadStoredObjects(ContextPtr local_context, bool has_force_restore_data_flag, bool force_attach) { tryConnectToZooKeeperAndInitDatabase(force_attach); - DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); + DatabaseAtomic::loadStoredObjects(local_context, has_force_restore_data_flag, force_attach); - ddl_worker = std::make_unique(this, global_context); + ddl_worker = std::make_unique(this, getContext()); ddl_worker->startup(); } -BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context) +void DatabaseReplicated::checkQueryValid(const ASTPtr & query, ContextPtr query_context) const { - if (is_readonly) - throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); - - if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) - throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); - /// Replicas will set correct name of current database in query context (database name can be different on replicas) - if (auto * ddl_query = query->as()) + if (auto * ddl_query = dynamic_cast(query.get())) + { + if (ddl_query->database != getDatabaseName()) + throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed"); ddl_query->database.clear(); + if (auto * create = query->as()) + { + bool replicated_table = create->storage && create->storage->engine && startsWith(create->storage->engine->name, "Replicated"); + if (!replicated_table || !create->storage->engine->arguments) + return; + + ASTs & args = create->storage->engine->arguments->children; + if (args.size() < 2) + return; + + ASTLiteral * arg1 = args[0]->as(); + ASTLiteral * arg2 = args[1]->as(); + if (!arg1 || !arg2 || arg1->value.getType() != Field::Types::String || arg2->value.getType() != Field::Types::String) + return; + + String maybe_path = arg1->value.get(); + String maybe_replica = arg2->value.get(); + + /// Looks like it's ReplicatedMergeTree with explicit zookeeper_path and replica_name arguments. + /// Let's ensure that some macros are used. + /// NOTE: we cannot check here that substituted values will be actually different on shards and replicas. + + Macros::MacroExpansionInfo info; + info.table_id = {getDatabaseName(), create->table, create->uuid}; + query_context->getMacros()->expand(maybe_path, info); + bool maybe_shard_macros = info.expanded_other; + info.expanded_other = false; + query_context->getMacros()->expand(maybe_replica, info); + bool maybe_replica_macros = info.expanded_other; + bool enable_functional_tests_helper = getContext()->getConfigRef().has("_functional_tests_helper_database_replicated_replace_args_macros"); + + if (!enable_functional_tests_helper) + LOG_WARNING(log, "It's not recommended to explicitly specify zookeeper_path and replica_name in ReplicatedMergeTree arguments"); + + if (maybe_shard_macros && maybe_replica_macros) + return; + + if (enable_functional_tests_helper) + { + if (maybe_path.empty() || maybe_path.back() != '/') + maybe_path += '/'; + arg1->value = maybe_path + "auto_{shard}"; + arg2->value = maybe_replica + "auto_{replica}"; + return; + } + + throw Exception(ErrorCodes::INCORRECT_QUERY, + "Explicit zookeeper_path and replica_name are specified in ReplicatedMergeTree arguments. " + "If you really want to specify it explicitly, then you should use some macros " + "to distinguish different shards and replicas"); + } + } + if (const auto * query_alter = query->as()) { for (const auto & command : query_alter->command_list->children) @@ -305,34 +381,47 @@ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const } } + if (auto * query_drop = query->as()) + { + if (query_drop->kind == ASTDropQuery::Kind::Detach && query_context->getSettingsRef().database_replicated_always_detach_permanently) + query_drop->permanently = true; + if (query_drop->kind == ASTDropQuery::Kind::Detach && !query_drop->permanently) + throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " + "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA or set " + "database_replicated_always_detach_permanently to 1"); + } +} + +BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context) +{ + if (is_readonly) + throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); + + if (query_context->getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) + throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); + + checkQueryValid(query, query_context); LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); - /// TODO maybe write current settings to log entry? DDLLogEntry entry; entry.query = queryToString(query); entry.initiator = ddl_worker->getCommonHostID(); + entry.setSettingsIfRequired(query_context); String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry, query_context); - BlockIO io; - if (query_context.getSettingsRef().distributed_ddl_task_timeout == 0) - return io; - Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas"); - auto stream = std::make_shared(node_path, entry, query_context, hosts_to_wait); - if (query_context.getSettingsRef().database_replicated_ddl_output) - io.in = std::move(stream); - return io; + return getDistributedDDLStatus(node_path, entry, query_context, hosts_to_wait); } -static UUID getTableUUIDIfReplicated(const String & metadata, const Context & context) +static UUID getTableUUIDIfReplicated(const String & metadata, ContextPtr context) { bool looks_like_replicated = metadata.find("ReplicatedMergeTree") != std::string::npos; if (!looks_like_replicated) return UUIDHelpers::Nil; ParserCreateQuery parser; - auto size = context.getSettingsRef().max_query_size; - auto depth = context.getSettingsRef().max_parser_depth; + auto size = context->getSettingsRef().max_query_size; + auto depth = context->getSettingsRef().max_parser_depth; ASTPtr query = parseQuery(parser, metadata, size, depth); const ASTCreateQuery & create = query->as(); if (!create.storage || !create.storage->engine) @@ -368,7 +457,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep std::unordered_map zk_replicated_id_to_name; for (const auto & zk_table : table_name_to_metadata) { - UUID zk_replicated_id = getTableUUIDIfReplicated(zk_table.second, global_context); + UUID zk_replicated_id = getTableUUIDIfReplicated(zk_table.second, getContext()); if (zk_replicated_id != UUIDHelpers::Nil) zk_replicated_id_to_name.emplace(zk_replicated_id, zk_table.first); } @@ -378,7 +467,8 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep std::vector> replicated_tables_to_rename; size_t total_tables = 0; std::vector replicated_ids; - for (auto existing_tables_it = getTablesIterator(global_context, {}); existing_tables_it->isValid(); existing_tables_it->next(), ++total_tables) + for (auto existing_tables_it = getTablesIterator(getContext(), {}); existing_tables_it->isValid(); + existing_tables_it->next(), ++total_tables) { String name = existing_tables_it->name(); UUID local_replicated_id = UUIDHelpers::Nil; @@ -417,7 +507,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep /// We use Ordinary engine for destination database, because it's the only way to discard table UUID /// and make possible creation of new table with the same UUID. String query = fmt::format("CREATE DATABASE IF NOT EXISTS {} ENGINE=Ordinary", backQuoteIfNeed(to_db_name)); - Context query_context = global_context; + auto query_context = Context::createCopy(getContext()); executeQuery(query, query_context, true); } @@ -430,12 +520,12 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep if (getDatabaseName() != db_name) throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed, will retry"); - auto table = tryGetTable(table_name, global_context); + auto table = tryGetTable(table_name, getContext()); if (isDictionaryExist(table_name)) { /// We can safely drop any dictionaries because they do not store data LOG_DEBUG(log, "Will DROP DICTIONARY {}", backQuoteIfNeed(table_name)); - DatabaseAtomic::removeDictionary(global_context, table_name); + DatabaseAtomic::removeDictionary(getContext(), table_name); ++dropped_dicts; } else if (!table->storesDataOnDisk()) @@ -443,7 +533,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep LOG_DEBUG(log, "Will DROP TABLE {}, because it does not store data on disk and can be safely dropped", backQuoteIfNeed(table_name)); dropped_tables.push_back(tryGetTableUUID(table_name)); table->shutdown(); - DatabaseAtomic::dropTable(global_context, table_name, true); + DatabaseAtomic::dropTable(getContext(), table_name, true); } else { @@ -453,7 +543,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep assert(db_name < to_db_name); DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_db_name, to_name); auto to_db_ptr = DatabaseCatalog::instance().getDatabase(to_db_name); - DatabaseAtomic::renameTable(global_context, table_name, *to_db_ptr, to_name, false, false); + DatabaseAtomic::renameTable(getContext(), table_name, *to_db_ptr, to_name, false, false); ++moved_tables; } } @@ -472,7 +562,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep /// TODO Maybe we should do it in two steps: rename all tables to temporary names and then rename them to actual names? DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::min(from, to)); DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::max(from, to)); - DatabaseAtomic::renameTable(global_context, from, *this, to, false, false); + DatabaseAtomic::renameTable(getContext(), from, *this, to, false, false); } for (const auto & id : dropped_tables) @@ -480,7 +570,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep for (const auto & name_and_meta : table_name_to_metadata) { - if (isTableExist(name_and_meta.first, global_context)) + if (isTableExist(name_and_meta.first, getContext())) { assert(name_and_meta.second == readMetadataFile(name_and_meta.first)); continue; @@ -488,11 +578,11 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep auto query_ast = parseQueryFromMetadataInZooKeeper(name_and_meta.first, name_and_meta.second); - Context query_context = global_context; - query_context.makeQueryContext(); - query_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - query_context.setCurrentDatabase(database_name); - query_context.setCurrentQueryId(""); // generate random query_id + auto query_context = Context::createCopy(getContext()); + query_context->makeQueryContext(); + query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + query_context->setCurrentDatabase(database_name); + query_context->setCurrentQueryId(""); // generate random query_id LOG_INFO(log, "Executing {}", serializeAST(*query_ast)); InterpreterCreateQuery(query_ast, query_context).execute(); @@ -554,24 +644,31 @@ ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node { ParserCreateQuery parser; String description = "in ZooKeeper " + zookeeper_path + "/metadata/" + node_name; - auto ast = parseQuery(parser, query, description, 0, global_context.getSettingsRef().max_parser_depth); + auto ast = parseQuery(parser, query, description, 0, getContext()->getSettingsRef().max_parser_depth); auto & create = ast->as(); - if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || ! create.database.empty()) + if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || !create.database.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Got unexpected query from {}: {}", node_name, query); + bool is_materialized_view_with_inner_table = create.is_materialized_view && create.to_table_id.empty(); + create.database = getDatabaseName(); create.table = unescapeForFileName(node_name); - create.attach = false; + create.attach = is_materialized_view_with_inner_table; return ast; } -void DatabaseReplicated::drop(const Context & context_) +void DatabaseReplicated::drop(ContextPtr context_) { auto current_zookeeper = getZooKeeper(); - current_zookeeper->set(replica_path, DROPPED_MARK); + Coordination::Requests ops; + ops.emplace_back(zkutil::makeSetRequest(replica_path, DROPPED_MARK, -1)); + createEmptyLogEntry(ops, current_zookeeper); + current_zookeeper->multi(ops); + DatabaseAtomic::drop(context_); + current_zookeeper->tryRemoveRecursive(replica_path); /// TODO it may leave garbage in ZooKeeper if the last node lost connection here if (current_zookeeper->tryRemove(zookeeper_path + "/replicas") == Coordination::Error::ZOK) @@ -595,22 +692,22 @@ void DatabaseReplicated::shutdown() } -void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) +void DatabaseReplicated::dropTable(ContextPtr local_context, const String & table_name, bool no_delay) { - auto txn = context.getZooKeeperMetadataTransaction(); - assert(!ddl_worker->isCurrentlyActive() || txn); + auto txn = local_context->getZooKeeperMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn || startsWith(table_name, ".inner_id.")); if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } - DatabaseAtomic::dropTable(context, table_name, no_delay); + DatabaseAtomic::dropTable(local_context, table_name, no_delay); } -void DatabaseReplicated::renameTable(const Context & context, const String & table_name, IDatabase & to_database, +void DatabaseReplicated::renameTable(ContextPtr local_context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) { - auto txn = context.getZooKeeperMetadataTransaction(); + auto txn = local_context->getZooKeeperMetadataTransaction(); assert(txn); if (txn->isInitialQuery()) @@ -619,9 +716,9 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine"); if (table_name == to_table_name) throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot rename table to itself"); - if (!isTableExist(table_name, context)) + if (!isTableExist(table_name, local_context)) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name); - if (exchange && !to_database.isTableExist(to_table_name, context)) + if (exchange && !to_database.isTableExist(to_table_name, local_context)) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name); String statement = readMetadataFile(table_name); @@ -637,14 +734,14 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab txn->addOp(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); } - DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary); + DatabaseAtomic::renameTable(local_context, table_name, to_database, to_table_name, exchange, dictionary); } void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, const String & table_metadata_tmp_path, const String & table_metadata_path, - const Context & query_context) + ContextPtr query_context) { - auto txn = query_context.getZooKeeperMetadataTransaction(); + auto txn = query_context->getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->isInitialQuery()) { @@ -658,9 +755,9 @@ void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const S void DatabaseReplicated::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, - const String & statement, const Context & query_context) + const String & statement, ContextPtr query_context) { - auto txn = query_context.getZooKeeperMetadataTransaction(); + auto txn = query_context->getZooKeeperMetadataTransaction(); if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); @@ -669,11 +766,11 @@ void DatabaseReplicated::commitAlterTable(const StorageID & table_id, DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context); } -void DatabaseReplicated::createDictionary(const Context & context, +void DatabaseReplicated::createDictionary(ContextPtr local_context, const String & dictionary_name, const ASTPtr & query) { - auto txn = context.getZooKeeperMetadataTransaction(); + auto txn = local_context->getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->isInitialQuery()) { @@ -681,33 +778,49 @@ void DatabaseReplicated::createDictionary(const Context & context, String statement = getObjectDefinitionFromCreateQuery(query->clone()); txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); } - DatabaseAtomic::createDictionary(context, dictionary_name, query); + DatabaseAtomic::createDictionary(local_context, dictionary_name, query); } -void DatabaseReplicated::removeDictionary(const Context & context, const String & dictionary_name) +void DatabaseReplicated::removeDictionary(ContextPtr local_context, const String & dictionary_name) { - auto txn = context.getZooKeeperMetadataTransaction(); + auto txn = local_context->getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } - DatabaseAtomic::removeDictionary(context, dictionary_name); + DatabaseAtomic::removeDictionary(local_context, dictionary_name); } -void DatabaseReplicated::detachTablePermanently(const Context & context, const String & table_name) +void DatabaseReplicated::detachTablePermanently(ContextPtr local_context, const String & table_name) { - auto txn = context.getZooKeeperMetadataTransaction(); + auto txn = local_context->getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->isInitialQuery()) { + /// We have to remove metadata from zookeeper, because we do not distinguish permanently detached tables + /// from attached tables when recovering replica. String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } - DatabaseAtomic::detachTablePermanently(context, table_name); + DatabaseAtomic::detachTablePermanently(local_context, table_name); } +void DatabaseReplicated::removeDetachedPermanentlyFlag(ContextPtr local_context, const String & table_name, const String & table_metadata_path, bool attach) const +{ + auto txn = local_context->getZooKeeperMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->isInitialQuery() && attach) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String statement = readMetadataFile(table_name); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + } + DatabaseAtomic::removeDetachedPermanentlyFlag(local_context, table_name, table_metadata_path, attach); +} + + String DatabaseReplicated::readMetadataFile(const String & table_name) const { String statement; diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index fde53cf2c29..5220535f095 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -24,31 +24,32 @@ public: DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, DatabaseReplicatedSettings db_settings_, - const Context & context); + ContextPtr context); ~DatabaseReplicated() override; String getEngineName() const override { return "Replicated"; } /// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current ZooKeeperMetadataTransaction. - void dropTable(const Context &, const String & table_name, bool no_delay) override; - void renameTable(const Context & context, const String & table_name, IDatabase & to_database, + void dropTable(ContextPtr, const String & table_name, bool no_delay) override; + void renameTable(ContextPtr context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) override; void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, const String & table_metadata_tmp_path, const String & table_metadata_path, - const Context & query_context) override; + ContextPtr query_context) override; void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, - const String & statement, const Context & query_context) override; - void createDictionary(const Context & context, + const String & statement, ContextPtr query_context) override; + void createDictionary(ContextPtr context, const String & dictionary_name, const ASTPtr & query) override; - void removeDictionary(const Context & context, const String & dictionary_name) override; - void detachTablePermanently(const Context & context, const String & table_name) override; + void removeDictionary(ContextPtr context, const String & dictionary_name) override; + void detachTablePermanently(ContextPtr context, const String & table_name) override; + void removeDetachedPermanentlyFlag(ContextPtr context, const String & table_name, const String & table_metadata_path, bool attach) const override; /// Try to execute DLL query on current host as initial query. If query is succeed, /// then it will be executed on all replicas. - BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context); + BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context); void stopReplication(); @@ -58,9 +59,9 @@ public: /// Returns cluster consisting of database replicas ClusterPtr getCluster() const; - void drop(const Context & /*context*/) override; + void drop(ContextPtr /*context*/) override; - void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; + void loadStoredObjects(ContextPtr context, bool has_force_restore_data_flag, bool force_attach) override; void shutdown() override; friend struct DatabaseReplicatedTask; @@ -70,12 +71,19 @@ private: bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); + void checkQueryValid(const ASTPtr & query, ContextPtr query_context) const; + void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr); std::map tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr); ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); String readMetadataFile(const String & table_name) const; + ClusterPtr getClusterImpl() const; + void setCluster(ClusterPtr && new_cluster); + + void createEmptyLogEntry(Coordination::Requests & ops, const ZooKeeperPtr & current_zookeeper); + String zookeeper_path; String shard_name; String replica_name; @@ -86,6 +94,8 @@ private: std::atomic_bool is_readonly = true; std::unique_ptr ddl_worker; + + mutable ClusterPtr cluster; }; } diff --git a/src/Databases/DatabaseReplicatedSettings.h b/src/Databases/DatabaseReplicatedSettings.h index 11d5b3820e4..43003af1120 100644 --- a/src/Databases/DatabaseReplicatedSettings.h +++ b/src/Databases/DatabaseReplicatedSettings.h @@ -11,6 +11,9 @@ class ASTStorage; M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \ M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \ + M(String, cluster_username, "default", "Username to use when connecting to hosts of cluster", 0) \ + M(String, cluster_password, "", "Password to use when connecting to hosts of cluster", 0) \ + M(Bool, cluster_secure_connection, false, "Enable TLS when connecting to hosts of cluster", 0) \ DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index e0c5717711c..9ae4d026bf0 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -13,7 +13,7 @@ namespace ErrorCodes extern const int UNFINISHED; } -DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_) +DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, ContextPtr context_) : DDLWorker(/* pool_size */ 1, db->zookeeper_path + "/log", context_, nullptr, {}, fmt::format("DDLWorker({})", db->getDatabaseName())) , database(db) { @@ -22,7 +22,7 @@ DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db /// We also need similar graph to load tables on server startup in order of topsort. } -void DatabaseReplicatedDDLWorker::initializeMainThread() +bool DatabaseReplicatedDDLWorker::initializeMainThread() { while (!stop_flag) { @@ -33,7 +33,7 @@ void DatabaseReplicatedDDLWorker::initializeMainThread() database->tryConnectToZooKeeperAndInitDatabase(false); initializeReplication(); initialized = true; - return; + return true; } catch (...) { @@ -41,6 +41,8 @@ void DatabaseReplicatedDDLWorker::initializeMainThread() sleepForSeconds(5); } } + + return false; } void DatabaseReplicatedDDLWorker::shutdown() @@ -61,7 +63,7 @@ void DatabaseReplicatedDDLWorker::initializeReplication() if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr) database->recoverLostReplica(current_zookeeper, our_log_ptr, max_log_ptr); else - last_skipped_entry_name.emplace(log_ptr_str); + last_skipped_entry_name.emplace(DDLTaskBase::getLogEntryName(our_log_ptr)); } String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) @@ -89,7 +91,7 @@ String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) return node_path; } -String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context) +String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, ContextPtr query_context) { /// NOTE Possibly it would be better to execute initial query on the most up-to-date node, /// but it requires more complex logic around /try node. @@ -113,7 +115,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr task->is_initial_query = true; LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); - UInt64 timeout = query_context.getSettingsRef().database_replicated_initial_query_timeout_sec; + UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { std::unique_lock lock{mutex}; bool processed = wait_current_task_change.wait_for(lock, std::chrono::seconds(timeout), [&]() @@ -123,7 +125,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr }); if (!processed) - throw Exception(ErrorCodes::UNFINISHED, "Timeout: Cannot enqueue query on this replica," + throw Exception(ErrorCodes::UNFINISHED, "Timeout: Cannot enqueue query on this replica, " "most likely because replica is busy with previous queue entries"); } @@ -235,6 +237,8 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (task->entry.query.empty()) { + /// Some replica is added or removed, let's update cached cluster + database->setCluster(database->getClusterImpl()); out_reason = fmt::format("Entry {} is a dummy task", entry_name); return {}; } diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 6ba46a98bca..16ad100b81a 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -21,16 +21,16 @@ class DatabaseReplicated; class DatabaseReplicatedDDLWorker : public DDLWorker { public: - DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_); + DatabaseReplicatedDDLWorker(DatabaseReplicated * db, ContextPtr context_); String enqueueQuery(DDLLogEntry & entry) override; - String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context); + String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, ContextPtr query_context); void shutdown() override; private: - void initializeMainThread() override; + bool initializeMainThread() override; void initializeReplication(); DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index d92f0f1897e..c97417e292c 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -49,7 +49,7 @@ void DatabaseWithDictionaries::attachDictionary(const String & dictionary_name, /// Attach the dictionary as table too. try { - /// TODO Make StorageDictionary an owner of IDictionaryBase objects. + /// TODO Make StorageDictionary an owner of IDictionary objects. /// All DDL operations with dictionaries will work with StorageDictionary table, /// and StorageDictionary will be responsible for loading of DDL dictionaries. /// ExternalLoaderDatabaseConfigRepository and other hacks related to ExternalLoader @@ -125,9 +125,9 @@ void DatabaseWithDictionaries::detachDictionaryImpl(const String & dictionary_na detachTable(dictionary_name); } -void DatabaseWithDictionaries::createDictionary(const Context & context, const String & dictionary_name, const ASTPtr & query) +void DatabaseWithDictionaries::createDictionary(ContextPtr local_context, const String & dictionary_name, const ASTPtr & query) { - const auto & settings = context.getSettingsRef(); + const auto & settings = local_context->getSettingsRef(); /** The code is based on the assumption that all threads share the same order of operations: * - create the .sql.tmp file; @@ -151,7 +151,7 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S throw Exception(ErrorCodes::DICTIONARY_ALREADY_EXISTS, "Dictionary {} already exists.", dict_id.getFullNameNotQuoted()); - if (isTableExist(dictionary_name, global_context)) + if (isTableExist(dictionary_name, getContext())) throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Table {} already exists.", dict_id.getFullTableName()); String dictionary_metadata_path = getObjectMetadataPath(dictionary_name); @@ -169,17 +169,28 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S } bool succeeded = false; + bool uuid_locked = false; SCOPE_EXIT({ if (!succeeded) + { + if (uuid_locked) + DatabaseCatalog::instance().removeUUIDMappingFinally(dict_id.uuid); Poco::File(dictionary_metadata_tmp_path).remove(); + } }); + if (dict_id.uuid != UUIDHelpers::Nil) + { + DatabaseCatalog::instance().addUUIDMapping(dict_id.uuid); + uuid_locked = true; + } + /// Add a temporary repository containing the dictionary. /// We need this temp repository to try loading the dictionary before actually attaching it to the database. auto temp_repository = external_loader.addConfigRepository(std::make_unique( - getDatabaseName(), dictionary_metadata_tmp_path, getDictionaryConfigurationFromAST(query->as(), context))); + getDatabaseName(), dictionary_metadata_tmp_path, getDictionaryConfigurationFromAST(query->as(), local_context))); - bool lazy_load = context.getConfigRef().getBool("dictionaries_lazy_load", true); + bool lazy_load = local_context->getConfigRef().getBool("dictionaries_lazy_load", true); if (!lazy_load) { /// load() is called here to force loading the dictionary, wait until the loading is finished, @@ -187,15 +198,15 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S external_loader.load(dict_id.getInternalDictionaryName()); } - auto config = getDictionaryConfigurationFromAST(query->as(), context); + auto config = getDictionaryConfigurationFromAST(query->as(), local_context); attachDictionary(dictionary_name, DictionaryAttachInfo{query, config, time(nullptr)}); SCOPE_EXIT({ if (!succeeded) detachDictionary(dictionary_name); }); - auto txn = context.getZooKeeperMetadataTransaction(); - if (txn && !context.isInternalSubquery()) + auto txn = local_context->getZooKeeperMetadataTransaction(); + if (txn && !local_context->isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// If it was ATTACH query and file with dictionary metadata already exist @@ -210,7 +221,7 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S succeeded = true; } -void DatabaseWithDictionaries::removeDictionary(const Context & context, const String & dictionary_name) +void DatabaseWithDictionaries::removeDictionary(ContextPtr local_context, const String & dictionary_name) { DictionaryAttachInfo attach_info; detachDictionaryImpl(dictionary_name, attach_info); @@ -219,8 +230,8 @@ void DatabaseWithDictionaries::removeDictionary(const Context & context, const S { String dictionary_metadata_path = getObjectMetadataPath(dictionary_name); - auto txn = context.getZooKeeperMetadataTransaction(); - if (txn && !context.isInternalSubquery()) + auto txn = local_context->getZooKeeperMetadataTransaction(); + if (txn && !local_context->isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database Poco::File(dictionary_metadata_path).remove(); @@ -335,7 +346,7 @@ void DatabaseWithDictionaries::reloadDictionaryConfig(const String & full_name) /// Ensure that this database is attached to ExternalLoader as a config repository. if (!database_as_config_repo_for_external_loader.load()) { - auto repository = std::make_unique(*this, global_context); + auto repository = std::make_unique(*this, getContext()); auto remove_repository_callback = external_loader.addConfigRepository(std::move(repository)); database_as_config_repo_for_external_loader = boost::make_shared(std::move(remove_repository_callback)); } @@ -359,9 +370,9 @@ void DatabaseWithDictionaries::shutdown() DatabaseWithDictionaries::DatabaseWithDictionaries( - const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context) - : DatabaseOnDisk(name, metadata_path_, data_path_, logger, context) - , external_loader(context.getExternalDictionariesLoader()) + const String & name, const String & metadata_path_, const String & data_path_, const String & logger, ContextPtr context_) + : DatabaseOnDisk(name, metadata_path_, data_path_, logger, context_) + , external_loader(context_->getExternalDictionariesLoader()) { } diff --git a/src/Databases/DatabaseWithDictionaries.h b/src/Databases/DatabaseWithDictionaries.h index d69289d7456..d10908c7c06 100644 --- a/src/Databases/DatabaseWithDictionaries.h +++ b/src/Databases/DatabaseWithDictionaries.h @@ -17,11 +17,11 @@ public: void detachDictionary(const String & dictionary_name) override; - void createDictionary(const Context & context, + void createDictionary(ContextPtr context, const String & dictionary_name, const ASTPtr & query) override; - void removeDictionary(const Context & context, const String & dictionary_name) override; + void removeDictionary(ContextPtr context, const String & dictionary_name) override; bool isDictionaryExist(const String & dictionary_name) const override; @@ -38,7 +38,7 @@ public: ~DatabaseWithDictionaries() override; protected: - DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); + DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, ContextPtr context); ASTPtr getCreateDictionaryQueryImpl(const String & dictionary_name, bool throw_on_error) const override; diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 2d3d206162b..9329b0a4210 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -20,18 +20,18 @@ namespace ErrorCodes extern const int UNKNOWN_DATABASE; } -DatabaseWithOwnTablesBase::DatabaseWithOwnTablesBase(const String & name_, const String & logger, const Context & context) - : IDatabase(name_), log(&Poco::Logger::get(logger)), global_context(context.getGlobalContext()) +DatabaseWithOwnTablesBase::DatabaseWithOwnTablesBase(const String & name_, const String & logger, ContextPtr context_) + : IDatabase(name_), WithContext(context_->getGlobalContext()), log(&Poco::Logger::get(logger)) { } -bool DatabaseWithOwnTablesBase::isTableExist(const String & table_name, const Context &) const +bool DatabaseWithOwnTablesBase::isTableExist(const String & table_name, ContextPtr) const { std::lock_guard lock(mutex); return tables.find(table_name) != tables.end(); } -StoragePtr DatabaseWithOwnTablesBase::tryGetTable(const String & table_name, const Context &) const +StoragePtr DatabaseWithOwnTablesBase::tryGetTable(const String & table_name, ContextPtr) const { std::lock_guard lock(mutex); auto it = tables.find(table_name); @@ -40,7 +40,7 @@ StoragePtr DatabaseWithOwnTablesBase::tryGetTable(const String & table_name, con return {}; } -DatabaseTablesIteratorPtr DatabaseWithOwnTablesBase::getTablesIterator(const Context &, const FilterByNameFunction & filter_by_table_name) +DatabaseTablesIteratorPtr DatabaseWithOwnTablesBase::getTablesIterator(ContextPtr, const FilterByNameFunction & filter_by_table_name) { std::lock_guard lock(mutex); if (!filter_by_table_name) diff --git a/src/Databases/DatabasesCommon.h b/src/Databases/DatabasesCommon.h index 5e1e555a524..da1bd6c1852 100644 --- a/src/Databases/DatabasesCommon.h +++ b/src/Databases/DatabasesCommon.h @@ -16,12 +16,12 @@ namespace DB class Context; /// A base class for databases that manage their own list of tables. -class DatabaseWithOwnTablesBase : public IDatabase +class DatabaseWithOwnTablesBase : public IDatabase, protected WithContext { public: - bool isTableExist(const String & table_name, const Context & context) const override; + bool isTableExist(const String & table_name, ContextPtr context) const override; - StoragePtr tryGetTable(const String & table_name, const Context & context) const override; + StoragePtr tryGetTable(const String & table_name, ContextPtr context) const override; bool empty() const override; @@ -29,7 +29,7 @@ public: StoragePtr detachTable(const String & table_name) override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) override; void shutdown() override; @@ -38,9 +38,8 @@ public: protected: Tables tables; Poco::Logger * log; - const Context & global_context; - DatabaseWithOwnTablesBase(const String & name_, const String & logger, const Context & context); + DatabaseWithOwnTablesBase(const String & name_, const String & logger, ContextPtr context); void attachTableUnlocked(const String & table_name, const StoragePtr & table, std::unique_lock & lock); StoragePtr detachTableUnlocked(const String & table_name, std::unique_lock & lock); diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 3a196f827b7..8c356b88460 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -10,6 +10,7 @@ #include #include + #include #include #include @@ -18,7 +19,6 @@ namespace DB { -class Context; struct Settings; struct ConstraintsDescription; struct IndicesDescription; @@ -153,10 +153,10 @@ public: /// Load a set of existing tables. /// You can call only once, right after the object is created. - virtual void loadStoredObjects(Context & /*context*/, bool /*has_force_restore_data_flag*/, bool /*force_attach*/ = false) {} + virtual void loadStoredObjects(ContextPtr /*context*/, bool /*has_force_restore_data_flag*/, bool /*force_attach*/ = false) {} /// Check the existence of the table. - virtual bool isTableExist(const String & name, const Context & context) const = 0; + virtual bool isTableExist(const String & name, ContextPtr context) const = 0; /// Check the existence of the dictionary virtual bool isDictionaryExist(const String & /*name*/) const @@ -165,7 +165,7 @@ public: } /// Get the table for work. Return nullptr if there is no table. - virtual StoragePtr tryGetTable(const String & name, const Context & context) const = 0; + virtual StoragePtr tryGetTable(const String & name, ContextPtr context) const = 0; virtual UUID tryGetTableUUID(const String & /*table_name*/) const { return UUIDHelpers::Nil; } @@ -173,7 +173,7 @@ public: /// Get an iterator that allows you to pass through all the tables. /// It is possible to have "hidden" tables that are not visible when passing through, but are visible if you get them by name using the functions above. - virtual DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name = {}) = 0; + virtual DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name = {}) = 0; /// Get an iterator to pass through all the dictionaries. virtual DatabaseDictionariesIteratorPtr getDictionariesIterator([[maybe_unused]] const FilterByNameFunction & filter_by_dictionary_name = {}) @@ -186,7 +186,7 @@ public: /// Add the table to the database. Record its presence in the metadata. virtual void createTable( - const Context & /*context*/, + ContextPtr /*context*/, const String & /*name*/, const StoragePtr & /*table*/, const ASTPtr & /*query*/) @@ -196,7 +196,7 @@ public: /// Add the dictionary to the database. Record its presence in the metadata. virtual void createDictionary( - const Context & /*context*/, + ContextPtr /*context*/, const String & /*dictionary_name*/, const ASTPtr & /*query*/) { @@ -205,7 +205,7 @@ public: /// Delete the table from the database, drop table and delete the metadata. virtual void dropTable( - const Context & /*context*/, + ContextPtr /*context*/, const String & /*name*/, [[maybe_unused]] bool no_delay = false) { @@ -214,7 +214,7 @@ public: /// Delete the dictionary from the database. Delete the metadata. virtual void removeDictionary( - const Context & /*context*/, + ContextPtr /*context*/, const String & /*dictionary_name*/) { throw Exception("There is no DROP DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); @@ -249,14 +249,14 @@ public: /// Forget about the table without deleting it's data, but rename metadata file to prevent reloading it /// with next restart. The database may not support this method. - virtual void detachTablePermanently(const Context & /*context*/, const String & /*name*/) + virtual void detachTablePermanently(ContextPtr /*context*/, const String & /*name*/) { throw Exception("There is no DETACH TABLE PERMANENTLY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Rename the table and possibly move the table to another database. virtual void renameTable( - const Context & /*context*/, + ContextPtr /*context*/, const String & /*name*/, IDatabase & /*to_database*/, const String & /*to_name*/, @@ -271,7 +271,7 @@ public: /// Change the table structure in metadata. /// You must call under the alter_lock of the corresponding table . If engine_modifier is empty, then engine does not change. virtual void alterTable( - const Context & /*context*/, + ContextPtr /*context*/, const StorageID & /*table_id*/, const StorageInMemoryMetadata & /*metadata*/) { @@ -285,12 +285,12 @@ public: } /// Get the CREATE TABLE query for the table. It can also provide information for detached tables for which there is metadata. - ASTPtr tryGetCreateTableQuery(const String & name, const Context & context) const noexcept + ASTPtr tryGetCreateTableQuery(const String & name, ContextPtr context) const noexcept { return getCreateTableQueryImpl(name, context, false); } - ASTPtr getCreateTableQuery(const String & name, const Context & context) const + ASTPtr getCreateTableQuery(const String & name, ContextPtr context) const { return getCreateTableQueryImpl(name, context, true); } @@ -345,18 +345,19 @@ public: virtual void assertCanBeDetached(bool /*cleanup*/) {} - virtual void waitDetachedTableNotInUse(const UUID & /*uuid*/) { assert(false); } + virtual void waitDetachedTableNotInUse(const UUID & /*uuid*/) { } + virtual void checkDetachedTableNotInUse(const UUID & /*uuid*/) { } /// Ask all tables to complete the background threads they are using and delete all table objects. virtual void shutdown() = 0; /// Delete data and metadata stored inside the database, if exists. - virtual void drop(const Context & /*context*/) {} + virtual void drop(ContextPtr /*context*/) {} - virtual ~IDatabase() {} + virtual ~IDatabase() = default; protected: - virtual ASTPtr getCreateTableQueryImpl(const String & /*name*/, const Context & /*context*/, bool throw_on_error) const + virtual ASTPtr getCreateTableQueryImpl(const String & /*name*/, ContextPtr /*context*/, bool throw_on_error) const { if (throw_on_error) throw Exception("There is no SHOW CREATE TABLE query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); diff --git a/src/Databases/MySQL/ConnectionMySQLSettings.cpp b/src/Databases/MySQL/ConnectionMySQLSettings.cpp index fa92e793225..1026d14018b 100644 --- a/src/Databases/MySQL/ConnectionMySQLSettings.cpp +++ b/src/Databases/MySQL/ConnectionMySQLSettings.cpp @@ -50,12 +50,12 @@ void ConnectionMySQLSettings::loadFromQuery(ASTStorage & storage_def) #undef ADD_IF_ABSENT } -void ConnectionMySQLSettings::loadFromQueryContext(const Context & context) +void ConnectionMySQLSettings::loadFromQueryContext(ContextPtr context) { - if (!context.hasQueryContext()) + if (!context->hasQueryContext()) return; - const Settings & settings = context.getQueryContext().getSettingsRef(); + const Settings & settings = context->getQueryContext()->getSettingsRef(); if (settings.mysql_datatypes_support_level.value != mysql_datatypes_support_level.value) set("mysql_datatypes_support_level", settings.mysql_datatypes_support_level.toString()); diff --git a/src/Databases/MySQL/ConnectionMySQLSettings.h b/src/Databases/MySQL/ConnectionMySQLSettings.h index ce2773307c5..f05985a0cda 100644 --- a/src/Databases/MySQL/ConnectionMySQLSettings.h +++ b/src/Databases/MySQL/ConnectionMySQLSettings.h @@ -1,13 +1,13 @@ #pragma once -#include #include +#include #include +#include namespace DB { -class Context; class ASTStorage; #define LIST_OF_CONNECTION_MYSQL_SETTINGS(M) \ @@ -27,7 +27,7 @@ struct ConnectionMySQLSettings : public BaseSettings settings_, mysqlxx::Pool && pool) +DatabaseConnectionMySQL::DatabaseConnectionMySQL( + ContextPtr context_, + const String & database_name_, + const String & metadata_path_, + const ASTStorage * database_engine_define_, + const String & database_name_in_mysql_, + std::unique_ptr settings_, + mysqlxx::PoolWithFailover && pool) : IDatabase(database_name_) - , global_context(context.getGlobalContext()) + , WithContext(context_->getGlobalContext()) , metadata_path(metadata_path_) , database_engine_define(database_engine_define_->clone()) , database_name_in_mysql(database_name_in_mysql_) @@ -63,7 +69,7 @@ bool DatabaseConnectionMySQL::empty() const { std::lock_guard lock(mutex); - fetchTablesIntoLocalCache(global_context); + fetchTablesIntoLocalCache(getContext()); if (local_tables_cache.empty()) return true; @@ -75,12 +81,12 @@ bool DatabaseConnectionMySQL::empty() const return true; } -DatabaseTablesIteratorPtr DatabaseConnectionMySQL::getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) +DatabaseTablesIteratorPtr DatabaseConnectionMySQL::getTablesIterator(ContextPtr local_context, const FilterByNameFunction & filter_by_table_name) { Tables tables; std::lock_guard lock(mutex); - fetchTablesIntoLocalCache(context); + fetchTablesIntoLocalCache(local_context); for (const auto & [table_name, modify_time_and_storage] : local_tables_cache) if (!remove_or_detach_tables.count(table_name) && (!filter_by_table_name || filter_by_table_name(table_name))) @@ -89,16 +95,16 @@ DatabaseTablesIteratorPtr DatabaseConnectionMySQL::getTablesIterator(const Conte return std::make_unique(tables, database_name); } -bool DatabaseConnectionMySQL::isTableExist(const String & name, const Context & context) const +bool DatabaseConnectionMySQL::isTableExist(const String & name, ContextPtr local_context) const { - return bool(tryGetTable(name, context)); + return bool(tryGetTable(name, local_context)); } -StoragePtr DatabaseConnectionMySQL::tryGetTable(const String & mysql_table_name, const Context & context) const +StoragePtr DatabaseConnectionMySQL::tryGetTable(const String & mysql_table_name, ContextPtr local_context) const { std::lock_guard lock(mutex); - fetchTablesIntoLocalCache(context); + fetchTablesIntoLocalCache(local_context); if (!remove_or_detach_tables.count(mysql_table_name) && local_tables_cache.find(mysql_table_name) != local_tables_cache.end()) return local_tables_cache[mysql_table_name].second; @@ -153,11 +159,11 @@ static ASTPtr getCreateQueryFromStorage(const StoragePtr & storage, const ASTPtr return create_table_query; } -ASTPtr DatabaseConnectionMySQL::getCreateTableQueryImpl(const String & table_name, const Context & context, bool throw_on_error) const +ASTPtr DatabaseConnectionMySQL::getCreateTableQueryImpl(const String & table_name, ContextPtr local_context, bool throw_on_error) const { std::lock_guard lock(mutex); - fetchTablesIntoLocalCache(context); + fetchTablesIntoLocalCache(local_context); if (local_tables_cache.find(table_name) == local_tables_cache.end()) { @@ -174,7 +180,7 @@ time_t DatabaseConnectionMySQL::getObjectMetadataModificationTime(const String & { std::lock_guard lock(mutex); - fetchTablesIntoLocalCache(global_context); + fetchTablesIntoLocalCache(getContext()); if (local_tables_cache.find(table_name) == local_tables_cache.end()) throw Exception("MySQL table " + database_name_in_mysql + "." + table_name + " doesn't exist.", ErrorCodes::UNKNOWN_TABLE); @@ -190,12 +196,12 @@ ASTPtr DatabaseConnectionMySQL::getCreateDatabaseQuery() const return create_query; } -void DatabaseConnectionMySQL::fetchTablesIntoLocalCache(const Context & context) const +void DatabaseConnectionMySQL::fetchTablesIntoLocalCache(ContextPtr local_context) const { - const auto & tables_with_modification_time = fetchTablesWithModificationTime(); + const auto & tables_with_modification_time = fetchTablesWithModificationTime(local_context); destroyLocalCacheExtraTables(tables_with_modification_time); - fetchLatestTablesStructureIntoCache(tables_with_modification_time, context); + fetchLatestTablesStructureIntoCache(tables_with_modification_time, local_context); } void DatabaseConnectionMySQL::destroyLocalCacheExtraTables(const std::map & tables_with_modification_time) const @@ -212,7 +218,8 @@ void DatabaseConnectionMySQL::destroyLocalCacheExtraTables(const std::map &tables_modification_time, const Context & context) const +void DatabaseConnectionMySQL::fetchLatestTablesStructureIntoCache( + const std::map & tables_modification_time, ContextPtr local_context) const { std::vector wait_update_tables_name; for (const auto & table_modification_time : tables_modification_time) @@ -224,7 +231,7 @@ void DatabaseConnectionMySQL::fetchLatestTablesStructureIntoCache(const std::map wait_update_tables_name.emplace_back(table_modification_time.first); } - std::map tables_and_columns = fetchTablesColumnsList(wait_update_tables_name, context); + std::map tables_and_columns = fetchTablesColumnsList(wait_update_tables_name, local_context); for (const auto & table_and_columns : tables_and_columns) { @@ -241,11 +248,11 @@ void DatabaseConnectionMySQL::fetchLatestTablesStructureIntoCache(const std::map local_tables_cache[table_name] = std::make_pair(table_modification_time, StorageMySQL::create( StorageID(database_name, table_name), std::move(mysql_pool), database_name_in_mysql, table_name, - false, "", ColumnsDescription{columns_name_and_type}, ConstraintsDescription{}, global_context)); + false, "", ColumnsDescription{columns_name_and_type}, ConstraintsDescription{}, getContext())); } } -std::map DatabaseConnectionMySQL::fetchTablesWithModificationTime() const +std::map DatabaseConnectionMySQL::fetchTablesWithModificationTime(ContextPtr local_context) const { Block tables_status_sample_block { @@ -261,7 +268,8 @@ std::map DatabaseConnectionMySQL::fetchTablesWithModificationTim " WHERE TABLE_SCHEMA = " << quote << database_name_in_mysql; std::map tables_with_modification_time; - MySQLBlockInputStream result(mysql_pool.get(), query.str(), tables_status_sample_block, DEFAULT_BLOCK_SIZE); + StreamSettings mysql_input_stream_settings(local_context->getSettingsRef()); + MySQLBlockInputStream result(mysql_pool.get(), query.str(), tables_status_sample_block, mysql_input_stream_settings); while (Block block = result.read()) { @@ -276,15 +284,16 @@ std::map DatabaseConnectionMySQL::fetchTablesWithModificationTim return tables_with_modification_time; } -std::map DatabaseConnectionMySQL::fetchTablesColumnsList(const std::vector & tables_name, const Context & context) const +std::map +DatabaseConnectionMySQL::fetchTablesColumnsList(const std::vector & tables_name, ContextPtr local_context) const { - const auto & settings = context.getSettingsRef(); + const auto & settings = local_context->getSettingsRef(); return DB::fetchTablesColumnsList( mysql_pool, database_name_in_mysql, tables_name, - settings.external_table_functions_use_nulls, + settings, database_settings->mysql_datatypes_support_level); } @@ -303,7 +312,7 @@ void DatabaseConnectionMySQL::shutdown() local_tables_cache.clear(); } -void DatabaseConnectionMySQL::drop(const Context & /*context*/) +void DatabaseConnectionMySQL::drop(ContextPtr /*context*/) { Poco::File(getMetadataPath()).remove(true); } @@ -378,7 +387,7 @@ String DatabaseConnectionMySQL::getMetadataPath() const return metadata_path; } -void DatabaseConnectionMySQL::loadStoredObjects(Context &, bool, bool /*force_attach*/) +void DatabaseConnectionMySQL::loadStoredObjects(ContextPtr, bool, bool /*force_attach*/) { std::lock_guard lock{mutex}; @@ -395,7 +404,7 @@ void DatabaseConnectionMySQL::loadStoredObjects(Context &, bool, bool /*force_at } } -void DatabaseConnectionMySQL::detachTablePermanently(const Context &, const String & table_name) +void DatabaseConnectionMySQL::detachTablePermanently(ContextPtr, const String & table_name) { std::lock_guard lock{mutex}; @@ -429,9 +438,9 @@ void DatabaseConnectionMySQL::detachTablePermanently(const Context &, const Stri table_iter->second.second->is_dropped = true; } -void DatabaseConnectionMySQL::dropTable(const Context & context, const String & table_name, bool /*no_delay*/) +void DatabaseConnectionMySQL::dropTable(ContextPtr local_context, const String & table_name, bool /*no_delay*/) { - detachTablePermanently(context, table_name); + detachTablePermanently(local_context, table_name); } DatabaseConnectionMySQL::~DatabaseConnectionMySQL() @@ -456,7 +465,7 @@ DatabaseConnectionMySQL::~DatabaseConnectionMySQL() } } -void DatabaseConnectionMySQL::createTable(const Context &, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) +void DatabaseConnectionMySQL::createTable(ContextPtr, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) { const auto & create = create_query->as(); @@ -467,7 +476,7 @@ void DatabaseConnectionMySQL::createTable(const Context &, const String & table_ /// XXX: hack /// In order to prevent users from broken the table structure by executing attach table database_name.table_name (...) /// we should compare the old and new create_query to make them completely consistent - const auto & origin_create_query = getCreateTableQuery(table_name, global_context); + const auto & origin_create_query = getCreateTableQuery(table_name, getContext()); origin_create_query->as()->attach = true; if (queryToString(origin_create_query) != queryToString(create_query)) diff --git a/src/Databases/MySQL/DatabaseConnectionMySQL.h b/src/Databases/MySQL/DatabaseConnectionMySQL.h index d0a5c041d7b..7e81003e9a9 100644 --- a/src/Databases/MySQL/DatabaseConnectionMySQL.h +++ b/src/Databases/MySQL/DatabaseConnectionMySQL.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -30,15 +31,19 @@ enum class MySQLDataTypesSupport; * It doesn't make any manipulations with filesystem. * All tables are created by calling code after real-time pull-out structure from remote MySQL */ -class DatabaseConnectionMySQL final : public IDatabase +class DatabaseConnectionMySQL final : public IDatabase, WithContext { public: ~DatabaseConnectionMySQL() override; DatabaseConnectionMySQL( - const Context & context, const String & database_name, const String & metadata_path, - const ASTStorage * database_engine_define, const String & database_name_in_mysql, std::unique_ptr settings_, - mysqlxx::Pool && pool); + ContextPtr context, + const String & database_name, + const String & metadata_path, + const ASTStorage * database_engine_define, + const String & database_name_in_mysql, + std::unique_ptr settings_, + mysqlxx::PoolWithFailover && pool); String getEngineName() const override { return "MySQL"; } @@ -50,39 +55,38 @@ public: bool empty() const override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) override; ASTPtr getCreateDatabaseQuery() const override; - bool isTableExist(const String & name, const Context & context) const override; + bool isTableExist(const String & name, ContextPtr context) const override; - StoragePtr tryGetTable(const String & name, const Context & context) const override; + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; time_t getObjectMetadataModificationTime(const String & name) const override; void shutdown() override; - void drop(const Context & /*context*/) override; + void drop(ContextPtr /*context*/) override; String getMetadataPath() const override; - void createTable(const Context &, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) override; + void createTable(ContextPtr, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) override; - void loadStoredObjects(Context &, bool, bool force_attach) override; + void loadStoredObjects(ContextPtr, bool, bool force_attach) override; StoragePtr detachTable(const String & table_name) override; - void detachTablePermanently(const Context & context, const String & table_name) override; + void detachTablePermanently(ContextPtr context, const String & table_name) override; - void dropTable(const Context & context, const String & table_name, bool no_delay) override; + void dropTable(ContextPtr context, const String & table_name, bool no_delay) override; void attachTable(const String & table_name, const StoragePtr & storage, const String & relative_table_path) override; protected: - ASTPtr getCreateTableQueryImpl(const String & name, const Context & context, bool throw_on_error) const override; + ASTPtr getCreateTableQueryImpl(const String & name, ContextPtr context, bool throw_on_error) const override; private: - const Context & global_context; String metadata_path; ASTPtr database_engine_define; String database_name_in_mysql; @@ -91,7 +95,7 @@ private: std::atomic quit{false}; std::condition_variable cond; - using MySQLPool = mysqlxx::Pool; + using MySQLPool = mysqlxx::PoolWithFailover; using ModifyTimeAndStorage = std::pair; mutable MySQLPool mysql_pool; @@ -102,15 +106,15 @@ private: void cleanOutdatedTables(); - void fetchTablesIntoLocalCache(const Context & context) const; + void fetchTablesIntoLocalCache(ContextPtr context) const; - std::map fetchTablesWithModificationTime() const; + std::map fetchTablesWithModificationTime(ContextPtr local_context) const; - std::map fetchTablesColumnsList(const std::vector & tables_name, const Context & context) const; + std::map fetchTablesColumnsList(const std::vector & tables_name, ContextPtr context) const; void destroyLocalCacheExtraTables(const std::map & tables_with_modification_time) const; - void fetchLatestTablesStructureIntoCache(const std::map & tables_modification_time, const Context & context) const; + void fetchLatestTablesStructureIntoCache(const std::map & tables_modification_time, ContextPtr context) const; ThreadFromGlobalPool thread; }; diff --git a/src/Databases/MySQL/DatabaseMaterializeMySQL.cpp b/src/Databases/MySQL/DatabaseMaterializeMySQL.cpp index 6a9f1e37f8e..62a66b22c93 100644 --- a/src/Databases/MySQL/DatabaseMaterializeMySQL.cpp +++ b/src/Databases/MySQL/DatabaseMaterializeMySQL.cpp @@ -26,27 +26,40 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -template<> +template <> DatabaseMaterializeMySQL::DatabaseMaterializeMySQL( - const Context & context, const String & database_name_, const String & metadata_path_, UUID /*uuid*/, - const String & mysql_database_name_, mysqlxx::Pool && pool_, MySQLClient && client_, std::unique_ptr settings_) - : DatabaseOrdinary(database_name_ - , metadata_path_ - , "data/" + escapeForFileName(database_name_) + "/" - , "DatabaseMaterializeMySQL (" + database_name_ + ")", context - ) + ContextPtr context_, + const String & database_name_, + const String & metadata_path_, + UUID /*uuid*/, + const String & mysql_database_name_, + mysqlxx::Pool && pool_, + MySQLClient && client_, + std::unique_ptr settings_) + : DatabaseOrdinary( + database_name_, + metadata_path_, + "data/" + escapeForFileName(database_name_) + "/", + "DatabaseMaterializeMySQL (" + database_name_ + ")", + context_) , settings(std::move(settings_)) - , materialize_thread(context, database_name_, mysql_database_name_, std::move(pool_), std::move(client_), settings.get()) + , materialize_thread(context_, database_name_, mysql_database_name_, std::move(pool_), std::move(client_), settings.get()) { } -template<> +template <> DatabaseMaterializeMySQL::DatabaseMaterializeMySQL( - const Context & context, const String & database_name_, const String & metadata_path_, UUID uuid, - const String & mysql_database_name_, mysqlxx::Pool && pool_, MySQLClient && client_, std::unique_ptr settings_) - : DatabaseAtomic(database_name_, metadata_path_, uuid, "DatabaseMaterializeMySQL (" + database_name_ + ")", context) + ContextPtr context_, + const String & database_name_, + const String & metadata_path_, + UUID uuid, + const String & mysql_database_name_, + mysqlxx::Pool && pool_, + MySQLClient && client_, + std::unique_ptr settings_) + : DatabaseAtomic(database_name_, metadata_path_, uuid, "DatabaseMaterializeMySQL (" + database_name_ + ")", context_) , settings(std::move(settings_)) - , materialize_thread(context, database_name_, mysql_database_name_, std::move(pool_), std::move(client_), settings.get()) + , materialize_thread(context_, database_name_, mysql_database_name_, std::move(pool_), std::move(client_), settings.get()) { } @@ -79,35 +92,28 @@ void DatabaseMaterializeMySQL::setException(const std::exception_ptr & exc } template -void DatabaseMaterializeMySQL::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) +void DatabaseMaterializeMySQL::loadStoredObjects(ContextPtr context_, bool has_force_restore_data_flag, bool force_attach) { - Base::loadStoredObjects(context, has_force_restore_data_flag, force_attach); - try - { - materialize_thread.startSynchronization(); - started_up = true; - } - catch (...) - { - tryLogCurrentException(Base::log, "Cannot load MySQL nested database stored objects."); + Base::loadStoredObjects(context_, has_force_restore_data_flag, force_attach); + if (!force_attach) + materialize_thread.assertMySQLAvailable(); - if (!force_attach) - throw; - } + materialize_thread.startSynchronization(); + started_up = true; } template -void DatabaseMaterializeMySQL::createTable(const Context & context, const String & name, const StoragePtr & table, const ASTPtr & query) +void DatabaseMaterializeMySQL::createTable(ContextPtr context_, const String & name, const StoragePtr & table, const ASTPtr & query) { assertCalledFromSyncThreadOrDrop("create table"); - Base::createTable(context, name, table, query); + Base::createTable(context_, name, table, query); } template -void DatabaseMaterializeMySQL::dropTable(const Context & context, const String & name, bool no_delay) +void DatabaseMaterializeMySQL::dropTable(ContextPtr context_, const String & name, bool no_delay) { assertCalledFromSyncThreadOrDrop("drop table"); - Base::dropTable(context, name, no_delay); + Base::dropTable(context_, name, no_delay); } template @@ -125,7 +131,7 @@ StoragePtr DatabaseMaterializeMySQL::detachTable(const String & name) } template -void DatabaseMaterializeMySQL::renameTable(const Context & context, const String & name, IDatabase & to_database, const String & to_name, bool exchange, bool dictionary) +void DatabaseMaterializeMySQL::renameTable(ContextPtr context_, const String & name, IDatabase & to_database, const String & to_name, bool exchange, bool dictionary) { assertCalledFromSyncThreadOrDrop("rename table"); @@ -138,18 +144,18 @@ void DatabaseMaterializeMySQL::renameTable(const Context & context, const if (to_database.getDatabaseName() != Base::getDatabaseName()) throw Exception("Cannot rename with other database for MaterializeMySQL database.", ErrorCodes::NOT_IMPLEMENTED); - Base::renameTable(context, name, *this, to_name, exchange, dictionary); + Base::renameTable(context_, name, *this, to_name, exchange, dictionary); } template -void DatabaseMaterializeMySQL::alterTable(const Context & context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) +void DatabaseMaterializeMySQL::alterTable(ContextPtr context_, const StorageID & table_id, const StorageInMemoryMetadata & metadata) { assertCalledFromSyncThreadOrDrop("alter table"); - Base::alterTable(context, table_id, metadata); + Base::alterTable(context_, table_id, metadata); } template -void DatabaseMaterializeMySQL::drop(const Context & context) +void DatabaseMaterializeMySQL::drop(ContextPtr context_) { /// Remove metadata info Poco::File metadata(Base::getMetadataPath() + "/.metadata"); @@ -157,15 +163,15 @@ void DatabaseMaterializeMySQL::drop(const Context & context) if (metadata.exists()) metadata.remove(false); - Base::drop(context); + Base::drop(context_); } template -StoragePtr DatabaseMaterializeMySQL::tryGetTable(const String & name, const Context & context) const +StoragePtr DatabaseMaterializeMySQL::tryGetTable(const String & name, ContextPtr context_) const { if (!MaterializeMySQLSyncThread::isMySQLSyncThread()) { - StoragePtr nested_storage = Base::tryGetTable(name, context); + StoragePtr nested_storage = Base::tryGetTable(name, context_); if (!nested_storage) return {}; @@ -173,19 +179,20 @@ StoragePtr DatabaseMaterializeMySQL::tryGetTable(const String & name, cons return std::make_shared(std::move(nested_storage), this); } - return Base::tryGetTable(name, context); + return Base::tryGetTable(name, context_); } -template -DatabaseTablesIteratorPtr DatabaseMaterializeMySQL::getTablesIterator(const Context & context, const DatabaseOnDisk::FilterByNameFunction & filter_by_table_name) +template +DatabaseTablesIteratorPtr +DatabaseMaterializeMySQL::getTablesIterator(ContextPtr context_, const DatabaseOnDisk::FilterByNameFunction & filter_by_table_name) { if (!MaterializeMySQLSyncThread::isMySQLSyncThread()) { - DatabaseTablesIteratorPtr iterator = Base::getTablesIterator(context, filter_by_table_name); + DatabaseTablesIteratorPtr iterator = Base::getTablesIterator(context_, filter_by_table_name); return std::make_unique(std::move(iterator), this); } - return Base::getTablesIterator(context, filter_by_table_name); + return Base::getTablesIterator(context_, filter_by_table_name); } template diff --git a/src/Databases/MySQL/DatabaseMaterializeMySQL.h b/src/Databases/MySQL/DatabaseMaterializeMySQL.h index e1229269a33..cd9fe640239 100644 --- a/src/Databases/MySQL/DatabaseMaterializeMySQL.h +++ b/src/Databases/MySQL/DatabaseMaterializeMySQL.h @@ -23,7 +23,7 @@ class DatabaseMaterializeMySQL : public Base public: DatabaseMaterializeMySQL( - const Context & context, const String & database_name_, const String & metadata_path_, UUID uuid, + ContextPtr context, const String & database_name_, const String & metadata_path_, UUID uuid, const String & mysql_database_name_, mysqlxx::Pool && pool_, MySQLClient && client_, std::unique_ptr settings_); @@ -43,25 +43,25 @@ protected: public: String getEngineName() const override { return "MaterializeMySQL"; } - void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; + void loadStoredObjects(ContextPtr context_, bool has_force_restore_data_flag, bool force_attach) override; - void createTable(const Context & context, const String & name, const StoragePtr & table, const ASTPtr & query) override; + void createTable(ContextPtr context_, const String & name, const StoragePtr & table, const ASTPtr & query) override; - void dropTable(const Context & context, const String & name, bool no_delay) override; + void dropTable(ContextPtr context_, const String & name, bool no_delay) override; void attachTable(const String & name, const StoragePtr & table, const String & relative_table_path) override; StoragePtr detachTable(const String & name) override; - void renameTable(const Context & context, const String & name, IDatabase & to_database, const String & to_name, bool exchange, bool dictionary) override; + void renameTable(ContextPtr context_, const String & name, IDatabase & to_database, const String & to_name, bool exchange, bool dictionary) override; - void alterTable(const Context & context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) override; + void alterTable(ContextPtr context_, const StorageID & table_id, const StorageInMemoryMetadata & metadata) override; - void drop(const Context & context) override; + void drop(ContextPtr context_) override; - StoragePtr tryGetTable(const String & name, const Context & context) const override; + StoragePtr tryGetTable(const String & name, ContextPtr context_) const override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const DatabaseOnDisk::FilterByNameFunction & filter_by_table_name) override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context_, const DatabaseOnDisk::FilterByNameFunction & filter_by_table_name) override; void assertCalledFromSyncThreadOrDrop(const char * method) const; diff --git a/src/Databases/MySQL/FetchTablesColumnsList.cpp b/src/Databases/MySQL/FetchTablesColumnsList.cpp index 3e25c703a1d..cfd01d4ddc4 100644 --- a/src/Databases/MySQL/FetchTablesColumnsList.cpp +++ b/src/Databases/MySQL/FetchTablesColumnsList.cpp @@ -41,10 +41,10 @@ namespace DB { std::map fetchTablesColumnsList( - mysqlxx::Pool & pool, + mysqlxx::PoolWithFailover & pool, const String & database_name, const std::vector & tables_name, - bool external_table_functions_use_nulls, + const Settings & settings, MultiEnum type_support) { std::map tables_and_columns; @@ -72,13 +72,18 @@ std::map fetchTablesColumnsList( " IS_NULLABLE = 'YES' AS is_nullable," " COLUMN_TYPE LIKE '%unsigned' AS is_unsigned," " CHARACTER_MAXIMUM_LENGTH AS length," - " NUMERIC_PRECISION as ''," + " NUMERIC_PRECISION as numeric_precision," " IF(ISNULL(NUMERIC_SCALE), DATETIME_PRECISION, NUMERIC_SCALE) AS scale" // we know DATETIME_PRECISION as a scale in CH " FROM INFORMATION_SCHEMA.COLUMNS" - " WHERE TABLE_SCHEMA = " << quote << database_name - << " AND TABLE_NAME IN " << toQueryStringWithQuote(tables_name) << " ORDER BY ORDINAL_POSITION"; + " WHERE "; - MySQLBlockInputStream result(pool.get(), query.str(), tables_columns_sample_block, DEFAULT_BLOCK_SIZE); + if (!database_name.empty()) + query << " TABLE_SCHEMA = " << quote << database_name << " AND "; + + query << " TABLE_NAME IN " << toQueryStringWithQuote(tables_name) << " ORDER BY ORDINAL_POSITION"; + + StreamSettings mysql_input_stream_settings(settings); + MySQLBlockInputStream result(pool.get(), query.str(), tables_columns_sample_block, mysql_input_stream_settings); while (Block block = result.read()) { const auto & table_name_col = *block.getByPosition(0).column; @@ -99,7 +104,7 @@ std::map fetchTablesColumnsList( convertMySQLDataType( type_support, column_type_col[i].safeGet(), - external_table_functions_use_nulls && is_nullable_col[i].safeGet(), + settings.external_table_functions_use_nulls && is_nullable_col[i].safeGet(), is_unsigned_col[i].safeGet(), char_max_length_col[i].safeGet(), precision_col[i].safeGet(), diff --git a/src/Databases/MySQL/FetchTablesColumnsList.h b/src/Databases/MySQL/FetchTablesColumnsList.h index 52191c2ecb8..55f18e0115f 100644 --- a/src/Databases/MySQL/FetchTablesColumnsList.h +++ b/src/Databases/MySQL/FetchTablesColumnsList.h @@ -3,7 +3,7 @@ #include "config_core.h" #if USE_MYSQL -#include +#include #include #include @@ -12,15 +12,16 @@ #include #include +#include namespace DB { std::map fetchTablesColumnsList( - mysqlxx::Pool & pool, + mysqlxx::PoolWithFailover & pool, const String & database_name, const std::vector & tables_name, - bool external_table_functions_use_nulls, + const Settings & settings, MultiEnum type_support); } diff --git a/src/Databases/MySQL/MaterializeMetadata.cpp b/src/Databases/MySQL/MaterializeMetadata.cpp index 84f44771bf1..c389ab5a1b0 100644 --- a/src/Databases/MySQL/MaterializeMetadata.cpp +++ b/src/Databases/MySQL/MaterializeMetadata.cpp @@ -24,7 +24,8 @@ namespace ErrorCodes } static std::unordered_map fetchTablesCreateQuery( - const mysqlxx::PoolWithFailover::Entry & connection, const String & database_name, const std::vector & fetch_tables) + const mysqlxx::PoolWithFailover::Entry & connection, const String & database_name, + const std::vector & fetch_tables, const Settings & global_settings) { std::unordered_map tables_create_query; for (const auto & fetch_table_name : fetch_tables) @@ -34,9 +35,10 @@ static std::unordered_map fetchTablesCreateQuery( {std::make_shared(), "Create Table"}, }; + StreamSettings mysql_input_stream_settings(global_settings, false, true); MySQLBlockInputStream show_create_table( connection, "SHOW CREATE TABLE " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(fetch_table_name), - show_create_table_header, DEFAULT_BLOCK_SIZE, false, true); + show_create_table_header, mysql_input_stream_settings); Block create_query_block = show_create_table.read(); if (!create_query_block || create_query_block.rows() != 1) @@ -49,13 +51,14 @@ static std::unordered_map fetchTablesCreateQuery( } -static std::vector fetchTablesInDB(const mysqlxx::PoolWithFailover::Entry & connection, const std::string & database) +static std::vector fetchTablesInDB(const mysqlxx::PoolWithFailover::Entry & connection, const std::string & database, const Settings & global_settings) { Block header{{std::make_shared(), "table_name"}}; - String query = "SELECT TABLE_NAME AS table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = " + quoteString(database); + String query = "SELECT TABLE_NAME AS table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE != 'VIEW' AND TABLE_SCHEMA = " + quoteString(database); std::vector tables_in_db; - MySQLBlockInputStream input(connection, query, header, DEFAULT_BLOCK_SIZE); + StreamSettings mysql_input_stream_settings(global_settings); + MySQLBlockInputStream input(connection, query, header, mysql_input_stream_settings); while (Block block = input.read()) { @@ -77,7 +80,8 @@ void MaterializeMetadata::fetchMasterStatus(mysqlxx::PoolWithFailover::Entry & c {std::make_shared(), "Executed_Gtid_Set"}, }; - MySQLBlockInputStream input(connection, "SHOW MASTER STATUS;", header, DEFAULT_BLOCK_SIZE, false, true); + StreamSettings mysql_input_stream_settings(settings, false, true); + MySQLBlockInputStream input(connection, "SHOW MASTER STATUS;", header, mysql_input_stream_settings); Block master_status = input.read(); if (!master_status || master_status.rows() != 1) @@ -99,7 +103,8 @@ void MaterializeMetadata::fetchMasterVariablesValue(const mysqlxx::PoolWithFailo }; const String & fetch_query = "SHOW VARIABLES WHERE Variable_name = 'binlog_checksum'"; - MySQLBlockInputStream variables_input(connection, fetch_query, variables_header, DEFAULT_BLOCK_SIZE, false, true); + StreamSettings mysql_input_stream_settings(settings, false, true); + MySQLBlockInputStream variables_input(connection, fetch_query, variables_header, mysql_input_stream_settings); while (Block variables_block = variables_input.read()) { @@ -114,7 +119,7 @@ void MaterializeMetadata::fetchMasterVariablesValue(const mysqlxx::PoolWithFailo } } -static bool checkSyncUserPrivImpl(const mysqlxx::PoolWithFailover::Entry & connection, WriteBuffer & out) +static bool checkSyncUserPrivImpl(const mysqlxx::PoolWithFailover::Entry & connection, const Settings & global_settings, WriteBuffer & out) { Block sync_user_privs_header { @@ -122,7 +127,8 @@ static bool checkSyncUserPrivImpl(const mysqlxx::PoolWithFailover::Entry & conne }; String grants_query, sub_privs; - MySQLBlockInputStream input(connection, "SHOW GRANTS FOR CURRENT_USER();", sync_user_privs_header, DEFAULT_BLOCK_SIZE); + StreamSettings mysql_input_stream_settings(global_settings); + MySQLBlockInputStream input(connection, "SHOW GRANTS FOR CURRENT_USER();", sync_user_privs_header, mysql_input_stream_settings); while (Block block = input.read()) { for (size_t index = 0; index < block.rows(); ++index) @@ -146,11 +152,11 @@ static bool checkSyncUserPrivImpl(const mysqlxx::PoolWithFailover::Entry & conne return false; } -static void checkSyncUserPriv(const mysqlxx::PoolWithFailover::Entry & connection) +static void checkSyncUserPriv(const mysqlxx::PoolWithFailover::Entry & connection, const Settings & global_settings) { WriteBufferFromOwnString out; - if (!checkSyncUserPrivImpl(connection, out)) + if (!checkSyncUserPrivImpl(connection, global_settings, out)) throw Exception("MySQL SYNC USER ACCESS ERR: mysql sync user needs " "at least GLOBAL PRIVILEGES:'RELOAD, REPLICATION SLAVE, REPLICATION CLIENT' " "and SELECT PRIVILEGE on MySQL Database." @@ -159,12 +165,16 @@ static void checkSyncUserPriv(const mysqlxx::PoolWithFailover::Entry & connectio bool MaterializeMetadata::checkBinlogFileExists(const mysqlxx::PoolWithFailover::Entry & connection) const { + if (binlog_file.empty()) + return false; + Block logs_header { {std::make_shared(), "Log_name"}, {std::make_shared(), "File_size"} }; - MySQLBlockInputStream input(connection, "SHOW MASTER LOGS", logs_header, DEFAULT_BLOCK_SIZE, false, true); + StreamSettings mysql_input_stream_settings(settings, false, true); + MySQLBlockInputStream input(connection, "SHOW MASTER LOGS", logs_header, mysql_input_stream_settings); while (Block block = input.read()) { @@ -219,13 +229,8 @@ void MaterializeMetadata::transaction(const MySQLReplication::Position & positio commitMetadata(std::move(fun), persistent_tmp_path, persistent_path); } -MaterializeMetadata::MaterializeMetadata( - mysqlxx::PoolWithFailover::Entry & connection, const String & path_, - const String & database, bool & opened_transaction) - : persistent_path(path_) +MaterializeMetadata::MaterializeMetadata(const String & path_, const Settings & settings_) : persistent_path(path_), settings(settings_) { - checkSyncUserPriv(connection); - if (Poco::File(persistent_path).exists()) { ReadBufferFromFile in(persistent_path, DBMS_DEFAULT_BUFFER_SIZE); @@ -239,9 +244,17 @@ MaterializeMetadata::MaterializeMetadata( assertString("\nData Version:\t", in); readIntText(data_version, in); - if (checkBinlogFileExists(connection)) - return; } +} + +void MaterializeMetadata::startReplication( + mysqlxx::PoolWithFailover::Entry & connection, const String & database, + bool & opened_transaction, std::unordered_map & need_dumping_tables) +{ + checkSyncUserPriv(connection, settings); + + if (checkBinlogFileExists(connection)) + return; bool locked_tables = false; @@ -257,7 +270,7 @@ MaterializeMetadata::MaterializeMetadata( connection->query("START TRANSACTION /*!40100 WITH CONSISTENT SNAPSHOT */;").execute(); opened_transaction = true; - need_dumping_tables = fetchTablesCreateQuery(connection, database, fetchTablesInDB(connection, database)); + need_dumping_tables = fetchTablesCreateQuery(connection, database, fetchTablesInDB(connection, database, settings), settings); connection->query("UNLOCK TABLES;").execute(); } catch (...) diff --git a/src/Databases/MySQL/MaterializeMetadata.h b/src/Databases/MySQL/MaterializeMetadata.h index f4cd2970fce..079786c261c 100644 --- a/src/Databases/MySQL/MaterializeMetadata.h +++ b/src/Databases/MySQL/MaterializeMetadata.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB { @@ -25,6 +26,7 @@ namespace DB struct MaterializeMetadata { const String persistent_path; + const Settings settings; String binlog_file; UInt64 binlog_position; @@ -35,7 +37,6 @@ struct MaterializeMetadata size_t data_version = 1; size_t meta_version = 2; String binlog_checksum = "CRC32"; - std::unordered_map need_dumping_tables; void fetchMasterStatus(mysqlxx::PoolWithFailover::Entry & connection); @@ -45,9 +46,13 @@ struct MaterializeMetadata void transaction(const MySQLReplication::Position & position, const std::function & fun); - MaterializeMetadata( - mysqlxx::PoolWithFailover::Entry & connection, const String & path - , const String & database, bool & opened_transaction); + void startReplication( + mysqlxx::PoolWithFailover::Entry & connection, + const String & database, + bool & opened_transaction, + std::unordered_map & need_dumping_tables); + + MaterializeMetadata(const String & path_, const Settings & settings_); }; } diff --git a/src/Databases/MySQL/MaterializeMySQLSettings.h b/src/Databases/MySQL/MaterializeMySQLSettings.h index 07de219c72f..9bd05b5382b 100644 --- a/src/Databases/MySQL/MaterializeMySQLSettings.h +++ b/src/Databases/MySQL/MaterializeMySQLSettings.h @@ -14,7 +14,7 @@ class ASTStorage; M(UInt64, max_rows_in_buffers, DEFAULT_BLOCK_SIZE, "Max rows that data is allowed to cache in memory(for database and the cache data unable to query). when rows is exceeded, the data will be materialized", 0) \ M(UInt64, max_bytes_in_buffers, DBMS_DEFAULT_BUFFER_SIZE, "Max bytes that data is allowed to cache in memory(for database and the cache data unable to query). when rows is exceeded, the data will be materialized", 0) \ M(UInt64, max_flush_data_time, 1000, "Max milliseconds that data is allowed to cache in memory(for database and the cache data unable to query). when this time is exceeded, the data will be materialized", 0) \ - M(UInt64, max_wait_time_when_mysql_unavailable, 1000, "Dump full data retry interval when MySQL is not available(milliseconds).", 0) \ + M(Int64, max_wait_time_when_mysql_unavailable, 1000, "Retry interval when MySQL is not available (milliseconds). Negative value disable retry.", 0) \ M(Bool, allows_query_when_mysql_lost, false, "Allow query materialized table when mysql is lost.", 0) \ DECLARE_SETTINGS_TRAITS(MaterializeMySQLSettingsTraits, LIST_OF_MATERIALIZE_MODE_SETTINGS) diff --git a/src/Databases/MySQL/MaterializeMySQLSyncThread.cpp b/src/Databases/MySQL/MaterializeMySQLSyncThread.cpp index 08d170768f4..82161ac5c8d 100644 --- a/src/Databases/MySQL/MaterializeMySQLSyncThread.cpp +++ b/src/Databases/MySQL/MaterializeMySQLSyncThread.cpp @@ -35,34 +35,36 @@ namespace ErrorCodes extern const int ILLEGAL_MYSQL_VARIABLE; extern const int SYNC_MYSQL_USER_ACCESS_ERROR; extern const int UNKNOWN_DATABASE; + extern const int UNKNOWN_EXCEPTION; + extern const int CANNOT_READ_ALL_DATA; } static constexpr auto MYSQL_BACKGROUND_THREAD_NAME = "MySQLDBSync"; -static Context createQueryContext(const Context & global_context) +static ContextPtr createQueryContext(ContextPtr context) { - Settings new_query_settings = global_context.getSettings(); + Settings new_query_settings = context->getSettings(); new_query_settings.insert_allow_materialized_columns = true; /// To avoid call AST::format /// TODO: We need to implement the format function for MySQLAST new_query_settings.enable_global_with_statement = false; - Context query_context(global_context); - query_context.setSettings(new_query_settings); + auto query_context = Context::createCopy(context); + query_context->setSettings(new_query_settings); CurrentThread::QueryScope query_scope(query_context); - query_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - query_context.setCurrentQueryId(""); // generate random query_id + query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + query_context->setCurrentQueryId(""); // generate random query_id return query_context; } -static BlockIO tryToExecuteQuery(const String & query_to_execute, Context & query_context, const String & database, const String & comment) +static BlockIO tryToExecuteQuery(const String & query_to_execute, ContextPtr query_context, const String & database, const String & comment) { try { if (!database.empty()) - query_context.setCurrentDatabase(database); + query_context->setCurrentDatabase(database); return executeQuery("/*" + comment + "*/ " + query_to_execute, query_context, true); } @@ -88,7 +90,7 @@ MaterializeMySQLSyncThread::~MaterializeMySQLSyncThread() } } -static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection) +static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection, const Settings & settings) { Block variables_header{ {std::make_shared(), "Variable_name"}, @@ -102,19 +104,19 @@ static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection) "OR (Variable_name = 'default_authentication_plugin' AND upper(Value) = 'MYSQL_NATIVE_PASSWORD') " "OR (Variable_name = 'log_bin_use_v1_row_events' AND upper(Value) = 'OFF');"; - MySQLBlockInputStream variables_input(connection, check_query, variables_header, DEFAULT_BLOCK_SIZE, false, true); + StreamSettings mysql_input_stream_settings(settings, false, true); + MySQLBlockInputStream variables_input(connection, check_query, variables_header, mysql_input_stream_settings); - Block variables_block = variables_input.read(); - if (!variables_block || variables_block.rows() != 5) + std::unordered_map variables_error_message{ + {"log_bin", "log_bin = 'ON'"}, + {"binlog_format", "binlog_format='ROW'"}, + {"binlog_row_image", "binlog_row_image='FULL'"}, + {"default_authentication_plugin", "default_authentication_plugin='mysql_native_password'"}, + {"log_bin_use_v1_row_events", "log_bin_use_v1_row_events='OFF'"} + }; + + while (Block variables_block = variables_input.read()) { - std::unordered_map variables_error_message{ - {"log_bin", "log_bin = 'ON'"}, - {"binlog_format", "binlog_format='ROW'"}, - {"binlog_row_image", "binlog_row_image='FULL'"}, - {"default_authentication_plugin", "default_authentication_plugin='mysql_native_password'"}, - {"log_bin_use_v1_row_events", "log_bin_use_v1_row_events='OFF'"} - }; - ColumnPtr variable_name_column = variables_block.getByName("Variable_name").column; for (size_t index = 0; index < variables_block.rows(); ++index) @@ -124,7 +126,10 @@ static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection) if (error_message_it != variables_error_message.end()) variables_error_message.erase(error_message_it); } + } + if (!variables_error_message.empty()) + { bool first = true; WriteBufferFromOwnString error_message; error_message << "Illegal MySQL variables, the MaterializeMySQL engine requires "; @@ -141,10 +146,19 @@ static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection) } MaterializeMySQLSyncThread::MaterializeMySQLSyncThread( - const Context & context, const String & database_name_, const String & mysql_database_name_, - mysqlxx::Pool && pool_, MySQLClient && client_, MaterializeMySQLSettings * settings_) - : log(&Poco::Logger::get("MaterializeMySQLSyncThread")), global_context(context.getGlobalContext()), database_name(database_name_) - , mysql_database_name(mysql_database_name_), pool(std::move(pool_)), client(std::move(client_)), settings(settings_) + ContextPtr context_, + const String & database_name_, + const String & mysql_database_name_, + mysqlxx::Pool && pool_, + MySQLClient && client_, + MaterializeMySQLSettings * settings_) + : WithContext(context_->getGlobalContext()) + , log(&Poco::Logger::get("MaterializeMySQLSyncThread")) + , database_name(database_name_) + , mysql_database_name(mysql_database_name_) + , pool(std::move(pool_)) + , client(std::move(client_)) + , settings(settings_) { query_prefix = "EXTERNAL DDL FROM MySQL(" + backQuoteIfNeed(database_name) + ", " + backQuoteIfNeed(mysql_database_name) + ") "; } @@ -155,32 +169,50 @@ void MaterializeMySQLSyncThread::synchronization() try { - if (std::optional metadata = prepareSynchronized()) + MaterializeMetadata metadata( + DatabaseCatalog::instance().getDatabase(database_name)->getMetadataPath() + "/.metadata", getContext()->getSettingsRef()); + bool need_reconnect = true; + + Stopwatch watch; + Buffers buffers(database_name); + + while (!isCancelled()) { - Stopwatch watch; - Buffers buffers(database_name); - - while (!isCancelled()) + if (need_reconnect) + { + if (!prepareSynchronized(metadata)) + break; + need_reconnect = false; + } + + /// TODO: add gc task for `sign = -1`(use alter table delete, execute by interval. need final state) + UInt64 max_flush_time = settings->max_flush_data_time; + + try { - /// TODO: add gc task for `sign = -1`(use alter table delete, execute by interval. need final state) - UInt64 max_flush_time = settings->max_flush_data_time; BinlogEventPtr binlog_event = client.readOneBinlogEvent(std::max(UInt64(1), max_flush_time - watch.elapsedMilliseconds())); + if (binlog_event) + onEvent(buffers, binlog_event, metadata); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::CANNOT_READ_ALL_DATA || settings->max_wait_time_when_mysql_unavailable < 0) + throw; - { - if (binlog_event) - onEvent(buffers, binlog_event, *metadata); - - if (watch.elapsedMilliseconds() > max_flush_time || buffers.checkThresholds( - settings->max_rows_in_buffer, settings->max_bytes_in_buffer, - settings->max_rows_in_buffers, settings->max_bytes_in_buffers) - ) - { - watch.restart(); - - if (!buffers.data.empty()) - flushBuffersData(buffers, *metadata); - } - } + flushBuffersData(buffers, metadata); + LOG_INFO(log, "Lost connection to MySQL"); + need_reconnect = true; + setSynchronizationThreadException(std::current_exception()); + sleepForMilliseconds(settings->max_wait_time_when_mysql_unavailable); + continue; + } + if (watch.elapsedMilliseconds() > max_flush_time || buffers.checkThresholds( + settings->max_rows_in_buffer, settings->max_bytes_in_buffer, + settings->max_rows_in_buffers, settings->max_bytes_in_buffers) + ) + { + watch.restart(); + flushBuffersData(buffers, metadata); } } } @@ -188,8 +220,7 @@ void MaterializeMySQLSyncThread::synchronization() { client.disconnect(); tryLogCurrentException(log); - auto db = DatabaseCatalog::instance().getDatabase(database_name); - setSynchronizationThreadException(db, std::current_exception()); + setSynchronizationThreadException(std::current_exception()); } } @@ -204,35 +235,32 @@ void MaterializeMySQLSyncThread::stopSynchronization() } void MaterializeMySQLSyncThread::startSynchronization() +{ + background_thread_pool = std::make_unique([this]() { synchronization(); }); +} + +void MaterializeMySQLSyncThread::assertMySQLAvailable() { try { - checkMySQLVariables(pool.get()); - background_thread_pool = std::make_unique([this]() { synchronization(); }); + checkMySQLVariables(pool.get(), getContext()->getSettingsRef()); } - catch (...) + catch (const mysqlxx::ConnectionFailed & e) { - try - { + if (e.errnum() == ER_ACCESS_DENIED_ERROR + || e.errnum() == ER_DBACCESS_DENIED_ERROR) + throw Exception("MySQL SYNC USER ACCESS ERR: mysql sync user needs " + "at least GLOBAL PRIVILEGES:'RELOAD, REPLICATION SLAVE, REPLICATION CLIENT' " + "and SELECT PRIVILEGE on Database " + mysql_database_name + , ErrorCodes::SYNC_MYSQL_USER_ACCESS_ERROR); + else if (e.errnum() == ER_BAD_DB_ERROR) + throw Exception("Unknown database '" + mysql_database_name + "' on MySQL", ErrorCodes::UNKNOWN_DATABASE); + else throw; - } - catch (mysqlxx::ConnectionFailed & e) - { - if (e.errnum() == ER_ACCESS_DENIED_ERROR - || e.errnum() == ER_DBACCESS_DENIED_ERROR) - throw Exception("MySQL SYNC USER ACCESS ERR: mysql sync user needs " - "at least GLOBAL PRIVILEGES:'RELOAD, REPLICATION SLAVE, REPLICATION CLIENT' " - "and SELECT PRIVILEGE on Database " + mysql_database_name - , ErrorCodes::SYNC_MYSQL_USER_ACCESS_ERROR); - else if (e.errnum() == ER_BAD_DB_ERROR) - throw Exception("Unknown database '" + mysql_database_name + "' on MySQL", ErrorCodes::UNKNOWN_DATABASE); - else - throw; - } } } -static inline void cleanOutdatedTables(const String & database_name, const Context & context) +static inline void cleanOutdatedTables(const String & database_name, ContextPtr context) { String cleaning_table_name; try @@ -242,7 +270,7 @@ static inline void cleanOutdatedTables(const String & database_name, const Conte for (auto iterator = clean_database->getTablesIterator(context); iterator->isValid(); iterator->next()) { - Context query_context = createQueryContext(context); + auto query_context = createQueryContext(context); String comment = "Materialize MySQL step 1: execute MySQL DDL for dump data"; cleaning_table_name = backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(iterator->name()); tryToExecuteQuery(" DROP TABLE " + cleaning_table_name, query_context, database_name, comment); @@ -255,7 +283,8 @@ static inline void cleanOutdatedTables(const String & database_name, const Conte } } -static inline BlockOutputStreamPtr getTableOutput(const String & database_name, const String & table_name, Context & query_context, bool insert_materialized = false) +static inline BlockOutputStreamPtr +getTableOutput(const String & database_name, const String & table_name, ContextPtr query_context, bool insert_materialized = false) { const StoragePtr & storage = DatabaseCatalog::instance().getTable(StorageID(database_name, table_name), query_context); @@ -285,24 +314,25 @@ static inline BlockOutputStreamPtr getTableOutput(const String & database_name, } static inline void dumpDataForTables( - mysqlxx::Pool::Entry & connection, MaterializeMetadata & master_info, + mysqlxx::Pool::Entry & connection, const std::unordered_map & need_dumping_tables, const String & query_prefix, const String & database_name, const String & mysql_database_name, - const Context & context, const std::function & is_cancelled) + ContextPtr context, const std::function & is_cancelled) { - auto iterator = master_info.need_dumping_tables.begin(); - for (; iterator != master_info.need_dumping_tables.end() && !is_cancelled(); ++iterator) + auto iterator = need_dumping_tables.begin(); + for (; iterator != need_dumping_tables.end() && !is_cancelled(); ++iterator) { try { const auto & table_name = iterator->first; - Context query_context = createQueryContext(context); + auto query_context = createQueryContext(context); String comment = "Materialize MySQL step 1: execute MySQL DDL for dump data"; tryToExecuteQuery(query_prefix + " " + iterator->second, query_context, database_name, comment); /// create table. auto out = std::make_shared(getTableOutput(database_name, table_name, query_context)); + StreamSettings mysql_input_stream_settings(context->getSettingsRef()); MySQLBlockInputStream input( connection, "SELECT * FROM " + backQuoteIfNeed(mysql_database_name) + "." + backQuoteIfNeed(table_name), - out->getHeader(), DEFAULT_BLOCK_SIZE); + out->getHeader(), mysql_input_stream_settings); Stopwatch watch; copyData(input, *out, is_cancelled); @@ -329,7 +359,7 @@ static inline UInt32 randomNumber() return dist6(rng); } -std::optional MaterializeMySQLSyncThread::prepareSynchronized() +bool MaterializeMySQLSyncThread::prepareSynchronized(MaterializeMetadata & metadata) { bool opened_transaction = false; mysqlxx::PoolWithFailover::Entry connection; @@ -338,21 +368,34 @@ std::optional MaterializeMySQLSyncThread::prepareSynchroniz { try { - connection = pool.get(); + connection = pool.tryGet(); + if (connection.isNull()) + { + if (settings->max_wait_time_when_mysql_unavailable < 0) + throw Exception("Unable to connect to MySQL", ErrorCodes::UNKNOWN_EXCEPTION); + sleepForMilliseconds(settings->max_wait_time_when_mysql_unavailable); + continue; + } + opened_transaction = false; - MaterializeMetadata metadata( - connection, DatabaseCatalog::instance().getDatabase(database_name)->getMetadataPath() + "/.metadata", mysql_database_name, opened_transaction); + checkMySQLVariables(connection, getContext()->getSettingsRef()); + std::unordered_map need_dumping_tables; + metadata.startReplication(connection, mysql_database_name, opened_transaction, need_dumping_tables); - if (!metadata.need_dumping_tables.empty()) + if (!need_dumping_tables.empty()) { Position position; position.update(metadata.binlog_position, metadata.binlog_file, metadata.executed_gtid_set); metadata.transaction(position, [&]() { - cleanOutdatedTables(database_name, global_context); - dumpDataForTables(connection, metadata, query_prefix, database_name, mysql_database_name, global_context, [this] { return isCancelled(); }); + cleanOutdatedTables(database_name, getContext()); + dumpDataForTables( + connection, need_dumping_tables, query_prefix, database_name, mysql_database_name, getContext(), [this] + { + return isCancelled(); + }); }); const auto & position_message = [&]() @@ -369,7 +412,9 @@ std::optional MaterializeMySQLSyncThread::prepareSynchroniz client.connect(); client.startBinlogDumpGTID(randomNumber(), mysql_database_name, metadata.executed_gtid_set, metadata.binlog_checksum); - return metadata; + + setSynchronizationThreadException(nullptr); + return true; } catch (...) { @@ -382,20 +427,29 @@ std::optional MaterializeMySQLSyncThread::prepareSynchroniz { throw; } - catch (const mysqlxx::ConnectionFailed &) + catch (const mysqlxx::ConnectionFailed &) {} + catch (const mysqlxx::BadQuery & e) { - /// Avoid busy loop when MySQL is not available. - sleepForMilliseconds(settings->max_wait_time_when_mysql_unavailable); + // Lost connection to MySQL server during query + if (e.code() != CR_SERVER_LOST || settings->max_wait_time_when_mysql_unavailable < 0) + throw; } + + setSynchronizationThreadException(std::current_exception()); + /// Avoid busy loop when MySQL is not available. + sleepForMilliseconds(settings->max_wait_time_when_mysql_unavailable); } } - return {}; + return false; } void MaterializeMySQLSyncThread::flushBuffersData(Buffers & buffers, MaterializeMetadata & metadata) { - metadata.transaction(client.getPosition(), [&]() { buffers.commit(global_context); }); + if (buffers.data.empty()) + return; + + metadata.transaction(client.getPosition(), [&]() { buffers.commit(getContext()); }); const auto & position_message = [&]() { @@ -628,21 +682,21 @@ void MaterializeMySQLSyncThread::onEvent(Buffers & buffers, const BinlogEventPtr if (receive_event->type() == MYSQL_WRITE_ROWS_EVENT) { WriteRowsEvent & write_rows_event = static_cast(*receive_event); - Buffers::BufferAndSortingColumnsPtr buffer = buffers.getTableDataBuffer(write_rows_event.table, global_context); + Buffers::BufferAndSortingColumnsPtr buffer = buffers.getTableDataBuffer(write_rows_event.table, getContext()); size_t bytes = onWriteOrDeleteData<1>(write_rows_event.rows, buffer->first, ++metadata.data_version); buffers.add(buffer->first.rows(), buffer->first.bytes(), write_rows_event.rows.size(), bytes); } else if (receive_event->type() == MYSQL_UPDATE_ROWS_EVENT) { UpdateRowsEvent & update_rows_event = static_cast(*receive_event); - Buffers::BufferAndSortingColumnsPtr buffer = buffers.getTableDataBuffer(update_rows_event.table, global_context); + Buffers::BufferAndSortingColumnsPtr buffer = buffers.getTableDataBuffer(update_rows_event.table, getContext()); size_t bytes = onUpdateData(update_rows_event.rows, buffer->first, ++metadata.data_version, buffer->second); buffers.add(buffer->first.rows(), buffer->first.bytes(), update_rows_event.rows.size(), bytes); } else if (receive_event->type() == MYSQL_DELETE_ROWS_EVENT) { DeleteRowsEvent & delete_rows_event = static_cast(*receive_event); - Buffers::BufferAndSortingColumnsPtr buffer = buffers.getTableDataBuffer(delete_rows_event.table, global_context); + Buffers::BufferAndSortingColumnsPtr buffer = buffers.getTableDataBuffer(delete_rows_event.table, getContext()); size_t bytes = onWriteOrDeleteData<-1>(delete_rows_event.rows, buffer->first, ++metadata.data_version); buffers.add(buffer->first.rows(), buffer->first.bytes(), delete_rows_event.rows.size(), bytes); } @@ -651,7 +705,7 @@ void MaterializeMySQLSyncThread::onEvent(Buffers & buffers, const BinlogEventPtr QueryEvent & query_event = static_cast(*receive_event); Position position_before_ddl; position_before_ddl.update(metadata.binlog_position, metadata.binlog_file, metadata.executed_gtid_set); - metadata.transaction(position_before_ddl, [&]() { buffers.commit(global_context); }); + metadata.transaction(position_before_ddl, [&]() { buffers.commit(getContext()); }); metadata.transaction(client.getPosition(),[&](){ executeDDLAtomic(query_event); }); } else @@ -682,7 +736,7 @@ void MaterializeMySQLSyncThread::executeDDLAtomic(const QueryEvent & query_event { try { - Context query_context = createQueryContext(global_context); + auto query_context = createQueryContext(getContext()); String comment = "Materialize MySQL step 2: execute MySQL DDL for sync data"; String event_database = query_event.schema == mysql_database_name ? database_name : ""; tryToExecuteQuery(query_prefix + query_event.query, query_context, event_database, comment); @@ -705,6 +759,12 @@ bool MaterializeMySQLSyncThread::isMySQLSyncThread() return getThreadName() == MYSQL_BACKGROUND_THREAD_NAME; } +void MaterializeMySQLSyncThread::setSynchronizationThreadException(const std::exception_ptr & exception) +{ + auto db = DatabaseCatalog::instance().getDatabase(database_name); + DB::setSynchronizationThreadException(db, exception); +} + void MaterializeMySQLSyncThread::Buffers::add(size_t block_rows, size_t block_bytes, size_t written_rows, size_t written_bytes) { total_blocks_rows += written_rows; @@ -719,13 +779,13 @@ bool MaterializeMySQLSyncThread::Buffers::checkThresholds(size_t check_block_row || total_blocks_bytes >= check_total_bytes; } -void MaterializeMySQLSyncThread::Buffers::commit(const Context & context) +void MaterializeMySQLSyncThread::Buffers::commit(ContextPtr context) { try { for (auto & table_name_and_buffer : data) { - Context query_context = createQueryContext(context); + auto query_context = createQueryContext(context); OneBlockInputStream input(table_name_and_buffer.second->first); BlockOutputStreamPtr out = getTableOutput(database, table_name_and_buffer.first, query_context, true); copyData(input, *out); @@ -745,7 +805,7 @@ void MaterializeMySQLSyncThread::Buffers::commit(const Context & context) } MaterializeMySQLSyncThread::Buffers::BufferAndSortingColumnsPtr MaterializeMySQLSyncThread::Buffers::getTableDataBuffer( - const String & table_name, const Context & context) + const String & table_name, ContextPtr context) { const auto & iterator = data.find(table_name); if (iterator == data.end()) diff --git a/src/Databases/MySQL/MaterializeMySQLSyncThread.h b/src/Databases/MySQL/MaterializeMySQLSyncThread.h index 26934b87511..03958fe10cc 100644 --- a/src/Databases/MySQL/MaterializeMySQLSyncThread.h +++ b/src/Databases/MySQL/MaterializeMySQLSyncThread.h @@ -36,24 +36,29 @@ namespace DB * real-time pull incremental data: * We will pull the binlog event of MySQL to parse and execute when the full data synchronization is completed. */ -class MaterializeMySQLSyncThread +class MaterializeMySQLSyncThread : WithContext { public: ~MaterializeMySQLSyncThread(); MaterializeMySQLSyncThread( - const Context & context, const String & database_name_, const String & mysql_database_name_ - , mysqlxx::Pool && pool_, MySQLClient && client_, MaterializeMySQLSettings * settings_); + ContextPtr context, + const String & database_name_, + const String & mysql_database_name_, + mysqlxx::Pool && pool_, + MySQLClient && client_, + MaterializeMySQLSettings * settings_); void stopSynchronization(); void startSynchronization(); + void assertMySQLAvailable(); + static bool isMySQLSyncThread(); private: Poco::Logger * log; - const Context & global_context; String database_name; String mysql_database_name; @@ -69,6 +74,9 @@ private: const int ER_DBACCESS_DENIED_ERROR = 1044; const int ER_BAD_DB_ERROR = 1049; + // https://dev.mysql.com/doc/mysql-errors/8.0/en/client-error-reference.html + const int CR_SERVER_LOST = 2013; + struct Buffers { String database; @@ -85,20 +93,20 @@ private: Buffers(const String & database_) : database(database_) {} - void commit(const Context & context); + void commit(ContextPtr context); void add(size_t block_rows, size_t block_bytes, size_t written_rows, size_t written_bytes); bool checkThresholds(size_t check_block_rows, size_t check_block_bytes, size_t check_total_rows, size_t check_total_bytes) const; - BufferAndSortingColumnsPtr getTableDataBuffer(const String & table, const Context & context); + BufferAndSortingColumnsPtr getTableDataBuffer(const String & table, ContextPtr context); }; void synchronization(); bool isCancelled() { return sync_quit.load(std::memory_order_relaxed); } - std::optional prepareSynchronized(); + bool prepareSynchronized(MaterializeMetadata & metadata); void flushBuffersData(Buffers & buffers, MaterializeMetadata & metadata); @@ -107,6 +115,8 @@ private: std::atomic sync_quit{false}; std::unique_ptr background_thread_pool; void executeDDLAtomic(const QueryEvent & query_event); + + void setSynchronizationThreadException(const std::exception_ptr & exception); }; } diff --git a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp index 722b9c64edb..e12d7f975ce 100644 --- a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp +++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -16,6 +15,8 @@ #include #include #include +#include +#include namespace DB @@ -34,22 +35,22 @@ static const auto suffix = ".removed"; static const auto cleaner_reschedule_ms = 60000; DatabasePostgreSQL::DatabasePostgreSQL( - const Context & context, + ContextPtr context_, const String & metadata_path_, const ASTStorage * database_engine_define_, const String & dbname_, const String & postgres_dbname, - PostgreSQLConnectionPtr connection_, + postgres::PoolWithFailoverPtr connection_pool_, const bool cache_tables_) : IDatabase(dbname_) - , global_context(context.getGlobalContext()) + , WithContext(context_->getGlobalContext()) , metadata_path(metadata_path_) , database_engine_define(database_engine_define_->clone()) , dbname(postgres_dbname) - , connection(std::move(connection_)) + , connection_pool(std::move(connection_pool_)) , cache_tables(cache_tables_) { - cleaner_task = context.getSchedulePool().createTask("PostgreSQLCleanerTask", [this]{ removeOutdatedTables(); }); + cleaner_task = getContext()->getSchedulePool().createTask("PostgreSQLCleanerTask", [this]{ removeOutdatedTables(); }); cleaner_task->deactivate(); } @@ -68,8 +69,7 @@ bool DatabasePostgreSQL::empty() const } -DatabaseTablesIteratorPtr DatabasePostgreSQL::getTablesIterator( - const Context & context, const FilterByNameFunction & /* filter_by_table_name */) +DatabaseTablesIteratorPtr DatabasePostgreSQL::getTablesIterator(ContextPtr local_context, const FilterByNameFunction & /* filter_by_table_name */) { std::lock_guard lock(mutex); @@ -78,7 +78,7 @@ DatabaseTablesIteratorPtr DatabasePostgreSQL::getTablesIterator( for (const auto & table_name : table_names) if (!detached_or_dropped.count(table_name)) - tables[table_name] = fetchTable(table_name, context, true); + tables[table_name] = fetchTable(table_name, local_context, true); return std::make_unique(tables, database_name); } @@ -89,7 +89,8 @@ std::unordered_set DatabasePostgreSQL::fetchTablesList() const std::unordered_set tables; std::string query = "SELECT tablename FROM pg_catalog.pg_tables " "WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema'"; - pqxx::read_transaction tx(*connection->conn()); + auto connection = connection_pool->get(); + pqxx::read_transaction tx(connection->conn()); for (auto table_name : tx.stream(query)) tables.insert(std::get<0>(table_name)); @@ -107,7 +108,8 @@ bool DatabasePostgreSQL::checkPostgresTable(const String & table_name) const "PostgreSQL table name cannot contain single quote or backslash characters, passed {}", table_name); } - pqxx::nontransaction tx(*connection->conn()); + auto connection = connection_pool->get(); + pqxx::nontransaction tx(connection->conn()); try { @@ -132,7 +134,7 @@ bool DatabasePostgreSQL::checkPostgresTable(const String & table_name) const } -bool DatabasePostgreSQL::isTableExist(const String & table_name, const Context & /* context */) const +bool DatabasePostgreSQL::isTableExist(const String & table_name, ContextPtr /* context */) const { std::lock_guard lock(mutex); @@ -143,33 +145,33 @@ bool DatabasePostgreSQL::isTableExist(const String & table_name, const Context & } -StoragePtr DatabasePostgreSQL::tryGetTable(const String & table_name, const Context & context) const +StoragePtr DatabasePostgreSQL::tryGetTable(const String & table_name, ContextPtr local_context) const { std::lock_guard lock(mutex); if (!detached_or_dropped.count(table_name)) - return fetchTable(table_name, context, false); + return fetchTable(table_name, local_context, false); return StoragePtr{}; } -StoragePtr DatabasePostgreSQL::fetchTable(const String & table_name, const Context & context, const bool table_checked) const +StoragePtr DatabasePostgreSQL::fetchTable(const String & table_name, ContextPtr local_context, const bool table_checked) const { if (!cache_tables || !cached_tables.count(table_name)) { if (!table_checked && !checkPostgresTable(table_name)) return StoragePtr{}; - auto use_nulls = context.getSettingsRef().external_table_functions_use_nulls; - auto columns = fetchPostgreSQLTableStructure(connection->conn(), table_name, use_nulls); + auto use_nulls = local_context->getSettingsRef().external_table_functions_use_nulls; + auto columns = fetchPostgreSQLTableStructure(connection_pool->get(), doubleQuoteString(table_name), use_nulls); if (!columns) return StoragePtr{}; auto storage = StoragePostgreSQL::create( - StorageID(database_name, table_name), table_name, std::make_shared(connection->conn_str()), - ColumnsDescription{*columns}, ConstraintsDescription{}, context); + StorageID(database_name, table_name), *connection_pool, table_name, + ColumnsDescription{*columns}, ConstraintsDescription{}, local_context); if (cache_tables) cached_tables[table_name] = storage; @@ -229,7 +231,7 @@ StoragePtr DatabasePostgreSQL::detachTable(const String & table_name) } -void DatabasePostgreSQL::createTable(const Context &, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) +void DatabasePostgreSQL::createTable(ContextPtr, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) { const auto & create = create_query->as(); @@ -240,7 +242,7 @@ void DatabasePostgreSQL::createTable(const Context &, const String & table_name, } -void DatabasePostgreSQL::dropTable(const Context &, const String & table_name, bool /* no_delay */) +void DatabasePostgreSQL::dropTable(ContextPtr, const String & table_name, bool /* no_delay */) { std::lock_guard lock{mutex}; @@ -268,13 +270,13 @@ void DatabasePostgreSQL::dropTable(const Context &, const String & table_name, b } -void DatabasePostgreSQL::drop(const Context & /*context*/) +void DatabasePostgreSQL::drop(ContextPtr /*context*/) { Poco::File(getMetadataPath()).remove(true); } -void DatabasePostgreSQL::loadStoredObjects(Context & /* context */, bool, bool /*force_attach*/) +void DatabasePostgreSQL::loadStoredObjects(ContextPtr /* context */, bool, bool /*force_attach*/) { { std::lock_guard lock{mutex}; @@ -346,9 +348,9 @@ ASTPtr DatabasePostgreSQL::getCreateDatabaseQuery() const } -ASTPtr DatabasePostgreSQL::getCreateTableQueryImpl(const String & table_name, const Context & context, bool throw_on_error) const +ASTPtr DatabasePostgreSQL::getCreateTableQueryImpl(const String & table_name, ContextPtr local_context, bool throw_on_error) const { - auto storage = fetchTable(table_name, context, false); + auto storage = fetchTable(table_name, local_context, false); if (!storage) { if (throw_on_error) diff --git a/src/Databases/PostgreSQL/DatabasePostgreSQL.h b/src/Databases/PostgreSQL/DatabasePostgreSQL.h index 56ea6645f15..3505c38e499 100644 --- a/src/Databases/PostgreSQL/DatabasePostgreSQL.h +++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.h @@ -9,14 +9,13 @@ #include #include #include +#include namespace DB { class Context; -class PostgreSQLConnection; -using PostgreSQLConnectionPtr = std::shared_ptr; /** Real-time access to table list and table structure from remote PostgreSQL. @@ -24,18 +23,18 @@ using PostgreSQLConnectionPtr = std::shared_ptr; * If `cache_tables` == 1 (default: 0) table structure is cached and not checked for being modififed, * but it will be updated during detach->attach. */ -class DatabasePostgreSQL final : public IDatabase +class DatabasePostgreSQL final : public IDatabase, WithContext { public: DatabasePostgreSQL( - const Context & context, + ContextPtr context, const String & metadata_path_, const ASTStorage * database_engine_define, const String & dbname_, const String & postgres_dbname, - PostgreSQLConnectionPtr connection_, - const bool cache_tables_); + postgres::PoolWithFailoverPtr connection_pool_, + bool cache_tables_); String getEngineName() const override { return "PostgreSQL"; } String getMetadataPath() const override { return metadata_path; } @@ -48,31 +47,30 @@ public: bool empty() const override; - void loadStoredObjects(Context &, bool, bool force_attach) override; + void loadStoredObjects(ContextPtr, bool, bool force_attach) override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) override; - bool isTableExist(const String & name, const Context & context) const override; - StoragePtr tryGetTable(const String & name, const Context & context) const override; + bool isTableExist(const String & name, ContextPtr context) const override; + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; - void createTable(const Context &, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) override; - void dropTable(const Context &, const String & table_name, bool no_delay) override; + void createTable(ContextPtr, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) override; + void dropTable(ContextPtr, const String & table_name, bool no_delay) override; void attachTable(const String & table_name, const StoragePtr & storage, const String & relative_table_path) override; StoragePtr detachTable(const String & table_name) override; - void drop(const Context & /*context*/) override; + void drop(ContextPtr /*context*/) override; void shutdown() override; protected: - ASTPtr getCreateTableQueryImpl(const String & table_name, const Context & context, bool throw_on_error) const override; + ASTPtr getCreateTableQueryImpl(const String & table_name, ContextPtr context, bool throw_on_error) const override; private: - const Context & global_context; String metadata_path; ASTPtr database_engine_define; String dbname; - PostgreSQLConnectionPtr connection; + postgres::PoolWithFailoverPtr connection_pool; const bool cache_tables; mutable Tables cached_tables; @@ -81,7 +79,7 @@ private: bool checkPostgresTable(const String & table_name) const; std::unordered_set fetchTablesList() const; - StoragePtr fetchTable(const String & table_name, const Context & context, const bool table_checked) const; + StoragePtr fetchTable(const String & table_name, ContextPtr context, bool table_checked) const; void removeOutdatedTables(); ASTPtr getColumnDeclaration(const DataTypePtr & data_type) const; }; diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index a6e5ded3efd..066090d02d6 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -40,6 +40,8 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl res = std::make_shared(); else if (type == "bigint") res = std::make_shared(); + else if (type == "boolean") + res = std::make_shared(); else if (type == "real") res = std::make_shared(); else if (type == "double precision") @@ -54,19 +56,32 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl res = std::make_shared(); else if (type.starts_with("numeric")) { - /// Numeric and decimal will both end up here as numeric. - res = DataTypeFactory::instance().get(type); - uint32_t precision = getDecimalPrecision(*res); - uint32_t scale = getDecimalScale(*res); + /// Numeric and decimal will both end up here as numeric. If it has type and precision, + /// there will be Numeric(x, y), otherwise just Numeric + UInt32 precision, scale; + if (type.ends_with(")")) + { + res = DataTypeFactory::instance().get(type); + precision = getDecimalPrecision(*res); + scale = getDecimalScale(*res); - if (precision <= DecimalUtils::max_precision) - res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::max_precision) - res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::max_precision) + if (precision <= DecimalUtils::max_precision) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::max_precision) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::max_precision) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::max_precision) + res = std::make_shared>(precision, scale); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Precision {} and scale {} are too big and not supported", precision, scale); + } + else + { + precision = DecimalUtils::max_precision; + scale = precision / 2; res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::max_precision) - res = std::make_shared>(precision, scale); + } } if (!res) @@ -81,7 +96,7 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl std::shared_ptr fetchPostgreSQLTableStructure( - std::shared_ptr connection, const String & postgres_table_name, bool use_nulls) + postgres::ConnectionHolderPtr connection, const String & postgres_table_name, bool use_nulls) { auto columns = NamesAndTypesList(); @@ -100,7 +115,7 @@ std::shared_ptr fetchPostgreSQLTableStructure( "AND NOT attisdropped AND attnum > 0", postgres_table_name); try { - pqxx::read_transaction tx(*connection); + pqxx::read_transaction tx(connection->conn()); pqxx::stream_from stream(tx, pqxx::from_query, std::string_view(query)); std::tuple row; @@ -120,7 +135,7 @@ std::shared_ptr fetchPostgreSQLTableStructure( { throw Exception(fmt::format( "PostgreSQL table {}.{} does not exist", - connection->dbname(), postgres_table_name), ErrorCodes::UNKNOWN_TABLE); + connection->conn().dbname(), postgres_table_name), ErrorCodes::UNKNOWN_TABLE); } catch (Exception & e) { diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h index bbbb379541b..f40929aa91d 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h @@ -12,7 +12,7 @@ namespace DB { std::shared_ptr fetchPostgreSQLTableStructure( - std::shared_ptr connection, const String & postgres_table_name, bool use_nulls); + postgres::ConnectionHolderPtr connection, const String & postgres_table_name, bool use_nulls); } diff --git a/src/Dictionaries/BucketCache.h b/src/Dictionaries/BucketCache.h deleted file mode 100644 index 381110066a6..00000000000 --- a/src/Dictionaries/BucketCache.h +++ /dev/null @@ -1,226 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace DB -{ - -namespace -{ - inline size_t roundUpToPowerOfTwoOrZero(size_t x) - { - size_t r = 8; - while (x > r) - r <<= 1; - return r; - } -} - -struct EmptyDeleter {}; - -struct Int64Hasher -{ - size_t operator()(const size_t x) const - { - return intHash64(x); - } -}; - - -/* - Class for storing cache index. - It consists of two arrays. - The first one is split into buckets (each stores 8 elements (cells)) determined by hash of the element key. - The second one is split into 4bit numbers, which are positions in bucket for next element write (So cache uses FIFO eviction algorithm inside each bucket). -*/ -template -class BucketCacheIndex -{ - struct Cell - { - K key; - V index; - }; - -public: - template >> - BucketCacheIndex(size_t cells_) - : buckets(roundUpToPowerOfTwoOrZero(cells_) / bucket_size) - , bucket_mask(buckets - 1) - , cells(buckets * bucket_size) - , positions((buckets / 2) + 1) - { - for (auto & cell : cells) - cell.index.setNotExists(); - for (size_t bucket = 0; bucket < buckets; ++bucket) - setPosition(bucket, 0); - } - - template >> - BucketCacheIndex(size_t cells_, Deleter deleter_) - : deleter(deleter_) - , buckets(roundUpToPowerOfTwoOrZero(cells_) / bucket_size) - , bucket_mask(buckets - 1) - , cells(buckets * bucket_size) - , positions((buckets / 2) + 1) - { - for (auto & cell : cells) - cell.index.setNotExists(); - for (size_t bucket = 0; bucket < buckets; ++bucket) - setPosition(bucket, 0); - } - - void set(K key, V val) - { - const size_t bucket = (hash(key) & bucket_mask); - const size_t idx = getCellIndex(key, bucket); - if (!cells[idx].index.exists()) - { - incPosition(bucket); - ++sz; - } - - cells[idx].key = key; - cells[idx].index = val; - } - - template >> - void setWithDelete(K key, V val) - { - const size_t bucket = (hash(key) & bucket_mask); - const size_t idx = getCellIndex(key, bucket); - if (!cells[idx].index.exists()) - { - incPosition(bucket); - ++sz; - } - else - { - deleter(cells[idx].key); - } - - cells[idx].key = key; - cells[idx].index = val; - } - - bool get(K key, V & val) const - { - const size_t bucket = (hash(key) & bucket_mask); - const size_t idx = getCellIndex(key, bucket); - if (!cells[idx].index.exists() || cells[idx].key != key) - return false; - val = cells[idx].index; - return true; - } - - bool getKeyAndValue(K & key, V & val) const - { - const size_t bucket = (hash(key) & bucket_mask); - const size_t idx = getCellIndex(key, bucket); - if (!cells[idx].index.exists() || cells[idx].key != key) - return false; - key = cells[idx].key; - val = cells[idx].index; - return true; - } - - bool erase(K key) - { - const size_t bucket = (hash(key) & bucket_mask); - const size_t idx = getCellIndex(key, bucket); - if (!cells[idx].index.exists() || cells[idx].key != key) - return false; - - cells[idx].index.setNotExists(); - --sz; - if constexpr (!std::is_same_v) - deleter(cells[idx].key); - - return true; - } - - size_t size() const - { - return sz; - } - - size_t capacity() const - { - return cells.size(); - } - - auto keys() const - { - std::vector res; - for (const auto & cell : cells) - { - if (cell.index.exists()) - { - res.push_back(cell.key); - } - } - return res; - } - -private: - /// Searches for the key in the bucket. - /// Returns index of cell with provided key. - size_t getCellIndex(const K key, const size_t bucket) const - { - const size_t pos = getPosition(bucket); - for (int idx = 7; idx >= 0; --idx) - { - const size_t cur = ((pos + 1 + idx) & pos_mask); - if (cells[bucket * bucket_size + cur].index.exists() && - cells[bucket * bucket_size + cur].key == key) - { - return bucket * bucket_size + cur; - } - } - - return bucket * bucket_size + pos; - } - - /// Returns current position for write in the bucket. - size_t getPosition(const size_t bucket) const - { - const size_t idx = (bucket >> 1); - if ((bucket & 1) == 0) - return ((positions[idx] >> 4) & pos_mask); - return (positions[idx] & pos_mask); - } - - /// Sets current posiotion in the bucket. - void setPosition(const size_t bucket, const size_t pos) - { - const size_t idx = bucket >> 1; - if ((bucket & 1) == 0) - positions[idx] = ((pos << 4) | (positions[idx] & ((1 << 4) - 1))); - else - positions[idx] = (pos | (positions[idx] & (((1 << 4) - 1) << 4))); - } - - void incPosition(const size_t bucket) - { - setPosition(bucket, (getPosition(bucket) + 1) & pos_mask); - } - - static constexpr size_t bucket_size = 8; - static constexpr size_t pos_size = 3; - static constexpr size_t pos_mask = (1 << pos_size) - 1; - - Hasher hash; - Deleter deleter; - - size_t buckets; - size_t bucket_mask; - - std::vector cells; - std::vector positions; - size_t sz = 0; -}; - -} diff --git a/src/Dictionaries/CMakeLists.txt b/src/Dictionaries/CMakeLists.txt index 4d6ab4b85f8..563c0f3914b 100644 --- a/src/Dictionaries/CMakeLists.txt +++ b/src/Dictionaries/CMakeLists.txt @@ -20,6 +20,10 @@ target_link_libraries(clickhouse_dictionaries string_utils ) +target_link_libraries(clickhouse_dictionaries + PUBLIC + abseil_swiss_tables) + if(USE_CASSANDRA) target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${CASSANDRA_INCLUDE_DIR}) endif() diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 67bcab109ea..b1b8ebed5bd 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -1,25 +1,21 @@ #include "CacheDictionary.h" #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include + #include #include #include #include -#include -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include + +#include +#include +#include +#include +#include +#include + +#include +#include namespace ProfileEvents { @@ -40,88 +36,82 @@ namespace CurrentMetrics extern const Metric DictCacheRequests; } - namespace DB { namespace ErrorCodes { extern const int CACHE_DICTIONARY_UPDATE_FAIL; - extern const int TYPE_MISMATCH; - extern const int BAD_ARGUMENTS; extern const int UNSUPPORTED_METHOD; - extern const int TOO_SMALL_BUFFER_SIZE; - extern const int TIMEOUT_EXCEEDED; } - -inline size_t CacheDictionary::getCellIdx(const Key id) const -{ - const auto hash = intHash64(id); - const auto idx = hash & size_overlap_mask; - return idx; -} - - -CacheDictionary::CacheDictionary( +template +CacheDictionary::CacheDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, + CacheDictionaryStoragePtr cache_storage_ptr_, + CacheDictionaryUpdateQueueConfiguration update_queue_configuration_, DictionaryLifetime dict_lifetime_, - size_t strict_max_lifetime_seconds_, - size_t size_, - bool allow_read_expired_keys_, - size_t max_update_queue_size_, - size_t update_queue_push_timeout_milliseconds_, - size_t query_wait_timeout_milliseconds_, - size_t max_threads_for_updates_) + bool allow_read_expired_keys_) : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} + , cache_storage_ptr(cache_storage_ptr_) + , update_queue( + dict_id_.getNameForLogs(), + update_queue_configuration_, + [this](CacheDictionaryUpdateUnitPtr unit_to_update) + { + update(unit_to_update); + }) , dict_lifetime(dict_lifetime_) - , strict_max_lifetime_seconds(strict_max_lifetime_seconds_) - , allow_read_expired_keys(allow_read_expired_keys_) - , max_update_queue_size(max_update_queue_size_) - , update_queue_push_timeout_milliseconds(update_queue_push_timeout_milliseconds_) - , query_wait_timeout_milliseconds(query_wait_timeout_milliseconds_) - , max_threads_for_updates(max_threads_for_updates_) , log(&Poco::Logger::get("ExternalDictionaries")) - , size{roundUpToPowerOfTwoOrZero(std::max(size_, size_t(max_collision_length)))} - , size_overlap_mask{this->size - 1} - , cells{this->size} + , allow_read_expired_keys(allow_read_expired_keys_) , rnd_engine(randomSeed()) - , update_queue(max_update_queue_size_) - , update_pool(max_threads_for_updates) { if (!source_ptr->supportsSelectiveLoad()) - throw Exception{full_name + ": source cannot be used with CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - createAttributes(); - for (size_t i = 0; i < max_threads_for_updates; ++i) - update_pool.scheduleOrThrowOnError([this] { updateThreadFunction(); }); + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "{}: source cannot be used with CacheDictionary", full_name); } -CacheDictionary::~CacheDictionary() +template +CacheDictionary::~CacheDictionary() { - finished = true; - update_queue.clear(); - for (size_t i = 0; i < max_threads_for_updates; ++i) - { - auto empty_finishing_ptr = std::make_shared(std::vector()); - update_queue.push(empty_finishing_ptr); - } - update_pool.wait(); + update_queue.stopAndWait(); } -size_t CacheDictionary::getBytesAllocated() const +template +size_t CacheDictionary::getElementCount() const +{ + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + return cache_storage_ptr->getSize(); +} + +template +size_t CacheDictionary::getBytesAllocated() const { /// In case of existing string arena we check the size of it. /// But the same appears in setAttributeValue() function, which is called from update() function /// which in turn is called from another thread. const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - return bytes_allocated + (string_arena ? string_arena->size() : 0); + return cache_storage_ptr->getBytesAllocated(); } -const IDictionarySource * CacheDictionary::getSource() const +template +double CacheDictionary::getLoadFactor() const +{ + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + return cache_storage_ptr->getLoadFactor(); +} + +template +std::exception_ptr CacheDictionary::getLastException() const +{ + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + return last_exception; +} + +template +const IDictionarySource * CacheDictionary::getSource() const { /// Mutex required here because of the getSourceAndUpdateIfNeeded() function /// which is used from another thread. @@ -129,1154 +119,444 @@ const IDictionarySource * CacheDictionary::getSource() const return source_ptr.get(); } -void CacheDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_value); - DictionaryDefaultValueExtractor default_value_extractor(null_value); - getItemsNumberImpl(*hierarchical_attribute, ids, out, default_value_extractor); -} - - -/// Allow to use single value in same way as array. -static inline CacheDictionary::Key getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline CacheDictionary::Key getAt(const CacheDictionary::Key & value, const size_t) -{ - return value; -} - - -template -void CacheDictionary::isInImpl(const PaddedPODArray & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - /// Transform all children to parents until ancestor id or null_value will be reached. - - size_t out_size = out.size(); - memset(out.data(), 0xFF, out_size); /// 0xFF means "not calculated" - - const auto null_value = std::get(hierarchical_attribute->null_value); - - PaddedPODArray children(out_size, 0); - PaddedPODArray parents(child_ids.begin(), child_ids.end()); - - for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - size_t out_idx = 0; - size_t parents_idx = 0; - size_t new_children_idx = 0; - - while (out_idx < out_size) - { - /// Already calculated - if (out[out_idx] != 0xFF) - { - ++out_idx; - continue; - } - - /// No parent - if (parents[parents_idx] == null_value) - { - out[out_idx] = 0; - } - /// Found ancestor - else if (parents[parents_idx] == getAt(ancestor_ids, parents_idx)) - { - out[out_idx] = 1; - } - /// Loop detected - else if (children[new_children_idx] == parents[parents_idx]) - { - out[out_idx] = 1; - } - /// Found intermediate parent, add this value to search at next loop iteration - else - { - children[new_children_idx] = parents[parents_idx]; - ++new_children_idx; - } - - ++out_idx; - ++parents_idx; - } - - if (new_children_idx == 0) - break; - - /// Transform all children to its parents. - children.resize(new_children_idx); - parents.resize(new_children_idx); - - toParent(children, parents); - } -} - -void CacheDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -void CacheDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -void CacheDictionary::isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - /// Special case with single child value. - - const auto null_value = std::get(hierarchical_attribute->null_value); - - PaddedPODArray child(1, child_id); - PaddedPODArray parent(1); - std::vector ancestors(1, child_id); - - /// Iteratively find all ancestors for child. - for (size_t i = 0; i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - toParent(child, parent); - - if (parent[0] == null_value) - break; - - child[0] = parent[0]; - ancestors.push_back(parent[0]); - } - - /// Assuming short hierarchy, so linear search is Ok. - for (size_t i = 0, out_size = out.size(); i < out_size; ++i) - out[i] = std::find(ancestors.begin(), ancestors.end(), ancestor_ids[i]) != ancestors.end(); -} - -ColumnPtr CacheDictionary::getColumn( +template +ColumnPtr CacheDictionary::getColumn( const std::string & attribute_name, const DataTypePtr & result_type, const Columns & key_columns, - const DataTypes &, - const ColumnPtr default_values_column) const + const DataTypes & key_types, + const ColumnPtr & default_values_column) const { - ColumnPtr result; + return getColumns({attribute_name}, {result_type}, key_columns, key_types, {default_values_column}).front(); +} - PaddedPODArray backup_storage; - const auto & keys = getColumnVectorData(this, key_columns.front(), backup_storage); - auto keys_size = keys.size(); +template +Columns CacheDictionary::getColumns( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns) const +{ + /** + * Flow of getColumsImpl + * 1. Get fetch result from storage + * 2. If all keys are found in storage and not expired + * 2.1. If storage returns fetched columns in order of keys then result is returned to client. + * 2.2. If storage does not return fetched columns in order of keys then reorder + * result columns and return result to client. + * 3. If all keys are found in storage but some of them are expired and we allow to read expired keys + * start async request to source and perform actions from step 2 for result returned from storage. + * 4. If some keys are found and some are not, start sync update from source. + * 5. Aggregate columns returned from storage and source, if key is not found in storage and in source + * use default value. + */ - auto & attribute = getAttribute(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + auto keys = extractor.extractAllKeys(); + + DictionaryStorageFetchRequest request(dict_struct, attribute_names, result_types, default_values_columns); + + FetchResult result_of_fetch_from_storage; - auto type_call = [&](const auto &dictionary_attribute_type) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ColumnProvider = DictionaryAttributeColumnProvider; + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; + result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request); + } - const auto & null_value = std::get(attribute.null_value); - DictionaryDefaultValueExtractor default_value_extractor(null_value, default_values_column); + size_t found_keys_size = result_of_fetch_from_storage.found_keys_size; + size_t expired_keys_size = result_of_fetch_from_storage.expired_keys_size; + size_t not_found_keys_size = result_of_fetch_from_storage.not_found_keys_size; - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); + ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, found_keys_size); + ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, expired_keys_size); + ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, not_found_keys_size); - if constexpr (std::is_same_v) - { - getItemsString(attribute, keys, column.get(), default_value_extractor); - } + query_count.fetch_add(keys.size()); + hit_count.fetch_add(found_keys_size); + + MutableColumns & fetched_columns_from_storage = result_of_fetch_from_storage.fetched_columns; + const PaddedPODArray & key_index_to_state_from_storage = result_of_fetch_from_storage.key_index_to_state; + + bool source_returns_fetched_columns_in_order_of_keys = cache_storage_ptr->returnsFetchedColumnsInOrderOfRequestedKeys(); + + if (not_found_keys_size == 0 && expired_keys_size == 0) + { + /// All keys were found in storage + + if (source_returns_fetched_columns_in_order_of_keys) + return request.filterRequestedColumns(fetched_columns_from_storage); else { - auto & out = column->getData(); - getItemsNumberImpl(attribute, keys, out, default_value_extractor); - } + /// Reorder result from storage to requested keys indexes + MutableColumns aggregated_columns = aggregateColumnsInOrderOfKeys( + keys, + request, + fetched_columns_from_storage, + key_index_to_state_from_storage); - result = std::move(column); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -template -void CacheDictionary::getItemsNumberImpl( - Attribute & attribute, - const PaddedPODArray & ids, - ResultArrayType & out, - DefaultValueExtractor & default_value_extractor) const -{ - /// First fill everything with default values - const auto rows = ext::size(ids); - for (const auto row : ext::range(0, rows)) - out[row] = default_value_extractor[row]; - - /// Maybe there are duplicate keys, so we remember their indices. - std::unordered_map> cache_expired_or_not_found_ids; - - auto & attribute_array = std::get>(attribute.arrays); - - size_t cache_hit = 0; - size_t cache_not_found_count = 0; - size_t cache_expired_cound = 0; - - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - - auto insert_to_answer_routine = [&](size_t row, size_t idx) - { - auto & cell = cells[idx]; - if (!cell.isDefault()) - out[row] = static_cast(attribute_array[idx]); - }; - - /// fetch up-to-date values, decide which ones require update - for (const auto row : ext::range(0, rows)) - { - const auto id = ids[row]; - - /** cell should be updated if either: - * 1. ids do not match, - * 2. cell has expired, - * 3. explicit defaults were specified and cell was set default. */ - - const auto [cell_idx, state] = findCellIdxForGet(id, now); - - if (state == ResultState::FoundAndValid) - { - ++cache_hit; - insert_to_answer_routine(row, cell_idx); - } - else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently) - { - ++cache_not_found_count; - cache_expired_or_not_found_ids[id].push_back(row); - } - else if (state == ResultState::FoundButExpired) - { - cache_expired_cound++; - cache_expired_or_not_found_ids[id].push_back(row); - - if (allow_read_expired_keys) - insert_to_answer_routine(row, cell_idx); - } + return request.filterRequestedColumns(aggregated_columns); } } - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_cound); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); + size_t keys_to_update_size = not_found_keys_size + expired_keys_size; + auto update_unit = std::make_shared>(key_columns, key_index_to_state_from_storage, request, keys_to_update_size); - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows - cache_not_found_count - cache_expired_cound, std::memory_order_release); + HashMap requested_keys_to_fetched_columns_during_update_index; + MutableColumns fetched_columns_during_update = request.makeAttributesResultColumns(); - if (!cache_not_found_count) + if (not_found_keys_size == 0 && expired_keys_size > 0 && allow_read_expired_keys) { - /// Nothing to update - return - if (!cache_expired_cound) - return; + /// Start async update only if allow read expired keys and all keys are found + update_queue.tryPushToUpdateQueueOrThrow(update_unit); - /// Update async only if allow_read_expired_keys_is_enabledadd condvar usage and better code - if (allow_read_expired_keys) - { - std::vector required_expired_ids; - required_expired_ids.reserve(cache_expired_cound); - std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; }); - - /// request new values - auto update_unit_ptr = std::make_shared(std::move(required_expired_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - - /// Nothing to do - return - return; - } - } - - /// From this point we have to update all keys sync. - /// Maybe allow_read_expired_keys_from_cache_dictionary is disabled - /// and there no cache_not_found_ids but some cache_expired. - - std::vector required_ids; - required_ids.reserve(cache_not_found_count + cache_expired_cound); - std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_ids), [](auto & pair) { return pair.first; }); - - /// Request new values - auto update_unit_ptr = std::make_shared(std::move(required_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - waitForCurrentUpdateFinish(update_unit_ptr); - - /// Add updated keys to answer. - - const size_t attribute_index = getAttributeIndex(attribute.name); - - for (auto & [key, value] : update_unit_ptr->found_ids) - { - if (value.found) - { - for (const size_t row : cache_expired_or_not_found_ids[key]) - out[row] = std::get(value.values[attribute_index]); - } - } -} - -void CacheDictionary::getItemsString( - Attribute & attribute, - const PaddedPODArray & ids, - ColumnString * out, - DictionaryDefaultValueExtractor & default_value_extractor) const -{ - const auto rows = ext::size(ids); - - /// Save on some allocations. - out->getOffsets().reserve(rows); - - auto & attribute_array = std::get>(attribute.arrays); - - auto found_outdated_values = false; - - /// Perform optimistic version, fallback to pessimistic if failed. - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - - /// Fetch up-to-date values, discard on fail. - for (const auto row : ext::range(0, rows)) - { - const auto id = ids[row]; - const auto [cell_idx, state] = findCellIdxForGet(id, now); - - if (state == ResultState::FoundAndValid) - { - auto & cell = cells[cell_idx]; - const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx]; - out->insertData(string_ref.data, string_ref.size); - } - else - { - found_outdated_values = true; - break; - } - } - } - - /// Optimistic code completed successfully. - if (!found_outdated_values) - { - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows, std::memory_order_release); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, ids.size()); - return; - } - - /// Now onto the pessimistic one, discard possible partial results from the optimistic path. - out->getChars().resize_assume_reserved(0); - out->getOffsets().resize_assume_reserved(0); - - /// Mapping: -> { all indices `i` of `ids` such that `ids[i]` = } - std::unordered_map> cache_expired_or_not_found_ids; - /// we are going to store every string separately - std::unordered_map local_cache; - - size_t cache_not_found_count = 0; - size_t cache_expired_count = 0; - - size_t total_length = 0; - size_t cache_hit = 0; - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - - auto insert_value_routine = [&](size_t row, size_t id, size_t cell_idx) - { - const auto & cell = cells[cell_idx]; - const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx]; - - /// Do not store default, but count it in total length. - if (!cell.isDefault()) - local_cache[id] = String{string_ref}; - - total_length += string_ref.size + 1; - }; - - for (const auto row : ext::range(0, ids.size())) - { - const auto id = ids[row]; - const auto [cell_idx, state] = findCellIdxForGet(id, now); - - if (state == ResultState::FoundAndValid) - { - ++cache_hit; - insert_value_routine(row, id, cell_idx); - } - else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently) - { - ++cache_not_found_count; - cache_expired_or_not_found_ids[id].push_back(row); - } - else if (state == ResultState::FoundButExpired) - { - ++cache_expired_count; - cache_expired_or_not_found_ids[id].push_back(row); - - if (allow_read_expired_keys) - insert_value_routine(row, id, cell_idx); - } - } - } - - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_count); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows - cache_expired_count - cache_not_found_count, std::memory_order_release); - - /// Async update of expired keys. - if (!cache_not_found_count) - { - if (allow_read_expired_keys && cache_expired_count) - { - std::vector required_expired_ids; - required_expired_ids.reserve(cache_expired_count); - std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; }); - - auto update_unit_ptr = std::make_shared(std::move(required_expired_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - - /// Insert all found keys and defaults to output array. - out->getChars().reserve(total_length); - - for (const auto row : ext::range(0, ext::size(ids))) - { - const auto id = ids[row]; - StringRef value; - - /// Previously we stored found keys in map. - const auto it = local_cache.find(id); - if (it != local_cache.end()) - value = StringRef(it->second); - else - value = default_value_extractor[row]; - - out->insertData(value.data, value.size); - } - - /// Nothing to do else. - return; - } - } - - /// We will request both cache_not_found_ids and cache_expired_ids sync. - std::vector required_ids; - required_ids.reserve(cache_not_found_count + cache_expired_count); - std::transform( - std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_ids), [](auto & pair) { return pair.first; }); - - auto update_unit_ptr = std::make_shared(std::move(required_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - waitForCurrentUpdateFinish(update_unit_ptr); - - const size_t attribute_index = getAttributeIndex(attribute.name); - - /// Only calculate the total length. - for (auto & [key, value] : update_unit_ptr->found_ids) - { - if (value.found) - { - const auto found_value_ref = std::get(value.values[attribute_index]); - total_length += (found_value_ref.size() + 1) * cache_expired_or_not_found_ids[key].size(); - } + if (source_returns_fetched_columns_in_order_of_keys) + return request.filterRequestedColumns(fetched_columns_from_storage); else { - for (const auto row : cache_expired_or_not_found_ids[key]) - total_length += default_value_extractor[row].size + 1; + /// Reorder result from storage to requested keys indexes + MutableColumns aggregated_columns = aggregateColumnsInOrderOfKeys( + keys, + request, + fetched_columns_from_storage, + key_index_to_state_from_storage); + + return request.filterRequestedColumns(aggregated_columns); } } - - out->getChars().reserve(total_length); - - for (const auto row : ext::range(0, ext::size(ids))) + else { - const auto id = ids[row]; - StringRef value; + /// Start sync update + update_queue.tryPushToUpdateQueueOrThrow(update_unit); + update_queue.waitForCurrentUpdateFinish(update_unit); - /// We have two maps: found in cache and found in source. - const auto local_it = local_cache.find(id); - if (local_it != local_cache.end()) - value = StringRef(local_it->second); - else - { - const auto found_it = update_unit_ptr->found_ids.find(id); - - /// Previously we didn't store defaults in local cache. - if (found_it != update_unit_ptr->found_ids.end() && found_it->second.found) - value = std::get(found_it->second.values[attribute_index]); - else - value = default_value_extractor[row]; - } - - out->insertData(value.data, value.size); + requested_keys_to_fetched_columns_during_update_index = std::move(update_unit->requested_keys_to_fetched_columns_during_update_index); + fetched_columns_during_update = std::move(update_unit->fetched_columns_during_update); } + + MutableColumns aggregated_columns = aggregateColumns( + keys, + request, + fetched_columns_from_storage, + key_index_to_state_from_storage, + fetched_columns_during_update, + requested_keys_to_fetched_columns_during_update_index); + + return request.filterRequestedColumns(aggregated_columns); } - -template -struct Overloaded : Ts... {using Ts::operator()...;}; - -template -Overloaded(Ts...) -> Overloaded; - -std::string CacheDictionary::AttributeValuesForKey::dump() +template +ColumnUInt8::Ptr CacheDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const { - WriteBufferFromOwnString os; - for (auto & attr : values) - std::visit(Overloaded { - [&os](UInt8 arg) { os << "type: UInt8, value: " << std::to_string(arg) << "\n"; }, - [&os](UInt16 arg) { os << "type: UInt16, value: " << std::to_string(arg) << "\n"; }, - [&os](UInt32 arg) { os << "type: UInt32, value: " << std::to_string(arg) << "\n"; }, - [&os](UInt64 arg) { os << "type: UInt64, value: " << std::to_string(arg) << "\n"; }, - [&os](UInt128 arg) { os << "type: UInt128, value: " << arg.toHexString() << "\n"; }, - [&os](Int8 arg) { os << "type: Int8, value: " << std::to_string(arg) << "\n"; }, - [&os](Int16 arg) { os << "type: Int16, value: " << std::to_string(arg) << "\n"; }, - [&os](Int32 arg) { os << "type: Int32, value: " << std::to_string(arg) << "\n"; }, - [&os](Int64 arg) { os << "type: Int64, value: " << std::to_string(arg) << "\n"; }, - [&os](Decimal32 arg) { os << "type: Decimal32, value: " << std::to_string(arg) << "\n"; }, - [&os](Decimal64 arg) { os << "type: Decimal64, value: " << std::to_string(arg) << "\n"; }, - [&os](Decimal128) { os << "type: Decimal128, value: ???" << "\n" ; }, - [&os](Float32 arg) { os << "type: Float32, value: " << std::to_string(arg) << "\n"; }, - [&os](Float64 arg) { os << "type: Float64, value: " << std::to_string(arg) << "\n"; }, - [&os](String arg) { os << "type: String, value: " << arg + "\n"; } - }, attr); - return os.str(); -}; + /** + * Flow of hasKeys. It is similar to getColumns. But there is an important detail, if key is identified with default value in storage + * it means that in hasKeys result this key will be false. + * + * 1. Get fetch result from storage + * 2. If all keys are found in storage and not expired and there are no default keys return that we have all keys. + * Otherwise set allow_expired_keys_during_aggregation and go to step 5. + * 3. If all keys are found in storage and some of them are expired and allow_read_expired keys is true return that we have all keys. + * Otherwise set allow_expired_keys_during_aggregation and go to step 5. + * 4. If not all keys are found in storage start sync update from source. + * 5. Start aggregation of keys from source and storage. + * If we allow read expired keys from step 2 or 3 then count them as founded in storage. + * Check if key was found in storage not default for that key set true in result array. + * Check that key was fetched during update for that key set true in result array. + */ + + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); -std::string CacheDictionary::UpdateUnit::dumpFoundIds() -{ - WriteBufferFromOwnString os; - for (auto it : found_ids) + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + const auto keys = extractor.extractAllKeys(); + + /// We make empty request just to fetch if keys exists + DictionaryStorageFetchRequest request(dict_struct, {}, {}, {}); + + FetchResult result_of_fetch_from_storage; + { - os << "Key: " << std::to_string(it.first) << "\n"; - if (it.second.found) - os << it.second.dump() << "\n"; - } - return os.str(); -}; + /// Write lock on storage + const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; -/// Returns cell_idx in handmade open addressing cache table and the state of the cell stored the key. -CacheDictionary::FindResult CacheDictionary::findCellIdxForGet(const Key & id, const time_point_t now) const -{ - auto pos = getCellIdx(id); - const auto stop = pos + max_collision_length; - for (; pos < stop; ++pos) - { - const auto cell_idx = pos & size_overlap_mask; - const auto & cell = cells[cell_idx]; - - if (cell.id != id) - continue; - - if (isExpiredPermanently(now, cell.expiresAt())) - return {cell_idx, ResultState::FoundButExpiredPermanently}; - - if (isExpired(now, cell.expiresAt())) - return {cell_idx, ResultState::FoundButExpired}; - - return {cell_idx, ResultState::FoundAndValid}; + result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request); } - return {pos & size_overlap_mask, ResultState::NotFound}; -} + size_t found_keys_size = result_of_fetch_from_storage.found_keys_size; + size_t expired_keys_size = result_of_fetch_from_storage.expired_keys_size; + size_t not_found_keys_size = result_of_fetch_from_storage.not_found_keys_size; -/// Returns cell_idx such that cells[cell_idx].id = id or the oldest cell in bounds of max_coolision_length. -size_t CacheDictionary::findCellIdxForSet(const Key & id) const -{ - auto pos = getCellIdx(id); - auto oldest_id = pos; - auto oldest_time = time_point_t::max(); - const auto stop = pos + max_collision_length; - for (; pos < stop; ++pos) + ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, found_keys_size); + ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, expired_keys_size); + ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, not_found_keys_size); + + query_count.fetch_add(keys.size()); + hit_count.fetch_add(found_keys_size); + + size_t keys_to_update_size = expired_keys_size + not_found_keys_size; + auto update_unit = std::make_shared>(key_columns, result_of_fetch_from_storage.key_index_to_state, request, keys_to_update_size); + + HashMap requested_keys_to_fetched_columns_during_update_index; + bool allow_expired_keys_during_aggregation = false; + + if (not_found_keys_size == 0 && expired_keys_size == 0) { - const auto cell_idx = pos & size_overlap_mask; - const auto & cell = cells[cell_idx]; + /// All keys were found in storage - if (cell.id != id) + if (result_of_fetch_from_storage.default_keys_size == 0) + return ColumnUInt8::create(keys.size(), true); + + allow_expired_keys_during_aggregation = true; + } + else if (not_found_keys_size == 0 && expired_keys_size > 0 && allow_read_expired_keys) + { + /// Start async update only if allow read expired keys and all keys are found + update_queue.tryPushToUpdateQueueOrThrow(update_unit); + + if (result_of_fetch_from_storage.default_keys_size == 0) + return ColumnUInt8::create(keys.size(), true); + + allow_expired_keys_during_aggregation = true; + } + else + { + /// Start sync update + update_queue.tryPushToUpdateQueueOrThrow(update_unit); + update_queue.waitForCurrentUpdateFinish(update_unit); + + requested_keys_to_fetched_columns_during_update_index = std::move(update_unit->requested_keys_to_fetched_columns_during_update_index); + } + + auto result = ColumnUInt8::create(keys.size(), false); + auto & data = result->getData(); + + for (size_t key_index = 0; key_index < keys.size(); ++key_index) + { + auto key = keys[key_index]; + + bool valid_expired_key = allow_expired_keys_during_aggregation && result_of_fetch_from_storage.key_index_to_state[key_index].isExpired(); + + if (result_of_fetch_from_storage.key_index_to_state[key_index].isFound() || valid_expired_key) { - /// maybe we already found nearest expired cell (try minimize collision_length on insert) - if (cell.expiresAt() < oldest_time) - { - oldest_time = cell.expiresAt(); - oldest_id = cell_idx; - } - continue; + /// Check if key was fetched from cache + data[key_index] = !result_of_fetch_from_storage.key_index_to_state[key_index].isDefault(); } - /// We found the exact place for id. - return cell_idx; - } - - return oldest_id; -} - -ColumnUInt8::Ptr CacheDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const -{ - PaddedPODArray backup_storage; - const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); - - auto result = ColumnUInt8::create(ext::size(ids)); - auto& out = result->getData(); - - /// There are three types of ids. - /// - Valid ids. These ids are presented in local cache and their lifetime is not expired. - /// - CacheExpired ids. Ids that are in local cache, but their values are rotted (lifetime is expired). - /// - CacheNotFound ids. We have to go to external storage to know its value. - - /// Mark everything as absent. - const auto rows = ext::size(ids); - for (const auto row : ext::range(0, rows)) - out[row] = false; - - /// Mapping: -> { all indices `i` of `ids` such that `ids[i]` = } - std::unordered_map> cache_expired_or_not_found_ids; - - size_t cache_hit = 0; - - size_t cache_expired_count = 0; - size_t cache_not_found_count = 0; - - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - /// fetch up-to-date values, decide which ones require update - for (const auto row : ext::range(0, rows)) + if (requested_keys_to_fetched_columns_during_update_index.has(key)) { - const auto id = ids[row]; - const auto [cell_idx, state] = findCellIdxForGet(id, now); - auto & cell = cells[cell_idx]; - - auto insert_to_answer_routine = [&] () - { - out[row] = !cell.isDefault(); - }; - - if (state == ResultState::FoundAndValid) - { - ++cache_hit; - insert_to_answer_routine(); - } - else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently) - { - /// Permanently expired equals to not found semantically. - ++cache_not_found_count; - cache_expired_or_not_found_ids[id].push_back(row); - } - else if (state == ResultState::FoundButExpired) - { - cache_expired_count++; - cache_expired_or_not_found_ids[id].push_back(row); - - if (allow_read_expired_keys) - insert_to_answer_routine(); - } + /// Check if key was not in cache and was fetched during update + data[key_index] = true; } } - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_count); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows - cache_expired_count - cache_not_found_count, std::memory_order_release); - - if (!cache_not_found_count) - { - /// Nothing to update - return; - if (!cache_expired_count) - return result; - - if (allow_read_expired_keys) - { - std::vector required_expired_ids; - required_expired_ids.reserve(cache_expired_count); - std::transform( - std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; }); - - auto update_unit_ptr = std::make_shared(std::move(required_expired_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - /// Update is async - no need to wait. - return result; - } - } - - /// At this point we have two situations. - /// There may be both types of keys: expired and not_found. - /// We will update them all synchronously. - - std::vector required_ids; - required_ids.reserve(cache_not_found_count + cache_expired_count); - std::transform( - std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_ids), [](auto & pair) { return pair.first; }); - - auto update_unit_ptr = std::make_shared(std::move(required_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - waitForCurrentUpdateFinish(update_unit_ptr); - - for (auto & [key, value] : update_unit_ptr->found_ids) - { - if (value.found) - for (const auto row : cache_expired_or_not_found_ids[key]) - out[row] = true; - } - return result; } - -void CacheDictionary::createAttributes() +template +ColumnPtr CacheDictionary::getHierarchy( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const { - const auto attributes_size = dict_struct.attributes.size(); - attributes.reserve(attributes_size); - - bytes_allocated += size * sizeof(CellMetadata); - bytes_allocated += attributes_size * sizeof(attributes.front()); - - for (const auto & attribute : dict_struct.attributes) + if (dictionary_key_type == DictionaryKeyType::simple) { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttributeWithTypeAndName(attribute.underlying_type, attribute.name, attribute.null_value)); - - if (attribute.hierarchical) - { - hierarchical_attribute = &attributes.back(); - - if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } + auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; } + else + return nullptr; } -/* For unknown reason clang-tidy wants this function to be static, but it uses bytes_allocated, which is a class member. - * NOLINT(readability-convert-member-functions-to-static) */ -CacheDictionary::Attribute CacheDictionary::createAttributeWithTypeAndName(const AttributeUnderlyingType type, const String & name, const Field & null_value) +template +ColumnUInt8::Ptr CacheDictionary::isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const { - Attribute attr{type, name, {}, {}}; - - switch (type) + if (dictionary_key_type == DictionaryKeyType::simple) { - /* Macro argument should be enclosed in parentheses, but if do so we cannot initialize \ - * NearestFieldType which takes TYPE as a template parameter. */ -#define DISPATCH(TYPE)\ - case AttributeUnderlyingType::ut##TYPE:\ - {\ - attr.null_value = TYPE(null_value.get>()); /* NOLINT(bugprone-macro-parentheses) */ \ - attr.arrays = std::make_unique>(size); /* NOLINT(bugprone-macro-parentheses) */ \ - bytes_allocated += size * sizeof(TYPE);\ - break;\ - } - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - case AttributeUnderlyingType::utString: { - attr.null_value = null_value.get(); - attr.arrays = std::make_unique>(size); - bytes_allocated += size * sizeof(StringRef); - if (!string_arena) - string_arena = std::make_unique(); - break; - } + auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; } - - return attr; + else + return nullptr; } -void CacheDictionary::setDefaultAttributeValue(Attribute & attribute, const Key idx) const +template +MutableColumns CacheDictionary::aggregateColumnsInOrderOfKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & request, + const MutableColumns & fetched_columns, + const PaddedPODArray & key_index_to_state) { - switch (attribute.type) + MutableColumns aggregated_columns = request.makeAttributesResultColumns(); + + /// If keys were returned not in order of keys, aggregate fetched columns in order of requested keys. + + for (size_t fetch_request_index = 0; fetch_request_index < request.attributesSize(); ++fetch_request_index) { - /* Macro argument should be enclosed in parentheses, but if do so we cannot initialize \ - * NearestFieldType which takes TYPE as a template parameter. */ -#define DISPATCH(TYPE)\ - case AttributeUnderlyingType::ut##TYPE:\ - std::get>(attribute.arrays)[idx] = std::get(attribute.null_value); /* NOLINT(bugprone-macro-parentheses) */ \ - break; - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - case AttributeUnderlyingType::utString: - { - const auto & null_value_ref = std::get(attribute.null_value); - auto & string_ref = std::get>(attribute.arrays)[idx]; - - if (string_ref.data != null_value_ref.data()) - { - if (string_ref.data) - string_arena->free(const_cast(string_ref.data), string_ref.size); - - string_ref = StringRef{null_value_ref}; - } - - break; - } - } -} - -void CacheDictionary::setAttributeValue(Attribute & attribute, const Key idx, const Field & value) const -{ - switch (attribute.type) - { - case AttributeUnderlyingType::utUInt8: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utUInt16: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utUInt32: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utUInt64: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utUInt128: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utInt8: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utInt16: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utInt32: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utInt64: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utFloat32: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utFloat64: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utDecimal32: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utDecimal64: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utDecimal128: - std::get>(attribute.arrays)[idx] = value.get(); - break; - - case AttributeUnderlyingType::utString: - { - const auto & string = value.get(); - auto & string_ref = std::get>(attribute.arrays)[idx]; - const auto & null_value_ref = std::get(attribute.null_value); - - /// free memory unless it points to a null_value - if (string_ref.data && string_ref.data != null_value_ref.data()) - string_arena->free(const_cast(string_ref.data), string_ref.size); - - const auto str_size = string.size(); - if (str_size != 0) - { - auto * string_ptr = string_arena->alloc(str_size + 1); - std::copy(string.data(), string.data() + str_size + 1, string_ptr); - string_ref = StringRef{string_ptr, str_size}; - } - else - string_ref = {}; - - break; - } - } -} - -CacheDictionary::Attribute & CacheDictionary::getAttribute(const std::string & attribute_name) const -{ - const size_t attr_index = getAttributeIndex(attribute_name); - return attributes[attr_index]; -} - -size_t CacheDictionary::getAttributeIndex(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return it->second; -} - -bool CacheDictionary::isEmptyCell(const UInt64 idx) const -{ - return (idx != zero_cell_idx && cells[idx].id == 0) || (cells[idx].deadline == time_point_t()); -} - - -PaddedPODArray CacheDictionary::getCachedIds() const -{ - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - PaddedPODArray array; - for (size_t idx = 0; idx < cells.size(); ++idx) - { - auto & cell = cells[idx]; - if (!isEmptyCell(idx) && !cells[idx].isDefault()) - array.push_back(cell.id); - } - return array; -} - -BlockInputStreamPtr CacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const -{ - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, getCachedIds(), column_names); -} - -std::exception_ptr CacheDictionary::getLastException() const -{ - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - return last_exception; -} - -void registerDictionaryCache(DictionaryFactory & factory) -{ - auto create_layout = [=](const std::string & full_name, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - if (dict_struct.key) - throw Exception{"'key' is not supported for dictionary of layout 'cache'", - ErrorCodes::UNSUPPORTED_METHOD}; - - if (dict_struct.range_min || dict_struct.range_max) - throw Exception{full_name - + ": elements .structure.range_min and .structure.range_max should be defined only " - "for a dictionary of layout 'range_hashed'", - ErrorCodes::BAD_ARGUMENTS}; - const auto & layout_prefix = config_prefix + ".layout"; - - const size_t size = config.getUInt64(layout_prefix + ".cache.size_in_cells"); - if (size == 0) - throw Exception{full_name + ": dictionary of layout 'cache' cannot have 0 cells", - ErrorCodes::TOO_SMALL_BUFFER_SIZE}; - - const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - if (require_nonempty) - throw Exception{full_name + ": dictionary of layout 'cache' cannot have 'require_nonempty' attribute set", - ErrorCodes::BAD_ARGUMENTS}; - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - - const size_t strict_max_lifetime_seconds = - config.getUInt64(layout_prefix + ".cache.strict_max_lifetime_seconds", static_cast(dict_lifetime.max_sec)); - - const size_t max_update_queue_size = - config.getUInt64(layout_prefix + ".cache.max_update_queue_size", 100000); - if (max_update_queue_size == 0) - throw Exception{full_name + ": dictionary of layout 'cache' cannot have empty update queue of size 0", - ErrorCodes::TOO_SMALL_BUFFER_SIZE}; - - const bool allow_read_expired_keys = - config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false); - - const size_t update_queue_push_timeout_milliseconds = - config.getUInt64(layout_prefix + ".cache.update_queue_push_timeout_milliseconds", 10); - if (update_queue_push_timeout_milliseconds < 10) - throw Exception{full_name + ": dictionary of layout 'cache' have too little update_queue_push_timeout", - ErrorCodes::BAD_ARGUMENTS}; - - const size_t query_wait_timeout_milliseconds = - config.getUInt64(layout_prefix + ".cache.query_wait_timeout_milliseconds", 60000); - - const size_t max_threads_for_updates = - config.getUInt64(layout_prefix + ".max_threads_for_updates", 4); - if (max_threads_for_updates == 0) - throw Exception{full_name + ": dictionary of layout 'cache' cannot have zero threads for updates.", - ErrorCodes::BAD_ARGUMENTS}; - - return std::make_unique( - dict_id, - dict_struct, - std::move(source_ptr), - dict_lifetime, - strict_max_lifetime_seconds, - size, - allow_read_expired_keys, - max_update_queue_size, - update_queue_push_timeout_milliseconds, - query_wait_timeout_milliseconds, - max_threads_for_updates); - }; - factory.registerLayout("cache", create_layout, false); -} - -void CacheDictionary::updateThreadFunction() -{ - setThreadName("AsyncUpdater"); - while (!finished) - { - UpdateUnitPtr popped; - update_queue.pop(popped); - - if (finished) - break; - - try - { - /// Update a bunch of ids. - update(popped); - - /// Notify thread about finished updating the bunch of ids - /// where their own ids were included. - std::unique_lock lock(update_mutex); - - popped->is_done = true; - is_update_finished.notify_all(); - } - catch (...) - { - std::unique_lock lock(update_mutex); - - popped->current_exception = std::current_exception(); - is_update_finished.notify_all(); - } - } -} - -void CacheDictionary::waitForCurrentUpdateFinish(UpdateUnitPtr & update_unit_ptr) const -{ - std::unique_lock update_lock(update_mutex); - - bool result = is_update_finished.wait_for( - update_lock, - std::chrono::milliseconds(query_wait_timeout_milliseconds), - [&] { return update_unit_ptr->is_done || update_unit_ptr->current_exception; }); - - if (!result) - { - throw DB::Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Dictionary {} source seems unavailable, because {}ms timeout exceeded.", - getDictionaryID().getNameForLogs(), toString(query_wait_timeout_milliseconds)); - } - - - if (update_unit_ptr->current_exception) - { - // Don't just rethrow it, because sharing the same exception object - // between multiple threads can lead to weird effects if they decide to - // modify it, for example, by adding some error context. - try - { - std::rethrow_exception(update_unit_ptr->current_exception); - } - catch (...) - { - throw DB::Exception(ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL, - "Update failed for dictionary '{}': {}", - getDictionaryID().getNameForLogs(), - getCurrentExceptionMessage(true /*with stack trace*/, - true /*check embedded stack trace*/)); - } - } -} - -void CacheDictionary::tryPushToUpdateQueueOrThrow(UpdateUnitPtr & update_unit_ptr) const -{ - if (!update_queue.tryPush(update_unit_ptr, update_queue_push_timeout_milliseconds)) - throw DB::Exception(ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL, - "Cannot push to internal update queue in dictionary {}. " - "Timelimit of {} ms. exceeded. Current queue size is {}", - getDictionaryID().getNameForLogs(), std::to_string(update_queue_push_timeout_milliseconds), - std::to_string(update_queue.size())); -} - - -std::vector CacheDictionary::getAttributeValuesFromBlockAtPosition(const std::vector & column_ptrs, size_t position) -{ - std::vector answer; - answer.reserve(column_ptrs.size()); - - for (const auto * pure_column : column_ptrs) - { -#define DISPATCH(TYPE) \ - if (const auto * column = typeid_cast(pure_column)) { \ - answer.emplace_back(column->getElement(position)); \ - continue; \ - } - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal) - DISPATCH(Decimal) - DISPATCH(Decimal) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - if (const auto * column = typeid_cast(pure_column)) - { - answer.emplace_back(column->getDataAt(position).toString()); + if (!request.shouldFillResultColumnWithIndex(fetch_request_index)) continue; + + const auto & aggregated_column = aggregated_columns[fetch_request_index]; + const auto & fetched_column = fetched_columns[fetch_request_index]; + + for (size_t key_index = 0; key_index < keys.size(); ++key_index) + { + auto state = key_index_to_state[key_index]; + + if (state.isNotFound()) + continue; + + aggregated_column->insertFrom(*fetched_column, state.getFetchedColumnIndex()); } } - return answer; + + return aggregated_columns; } -void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr) +template +MutableColumns CacheDictionary::aggregateColumns( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & request, + const MutableColumns & fetched_columns_from_storage, + const PaddedPODArray & key_index_to_fetched_columns_from_storage_result, + const MutableColumns & fetched_columns_during_update, + const HashMap & found_keys_to_fetched_columns_during_update_index) { + /** + * Aggregation of columns fetched from storage and from source during update. + * If key was found in storage add it to result. + * If key was found in source during update add it to result. + * If key was not found in storage or in source during update add default value. + */ + + MutableColumns aggregated_columns = request.makeAttributesResultColumns(); + + for (size_t fetch_request_index = 0; fetch_request_index < request.attributesSize(); ++fetch_request_index) + { + if (!request.shouldFillResultColumnWithIndex(fetch_request_index)) + continue; + + const auto & aggregated_column = aggregated_columns[fetch_request_index]; + const auto & fetched_column_from_storage = fetched_columns_from_storage[fetch_request_index]; + const auto & fetched_column_during_update = fetched_columns_during_update[fetch_request_index]; + const auto & default_value_provider = request.defaultValueProviderAtIndex(fetch_request_index); + + for (size_t key_index = 0; key_index < keys.size(); ++key_index) + { + auto key = keys[key_index]; + + auto key_state_from_storage = key_index_to_fetched_columns_from_storage_result[key_index]; + if (key_state_from_storage.isFound()) + { + /// Check and insert value if key was fetched from cache + aggregated_column->insertFrom(*fetched_column_from_storage, key_state_from_storage.getFetchedColumnIndex()); + continue; + } + + /// Check and insert value if key was not in cache and was fetched during update + const auto * find_iterator_in_fetch_during_update = found_keys_to_fetched_columns_during_update_index.find(key); + if (find_iterator_in_fetch_during_update) + { + aggregated_column->insertFrom(*fetched_column_during_update, find_iterator_in_fetch_during_update->getMapped()); + continue; + } + + /// Insert default value + aggregated_column->insert(default_value_provider.getDefaultValue(key_index)); + } + } + + return aggregated_columns; +} + +template +BlockInputStreamPtr CacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const +{ + std::shared_ptr stream; + + { + /// Write lock on storage + const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + stream = std::make_shared(shared_from_this(), max_block_size, cache_storage_ptr->getCachedSimpleKeys(), column_names); + else + { + auto keys = cache_storage_ptr->getCachedComplexKeys(); + stream = std::make_shared(shared_from_this(), max_block_size, keys, column_names); + } + } + + return stream; +} + +template +void CacheDictionary::update(CacheDictionaryUpdateUnitPtr update_unit_ptr) +{ + /** + * Update has following flow. + * 1. Filter only necessary keys to request, keys that are expired or not found. + * And create not_found_keys hash_set including each requested key. + * In case of simple_keys we need to fill requested_keys_vector with requested value key. + * In case of complex_keys we need to fill requested_complex_key_rows with requested row. + * 2. Create stream from source with necessary keys to request using method for simple or complex keys. + * 3. Create fetched columns during update variable. This columns will aggregate columns that we fetch from source. + * 4. When block is fetched from source. Split it into keys columns and attributes columns. + * Insert attributes columns into associated fetched columns during update. + * Create KeysExtractor and extract keys from keys columns. + * Update map of requested found key to fetched column index. + * Remove found key from not_found_keys. + * 5. Add aggregated columns during update into storage. + * 6. Add not found keys as default into storage. + */ CurrentMetrics::Increment metric_increment{CurrentMetrics::DictCacheRequests}; - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, update_unit_ptr->requested_ids.size()); - auto & map_ids = update_unit_ptr->found_ids; + size_t found_keys_size = 0; - size_t found_num = 0; + Arena * complex_key_arena = update_unit_ptr->complex_keys_arena_holder.getComplexKeyArena(); + DictionaryKeysExtractor requested_keys_extractor(update_unit_ptr->key_columns, complex_key_arena); + auto requested_keys = requested_keys_extractor.extractAllKeys(); + + HashSet not_found_keys; + + std::vector requested_keys_vector; + std::vector requested_complex_key_rows; + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + requested_keys_vector.reserve(requested_keys.size()); + else + requested_complex_key_rows.reserve(requested_keys.size()); + + auto & key_index_to_state_from_storage = update_unit_ptr->key_index_to_state; + + for (size_t i = 0; i < key_index_to_state_from_storage.size(); ++i) + { + if (key_index_to_state_from_storage[i].isExpired() + || key_index_to_state_from_storage[i].isNotFound()) + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + requested_keys_vector.emplace_back(requested_keys[i]); + else + requested_complex_key_rows.emplace_back(i); + + auto requested_key = requested_keys[i]; + not_found_keys.insert(requested_key); + } + } + + size_t requested_keys_size = update_unit_ptr->keys_to_update_size; + ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, requested_keys_size); + + const auto & fetch_request = update_unit_ptr->request; const auto now = std::chrono::system_clock::now(); @@ -1287,85 +567,77 @@ void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr) auto current_source_ptr = getSourceAndUpdateIfNeeded(); Stopwatch watch; + BlockInputStreamPtr stream; + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + stream = current_source_ptr->loadIds(requested_keys_vector); + else + stream = current_source_ptr->loadKeys(update_unit_ptr->key_columns, requested_complex_key_rows); - BlockInputStreamPtr stream = current_source_ptr->loadIds(update_unit_ptr->requested_ids); stream->readPrefix(); - while (true) + size_t skip_keys_size_offset = dict_struct.getKeysSize(); + PaddedPODArray found_keys_in_source; + + Columns fetched_columns_during_update = fetch_request.makeAttributesResultColumnsNonMutable(); + + while (Block block = stream->read()) { - Block block = stream->read(); - if (!block) - break; + Columns key_columns; + key_columns.reserve(skip_keys_size_offset); - const auto * id_column = typeid_cast(block.safeGetByPosition(0).column.get()); - if (!id_column) - throw Exception{ErrorCodes::TYPE_MISMATCH, - "{}: id column has type different from UInt64.", getDictionaryID().getNameForLogs()}; + auto block_columns = block.getColumns(); - const auto & ids = id_column->getData(); - - /// cache column pointers - const auto column_ptrs = ext::map( - ext::range(0, attributes.size()), [&block](size_t i) { return block.safeGetByPosition(i + 1).column.get(); }); - - found_num += ids.size(); - - for (const auto i : ext::range(0, ids.size())) + /// Split into keys columns and attribute columns + for (size_t i = 0; i < skip_keys_size_offset; ++i) { - /// Modifying cache with write lock - ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - const auto id = ids[i]; + key_columns.emplace_back(*block_columns.begin()); + block_columns.erase(block_columns.begin()); + } - const auto cell_idx = findCellIdxForSet(id); - auto & cell = cells[cell_idx]; + DictionaryKeysExtractor keys_extractor(key_columns, complex_key_arena); + auto keys_extracted_from_block = keys_extractor.extractAllKeys(); - auto it = map_ids.find(id); + for (size_t index_of_attribute = 0; index_of_attribute < fetched_columns_during_update.size(); ++index_of_attribute) + { + auto & column_to_update = fetched_columns_during_update[index_of_attribute]; + auto column = block.safeGetByPosition(skip_keys_size_offset + index_of_attribute).column; + column_to_update->assumeMutable()->insertRangeFrom(*column, 0, keys_extracted_from_block.size()); + } - /// We have some extra keys from source. Won't add them to cache. - if (it == map_ids.end()) - continue; + for (size_t i = 0; i < keys_extracted_from_block.size(); ++i) + { + auto fetched_key_from_source = keys_extracted_from_block[i]; - auto & all_attributes = it->second; - all_attributes.found = true; - all_attributes.values = getAttributeValuesFromBlockAtPosition(column_ptrs, i); - - for (const auto attribute_idx : ext::range(0, attributes.size())) - { - const auto & attribute_column = *column_ptrs[attribute_idx]; - auto & attribute = attributes[attribute_idx]; - - setAttributeValue(attribute, cell_idx, attribute_column[i]); - } - - /// if cell id is zero and zero does not map to this cell, then the cell is unused - if (cell.id == 0 && cell_idx != zero_cell_idx) - element_count.fetch_add(1, std::memory_order_relaxed); - - cell.id = id; - setLifetime(cell, now); + not_found_keys.erase(fetched_key_from_source); + update_unit_ptr->requested_keys_to_fetched_columns_during_update_index[fetched_key_from_source] = found_keys_size; + found_keys_in_source.emplace_back(fetched_key_from_source); + ++found_keys_size; } } + PaddedPODArray not_found_keys_in_source; + not_found_keys_in_source.reserve(not_found_keys.size()); + + for (auto & cell : not_found_keys) + not_found_keys_in_source.emplace_back(cell.getKey()); + + auto & update_unit_ptr_mutable_columns = update_unit_ptr->fetched_columns_during_update; + for (const auto & fetched_column : fetched_columns_during_update) + update_unit_ptr_mutable_columns.emplace_back(fetched_column->assumeMutable()); + stream->readSuffix(); - /// Lock for cache modification - ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - - for (auto & [key, value] : update_unit_ptr->found_ids) { - if (!value.found) - { - auto cell_idx = findCellIdxForSet(key); - auto & cell = cells[cell_idx]; - cell.id = key; - setLifetime(cell, now); - cell.setDefault(); - } - } + /// Lock for cache modification + ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; + cache_storage_ptr->insertColumnsForKeys(found_keys_in_source, fetched_columns_during_update); + cache_storage_ptr->insertDefaultKeys(not_found_keys_in_source); - error_count = 0; - last_exception = std::exception_ptr{}; - backoff_end_time = std::chrono::system_clock::time_point{}; + error_count = 0; + last_exception = std::exception_ptr{}; + backoff_end_time = std::chrono::system_clock::time_point{}; + } ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed()); } @@ -1394,10 +666,9 @@ void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr) } } - - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, update_unit_ptr->requested_ids.size() - found_num); - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num); - ProfileEvents::increment(ProfileEvents::DictCacheRequests); + ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, requested_keys_size - found_keys_size); + ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_keys_size); + ProfileEvents::increment(ProfileEvents::DictCacheRequests); } else { @@ -1409,4 +680,7 @@ void CacheDictionary::update(UpdateUnitPtr & update_unit_ptr) } } +template class CacheDictionary; +template class CacheDictionary; + } diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index 35d38f03cbe..62cd509d006 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -3,72 +3,76 @@ #include #include #include -#include #include #include #include -#include #include -#include -#include -#include -#include -#include + #include -#include + +#include + +#include +#include #include -#include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" - -namespace CurrentMetrics -{ - extern const Metric CacheDictionaryUpdateQueueBatches; - extern const Metric CacheDictionaryUpdateQueueKeys; -} +#include +#include +#include +#include +#include +#include namespace DB { +/** CacheDictionary store keys in cache storage and can asynchronous and synchronous updates during keys fetch. -namespace ErrorCodes -{ -} + If keys are not found in storage during fetch, dictionary start update operation with update queue. -/* - * - * This dictionary is stored in a cache that has a fixed number of cells. - * These cells contain frequently used elements. - * When searching for a dictionary, the cache is searched first and special heuristic is used: - * while looking for the key, we take a look only at max_collision_length elements. - * So, our cache is not perfect. It has errors like "the key is in cache, but the cache says that it does not". - * And in this case we simply ask external source for the key which is faster. - * You have to keep this logic in mind. - * */ + During update operation necessary keys are fetched from source and inserted into storage. + + After that data from storage and source are aggregated and returned to the client. + + Typical flow: + + 1. Client request data during for example getColumn function call. + 2. CacheDictionary request data from storage and if all data is found in storage it returns result to client. + 3. If some data is not in storage cache dictionary try to perform update. + + If all keys are just expired and allow_read_expired_keys option is set dictionary starts asynchronous update and + return result to client. + + If there are not found keys dictionary start synchronous update and wait for result. + + 4. After getting result from synchronous update dictionary aggregates data that was previously fetched from + storage and data that was fetched during update and return result to client. + */ +template class CacheDictionary final : public IDictionary { public: + using KeyType = std::conditional_t; + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by cache dictionary"); + CacheDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, + CacheDictionaryStoragePtr cache_storage_ptr_, + CacheDictionaryUpdateQueueConfiguration update_queue_configuration_, DictionaryLifetime dict_lifetime_, - size_t strict_max_lifetime_seconds, - size_t size_, - bool allow_read_expired_keys_, - size_t max_update_queue_size_, - size_t update_queue_push_timeout_milliseconds_, - size_t query_wait_timeout_milliseconds, - size_t max_threads_for_updates); + bool allow_read_expired_keys_); ~CacheDictionary() override; - std::string getTypeName() const override { return "Cache"; } + std::string getTypeName() const override { return cache_storage_ptr->getName(); } + + size_t getElementCount() const override; size_t getBytesAllocated() const override; + double getLoadFactor() const override; + size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } double getHitRate() const override @@ -76,10 +80,6 @@ public: return static_cast(hit_count.load(std::memory_order_acquire)) / query_count.load(std::memory_order_relaxed); } - size_t getElementCount() const override { return element_count.load(std::memory_order_relaxed); } - - double getLoadFactor() const override { return static_cast(element_count.load(std::memory_order_relaxed)) / size; } - bool supportUpdates() const override { return false; } std::shared_ptr clone() const override @@ -88,14 +88,10 @@ public: getDictionaryID(), dict_struct, getSourceAndUpdateIfNeeded()->clone(), + cache_storage_ptr, + update_queue.getConfiguration(), dict_lifetime, - strict_max_lifetime_seconds, - size, - allow_read_expired_keys, - max_update_queue_size, - update_queue_push_timeout_milliseconds, - query_wait_timeout_milliseconds, - max_threads_for_updates); + allow_read_expired_keys); } const IDictionarySource * getSource() const override; @@ -106,133 +102,61 @@ public: bool isInjective(const std::string & attribute_name) const override { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - - std::exception_ptr getLastException() const override; - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } + DictionaryKeyType getKeyType() const override + { + return dictionary_key_type == DictionaryKeyType::simple ? DictionaryKeyType::simple : DictionaryKeyType::complex; + } ColumnPtr getColumn( const std::string& attribute_name, const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const override; + const ColumnPtr & default_values_column) const override; + + Columns getColumns( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns) const override; ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - template - using ResultArrayType = std::conditional_t, DecimalPaddedPODArray, PaddedPODArray>; - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; + std::exception_ptr getLastException() const override; + + bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + private: - template - using ContainerType = Value[]; - template - using ContainerPtrType = std::unique_ptr>; + using FetchResult = std::conditional_t; - using time_point_t = std::chrono::system_clock::time_point; + static MutableColumns aggregateColumnsInOrderOfKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & request, + const MutableColumns & fetched_columns, + const PaddedPODArray & key_index_to_state); - struct CellMetadata final - { - UInt64 id; - time_point_t deadline; - bool is_default{false}; + static MutableColumns aggregateColumns( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & request, + const MutableColumns & fetched_columns_from_storage, + const PaddedPODArray & key_index_to_fetched_columns_from_storage_result, + const MutableColumns & fetched_columns_during_update, + const HashMap & found_keys_to_fetched_columns_during_update_index); - time_point_t expiresAt() const { return deadline; } - void setExpiresAt(const time_point_t & t) { deadline = t; is_default = false; } - bool isDefault() const { return is_default; } - void setDefault() { is_default = true; } - }; - - using AttributeValue = std::variant< - UInt8, UInt16, UInt32, UInt64, UInt128, - Int8, Int16, Int32, Int64, - Decimal32, Decimal64, Decimal128, - Float32, Float64, String>; - - struct AttributeValuesForKey - { - bool found{false}; - std::vector values; - - std::string dump(); - }; - - using FoundValuesForKeys = std::unordered_map; - - struct Attribute final - { - AttributeUnderlyingType type; - String name; - /// Default value for each type. Could be defined in config. - AttributeValue null_value; - /// We store attribute value for all keys. It is a "row" in a hand-made open addressing hashtable, - /// where "column" is key. - std::variant< - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType> - arrays; - }; - - void createAttributes(); - - /* NOLINTNEXTLINE(readability-convert-member-functions-to-static) */ - Attribute createAttributeWithTypeAndName(const AttributeUnderlyingType type, const String & name, const Field & null_value); - - template - void getItemsNumberImpl( - Attribute & attribute, - const PaddedPODArray & ids, - ResultArrayType & out, - DefaultValueExtractor & default_value_extractor) const; - - void getItemsString( - Attribute & attribute, - const PaddedPODArray & ids, - ColumnString * out, - DictionaryDefaultValueExtractor & default_value_extractor) const; - - PaddedPODArray getCachedIds() const; - - bool isEmptyCell(const UInt64 idx) const; - - size_t getCellIdx(const Key id) const; - - void setDefaultAttributeValue(Attribute & attribute, const Key idx) const; - - void setAttributeValue(Attribute & attribute, const Key idx, const Field & value) const; - - static std::vector getAttributeValuesFromBlockAtPosition(const std::vector & column_ptrs, size_t position); - - Attribute & getAttribute(const std::string & attribute_name) const; - size_t getAttributeIndex(const std::string & attribute_name) const; - - using SharedDictionarySourcePtr = std::shared_ptr; + void update(CacheDictionaryUpdateUnitPtr update_unit_ptr); /// Update dictionary source pointer if required and return it. Thread safe. /// MultiVersion is not used here because it works with constant pointers. @@ -252,160 +176,38 @@ private: return source_ptr; } - inline void setLifetime(CellMetadata & cell, time_point_t now) - { - if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0) - { - std::uniform_int_distribution distribution{dict_lifetime.min_sec, dict_lifetime.max_sec}; - cell.setExpiresAt(now + std::chrono::seconds{distribution(rnd_engine)}); - } - else - { - /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds - /// to the expiration time. And it overflows pretty well. - cell.setExpiresAt(std::chrono::time_point::max() - 2 * std::chrono::seconds(strict_max_lifetime_seconds)); - } - } - - inline bool isExpired(time_point_t now, time_point_t deadline) const - { - return now > deadline; - } - - inline bool isExpiredPermanently(time_point_t now, time_point_t deadline) const - { - return now > deadline + std::chrono::seconds(strict_max_lifetime_seconds); - } - - enum class ResultState - { - NotFound, - FoundAndValid, - FoundButExpired, - /// Here is a gap between there two states in which a key could be read - /// with an enabled setting in config enable_read_expired_keys. - FoundButExpiredPermanently - }; - - using FindResult = std::pair; - - FindResult findCellIdxForGet(const Key & id, const time_point_t now) const; - - size_t findCellIdxForSet(const Key & id) const; - - template - void isInImpl(const PaddedPODArray & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - const DictionaryStructure dict_struct; /// Dictionary source should be used with mutex mutable std::mutex source_mutex; mutable SharedDictionarySourcePtr source_ptr; + CacheDictionaryStoragePtr cache_storage_ptr; + mutable CacheDictionaryUpdateQueue update_queue; + const DictionaryLifetime dict_lifetime; - const size_t strict_max_lifetime_seconds; - const bool allow_read_expired_keys; - const size_t max_update_queue_size; - const size_t update_queue_push_timeout_milliseconds; - const size_t query_wait_timeout_milliseconds; - const size_t max_threads_for_updates; Poco::Logger * log; + const bool allow_read_expired_keys; + + mutable pcg64 rnd_engine; + /// This lock is used for the inner cache state update function lock it for /// write, when it need to update cache state all other functions just /// readers. Surprisingly this lock is also used for last_exception pointer. mutable std::shared_mutex rw_lock; - /// Actual size will be increased to match power of 2 - const size_t size; - - /// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111) - const size_t size_overlap_mask; - - /// Max tries to find cell, overlapped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3 - static constexpr size_t max_collision_length = 10; - - const size_t zero_cell_idx{getCellIdx(0)}; - std::map attribute_index_by_name; - mutable std::vector attributes; - mutable std::vector cells; - Attribute * hierarchical_attribute = nullptr; - std::unique_ptr string_arena; - mutable std::exception_ptr last_exception; - mutable std::atomic error_count{0}; + mutable std::atomic error_count {0}; mutable std::atomic backoff_end_time{std::chrono::system_clock::time_point{}}; - mutable pcg64 rnd_engine; - - mutable size_t bytes_allocated = 0; - mutable std::atomic element_count{0}; mutable std::atomic hit_count{0}; mutable std::atomic query_count{0}; - /* - * How the update goes: we basically have a method like get(keys)->values. Values are cached, so sometimes we - * can return them from the cache. For values not in cache, we query them from the source, and add to the - * cache. The cache is lossy, so we can't expect it to store all the keys, and we store them separately. - * So, there is a map of found keys to all its attributes. - */ - struct UpdateUnit - { - explicit UpdateUnit(std::vector && requested_ids_) : - requested_ids(std::move(requested_ids_)), - alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, requested_ids.size()) - { - found_ids.reserve(requested_ids.size()); - for (const auto id : requested_ids) - found_ids.insert({id, {}}); - } - - std::vector requested_ids; - FoundValuesForKeys found_ids; - - std::atomic is_done{false}; - std::exception_ptr current_exception{nullptr}; - - /// While UpdateUnit is alive, it is accounted in update_queue size. - CurrentMetrics::Increment alive_batch{CurrentMetrics::CacheDictionaryUpdateQueueBatches}; - CurrentMetrics::Increment alive_keys; - - std::string dumpFoundIds(); - }; - - using UpdateUnitPtr = std::shared_ptr; - using UpdateQueue = ConcurrentBoundedQueue; - - mutable UpdateQueue update_queue; - - ThreadPool update_pool; - - /* - * Actually, we can divide all requested keys into two 'buckets'. There are only four possible states and they - * are described in the table. - * - * cache_not_found_ids |0|0|1|1| - * cache_expired_ids |0|1|0|1| - * - * 0 - if set is empty, 1 - otherwise - * - * Only if there are no cache_not_found_ids and some cache_expired_ids - * (with allow_read_expired_keys setting) we can perform async update. - * Otherwise we have no concatenate ids and update them sync. - * - */ - void updateThreadFunction(); - void update(UpdateUnitPtr & update_unit_ptr); - - - void tryPushToUpdateQueueOrThrow(UpdateUnitPtr & update_unit_ptr) const; - void waitForCurrentUpdateFinish(UpdateUnitPtr & update_unit_ptr) const; - - mutable std::mutex update_mutex; - mutable std::condition_variable is_update_finished; - - std::atomic finished{false}; }; +extern template class CacheDictionary; +extern template class CacheDictionary; + } diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h new file mode 100644 index 00000000000..720800e6357 --- /dev/null +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -0,0 +1,711 @@ +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +struct CacheDictionaryStorageConfiguration +{ + /// Max size of storage in cells + const size_t max_size_in_cells; + /// Needed to perform check if cell is expired or not found. Default value is dictionary max lifetime. + const size_t strict_max_lifetime_seconds; + /// Lifetime of dictionary. Cell deadline is random value between lifetime min and max seconds. + const DictionaryLifetime lifetime; +}; + +/** ICacheDictionaryStorage implementation that keeps key in hash table with fixed collision length. + * Value in hash table point to index in attributes arrays. + */ +template +class CacheDictionaryStorage final : public ICacheDictionaryStorage +{ + + static constexpr size_t max_collision_length = 10; + +public: + using KeyType = std::conditional_t; + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage"); + + explicit CacheDictionaryStorage( + const DictionaryStructure & dictionary_structure, + CacheDictionaryStorageConfiguration & configuration_) + : configuration(configuration_) + , rnd_engine(randomSeed()) + { + size_t cells_size = roundUpToPowerOfTwoOrZero(std::max(configuration.max_size_in_cells, max_collision_length)); + + cells.resize_fill(cells_size); + size_overlap_mask = cells_size - 1; + + setup(dictionary_structure); + } + + bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; } + + String getName() const override + { + if (dictionary_key_type == DictionaryKeyType::simple) + return "Cache"; + else + return "ComplexKeyCache"; + } + + bool supportsSimpleKeys() const override { return dictionary_key_type == DictionaryKeyType::simple; } + + SimpleKeysStorageFetchResult fetchColumnsForKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & fetch_request) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return fetchColumnsForKeysImpl(keys, fetch_request); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method fetchColumnsForKeys is not supported for complex key storage"); + } + + void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + insertColumnsForKeysImpl(keys, columns); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertColumnsForKeys is not supported for complex key storage"); + } + + void insertDefaultKeys(const PaddedPODArray & keys) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + insertDefaultKeysImpl(keys); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertDefaultKeysImpl is not supported for complex key storage"); + } + + PaddedPODArray getCachedSimpleKeys() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return getCachedKeysImpl(); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getCachedSimpleKeys is not supported for complex key storage"); + } + + bool supportsComplexKeys() const override { return dictionary_key_type == DictionaryKeyType::complex; } + + ComplexKeysStorageFetchResult fetchColumnsForKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & column_fetch_requests) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + return fetchColumnsForKeysImpl(keys, column_fetch_requests); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method fetchColumnsForKeys is not supported for simple key storage"); + } + + void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + insertColumnsForKeysImpl(keys, columns); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertColumnsForKeys is not supported for simple key storage"); + } + + void insertDefaultKeys(const PaddedPODArray & keys) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + insertDefaultKeysImpl(keys); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertDefaultKeysImpl is not supported for simple key storage"); + } + + PaddedPODArray getCachedComplexKeys() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + return getCachedKeysImpl(); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getCachedComplexKeys is not supported for simple key storage"); + } + + size_t getSize() const override { return size; } + + double getLoadFactor() const override { return static_cast(size) / configuration.max_size_in_cells; } + + size_t getBytesAllocated() const override + { + size_t attributes_size_in_bytes = 0; + size_t attributes_size = attributes.size(); + + for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index) + { + getAttributeContainer(attribute_index, [&](const auto & container) + { + attributes_size_in_bytes += container.capacity() * sizeof(container[0]); + }); + } + + return arena.size() + sizeof(Cell) * configuration.max_size_in_cells + attributes_size_in_bytes; + } + +private: + + struct FetchedKey + { + FetchedKey(size_t element_index_, bool is_default_) + : element_index(element_index_) + , is_default(is_default_) + {} + + size_t element_index; + bool is_default; + }; + + template + KeysStorageFetchResult fetchColumnsForKeysImpl( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & fetch_request) + { + KeysStorageFetchResult result; + + result.fetched_columns = fetch_request.makeAttributesResultColumns(); + result.key_index_to_state.resize_fill(keys.size()); + + const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + + size_t fetched_columns_index = 0; + size_t keys_size = keys.size(); + + PaddedPODArray fetched_keys; + fetched_keys.resize_fill(keys_size); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys[key_index]; + auto [key_state, cell_index] = getKeyStateAndCellIndex(key, now); + + if (unlikely(key_state == KeyState::not_found)) + { + result.key_index_to_state[key_index] = {KeyState::not_found}; + ++result.not_found_keys_size; + continue; + } + + auto & cell = cells[cell_index]; + + result.expired_keys_size += static_cast(key_state == KeyState::expired); + + result.key_index_to_state[key_index] = {key_state, fetched_columns_index}; + fetched_keys[fetched_columns_index] = FetchedKey(cell.element_index, cell.is_default); + + ++fetched_columns_index; + + result.key_index_to_state[key_index].setDefaultValue(cell.is_default); + result.default_keys_size += cell.is_default; + } + + result.found_keys_size = keys_size - (result.expired_keys_size + result.not_found_keys_size); + + for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index) + { + if (!fetch_request.shouldFillResultColumnWithIndex(attribute_index)) + continue; + + auto & attribute = attributes[attribute_index]; + const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(attribute_index); + + size_t fetched_keys_size = fetched_keys.size(); + auto & fetched_column = *result.fetched_columns[attribute_index]; + fetched_column.reserve(fetched_keys_size); + + if (unlikely(attribute.is_complex_type)) + { + auto & container = std::get>(attribute.attribute_container); + + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) + { + auto fetched_key = fetched_keys[fetched_key_index]; + + if (unlikely(fetched_key.is_default)) + fetched_column.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + fetched_column.insert(container[fetched_key.element_index]); + } + } + else + { + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnType = + std::conditional_t, ColumnString, + std::conditional_t, ColumnDecimal, + ColumnVector>>; + + auto & container = std::get>(attribute.attribute_container); + ColumnType & column_typed = static_cast(fetched_column); + + if constexpr (std::is_same_v) + { + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) + { + auto fetched_key = fetched_keys[fetched_key_index]; + + if (unlikely(fetched_key.is_default)) + column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + { + auto item = container[fetched_key.element_index]; + column_typed.insertData(item.data, item.size); + } + } + } + else + { + auto & data = column_typed.getData(); + + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) + { + auto fetched_key = fetched_keys[fetched_key_index]; + + if (unlikely(fetched_key.is_default)) + column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + { + auto item = container[fetched_key.element_index]; + data.push_back(item); + } + } + } + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + } + } + + return result; + } + + void insertColumnsForKeysImpl(const PaddedPODArray & keys, Columns columns) + { + const auto now = std::chrono::system_clock::now(); + + Field column_value; + + for (size_t key_index = 0; key_index < keys.size(); ++key_index) + { + auto key = keys[key_index]; + + size_t cell_index = getCellIndexForInsert(key); + auto & cell = cells[cell_index]; + + bool cell_was_default = cell.is_default; + cell.is_default = false; + + bool was_inserted = cell.deadline == 0; + + if (was_inserted) + { + if constexpr (std::is_same_v) + cell.key = copyStringInArena(key); + else + cell.key = key; + + for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index) + { + auto & column = columns[attribute_index]; + + getAttributeContainer(attribute_index, [&](auto & container) + { + container.emplace_back(); + cell.element_index = container.size() - 1; + + using ElementType = std::decay_t; + + column->get(key_index, column_value); + + if constexpr (std::is_same_v) + container.back() = column_value; + else if constexpr (std::is_same_v) + { + const String & string_value = column_value.get(); + StringRef string_value_ref = StringRef {string_value.data(), string_value.size()}; + StringRef inserted_value = copyStringInArena(string_value_ref); + container.back() = inserted_value; + } + else + container.back() = column_value.get>(); + }); + } + + ++size; + } + else + { + if (cell.key != key) + { + if constexpr (std::is_same_v) + { + char * data = const_cast(cell.key.data); + arena.free(data, cell.key.size); + cell.key = copyStringInArena(key); + } + else + cell.key = key; + } + + /// Put values into existing index + size_t index_to_use = cell.element_index; + + for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index) + { + auto & column = columns[attribute_index]; + + getAttributeContainer(attribute_index, [&](auto & container) + { + using ElementType = std::decay_t; + + column->get(key_index, column_value); + + if constexpr (std::is_same_v) + container[index_to_use] = column_value; + else if constexpr (std::is_same_v) + { + const String & string_value = column_value.get(); + StringRef string_ref_value = StringRef {string_value.data(), string_value.size()}; + StringRef inserted_value = copyStringInArena(string_ref_value); + + if (!cell_was_default) + { + StringRef previous_value = container[index_to_use]; + arena.free(const_cast(previous_value.data), previous_value.size); + } + + container[index_to_use] = inserted_value; + } + else + container[index_to_use] = column_value.get>(); + }); + } + } + + setCellDeadline(cell, now); + } + } + + void insertDefaultKeysImpl(const PaddedPODArray & keys) + { + const auto now = std::chrono::system_clock::now(); + + size_t keys_size = keys.size(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys[key_index]; + + size_t cell_index = getCellIndexForInsert(key); + auto & cell = cells[cell_index]; + + bool was_inserted = cell.deadline == 0; + bool cell_was_default = cell.is_default; + + cell.is_default = true; + + if (was_inserted) + { + if constexpr (std::is_same_v) + cell.key = copyStringInArena(key); + else + cell.key = key; + + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + { + getAttributeContainer(attribute_index, [&](auto & container) + { + container.emplace_back(); + cell.element_index = container.size() - 1; + }); + } + + ++size; + } + else + { + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + { + getAttributeContainer(attribute_index, [&](const auto & container) + { + using ElementType = std::decay_t; + + if constexpr (std::is_same_v) + { + if (!cell_was_default) + { + StringRef previous_value = container[cell.element_index]; + arena.free(const_cast(previous_value.data), previous_value.size); + } + } + }); + } + + if (cell.key != key) + { + if constexpr (std::is_same_v) + { + char * data = const_cast(cell.key.data); + arena.free(data, cell.key.size); + cell.key = copyStringInArena(key); + } + else + cell.key = key; + } + } + + setCellDeadline(cell, now); + } + } + + PaddedPODArray getCachedKeysImpl() const + { + PaddedPODArray result; + result.reserve(size); + + for (auto & cell : cells) + { + if (cell.deadline == 0) + continue; + + if (cell.is_default) + continue; + + result.emplace_back(cell.key); + } + + return result; + } + + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) + { + auto & attribute = attributes[attribute_index]; + auto & attribute_type = attribute.type; + + if (unlikely(attribute.is_complex_type)) + { + auto & container = std::get>(attribute.attribute_container); + std::forward(func)(container); + } + else + { + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + + auto & container = std::get>(attribute.attribute_container); + std::forward(func)(container); + }; + + callOnDictionaryAttributeType(attribute_type, type_call); + } + } + + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) const + { + return const_cast *>(this)->template getAttributeContainer(attribute_index, std::forward(func)); + } + + StringRef copyStringInArena(StringRef value_to_copy) + { + size_t value_to_copy_size = value_to_copy.size; + char * place_for_key = arena.alloc(value_to_copy_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value_to_copy.data), value_to_copy_size); + StringRef updated_value{place_for_key, value_to_copy_size}; + + return updated_value; + } + + void setup(const DictionaryStructure & dictionary_structure) + { + /// For each dictionary attribute create storage attribute + /// For simple attributes create PODArray, for complex vector of Fields + + attributes.reserve(dictionary_structure.attributes.size()); + + for (const auto & dictionary_attribute : dictionary_structure.attributes) + { + auto attribute_type = dictionary_attribute.underlying_type; + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + + attributes.emplace_back(); + auto & last_attribute = attributes.back(); + last_attribute.type = attribute_type; + last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array; + + if (dictionary_attribute.is_nullable) + last_attribute.attribute_container = std::vector(); + else + last_attribute.attribute_container = PaddedPODArray(); + }; + + callOnDictionaryAttributeType(attribute_type, type_call); + } + } + + using TimePoint = std::chrono::system_clock::time_point; + + struct Cell + { + KeyType key; + size_t element_index; + bool is_default; + time_t deadline; + }; + + struct Attribute + { + AttributeUnderlyingType type; + bool is_complex_type; + + std::variant< + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + std::vector> attribute_container; + }; + + CacheDictionaryStorageConfiguration configuration; + + pcg64 rnd_engine; + + size_t size_overlap_mask = 0; + + size_t size = 0; + + PaddedPODArray cells; + + ArenaWithFreeLists arena; + + std::vector attributes; + + inline void setCellDeadline(Cell & cell, TimePoint now) + { + if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) + { + /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds + /// to the expiration time. And it overflows pretty well. + auto deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); + return; + } + + size_t min_sec_lifetime = configuration.lifetime.min_sec; + size_t max_sec_lifetime = configuration.lifetime.max_sec; + + std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; + + auto deadline = now + std::chrono::seconds(distribution(rnd_engine)); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); + } + + inline size_t getCellIndex(const KeyType key) const + { + const size_t hash = DefaultHash()(key); + const size_t index = hash & size_overlap_mask; + return index; + } + + using KeyStateAndCellIndex = std::pair; + + inline KeyStateAndCellIndex getKeyStateAndCellIndex(const KeyType key, const time_t now) const + { + size_t place_value = getCellIndex(key); + const size_t place_value_end = place_value + max_collision_length; + + time_t max_lifetime_seconds = static_cast(configuration.strict_max_lifetime_seconds); + + for (; place_value < place_value_end; ++place_value) + { + const auto cell_place_value = place_value & size_overlap_mask; + const auto & cell = cells[cell_place_value]; + + if (cell.key != key) + continue; + + if (unlikely(now > cell.deadline + max_lifetime_seconds)) + return std::make_pair(KeyState::not_found, cell_place_value); + + if (unlikely(now > cell.deadline)) + return std::make_pair(KeyState::expired, cell_place_value); + + return std::make_pair(KeyState::found, cell_place_value); + } + + return std::make_pair(KeyState::not_found, place_value & size_overlap_mask); + } + + inline size_t getCellIndexForInsert(const KeyType & key) const + { + size_t place_value = getCellIndex(key); + const size_t place_value_end = place_value + max_collision_length; + size_t oldest_place_value = place_value; + + time_t oldest_time = std::numeric_limits::max(); + + for (; place_value < place_value_end; ++place_value) + { + const size_t cell_place_value = place_value & size_overlap_mask; + const Cell cell = cells[cell_place_value]; + + if (cell.deadline == 0) + return cell_place_value; + + if (cell.key == key) + return cell_place_value; + + if (cell.deadline < oldest_time) + { + oldest_time = cell.deadline; + oldest_place_value = cell_place_value; + } + } + + return oldest_place_value; + } +}; + +} diff --git a/src/Dictionaries/CacheDictionaryUpdateQueue.cpp b/src/Dictionaries/CacheDictionaryUpdateQueue.cpp new file mode 100644 index 00000000000..310abed822f --- /dev/null +++ b/src/Dictionaries/CacheDictionaryUpdateQueue.cpp @@ -0,0 +1,161 @@ +#include "CacheDictionaryUpdateQueue.h" + +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CACHE_DICTIONARY_UPDATE_FAIL; + extern const int UNSUPPORTED_METHOD; + extern const int TIMEOUT_EXCEEDED; +} + +template class CacheDictionaryUpdateUnit; +template class CacheDictionaryUpdateUnit; + +template +CacheDictionaryUpdateQueue::CacheDictionaryUpdateQueue( + String dictionary_name_for_logs_, + CacheDictionaryUpdateQueueConfiguration configuration_, + UpdateFunction && update_func_) + : dictionary_name_for_logs(std::move(dictionary_name_for_logs_)) + , configuration(configuration_) + , update_func(std::move(update_func_)) + , update_queue(configuration.max_update_queue_size) + , update_pool(configuration.max_threads_for_updates) +{ + for (size_t i = 0; i < configuration.max_threads_for_updates; ++i) + update_pool.scheduleOrThrowOnError([this] { updateThreadFunction(); }); +} + +template +CacheDictionaryUpdateQueue::~CacheDictionaryUpdateQueue() +{ + try { + if (!finished) + stopAndWait(); + } + catch (...) + { + /// TODO: Write log + } +} + +template +void CacheDictionaryUpdateQueue::tryPushToUpdateQueueOrThrow(CacheDictionaryUpdateUnitPtr & update_unit_ptr) +{ + if (finished) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "CacheDictionaryUpdateQueue finished"); + + if (!update_queue.tryPush(update_unit_ptr, configuration.update_queue_push_timeout_milliseconds)) + throw DB::Exception(ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL, + "Cannot push to internal update queue in dictionary {}. " + "Timelimit of {} ms. exceeded. Current queue size is {}", + dictionary_name_for_logs, + std::to_string(configuration.update_queue_push_timeout_milliseconds), + std::to_string(update_queue.size())); +} + +template +void CacheDictionaryUpdateQueue::waitForCurrentUpdateFinish(CacheDictionaryUpdateUnitPtr & update_unit_ptr) const +{ + if (finished) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "CacheDictionaryUpdateQueue finished"); + + std::unique_lock update_lock(update_mutex); + + bool result = is_update_finished.wait_for( + update_lock, + std::chrono::milliseconds(configuration.query_wait_timeout_milliseconds), + [&] + { + return update_unit_ptr->is_done || update_unit_ptr->current_exception; + }); + + if (!result) + { + throw DB::Exception( + ErrorCodes::TIMEOUT_EXCEEDED, + "Dictionary {} source seems unavailable, because {} ms timeout exceeded.", + dictionary_name_for_logs, + toString(configuration.query_wait_timeout_milliseconds)); + } + + if (update_unit_ptr->current_exception) + { + // Don't just rethrow it, because sharing the same exception object + // between multiple threads can lead to weird effects if they decide to + // modify it, for example, by adding some error context. + try + { + std::rethrow_exception(update_unit_ptr->current_exception); + } + catch (...) + { + throw DB::Exception( + ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL, + "Update failed for dictionary '{}': {}", + dictionary_name_for_logs, + getCurrentExceptionMessage(true /*with stack trace*/, true /*check embedded stack trace*/)); + } + } +} + +template +void CacheDictionaryUpdateQueue::stopAndWait() +{ + finished = true; + update_queue.clear(); + + for (size_t i = 0; i < configuration.max_threads_for_updates; ++i) + { + auto empty_finishing_ptr = std::make_shared>(); + update_queue.push(empty_finishing_ptr); + } + + update_pool.wait(); +} + +template +void CacheDictionaryUpdateQueue::updateThreadFunction() +{ + setThreadName("UpdQueue"); + + while (!finished) + { + CacheDictionaryUpdateUnitPtr unit_to_update; + update_queue.pop(unit_to_update); + + if (finished) + break; + + try + { + /// Update + update_func(unit_to_update); + + /// Notify thread about finished updating the bunch of ids + /// where their own ids were included. + std::unique_lock lock(update_mutex); + + unit_to_update->is_done = true; + is_update_finished.notify_all(); + } + catch (...) + { + std::unique_lock lock(update_mutex); + + unit_to_update->current_exception = std::current_exception(); // NOLINT(bugprone-throw-keyword-missing) + is_update_finished.notify_all(); + } + } +} + +template class CacheDictionaryUpdateQueue; +template class CacheDictionaryUpdateQueue; + +} diff --git a/src/Dictionaries/CacheDictionaryUpdateQueue.h b/src/Dictionaries/CacheDictionaryUpdateQueue.h new file mode 100644 index 00000000000..3d27a157752 --- /dev/null +++ b/src/Dictionaries/CacheDictionaryUpdateQueue.h @@ -0,0 +1,173 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace CurrentMetrics +{ + extern const Metric CacheDictionaryUpdateQueueBatches; + extern const Metric CacheDictionaryUpdateQueueKeys; +} + +namespace DB +{ + +/** This class is passed between update queue and update queue client during update. + + For simple keys we pass simple keys. + + For complex keys we pass complex keys columns and requested rows to update. + + During update cache dictionary should fill requested_keys_to_fetched_columns_during_update_index and + fetched_columns_during_update. + + For complex key to extend lifetime of key complex key arena should be used. +*/ +template +class CacheDictionaryUpdateUnit +{ +public: + using KeyType = std::conditional_t; + + /// Constructor for complex keys update request + explicit CacheDictionaryUpdateUnit( + const Columns & key_columns_, + const PaddedPODArray & key_index_to_state_from_storage_, + const DictionaryStorageFetchRequest & request_, + size_t keys_to_update_size_) + : key_columns(key_columns_) + , key_index_to_state(key_index_to_state_from_storage_.begin(), key_index_to_state_from_storage_.end()) + , request(request_) + , keys_to_update_size(keys_to_update_size_) + , alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, keys_to_update_size) + {} + + CacheDictionaryUpdateUnit() + : keys_to_update_size(0) + , alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, 0) + {} + + const Columns key_columns; + const PaddedPODArray key_index_to_state; + const DictionaryStorageFetchRequest request; + const size_t keys_to_update_size; + + HashMap requested_keys_to_fetched_columns_during_update_index; + MutableColumns fetched_columns_during_update; + + /// Complex keys are serialized in this arena + DictionaryKeysArenaHolder complex_keys_arena_holder; + +private: + template + friend class CacheDictionaryUpdateQueue; + + std::atomic is_done{false}; + std::exception_ptr current_exception{nullptr}; + + /// While UpdateUnit is alive, it is accounted in update_queue size. + CurrentMetrics::Increment alive_batch{CurrentMetrics::CacheDictionaryUpdateQueueBatches}; + CurrentMetrics::Increment alive_keys; +}; + +template +using CacheDictionaryUpdateUnitPtr = std::shared_ptr>; + +extern template class CacheDictionaryUpdateUnit; +extern template class CacheDictionaryUpdateUnit; + +struct CacheDictionaryUpdateQueueConfiguration +{ + /// Size of update queue + const size_t max_update_queue_size; + /// Size in thead pool of update queue + const size_t max_threads_for_updates; + /// Timeout for trying to push update unit into queue + const size_t update_queue_push_timeout_milliseconds; + /// Timeout during sync waititing of update unit + const size_t query_wait_timeout_milliseconds; +}; + +/** Responsibility of this class is to provide asynchronous and synchronous update support for CacheDictionary + + It is responsibility of CacheDictionary to perform update with UpdateUnit using UpdateFunction. +*/ +template +class CacheDictionaryUpdateQueue +{ +public: + /// Client of update queue must provide this function in constructor and perform update using update unit. + using UpdateFunction = std::function)>; + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryUpdateQueue"); + + CacheDictionaryUpdateQueue( + String dictionary_name_for_logs_, + CacheDictionaryUpdateQueueConfiguration configuration_, + UpdateFunction && update_func_); + + ~CacheDictionaryUpdateQueue(); + + /// Get configuration that was passed to constructor + const CacheDictionaryUpdateQueueConfiguration & getConfiguration() const { return configuration; } + + /// Is queue finished + bool isFinished() const { return finished; } + + /// Synchronous wait for update queue to stop + void stopAndWait(); + + /** Try to add update unit into queue. + + If queue is full and oush cannot be performed in update_queue_push_timeout_milliseconds from configuration + an exception will be thrown. + + If queue already finished an exception will be thrown. + */ + void tryPushToUpdateQueueOrThrow(CacheDictionaryUpdateUnitPtr & update_unit_ptr); + + /** Try to synchronously wait for update completion. + + If exception was passed from update function during update it will be rethrowed. + + If update will not be finished in query_wait_timeout_milliseconds from configuration + an exception will be thrown. + + If queue already finished an exception will be thrown. + */ + void waitForCurrentUpdateFinish(CacheDictionaryUpdateUnitPtr & update_unit_ptr) const; + +private: + void updateThreadFunction(); + + using UpdateQueue = ConcurrentBoundedQueue>; + + String dictionary_name_for_logs; + + CacheDictionaryUpdateQueueConfiguration configuration; + UpdateFunction update_func; + + UpdateQueue update_queue; + ThreadPool update_pool; + + mutable std::mutex update_mutex; + mutable std::condition_variable is_update_finished; + + std::atomic finished{false}; +}; + +extern template class CacheDictionaryUpdateQueue; +extern template class CacheDictionaryUpdateQueue; + +} diff --git a/src/Dictionaries/CassandraBlockInputStream.cpp b/src/Dictionaries/CassandraBlockInputStream.cpp index 721cb44a82e..4e71c212451 100644 --- a/src/Dictionaries/CassandraBlockInputStream.cpp +++ b/src/Dictionaries/CassandraBlockInputStream.cpp @@ -142,7 +142,7 @@ void CassandraBlockInputStream::insertValue(IColumn & column, ValueType type, co break; } default: - throw Exception("Unknown type : " + std::to_string(static_cast(type)), ErrorCodes::UNKNOWN_TYPE); + throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown type : {}", std::to_string(static_cast(type))); } } @@ -256,7 +256,7 @@ void CassandraBlockInputStream::assertTypes(const CassResultPtr & result) expected_text = "uuid"; break; default: - throw Exception("Unknown type : " + std::to_string(static_cast(description.types[i].first)), ErrorCodes::UNKNOWN_TYPE); + throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown type : {}", std::to_string(static_cast(description.types[i].first))); } CassValueType got = cass_result_column_type(result, i); @@ -267,8 +267,10 @@ void CassandraBlockInputStream::assertTypes(const CassResultPtr & result) continue; const auto & column_name = description.sample_block.getColumnsWithTypeAndName()[i].name; - throw Exception("Type mismatch for column " + column_name + ": expected Cassandra type " + expected_text, - ErrorCodes::TYPE_MISMATCH); + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Type mismatch for column {} : expected Cassandra type {}", + column_name, + expected_text); } } diff --git a/src/Dictionaries/CassandraDictionarySource.cpp b/src/Dictionaries/CassandraDictionarySource.cpp index 643e9af54b8..b4f7f3221bc 100644 --- a/src/Dictionaries/CassandraDictionarySource.cpp +++ b/src/Dictionaries/CassandraDictionarySource.cpp @@ -17,7 +17,7 @@ void registerDictionarySourceCassandra(DictionarySourceFactory & factory) [[maybe_unused]] const Poco::Util::AbstractConfiguration & config, [[maybe_unused]] const std::string & config_prefix, [[maybe_unused]] Block & sample_block, - const Context & /* context */, + ContextPtr /* context */, const std::string & /* default_database */, bool /*check_config*/) -> DictionarySourcePtr { @@ -25,8 +25,8 @@ void registerDictionarySourceCassandra(DictionarySourceFactory & factory) setupCassandraDriverLibraryLogging(CASS_LOG_INFO); return std::make_unique(dict_struct, config, config_prefix + ".cassandra", sample_block); #else - throw Exception{"Dictionary source of type `cassandra` is disabled because ClickHouse was built without cassandra support.", - ErrorCodes::SUPPORT_IS_DISABLED}; + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Dictionary source of type `cassandra` is disabled because ClickHouse was built without cassandra support."); #endif }; factory.registerSource("cassandra", create_table_source); @@ -90,7 +90,7 @@ void CassandraSettings::setConsistency(const String & config_str) else if (config_str == "LocalSerial") consistency = CASS_CONSISTENCY_LOCAL_SERIAL; else /// CASS_CONSISTENCY_ANY is only valid for writes - throw Exception("Unsupported consistency level: " + config_str, ErrorCodes::INVALID_CONFIG_PARAMETER); + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Unsupported consistency level: {}", config_str); } static const size_t max_block_size = 8192; @@ -156,7 +156,7 @@ BlockInputStreamPtr CassandraDictionarySource::loadIds(const std::vector BlockInputStreamPtr CassandraDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) { if (requested_rows.empty()) - throw Exception("No rows requested", ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No rows requested"); /// TODO is there a better way to load data by complex keys? std::unordered_map> partitions; @@ -185,7 +185,7 @@ BlockInputStreamPtr CassandraDictionarySource::loadKeys(const Columns & key_colu BlockInputStreamPtr CassandraDictionarySource::loadUpdatedAll() { - throw Exception("Method loadUpdatedAll is unsupported for CassandraDictionarySource", ErrorCodes::NOT_IMPLEMENTED); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method loadUpdatedAll is unsupported for CassandraDictionarySource"); } CassSessionShared CassandraDictionarySource::getSession() diff --git a/src/Dictionaries/CassandraHelpers.cpp b/src/Dictionaries/CassandraHelpers.cpp index 6de80a455c7..81f7d6d9a63 100644 --- a/src/Dictionaries/CassandraHelpers.cpp +++ b/src/Dictionaries/CassandraHelpers.cpp @@ -16,8 +16,10 @@ extern const int CASSANDRA_INTERNAL_ERROR; void cassandraCheck(CassError code) { if (code != CASS_OK) - throw Exception("Cassandra driver error " + std::to_string(code) + ": " + cass_error_desc(code), - ErrorCodes::CASSANDRA_INTERNAL_ERROR); + throw Exception(ErrorCodes::CASSANDRA_INTERNAL_ERROR, + "Cassandra driver error {}: {}", + std::to_string(code), + cass_error_desc(code)); } @@ -31,8 +33,12 @@ void cassandraWaitAndCheck(CassFuturePtr & future) const char * message; size_t message_len; cass_future_error_message(future, &message, & message_len); - std::string full_message = "Cassandra driver error " + std::to_string(code) + ": " + cass_error_desc(code) + ": " + message; - throw Exception(full_message, ErrorCodes::CASSANDRA_INTERNAL_ERROR); + + throw Exception(ErrorCodes::CASSANDRA_INTERNAL_ERROR, + "Cassandra driver error {}: {}: {}", + std::to_string(code), + cass_error_desc(code), + message); } static std::once_flag setup_logging_flag; diff --git a/src/Dictionaries/ClickHouseDictionarySource.cpp b/src/Dictionaries/ClickHouseDictionarySource.cpp index 8d733bcd90a..fdb0d76a8d7 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.cpp +++ b/src/Dictionaries/ClickHouseDictionarySource.cpp @@ -18,111 +18,85 @@ namespace DB { -static const size_t MAX_CONNECTIONS = 16; - -inline static UInt16 getPortFromContext(const Context & context, bool secure) +namespace ErrorCodes { - return secure ? context.getTCPPortSecure().value_or(0) : context.getTCPPort(); + extern const int BAD_ARGUMENTS; } -static ConnectionPoolWithFailoverPtr createPool( - const std::string & host, - UInt16 port, - bool secure, - const std::string & db, - const std::string & user, - const std::string & password) +namespace { - ConnectionPoolPtrs pools; - pools.emplace_back(std::make_shared( - MAX_CONNECTIONS, - host, - port, - db, - user, - password, - "", /* cluster */ - "", /* cluster_secret */ - "ClickHouseDictionarySource", - Protocol::Compression::Enable, - secure ? Protocol::Secure::Enable : Protocol::Secure::Disable)); - return std::make_shared(pools, LoadBalancing::RANDOM); -} + constexpr size_t MAX_CONNECTIONS = 16; + inline UInt16 getPortFromContext(ContextPtr context, bool secure) + { + return secure ? context->getTCPPortSecure().value_or(0) : context->getTCPPort(); + } + + ConnectionPoolWithFailoverPtr createPool(const ClickHouseDictionarySource::Configuration & configuration) + { + if (configuration.is_local) + return nullptr; + + ConnectionPoolPtrs pools; + pools.emplace_back(std::make_shared( + MAX_CONNECTIONS, + configuration.host, + configuration.port, + configuration.db, + configuration.user, + configuration.password, + "", /* cluster */ + "", /* cluster_secret */ + "ClickHouseDictionarySource", + Protocol::Compression::Enable, + configuration.secure ? Protocol::Secure::Enable : Protocol::Secure::Disable)); + + return std::make_shared(pools, LoadBalancing::RANDOM); + } + +} ClickHouseDictionarySource::ClickHouseDictionarySource( const DictionaryStructure & dict_struct_, - const Poco::Util::AbstractConfiguration & config, - const std::string & path_to_settings, - const std::string & config_prefix, + const Configuration & configuration_, const Block & sample_block_, - const Context & context_, - const std::string & default_database) + ContextPtr context_) : update_time{std::chrono::system_clock::from_time_t(0)} , dict_struct{dict_struct_} - , secure(config.getBool(config_prefix + ".secure", false)) - , host{config.getString(config_prefix + ".host", "localhost")} - , port(config.getInt(config_prefix + ".port", getPortFromContext(context_, secure))) - , user{config.getString(config_prefix + ".user", "default")} - , password{config.getString(config_prefix + ".password", "")} - , db{config.getString(config_prefix + ".db", default_database)} - , table{config.getString(config_prefix + ".table")} - , where{config.getString(config_prefix + ".where", "")} - , update_field{config.getString(config_prefix + ".update_field", "")} - , invalidate_query{config.getString(config_prefix + ".invalidate_query", "")} - , query_builder{dict_struct, db, "", table, where, IdentifierQuotingStyle::Backticks} + , configuration{configuration_} + , query_builder{dict_struct, configuration.db, "", configuration.table, configuration.where, IdentifierQuotingStyle::Backticks} , sample_block{sample_block_} - , context(context_) - , is_local{isLocalAddress({host, port}, getPortFromContext(context_, secure))} - , pool{is_local ? nullptr : createPool(host, port, secure, db, user, password)} + , context(Context::createCopy(context_)) + , pool{createPool(configuration)} , load_all_query{query_builder.composeLoadAllQuery()} { - /// We should set user info even for the case when the dictionary is loaded in-process (without TCP communication). - if (is_local) - { - context.setUser(user, password, Poco::Net::SocketAddress("127.0.0.1", 0)); - context = copyContextAndApplySettings(path_to_settings, context, config); - } - /// Query context is needed because some code in executeQuery function may assume it exists. /// Current example is Context::getSampleBlockCache from InterpreterSelectWithUnionQuery::getSampleBlock. - context.makeQueryContext(); + context->makeQueryContext(); } - ClickHouseDictionarySource::ClickHouseDictionarySource(const ClickHouseDictionarySource & other) : update_time{other.update_time} , dict_struct{other.dict_struct} - , secure{other.secure} - , host{other.host} - , port{other.port} - , user{other.user} - , password{other.password} - , db{other.db} - , table{other.table} - , where{other.where} - , update_field{other.update_field} - , invalidate_query{other.invalidate_query} + , configuration{other.configuration} , invalidate_query_response{other.invalidate_query_response} - , query_builder{dict_struct, db, "", table, where, IdentifierQuotingStyle::Backticks} + , query_builder{dict_struct, configuration.db, "", configuration.table, configuration.where, IdentifierQuotingStyle::Backticks} , sample_block{other.sample_block} - , context(other.context) - , is_local{other.is_local} - , pool{is_local ? nullptr : createPool(host, port, secure, db, user, password)} + , context(Context::createCopy(other.context)) + , pool{createPool(configuration)} , load_all_query{other.load_all_query} { - context.makeQueryContext(); + context->makeQueryContext(); } std::string ClickHouseDictionarySource::getUpdateFieldAndDate() { if (update_time != std::chrono::system_clock::from_time_t(0)) { - auto tmp_time = update_time; + time_t hr_time = std::chrono::system_clock::to_time_t(update_time) - 1; + std::string str_time = DateLUT::instance().timeToString(hr_time); update_time = std::chrono::system_clock::now(); - time_t hr_time = std::chrono::system_clock::to_time_t(tmp_time) - 1; - std::string str_time = std::to_string(LocalDateTime(hr_time)); - return query_builder.composeUpdateQuery(update_field, str_time); + return query_builder.composeUpdateQuery(configuration.update_field, str_time); } else { @@ -133,48 +107,32 @@ std::string ClickHouseDictionarySource::getUpdateFieldAndDate() BlockInputStreamPtr ClickHouseDictionarySource::loadAll() { - /** Query to local ClickHouse is marked internal in order to avoid - * the necessity of holding process_list_element shared pointer. - */ - if (is_local) - { - auto stream = executeQuery(load_all_query, context, true).getInputStream(); - /// FIXME res.in may implicitly use some objects owned be res, but them will be destructed after return - stream = std::make_shared(stream, sample_block, ConvertingBlockInputStream::MatchColumnsMode::Position); - return stream; - } - return std::make_shared(pool, load_all_query, sample_block, context); + return createStreamForQuery(load_all_query); } BlockInputStreamPtr ClickHouseDictionarySource::loadUpdatedAll() { - std::string load_update_query = getUpdateFieldAndDate(); - if (is_local) - { - auto stream = executeQuery(load_update_query, context, true).getInputStream(); - stream = std::make_shared(stream, sample_block, ConvertingBlockInputStream::MatchColumnsMode::Position); - return stream; - } - return std::make_shared(pool, load_update_query, sample_block, context); + String load_update_query = getUpdateFieldAndDate(); + return createStreamForQuery(load_update_query); } BlockInputStreamPtr ClickHouseDictionarySource::loadIds(const std::vector & ids) { - return createStreamForSelectiveLoad(query_builder.composeLoadIdsQuery(ids)); + return createStreamForQuery(query_builder.composeLoadIdsQuery(ids)); } BlockInputStreamPtr ClickHouseDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) { - return createStreamForSelectiveLoad( - query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::IN_WITH_TUPLES)); + String query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::IN_WITH_TUPLES); + return createStreamForQuery(query); } bool ClickHouseDictionarySource::isModified() const { - if (!invalidate_query.empty()) + if (!configuration.invalidate_query.empty()) { - auto response = doInvalidateQuery(invalidate_query); + auto response = doInvalidateQuery(configuration.invalidate_query); LOG_TRACE(log, "Invalidate query has returned: {}, previous value: {}", response, invalidate_query_response); if (invalidate_query_response == response) return false; @@ -185,34 +143,36 @@ bool ClickHouseDictionarySource::isModified() const bool ClickHouseDictionarySource::hasUpdateField() const { - return !update_field.empty(); + return !configuration.update_field.empty(); } std::string ClickHouseDictionarySource::toString() const { - return "ClickHouse: " + db + '.' + table + (where.empty() ? "" : ", where: " + where); + const std::string & where = configuration.where; + return "ClickHouse: " + configuration.db + '.' + configuration.table + (where.empty() ? "" : ", where: " + where); } - -BlockInputStreamPtr ClickHouseDictionarySource::createStreamForSelectiveLoad(const std::string & query) +BlockInputStreamPtr ClickHouseDictionarySource::createStreamForQuery(const String & query) { - if (is_local) + /// Sample block should not contain first row default values + auto empty_sample_block = sample_block.cloneEmpty(); + + if (configuration.is_local) { - auto res = executeQuery(query, context, true).getInputStream(); - res = std::make_shared( - res, sample_block, ConvertingBlockInputStream::MatchColumnsMode::Position); - return res; + auto stream = executeQuery(query, context, true).getInputStream(); + stream = std::make_shared(stream, empty_sample_block, ConvertingBlockInputStream::MatchColumnsMode::Position); + return stream; } - return std::make_shared(pool, query, sample_block, context); + return std::make_shared(pool, query, empty_sample_block, context); } std::string ClickHouseDictionarySource::doInvalidateQuery(const std::string & request) const { LOG_TRACE(log, "Performing invalidate query"); - if (is_local) + if (configuration.is_local) { - Context query_context = context; + auto query_context = Context::createCopy(context); auto input_block = executeQuery(request, query_context, true).getInputStream(); return readInvalidateQuery(*input_block); } @@ -225,20 +185,55 @@ std::string ClickHouseDictionarySource::doInvalidateQuery(const std::string & re } } - void registerDictionarySourceClickHouse(DictionarySourceFactory & factory) { auto create_table_source = [=](const DictionaryStructure & dict_struct, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block, - const Context & context, - const std::string & default_database, + ContextPtr context, + const std::string & default_database [[maybe_unused]], bool /* check_config */) -> DictionarySourcePtr { - return std::make_unique( - dict_struct, config, config_prefix, config_prefix + ".clickhouse", sample_block, context, default_database); + bool secure = config.getBool(config_prefix + ".secure", false); + auto context_copy = Context::createCopy(context); + + UInt16 default_port = getPortFromContext(context_copy, secure); + + std::string settings_config_prefix = config_prefix + ".clickhouse"; + std::string host = config.getString(settings_config_prefix + ".host", "localhost"); + UInt16 port = static_cast(config.getUInt(settings_config_prefix + ".port", default_port)); + + ClickHouseDictionarySource::Configuration configuration { + .secure = config.getBool(settings_config_prefix + ".secure", false), + .host = host, + .port = port, + .user = config.getString(settings_config_prefix + ".user", "default"), + .password = config.getString(settings_config_prefix + ".password", ""), + .db = config.getString(settings_config_prefix + ".db", default_database), + .table = config.getString(settings_config_prefix + ".table"), + .where = config.getString(settings_config_prefix + ".where", ""), + .update_field = config.getString(settings_config_prefix + ".update_field", ""), + .invalidate_query = config.getString(settings_config_prefix + ".invalidate_query", ""), + .is_local = isLocalAddress({host, port}, default_port) + }; + + /// We should set user info even for the case when the dictionary is loaded in-process (without TCP communication). + if (configuration.is_local) + { + context_copy->setUser(configuration.user, configuration.password, Poco::Net::SocketAddress("127.0.0.1", 0)); + context_copy = copyContextAndApplySettings(config_prefix, context_copy, config); + } + + String dictionary_name = config.getString(".dictionary.name", ""); + String dictionary_database = config.getString(".dictionary.database", ""); + + if (dictionary_name == configuration.table && dictionary_database == configuration.db) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouseDictionarySource table cannot be dictionary table"); + + return std::make_unique(dict_struct, configuration, sample_block, context_copy); }; + factory.registerSource("clickhouse", create_table_source); } diff --git a/src/Dictionaries/ClickHouseDictionarySource.h b/src/Dictionaries/ClickHouseDictionarySource.h index 9ef77d061fd..21c290ab23b 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.h +++ b/src/Dictionaries/ClickHouseDictionarySource.h @@ -18,14 +18,26 @@ namespace DB class ClickHouseDictionarySource final : public IDictionarySource { public: + struct Configuration + { + const bool secure; + const std::string host; + const UInt16 port; + const std::string user; + const std::string password; + const std::string db; + const std::string table; + const std::string where; + const std::string update_field; + const std::string invalidate_query; + const bool is_local; + }; + ClickHouseDictionarySource( const DictionaryStructure & dict_struct_, - const Poco::Util::AbstractConfiguration & config, - const std::string & path_to_settings, - const std::string & config_prefix, + const Configuration & configuration_, const Block & sample_block_, - const Context & context, - const std::string & default_database); + ContextPtr context); /// copy-constructor is provided in order to support cloneability ClickHouseDictionarySource(const ClickHouseDictionarySource & other); @@ -50,32 +62,22 @@ public: /// Used for detection whether the hashtable should be preallocated /// (since if there is WHERE then it can filter out too much) - bool hasWhere() const { return !where.empty(); } + bool hasWhere() const { return !configuration.where.empty(); } private: std::string getUpdateFieldAndDate(); - BlockInputStreamPtr createStreamForSelectiveLoad(const std::string & query); + BlockInputStreamPtr createStreamForQuery(const String & query); std::string doInvalidateQuery(const std::string & request) const; std::chrono::time_point update_time; const DictionaryStructure dict_struct; - const bool secure; - const std::string host; - const UInt16 port; - const std::string user; - const std::string password; - const std::string db; - const std::string table; - const std::string where; - const std::string update_field; - std::string invalidate_query; + const Configuration configuration; mutable std::string invalidate_query_response; ExternalQueryBuilder query_builder; Block sample_block; - Context context; - const bool is_local; + ContextPtr context; ConnectionPoolWithFailoverPtr pool; const std::string load_all_query; Poco::Logger * log = &Poco::Logger::get("ClickHouseDictionarySource"); diff --git a/src/Dictionaries/ComplexKeyCacheDictionary.cpp b/src/Dictionaries/ComplexKeyCacheDictionary.cpp deleted file mode 100644 index cbb57f81793..00000000000 --- a/src/Dictionaries/ComplexKeyCacheDictionary.cpp +++ /dev/null @@ -1,915 +0,0 @@ -#include "ComplexKeyCacheDictionary.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include -#include - -namespace ProfileEvents -{ -extern const Event DictCacheKeysRequested; -extern const Event DictCacheKeysRequestedMiss; -extern const Event DictCacheKeysRequestedFound; -extern const Event DictCacheKeysExpired; -extern const Event DictCacheKeysNotFound; -extern const Event DictCacheKeysHit; -extern const Event DictCacheRequestTimeNs; -extern const Event DictCacheLockWriteNs; -extern const Event DictCacheLockReadNs; -} - -namespace CurrentMetrics -{ -extern const Metric DictCacheRequests; -} - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int TYPE_MISMATCH; - extern const int BAD_ARGUMENTS; - extern const int UNSUPPORTED_METHOD; - extern const int TOO_SMALL_BUFFER_SIZE; -} - - -inline UInt64 ComplexKeyCacheDictionary::getCellIdx(const StringRef key) const -{ - const auto hash = StringRefHash{}(key); - const auto idx = hash & size_overlap_mask; - return idx; -} - - -ComplexKeyCacheDictionary::ComplexKeyCacheDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - const size_t size_) - : IDictionaryBase(dict_id_) - , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} - , dict_lifetime(dict_lifetime_) - , size{roundUpToPowerOfTwoOrZero(std::max(size_, size_t(max_collision_length)))} - , size_overlap_mask{this->size - 1} - , rnd_engine(randomSeed()) -{ - if (!this->source_ptr->supportsSelectiveLoad()) - throw Exception{full_name + ": source cannot be used with ComplexKeyCacheDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - createAttributes(); -} - -ColumnPtr ComplexKeyCacheDictionary::getColumn( - const std::string & attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const -{ - dict_struct.validateKeyTypes(key_types); - - ColumnPtr result; - - auto & attribute = getAttribute(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - - auto keys_size = key_columns.front()->size(); - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto & null_value = std::get(attribute.null_values); - DictionaryDefaultValueExtractor default_value_extractor(null_value, default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) - { - auto * out = column.get(); - getItemsString(attribute, key_columns, out, default_value_extractor); - } - else - { - auto & out = column->getData(); - getItemsNumberImpl(attribute, key_columns, out, default_value_extractor); - } - - result = std::move(column); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -/// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag, -/// true false found and valid -/// false true not found (something outdated, maybe our cell) -/// false false not found (other id stored with valid data) -/// true true impossible -/// -/// todo: split this func to two: find_for_get and find_for_set -ComplexKeyCacheDictionary::FindResult -ComplexKeyCacheDictionary::findCellIdx(const StringRef & key, const CellMetadata::time_point_t now, const size_t hash) const -{ - auto pos = hash; - auto oldest_id = pos; - auto oldest_time = CellMetadata::time_point_t::max(); - const auto stop = pos + max_collision_length; - - for (; pos < stop; ++pos) - { - const auto cell_idx = pos & size_overlap_mask; - const auto & cell = cells[cell_idx]; - - if (cell.hash != hash || cell.key != key) - { - /// maybe we already found nearest expired cell - if (oldest_time > now && oldest_time > cell.expiresAt()) - { - oldest_time = cell.expiresAt(); - oldest_id = cell_idx; - } - - continue; - } - - if (cell.expiresAt() < now) - { - return {cell_idx, false, true}; - } - - return {cell_idx, true, false}; - } - - oldest_id &= size_overlap_mask; - return {oldest_id, false, false}; -} - -ColumnUInt8::Ptr ComplexKeyCacheDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const -{ - dict_struct.validateKeyTypes(key_types); - - const auto rows_num = key_columns.front()->size(); - - auto result = ColumnUInt8::create(rows_num); - auto& out = result->getData(); - - for (const auto row : ext::range(0, rows_num)) - out[row] = false; - - /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } - MapType> outdated_keys; - - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - PODArray keys_array(rows_num); - - size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - /// fetch up-to-date values, decide which ones require update - for (const auto row : ext::range(0, rows_num)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); - keys_array[row] = key; - const auto find_result = findCellIdx(key, now); - const auto & cell_idx = find_result.cell_idx; - /** cell should be updated if either: - * 1. keys (or hash) do not match, - * 2. cell has expired, - * 3. explicit defaults were specified and cell was set default. */ - if (!find_result.valid) - { - outdated_keys[key].push_back(row); - if (find_result.outdated) - ++cache_expired; - else - ++cache_not_found; - } - else - { - ++cache_hit; - const auto & cell = cells[cell_idx]; - out[row] = !cell.isDefault(); - } - } - } - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - - query_count.fetch_add(rows_num, std::memory_order_relaxed); - hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); - - if (outdated_keys.empty()) - return result; - - std::vector required_rows(outdated_keys.size()); - std::transform( - std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.getMapped().front(); }); - - /// request new values - update( - key_columns, - keys_array, - required_rows, - [&](const StringRef key, const auto) - { - for (const auto out_idx : outdated_keys[key]) - out[out_idx] = true; - }, - [&](const StringRef key, const auto) - { - for (const auto out_idx : outdated_keys[key]) - out[out_idx] = false; - }); - - return result; -} - - -template -void ComplexKeyCacheDictionary::getItemsNumberImpl( - Attribute & attribute, - const Columns & key_columns, - PaddedPODArray & out, - DefaultValueExtractor & default_value_extractor) const -{ - /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } - MapType> outdated_keys; - auto & attribute_array = std::get>(attribute.arrays); - - const auto rows_num = key_columns.front()->size(); - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - PODArray keys_array(rows_num); - - size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - /// fetch up-to-date values, decide which ones require update - for (const auto row : ext::range(0, rows_num)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); - keys_array[row] = key; - const auto find_result = findCellIdx(key, now); - - /** cell should be updated if either: - * 1. keys (or hash) do not match, - * 2. cell has expired, - * 3. explicit defaults were specified and cell was set default. */ - - if (!find_result.valid) - { - outdated_keys[key].push_back(row); - if (find_result.outdated) - ++cache_expired; - else - ++cache_not_found; - } - else - { - ++cache_hit; - const auto & cell_idx = find_result.cell_idx; - const auto & cell = cells[cell_idx]; - out[row] = cell.isDefault() ? default_value_extractor[row] : static_cast(attribute_array[cell_idx]); - } - } - } - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - query_count.fetch_add(rows_num, std::memory_order_relaxed); - hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); - - if (outdated_keys.empty()) - return; - - std::vector required_rows(outdated_keys.size()); - std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) - { - return pair.getMapped().front(); - }); - - /// request new values - update( - key_columns, - keys_array, - required_rows, - [&](const StringRef key, const size_t cell_idx) - { - for (const auto row : outdated_keys[key]) - out[row] = static_cast(attribute_array[cell_idx]); - }, - [&](const StringRef key, const size_t) - { - for (const auto row : outdated_keys[key]) - out[row] = default_value_extractor[row]; - }); -} - -void ComplexKeyCacheDictionary::getItemsString( - Attribute & attribute, - const Columns & key_columns, - ColumnString * out, - DictionaryDefaultValueExtractor & default_value_extractor) const -{ - const auto rows_num = key_columns.front()->size(); - /// save on some allocations - out->getOffsets().reserve(rows_num); - - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - - auto & attribute_array = std::get>(attribute.arrays); - - auto found_outdated_values = false; - - /// perform optimistic version, fallback to pessimistic if failed - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - /// fetch up-to-date values, discard on fail - for (const auto row : ext::range(0, rows_num)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); - SCOPE_EXIT(temporary_keys_pool.rollback(key.size)); - const auto find_result = findCellIdx(key, now); - - if (!find_result.valid) - { - found_outdated_values = true; - break; - } - else - { - const auto & cell_idx = find_result.cell_idx; - const auto & cell = cells[cell_idx]; - const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx]; - out->insertData(string_ref.data, string_ref.size); - } - } - } - - /// optimistic code completed successfully - if (!found_outdated_values) - { - query_count.fetch_add(rows_num, std::memory_order_relaxed); - hit_count.fetch_add(rows_num, std::memory_order_release); - return; - } - - /// now onto the pessimistic one, discard possible partial results from the optimistic path - out->getChars().resize_assume_reserved(0); - out->getOffsets().resize_assume_reserved(0); - - /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } - MapType> outdated_keys; - /// we are going to store every string separately - MapType map; - PODArray keys_array(rows_num); - - size_t total_length = 0; - size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - for (const auto row : ext::range(0, rows_num)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); - keys_array[row] = key; - const auto find_result = findCellIdx(key, now); - - if (!find_result.valid) - { - outdated_keys[key].push_back(row); - if (find_result.outdated) - ++cache_expired; - else - ++cache_not_found; - } - else - { - ++cache_hit; - const auto & cell_idx = find_result.cell_idx; - const auto & cell = cells[cell_idx]; - const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx]; - - if (!cell.isDefault()) - map[key] = copyIntoArena(string_ref, temporary_keys_pool); - - total_length += string_ref.size + 1; - } - } - } - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - - query_count.fetch_add(rows_num, std::memory_order_relaxed); - hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); - - /// request new values - if (!outdated_keys.empty()) - { - std::vector required_rows(outdated_keys.size()); - std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) - { - return pair.getMapped().front(); - }); - - update( - key_columns, - keys_array, - required_rows, - [&](const StringRef key, const size_t cell_idx) - { - const StringRef attribute_value = attribute_array[cell_idx]; - - /// We must copy key and value to own memory, because it may be replaced with another - /// in next iterations of inner loop of update. - const StringRef copied_key = copyIntoArena(key, temporary_keys_pool); - const StringRef copied_value = copyIntoArena(attribute_value, temporary_keys_pool); - - map[copied_key] = copied_value; - total_length += (attribute_value.size + 1) * outdated_keys[key].size(); - }, - [&](const StringRef key, const size_t) - { - for (const auto row : outdated_keys[key]) - total_length += default_value_extractor[row].size + 1; - }); - } - - out->getChars().reserve(total_length); - - for (const auto row : ext::range(0, ext::size(keys_array))) - { - const StringRef key = keys_array[row]; - auto * const it = map.find(key); - const auto string_ref = it ? it->getMapped() : default_value_extractor[row]; - out->insertData(string_ref.data, string_ref.size); - } -} - -template -void ComplexKeyCacheDictionary::update( - const Columns & in_key_columns, - const PODArray & in_keys, - const std::vector & in_requested_rows, - PresentKeyHandler && on_cell_updated, - AbsentKeyHandler && on_key_not_found) const -{ - MapType remaining_keys{in_requested_rows.size()}; - for (const auto row : in_requested_rows) - remaining_keys.insert({in_keys[row], false}); - - std::uniform_int_distribution distribution(dict_lifetime.min_sec, dict_lifetime.max_sec); - - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - { - Stopwatch watch; - auto stream = source_ptr->loadKeys(in_key_columns, in_requested_rows); - stream->readPrefix(); - - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - - const auto attributes_size = attributes.size(); - const auto now = std::chrono::system_clock::now(); - - while (const auto block = stream->read()) - { - /// cache column pointers - const auto key_columns = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - const auto attribute_columns = ext::map(ext::range(0, attributes_size), [&](const size_t attribute_idx) - { - return block.safeGetByPosition(keys_size + attribute_idx).column; - }); - - const auto rows_num = block.rows(); - - for (const auto row : ext::range(0, rows_num)) - { - auto key = allocKey(row, key_columns, keys); - const auto hash = StringRefHash{}(key); - const auto find_result = findCellIdx(key, now, hash); - const auto & cell_idx = find_result.cell_idx; - auto & cell = cells[cell_idx]; - - for (const auto attribute_idx : ext::range(0, attributes.size())) - { - const auto & attribute_column = *attribute_columns[attribute_idx]; - auto & attribute = attributes[attribute_idx]; - - setAttributeValue(attribute, cell_idx, attribute_column[row]); - } - - /// if cell id is zero and zero does not map to this cell, then the cell is unused - if (cell.key == StringRef{} && cell_idx != zero_cell_idx) - element_count.fetch_add(1, std::memory_order_relaxed); - - /// handle memory allocated for old key - if (key == cell.key) - { - freeKey(key); - key = cell.key; - } - else - { - /// new key is different from the old one - if (cell.key.data) - freeKey(cell.key); - - cell.key = key; - } - - cell.hash = hash; - - if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0) - cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}); - else - cell.setExpiresAt(std::chrono::time_point::max()); - - /// inform caller - on_cell_updated(key, cell_idx); - /// mark corresponding id as found - remaining_keys[key] = true; - } - } - - stream->readSuffix(); - - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, in_requested_rows.size()); - ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed()); - } - - size_t found_num = 0; - size_t not_found_num = 0; - - const auto now = std::chrono::system_clock::now(); - - /// Check which ids have not been found and require setting null_value - for (const auto & key_found_pair : remaining_keys) - { - if (key_found_pair.getMapped()) - { - ++found_num; - continue; - } - - ++not_found_num; - - auto key = key_found_pair.getKey(); - const auto hash = StringRefHash{}(key); - const auto find_result = findCellIdx(key, now, hash); - const auto & cell_idx = find_result.cell_idx; - auto & cell = cells[cell_idx]; - - /// Set null_value for each attribute - for (auto & attribute : attributes) - setDefaultAttributeValue(attribute, cell_idx); - - /// Check if cell had not been occupied before and increment element counter if it hadn't - if (cell.key == StringRef{} && cell_idx != zero_cell_idx) - element_count.fetch_add(1, std::memory_order_relaxed); - - if (key == cell.key) - key = cell.key; - else - { - if (cell.key.data) - freeKey(cell.key); - - /// copy key from temporary pool - key = copyKey(key); - cell.key = key; - } - - cell.hash = hash; - - if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0) - cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}); - else - cell.setExpiresAt(std::chrono::time_point::max()); - - cell.setDefault(); - - /// inform caller that the cell has not been found - on_key_not_found(key, cell_idx); - } - - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num); - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num); -} - - -void ComplexKeyCacheDictionary::createAttributes() -{ - const auto attributes_size = dict_struct.attributes.size(); - attributes.reserve(attributes_size); - - bytes_allocated += size * sizeof(CellMetadata); - bytes_allocated += attributes_size * sizeof(attributes.front()); - - for (const auto & attribute : dict_struct.attributes) - { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value)); - - if (attribute.hierarchical) - throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; - } -} - -ComplexKeyCacheDictionary::Attribute & ComplexKeyCacheDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -void ComplexKeyCacheDictionary::setDefaultAttributeValue(Attribute & attribute, const size_t idx) const -{ - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if constexpr (std::is_same_v) - { - const auto & null_value_ref = std::get(attribute.null_values); - auto & string_ref = std::get>(attribute.arrays)[idx]; - - if (string_ref.data != null_value_ref.data()) - { - if (string_ref.data) - string_arena->free(const_cast(string_ref.data), string_ref.size); - - string_ref = StringRef{null_value_ref}; - } - } - else - { - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - } - }; - - callOnDictionaryAttributeType(attribute.type, type_call); -} - -ComplexKeyCacheDictionary::Attribute -ComplexKeyCacheDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) -{ - Attribute attr{type, {}, {}}; - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if constexpr (std::is_same_v) - { - attr.null_values = null_value.get(); - attr.arrays = std::make_unique>(size); - bytes_allocated += size * sizeof(StringRef); - if (!string_arena) - string_arena = std::make_unique(); - } - else - { - attr.null_values = AttributeType(null_value.get>()); /* NOLINT */ - attr.arrays = std::make_unique>(size); /* NOLINT */ - bytes_allocated += size * sizeof(AttributeType); - } - }; - - callOnDictionaryAttributeType(type, type_call); - - return attr; -} - -void ComplexKeyCacheDictionary::setAttributeValue(Attribute & attribute, const size_t idx, const Field & value) const -{ - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if constexpr (std::is_same_v) - { - const auto & string = value.get(); - auto & string_ref = std::get>(attribute.arrays)[idx]; - const auto & null_value_ref = std::get(attribute.null_values); - - /// free memory unless it points to a null_value - if (string_ref.data && string_ref.data != null_value_ref.data()) - string_arena->free(const_cast(string_ref.data), string_ref.size); - - const auto str_size = string.size(); - if (str_size != 0) - { - auto * str_ptr = string_arena->alloc(str_size); - std::copy(string.data(), string.data() + str_size, str_ptr); - string_ref = StringRef{str_ptr, str_size}; - } - else - string_ref = {}; - } - else - { - std::get>(attribute.arrays)[idx] = value.get>(); - } - }; - - callOnDictionaryAttributeType(attribute.type, type_call); -} - -StringRef ComplexKeyCacheDictionary::allocKey(const size_t row, const Columns & key_columns, StringRefs & keys) const -{ - if (key_size_is_fixed) - return placeKeysInFixedSizePool(row, key_columns); - - return placeKeysInPool(row, key_columns, keys, *dict_struct.key, *keys_pool); -} - -void ComplexKeyCacheDictionary::freeKey(const StringRef key) const -{ - if (key_size_is_fixed) - fixed_size_keys_pool->free(const_cast(key.data)); - else - keys_pool->free(const_cast(key.data), key.size); -} - -template -StringRef ComplexKeyCacheDictionary::placeKeysInPool( - const size_t row, const Columns & key_columns, StringRefs & keys, const std::vector & key_attributes, Pool & pool) -{ - const auto keys_size = key_columns.size(); - size_t sum_keys_size{}; - - for (size_t j = 0; j < keys_size; ++j) - { - keys[j] = key_columns[j]->getDataAt(row); - sum_keys_size += keys[j].size; - if (key_attributes[j].underlying_type == AttributeUnderlyingType::utString) - sum_keys_size += sizeof(size_t) + 1; - } - - auto place = pool.alloc(sum_keys_size); - - auto key_start = place; - for (size_t j = 0; j < keys_size; ++j) - { - if (key_attributes[j].underlying_type == AttributeUnderlyingType::utString) - { - auto start = key_start; - auto key_size = keys[j].size + 1; - memcpy(key_start, &key_size, sizeof(size_t)); - key_start += sizeof(size_t); - memcpy(key_start, keys[j].data, keys[j].size); - key_start += keys[j].size; - *key_start = '\0'; - ++key_start; - keys[j].data = start; - keys[j].size += sizeof(size_t) + 1; - } - else - { - memcpy(key_start, keys[j].data, keys[j].size); - keys[j].data = key_start; - key_start += keys[j].size; - } - } - - return {place, sum_keys_size}; -} - -/// Explicit instantiations. - -template StringRef ComplexKeyCacheDictionary::placeKeysInPool( - const size_t row, - const Columns & key_columns, - StringRefs & keys, - const std::vector & key_attributes, - Arena & pool); - -template StringRef ComplexKeyCacheDictionary::placeKeysInPool( - const size_t row, - const Columns & key_columns, - StringRefs & keys, - const std::vector & key_attributes, - ArenaWithFreeLists & pool); - - -StringRef ComplexKeyCacheDictionary::placeKeysInFixedSizePool(const size_t row, const Columns & key_columns) const -{ - auto * res = fixed_size_keys_pool->alloc(); - auto * place = res; - - for (const auto & key_column : key_columns) - { - const StringRef key = key_column->getDataAt(row); - memcpy(place, key.data, key.size); - place += key.size; - } - - return {res, key_size}; -} - -StringRef ComplexKeyCacheDictionary::copyIntoArena(StringRef src, Arena & arena) -{ - char * allocated = arena.alloc(src.size); - memcpy(allocated, src.data, src.size); - return {allocated, src.size}; -} - -StringRef ComplexKeyCacheDictionary::copyKey(const StringRef key) const -{ - auto * res = key_size_is_fixed ? fixed_size_keys_pool->alloc() : keys_pool->alloc(key.size); - memcpy(res, key.data, key.size); - - return {res, key.size}; -} - -bool ComplexKeyCacheDictionary::isEmptyCell(const UInt64 idx) const -{ - return ( - cells[idx].key == StringRef{} - && (idx != zero_cell_idx || cells[idx].data == ext::safe_bit_cast(CellMetadata::time_point_t()))); -} - -BlockInputStreamPtr ComplexKeyCacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const -{ - std::vector keys; - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - for (auto idx : ext::range(0, cells.size())) - if (!isEmptyCell(idx) && !cells[idx].isDefault()) - keys.push_back(cells[idx].key); - } - - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, keys, column_names); -} - -void registerDictionaryComplexKeyCache(DictionaryFactory & factory) -{ - auto create_layout = [=](const std::string & full_name, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - if (!dict_struct.key) - throw Exception{"'key' is required for dictionary of layout 'complex_key_hashed'", ErrorCodes::BAD_ARGUMENTS}; - const auto & layout_prefix = config_prefix + ".layout"; - const auto size = config.getInt(layout_prefix + ".complex_key_cache.size_in_cells"); - if (size == 0) - throw Exception{full_name + ": dictionary of layout 'cache' cannot have 0 cells", ErrorCodes::TOO_SMALL_BUFFER_SIZE}; - - const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - if (require_nonempty) - throw Exception{full_name + ": dictionary of layout 'cache' cannot have 'require_nonempty' attribute set", - ErrorCodes::BAD_ARGUMENTS}; - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, size); - }; - factory.registerLayout("complex_key_cache", create_layout, true); -} - - -} diff --git a/src/Dictionaries/ComplexKeyCacheDictionary.h b/src/Dictionaries/ComplexKeyCacheDictionary.h deleted file mode 100644 index f5643fc799c..00000000000 --- a/src/Dictionaries/ComplexKeyCacheDictionary.h +++ /dev/null @@ -1,276 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include -#include "DictionaryHelpers.h" - -namespace ProfileEvents -{ -extern const Event DictCacheKeysRequested; -extern const Event DictCacheKeysRequestedMiss; -extern const Event DictCacheKeysRequestedFound; -extern const Event DictCacheKeysExpired; -extern const Event DictCacheKeysNotFound; -extern const Event DictCacheKeysHit; -extern const Event DictCacheRequestTimeNs; -extern const Event DictCacheLockWriteNs; -extern const Event DictCacheLockReadNs; -} - -namespace DB -{ -class ComplexKeyCacheDictionary final : public IDictionaryBase -{ -public: - ComplexKeyCacheDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - const size_t size_); - - std::string getKeyDescription() const { return key_description; } - - std::string getTypeName() const override { return "ComplexKeyCache"; } - - size_t getBytesAllocated() const override - { - return bytes_allocated + (key_size_is_fixed ? fixed_size_keys_pool->size() : keys_pool->size()) - + (string_arena ? string_arena->size() : 0); - } - - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } - - double getHitRate() const override - { - return static_cast(hit_count.load(std::memory_order_acquire)) / query_count.load(std::memory_order_relaxed); - } - - size_t getElementCount() const override { return element_count.load(std::memory_order_relaxed); } - - double getLoadFactor() const override { return static_cast(element_count.load(std::memory_order_relaxed)) / size; } - - bool supportUpdates() const override { return false; } - - std::shared_ptr clone() const override - { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, size); - } - - const IDictionarySource * getSource() const override { return source_ptr.get(); } - - const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } - - const DictionaryStructure & getStructure() const override { return dict_struct; } - - bool isInjective(const std::string & attribute_name) const override - { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; - } - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } - - ColumnPtr getColumn( - const std::string& attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const override; - - ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; - -private: - template - using MapType = HashMapWithSavedHash; - template - using ContainerType = Value[]; - template - using ContainerPtrType = std::unique_ptr>; - - struct CellMetadata final - { - using time_point_t = std::chrono::system_clock::time_point; - using time_point_rep_t = time_point_t::rep; - using time_point_urep_t = std::make_unsigned_t; - - static constexpr UInt64 EXPIRES_AT_MASK = std::numeric_limits::max(); - static constexpr UInt64 IS_DEFAULT_MASK = ~EXPIRES_AT_MASK; - - StringRef key; - decltype(StringRefHash{}(key)) hash; - /// Stores both expiration time and `is_default` flag in the most significant bit - time_point_urep_t data; - - /// Sets expiration time, resets `is_default` flag to false - time_point_t expiresAt() const { return ext::safe_bit_cast(data & EXPIRES_AT_MASK); } - void setExpiresAt(const time_point_t & t) { data = ext::safe_bit_cast(t); } - - bool isDefault() const { return (data & IS_DEFAULT_MASK) == IS_DEFAULT_MASK; } - void setDefault() { data |= IS_DEFAULT_MASK; } - }; - - struct Attribute final - { - AttributeUnderlyingType type; - std::variant< - UInt8, - UInt16, - UInt32, - UInt64, - UInt128, - Int8, - Int16, - Int32, - Int64, - Decimal32, - Decimal64, - Decimal128, - Float32, - Float64, - String> - null_values; - std::variant< - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType, - ContainerPtrType> - arrays; - }; - - void createAttributes(); - - Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value); - - template - void getItemsNumberImpl( - Attribute & attribute, - const Columns & key_columns, - PaddedPODArray & out, - DefaultValueExtractor & default_value_extractor) const; - - void getItemsString( - Attribute & attribute, - const Columns & key_columns, - ColumnString * out, - DictionaryDefaultValueExtractor & default_value_extractor) const; - - template - void update( - const Columns & in_key_columns, - const PODArray & in_keys, - const std::vector & in_requested_rows, - PresentKeyHandler && on_cell_updated, - AbsentKeyHandler && on_key_not_found) const; - - UInt64 getCellIdx(const StringRef key) const; - - void setDefaultAttributeValue(Attribute & attribute, const size_t idx) const; - - void setAttributeValue(Attribute & attribute, const size_t idx, const Field & value) const; - - Attribute & getAttribute(const std::string & attribute_name) const; - - StringRef allocKey(const size_t row, const Columns & key_columns, StringRefs & keys) const; - - void freeKey(const StringRef key) const; - - template - static StringRef placeKeysInPool( - const size_t row, - const Columns & key_columns, - StringRefs & keys, - const std::vector & key_attributes, - Arena & pool); - - StringRef placeKeysInFixedSizePool(const size_t row, const Columns & key_columns) const; - - static StringRef copyIntoArena(StringRef src, Arena & arena); - StringRef copyKey(const StringRef key) const; - - struct FindResult - { - const size_t cell_idx; - const bool valid; - const bool outdated; - }; - - FindResult findCellIdx(const StringRef & key, const CellMetadata::time_point_t now, const size_t hash) const; - FindResult findCellIdx(const StringRef & key, const CellMetadata::time_point_t now) const - { - const auto hash = StringRefHash{}(key); - return findCellIdx(key, now, hash); - } - - bool isEmptyCell(const UInt64 idx) const; - - const DictionaryStructure dict_struct; - const DictionarySourcePtr source_ptr; - const DictionaryLifetime dict_lifetime; - const std::string key_description{dict_struct.getKeyDescription()}; - - mutable std::shared_mutex rw_lock; - - /// Actual size will be increased to match power of 2 - const size_t size; - - /// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111) - const size_t size_overlap_mask; - - /// Max tries to find cell, overlapped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3 - static constexpr size_t max_collision_length = 10; - - const UInt64 zero_cell_idx{getCellIdx(StringRef{})}; - std::map attribute_index_by_name; - mutable std::vector attributes; - mutable std::vector cells{size}; - const bool key_size_is_fixed{dict_struct.isKeySizeFixed()}; - size_t key_size{key_size_is_fixed ? dict_struct.getKeySize() : 0}; - std::unique_ptr keys_pool = key_size_is_fixed ? nullptr : std::make_unique(); - std::unique_ptr fixed_size_keys_pool = key_size_is_fixed ? std::make_unique(key_size) : nullptr; - std::unique_ptr string_arena; - - mutable pcg64 rnd_engine; - - mutable size_t bytes_allocated = 0; - mutable std::atomic element_count{0}; - mutable std::atomic hit_count{0}; - mutable std::atomic query_count{0}; - - const std::chrono::time_point creation_time = std::chrono::system_clock::now(); -}; - -} diff --git a/src/Dictionaries/ComplexKeyDirectDictionary.cpp b/src/Dictionaries/ComplexKeyDirectDictionary.cpp deleted file mode 100644 index 391b5c47980..00000000000 --- a/src/Dictionaries/ComplexKeyDirectDictionary.cpp +++ /dev/null @@ -1,403 +0,0 @@ -#include "ComplexKeyDirectDictionary.h" -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include -#include -#include -#include - -namespace DB -{ -namespace ErrorCodes -{ - extern const int TYPE_MISMATCH; - extern const int BAD_ARGUMENTS; - extern const int UNSUPPORTED_METHOD; -} - - -ComplexKeyDirectDictionary::ComplexKeyDirectDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - BlockPtr saved_block_) - : IDictionaryBase(dict_id_) - , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} - , saved_block{std::move(saved_block_)} -{ - if (!this->source_ptr->supportsSelectiveLoad()) - throw Exception{full_name + ": source cannot be used with ComplexKeyDirectDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - - createAttributes(); -} - -ColumnPtr ComplexKeyDirectDictionary::getColumn( - const std::string & attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const -{ - dict_struct.validateKeyTypes(key_types); - - ColumnPtr result; - - const auto & attribute = getAttribute(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - - auto keys_size = key_columns.front()->size(); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to = nullptr; - if (attribute.is_nullable) - { - col_null_map_to = ColumnUInt8::create(keys_size, false); - vec_null_map_to = &col_null_map_to->getData(); - } - - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto attribute_null_value = std::get(attribute.null_values); - AttributeType null_value = static_cast(attribute_null_value); - DictionaryDefaultValueExtractor default_value_extractor(std::move(null_value), default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) - { - auto * out = column.get(); - - getItemsImpl( - attribute, - key_columns, - [&](const size_t row, const String value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - const auto ref = StringRef{value}; - out->insertData(ref.data, ref.size); - }, - default_value_extractor); - } - else - { - auto & out = column->getData(); - - getItemsImpl( - attribute, - key_columns, - [&](const size_t row, const auto value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - out[row] = value; - }, - default_value_extractor); - } - - - result = std::move(column); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - if (attribute.is_nullable) - { - result = ColumnNullable::create(result, std::move(col_null_map_to)); - } - - return result; -} - -ColumnUInt8::Ptr ComplexKeyDirectDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const -{ - dict_struct.validateKeyTypes(key_types); - - auto size = key_columns.front()->size(); - auto result = ColumnUInt8::create(size); - auto& out = result->getData(); - - const auto rows = key_columns.front()->size(); - const auto keys_size = dict_struct.key->size(); - StringRefs keys_array(keys_size); - MapType has_key; - Arena temporary_keys_pool; - std::vector to_load(rows); - PODArray keys(rows); - - for (const auto row : ext::range(0, rows)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys_array, *dict_struct.key, temporary_keys_pool); - keys[row] = key; - has_key[key] = 0; - to_load[row] = row; - } - - auto stream = source_ptr->loadKeys(key_columns, to_load); - - stream->readPrefix(); - - while (const auto block = stream->read()) - { - const auto columns = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - Arena pool; - - StringRefs keys_temp(keys_size); - - const auto columns_size = columns.front()->size(); - - for (const auto row_idx : ext::range(0, columns_size)) - { - const StringRef key = placeKeysInPool(row_idx, columns, keys_temp, *dict_struct.key, pool); - if (has_key.has(key)) - { - has_key[key] = 1; - } - } - } - - stream->readSuffix(); - - for (const auto row : ext::range(0, rows)) - { - out[row] = has_key[keys[row]]; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); - - return result; -} - -void ComplexKeyDirectDictionary::createAttributes() -{ - const auto size = dict_struct.attributes.size(); - attributes.reserve(size); - - for (const auto & attribute : dict_struct.attributes) - { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attribute_name_by_index.emplace(attributes.size(), attribute.name); - attributes.push_back(createAttribute(attribute, attribute.null_value, attribute.name)); - - if (attribute.hierarchical) - throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; - } -} - -template -void ComplexKeyDirectDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.null_values = T(null_value.get>()); -} - -template <> -void ComplexKeyDirectDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); -} - - -ComplexKeyDirectDictionary::Attribute ComplexKeyDirectDictionary::createAttribute( - const DictionaryAttribute & attribute, const Field & null_value, const std::string & attr_name) -{ - Attribute attr{attribute.underlying_type, attribute.is_nullable, {}, {}, attr_name}; - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - createAttributeImpl(attr, null_value); - }; - - callOnDictionaryAttributeType(attribute.underlying_type, type_call); - - return attr; -} - -template -StringRef ComplexKeyDirectDictionary::placeKeysInPool( - const size_t row, const Columns & key_columns, StringRefs & keys, const std::vector & key_attributes, Pool & pool) const -{ - const auto keys_size = key_columns.size(); - size_t sum_keys_size{}; - - for (size_t j = 0; j < keys_size; ++j) - { - keys[j] = key_columns[j]->getDataAt(row); - sum_keys_size += keys[j].size; - if (key_attributes[j].underlying_type == AttributeUnderlyingType::utString) - sum_keys_size += sizeof(size_t) + 1; - } - - auto place = pool.alloc(sum_keys_size); - - auto key_start = place; - for (size_t j = 0; j < keys_size; ++j) - { - if (key_attributes[j].underlying_type == AttributeUnderlyingType::utString) - { - auto start = key_start; - auto key_size = keys[j].size + 1; - memcpy(key_start, &key_size, sizeof(size_t)); - key_start += sizeof(size_t); - memcpy(key_start, keys[j].data, keys[j].size); - key_start += keys[j].size; - *key_start = '\0'; - ++key_start; - keys[j].data = start; - keys[j].size += sizeof(size_t) + 1; - } - else - { - memcpy(key_start, keys[j].data, keys[j].size); - keys[j].data = key_start; - key_start += keys[j].size; - } - } - - return {place, sum_keys_size}; -} - - -template -void ComplexKeyDirectDictionary::getItemsImpl( - const Attribute & attribute, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - const auto rows = key_columns.front()->size(); - const auto keys_size = dict_struct.key->size(); - StringRefs keys_array(keys_size); - MapType value_by_key; - HashMapWithSavedHash value_is_null; - Arena temporary_keys_pool; - std::vector to_load(rows); - PODArray keys(rows); - - for (const auto row : ext::range(0, rows)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys_array, *dict_struct.key, temporary_keys_pool); - keys[row] = key; - value_by_key[key] = static_cast(default_value_extractor[row]); - to_load[row] = row; - value_is_null[key] = false; - } - - auto stream = source_ptr->loadKeys(key_columns, to_load); - const auto attributes_size = attributes.size(); - - stream->readPrefix(); - - while (const auto block = stream->read()) - { - const auto columns = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - const auto attribute_columns = ext::map(ext::range(0, attributes_size), [&](const size_t attribute_idx) - { - return block.safeGetByPosition(keys_size + attribute_idx).column; - }); - for (const size_t attribute_idx : ext::range(0, attributes.size())) - { - if (attribute.name != attribute_name_by_index.at(attribute_idx)) - { - continue; - } - - const IColumn & attribute_column = *attribute_columns[attribute_idx]; - Arena pool; - - StringRefs keys_temp(keys_size); - - const auto columns_size = columns.front()->size(); - - for (const auto row_idx : ext::range(0, columns_size)) - { - const StringRef key = placeKeysInPool(row_idx, columns, keys_temp, *dict_struct.key, pool); - - if (value_by_key.has(key)) - { - auto value = attribute_column[row_idx]; - - if (value.isNull()) - value_is_null[key] = true; - else - value_by_key[key] = static_cast(value.template get>()); - } - } - } - } - - stream->readSuffix(); - - for (const auto row : ext::range(0, rows)) - { - auto key = keys[row]; - set_value(row, value_by_key[key], value_is_null[key]); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -const ComplexKeyDirectDictionary::Attribute & ComplexKeyDirectDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -BlockInputStreamPtr ComplexKeyDirectDictionary::getBlockInputStream(const Names & /* column_names */, size_t /* max_block_size */) const -{ - return source_ptr->loadAll(); -} - - -void registerDictionaryComplexKeyDirect(DictionaryFactory & factory) -{ - auto create_layout = [=](const std::string & full_name, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - if (!dict_struct.key) - throw Exception{"'key' is required for dictionary of layout 'complex_key_direct'", ErrorCodes::BAD_ARGUMENTS}; - - if (dict_struct.range_min || dict_struct.range_max) - throw Exception{full_name - + ": elements .structure.range_min and .structure.range_max should be defined only " - "for a dictionary of layout 'range_hashed'", - ErrorCodes::BAD_ARGUMENTS}; - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - - if (config.has(config_prefix + ".lifetime.min") || config.has(config_prefix + ".lifetime.max")) - throw Exception{"'lifetime' parameter is redundant for the dictionary' of layout 'direct'", ErrorCodes::BAD_ARGUMENTS}; - - - return std::make_unique(dict_id, dict_struct, std::move(source_ptr)); - }; - factory.registerLayout("complex_key_direct", create_layout, true); -} - - -} diff --git a/src/Dictionaries/ComplexKeyDirectDictionary.h b/src/Dictionaries/ComplexKeyDirectDictionary.h deleted file mode 100644 index 0e191321daa..00000000000 --- a/src/Dictionaries/ComplexKeyDirectDictionary.h +++ /dev/null @@ -1,147 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryStructure.h" -#include "DictionaryHelpers.h" - -namespace DB -{ - -class ComplexKeyDirectDictionary final : public IDictionaryBase -{ -public: - ComplexKeyDirectDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - BlockPtr saved_block_ = nullptr); - - std::string getTypeName() const override { return "ComplexKeyDirect"; } - - size_t getBytesAllocated() const override { return 0; } - - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } - - double getHitRate() const override { return 1.0; } - - size_t getElementCount() const override { return 0; } - - double getLoadFactor() const override { return 0; } - - std::string getKeyDescription() const { return key_description; } - - std::shared_ptr clone() const override - { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), saved_block); - } - - const IDictionarySource * getSource() const override { return source_ptr.get(); } - - const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } - - const DictionaryStructure & getStructure() const override { return dict_struct; } - - bool isInjective(const std::string & attribute_name) const override - { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; - } - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } - - ColumnPtr getColumn( - const std::string& attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const override; - - ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; - -private: - template - using MapType = HashMapWithSavedHash; - - struct Attribute final - { - AttributeUnderlyingType type; - bool is_nullable; - - std::variant< - UInt8, - UInt16, - UInt32, - UInt64, - UInt128, - Int8, - Int16, - Int32, - Int64, - Decimal32, - Decimal64, - Decimal128, - Float32, - Float64, - StringRef> - null_values; - std::unique_ptr string_arena; - std::string name; - }; - - void createAttributes(); - - template - void addAttributeSize(const Attribute & attribute); - - template - static void createAttributeImpl(Attribute & attribute, const Field & null_value); - - static Attribute createAttribute(const DictionaryAttribute & attribute, const Field & null_value, const std::string & name); - - template - StringRef placeKeysInPool( - const size_t row, const Columns & key_columns, StringRefs & keys, const std::vector & key_attributes, Pool & pool) const; - - template - void getItemsImpl( - const Attribute & attribute, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const; - - template - void setAttributeValueImpl(Attribute & attribute, const Key id, const T & value); - - void setAttributeValue(Attribute & attribute, const Key id, const Field & value); - - const Attribute & getAttribute(const std::string & attribute_name) const; - - const DictionaryStructure dict_struct; - const DictionarySourcePtr source_ptr; - const DictionaryLifetime dict_lifetime; - - std::map attribute_index_by_name; - std::map attribute_name_by_index; - std::vector attributes; - - mutable std::atomic query_count{0}; - - BlockPtr saved_block; - const std::string key_description{dict_struct.getKeyDescription()}; -}; - -} diff --git a/src/Dictionaries/ComplexKeyHashedDictionary.cpp b/src/Dictionaries/ComplexKeyHashedDictionary.cpp deleted file mode 100644 index a0784b5a417..00000000000 --- a/src/Dictionaries/ComplexKeyHashedDictionary.cpp +++ /dev/null @@ -1,588 +0,0 @@ -#include "ComplexKeyHashedDictionary.h" -#include -#include -#include -#include -#include -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" - -namespace DB -{ -namespace ErrorCodes -{ - extern const int TYPE_MISMATCH; - extern const int BAD_ARGUMENTS; - extern const int DICTIONARY_IS_EMPTY; -} - -ComplexKeyHashedDictionary::ComplexKeyHashedDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, - BlockPtr saved_block_) - : IDictionaryBase(dict_id_) - , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} - , dict_lifetime(dict_lifetime_) - , require_nonempty(require_nonempty_) - , saved_block{std::move(saved_block_)} -{ - createAttributes(); - loadData(); - calculateBytesAllocated(); -} - -ColumnPtr ComplexKeyHashedDictionary::getColumn( - const std::string & attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const -{ - dict_struct.validateKeyTypes(key_types); - - ColumnPtr result; - - const auto & attribute = getAttribute(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - - auto keys_size = key_columns.front()->size(); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to = nullptr; - if (attribute.is_nullable) - { - col_null_map_to = ColumnUInt8::create(keys_size, false); - vec_null_map_to = &col_null_map_to->getData(); - } - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto attribute_null_value = std::get(attribute.null_values); - AttributeType null_value = static_cast(attribute_null_value); - DictionaryDefaultValueExtractor default_value_extractor(std::move(null_value), default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) - { - auto * out = column.get(); - - getItemsImpl( - attribute, - key_columns, - [&](const size_t row, const StringRef value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - out->insertData(value.data, value.size); - }, - default_value_extractor); - } - else - { - auto & out = column->getData(); - - getItemsImpl( - attribute, - key_columns, - [&](const size_t row, const auto value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - out[row] = value; - }, - default_value_extractor); - } - - result = std::move(column); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - if (attribute.is_nullable) - { - result = ColumnNullable::create(result, std::move(col_null_map_to)); - } - - return result; -} - -ColumnUInt8::Ptr ComplexKeyHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const -{ - dict_struct.validateKeyTypes(key_types); - - auto size = key_columns.front()->size(); - auto result = ColumnUInt8::create(size); - auto& out = result->getData(); - - const auto & attribute = attributes.front(); - - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - - has(attribute, key_columns, out); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -void ComplexKeyHashedDictionary::createAttributes() -{ - const auto size = dict_struct.attributes.size(); - attributes.reserve(size); - - for (const auto & attribute : dict_struct.attributes) - { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) - throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; - } -} - -void ComplexKeyHashedDictionary::blockToAttributes(const Block & block) -{ - /// created upfront to avoid excess allocations - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - - const auto attributes_size = attributes.size(); - const auto rows = block.rows(); - element_count += rows; - - const auto key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - const auto attribute_column_ptrs = ext::map(ext::range(0, attributes_size), [&](const size_t attribute_idx) - { - return block.safeGetByPosition(keys_size + attribute_idx).column; - }); - - for (const auto row_idx : ext::range(0, rows)) - { - /// calculate key once per row - const auto key = placeKeysInPool(row_idx, key_column_ptrs, keys, keys_pool); - - auto should_rollback = false; - - for (const auto attribute_idx : ext::range(0, attributes_size)) - { - const auto & attribute_column = *attribute_column_ptrs[attribute_idx]; - auto & attribute = attributes[attribute_idx]; - const auto inserted = setAttributeValue(attribute, key, attribute_column[row_idx]); - if (!inserted) - should_rollback = true; - } - - /// @note on multiple equal keys the mapped value for the first one is stored - if (should_rollback) - keys_pool.rollback(key.size); - } -} - -void ComplexKeyHashedDictionary::updateData() -{ - /// created upfront to avoid excess allocations - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - - const auto attributes_size = attributes.size(); - - if (!saved_block || saved_block->rows() == 0) - { - auto stream = source_ptr->loadUpdatedAll(); - stream->readPrefix(); - - while (const auto block = stream->read()) - { - /// We are using this method to keep saved data if input stream consists of multiple blocks - if (!saved_block) - saved_block = std::make_shared(block.cloneEmpty()); - for (const auto attribute_idx : ext::range(0, keys_size + attributes_size)) - { - const IColumn & update_column = *block.getByPosition(attribute_idx).column.get(); - MutableColumnPtr saved_column = saved_block->getByPosition(attribute_idx).column->assumeMutable(); - saved_column->insertRangeFrom(update_column, 0, update_column.size()); - } - } - stream->readSuffix(); - } - else - { - auto stream = source_ptr->loadUpdatedAll(); - - stream->readPrefix(); - while (Block block = stream->read()) - { - const auto saved_key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t key_idx) { return saved_block->safeGetByPosition(key_idx).column; }); - - const auto update_key_column_ptrs = ext::map( - ext::range(0, keys_size), [&](const size_t key_idx) { return block.safeGetByPosition(key_idx).column; }); - - Arena temp_key_pool; - ContainerType> update_key_hash; - - for (size_t i = 0; i < block.rows(); ++i) - { - const auto u_key = placeKeysInPool(i, update_key_column_ptrs, keys, temp_key_pool); - update_key_hash[u_key].push_back(i); - } - - const size_t rows = saved_block->rows(); - IColumn::Filter filter(rows); - - for (size_t i = 0; i < saved_block->rows(); ++i) - { - const auto s_key = placeKeysInPool(i, saved_key_column_ptrs, keys, temp_key_pool); - auto * it = update_key_hash.find(s_key); - if (it) - filter[i] = 0; - else - filter[i] = 1; - } - - auto block_columns = block.mutateColumns(); - for (const auto attribute_idx : ext::range(0, keys_size + attributes_size)) - { - auto & column = saved_block->safeGetByPosition(attribute_idx).column; - const auto & filtered_column = column->filter(filter, -1); - - block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size()); - } - - saved_block->setColumns(std::move(block_columns)); - } - stream->readSuffix(); - } - - if (saved_block) - blockToAttributes(*saved_block.get()); -} - -void ComplexKeyHashedDictionary::loadData() -{ - if (!source_ptr->hasUpdateField()) - { - auto stream = source_ptr->loadAll(); - stream->readPrefix(); - - while (const auto block = stream->read()) - blockToAttributes(block); - - stream->readSuffix(); - } - else - updateData(); - - if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY}; -} - -template -void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute) -{ - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(ContainerType) + map_ref.getBufferSizeInBytes(); - bucket_count = map_ref.getBufferSizeInCells(); -} - -template <> -void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute) -{ - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(ContainerType) + map_ref.getBufferSizeInBytes(); - bucket_count = map_ref.getBufferSizeInCells(); - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); -} - -void ComplexKeyHashedDictionary::calculateBytesAllocated() -{ - bytes_allocated += attributes.size() * sizeof(attributes.front()); - - for (const auto & attribute : attributes) - { - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - addAttributeSize(attribute); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - } - - bytes_allocated += keys_pool.size(); -} - -template -void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.null_values = T(null_value.get>()); - attribute.maps.emplace>(); -} - -template <> -void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); - attribute.maps.emplace>(); -} - -ComplexKeyHashedDictionary::Attribute -ComplexKeyHashedDictionary::createAttribute(const DictionaryAttribute & attribute, const Field & null_value) -{ - auto nullable_set = attribute.is_nullable ? std::make_unique() : nullptr; - Attribute attr{attribute.underlying_type, attribute.is_nullable, std::move(nullable_set), {}, {}, {}}; - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - createAttributeImpl(attr, null_value); - }; - - callOnDictionaryAttributeType(attribute.underlying_type, type_call); - - return attr; -} - - -template -void ComplexKeyHashedDictionary::getItemsImpl( - const Attribute & attribute, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - const auto & attr = std::get>(attribute.maps); - - const auto keys_size = key_columns.size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - - const auto rows = key_columns.front()->size(); - for (const auto i : ext::range(0, rows)) - { - /// copy key data to arena so it is contiguous and return StringRef to it - const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); - - const auto it = attr.find(key); - - if (it) - { - set_value(i, static_cast(it->getMapped()), false); - } - else - { - if (attribute.is_nullable && attribute.nullable_set->find(key) != nullptr) - set_value(i, default_value_extractor[i], true); - else - set_value(i, default_value_extractor[i], false); - } - - /// free memory allocated for the key - temporary_keys_pool.rollback(key.size); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - -template -bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value) -{ - auto & map = std::get>(attribute.maps); - const auto pair = map.insert({key, value}); - return pair.second; -} - -template <> -bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const String value) -{ - const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - return setAttributeValueImpl(attribute, key, StringRef{string_in_arena, value.size()}); -} - -bool ComplexKeyHashedDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value) -{ - bool result = false; - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if (attribute.is_nullable) - { - if (value.isNull()) - { - attribute.nullable_set->insert(key); - result = true; - return; - } - else - { - attribute.nullable_set->erase(key); - } - } - - result = setAttributeValueImpl(attribute, key, value.get>()); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -const ComplexKeyHashedDictionary::Attribute & ComplexKeyHashedDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -StringRef ComplexKeyHashedDictionary::placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool) -{ - const auto keys_size = key_columns.size(); - size_t sum_keys_size{}; - - const char * block_start = nullptr; - for (size_t j = 0; j < keys_size; ++j) - { - keys[j] = key_columns[j]->serializeValueIntoArena(row, pool, block_start); - sum_keys_size += keys[j].size; - } - - const auto * key_start = block_start; - for (size_t j = 0; j < keys_size; ++j) - { - keys[j].data = key_start; - key_start += keys[j].size; - } - - return {block_start, sum_keys_size}; -} - -template -void ComplexKeyHashedDictionary::has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray & out) const -{ - const auto & attr = std::get>(attribute.maps); - const auto keys_size = key_columns.size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - const auto rows = key_columns.front()->size(); - - for (const auto i : ext::range(0, rows)) - { - /// copy key data to arena so it is contiguous and return StringRef to it - const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); - - const auto it = attr.find(key); - out[i] = static_cast(it); - - if (attribute.is_nullable && !out[i]) - out[i] = attribute.nullable_set->find(key) != nullptr; - - /// free memory allocated for the key - temporary_keys_pool.rollback(key.size); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -std::vector ComplexKeyHashedDictionary::getKeys() const -{ - const Attribute & attribute = attributes.front(); - - std::vector result; - - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if constexpr (std::is_same_v) - { - result = getKeys(attribute); - } - else - { - result = getKeys(attribute); - } - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -template -std::vector ComplexKeyHashedDictionary::getKeys(const Attribute & attribute) const -{ - const ContainerType & attr = std::get>(attribute.maps); - std::vector keys; - keys.reserve(attr.size()); - for (const auto & key : attr) - keys.push_back(key.getKey()); - - if (attribute.is_nullable) - { - for (const auto & key: *attribute.nullable_set) - keys.push_back(key.getKey()); - } - - return keys; -} - -BlockInputStreamPtr ComplexKeyHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const -{ - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, getKeys(), column_names); -} - -void registerDictionaryComplexKeyHashed(DictionaryFactory & factory) -{ - auto create_layout = [=](const std::string &, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - if (!dict_struct.key) - throw Exception{"'key' is required for dictionary of layout 'complex_key_hashed'", ErrorCodes::BAD_ARGUMENTS}; - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); - }; - factory.registerLayout("complex_key_hashed", create_layout, true); -} - -} diff --git a/src/Dictionaries/ComplexKeyHashedDictionary.h b/src/Dictionaries/ComplexKeyHashedDictionary.h deleted file mode 100644 index ecc720ca0b0..00000000000 --- a/src/Dictionaries/ComplexKeyHashedDictionary.h +++ /dev/null @@ -1,185 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryStructure.h" -#include "DictionaryHelpers.h" - -namespace DB -{ - -class ComplexKeyHashedDictionary final : public IDictionaryBase -{ -public: - ComplexKeyHashedDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, - BlockPtr saved_block_ = nullptr); - - std::string getKeyDescription() const { return key_description; } - - std::string getTypeName() const override { return "ComplexKeyHashed"; } - - size_t getBytesAllocated() const override { return bytes_allocated; } - - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } - - double getHitRate() const override { return 1.0; } - - size_t getElementCount() const override { return element_count; } - - double getLoadFactor() const override { return static_cast(element_count) / bucket_count; } - - std::shared_ptr clone() const override - { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block); - } - - const IDictionarySource * getSource() const override { return source_ptr.get(); } - - const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } - - const DictionaryStructure & getStructure() const override { return dict_struct; } - - bool isInjective(const std::string & attribute_name) const override - { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; - } - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } - - ColumnPtr getColumn( - const std::string& attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const override; - - ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; - -private: - template - using ContainerType = HashMapWithSavedHash; - - using NullableSet = HashSetWithSavedHash; - - struct Attribute final - { - AttributeUnderlyingType type; - bool is_nullable; - std::unique_ptr nullable_set; - - std::variant< - UInt8, - UInt16, - UInt32, - UInt64, - UInt128, - Int8, - Int16, - Int32, - Int64, - Decimal32, - Decimal64, - Decimal128, - Float32, - Float64, - StringRef> - null_values; - std::variant< - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType, - ContainerType> - maps; - std::unique_ptr string_arena; - }; - - void createAttributes(); - - void blockToAttributes(const Block & block); - - void updateData(); - - void loadData(); - - template - void addAttributeSize(const Attribute & attribute); - - void calculateBytesAllocated(); - - template - static void createAttributeImpl(Attribute & attribute, const Field & null_value); - - static Attribute createAttribute(const DictionaryAttribute & attribute, const Field & null_value); - - template - void getItemsImpl( - const Attribute & attribute, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const; - - template - static bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value); - - static bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value); - - const Attribute & getAttribute(const std::string & attribute_name) const; - - static StringRef placeKeysInPool(const size_t row, const Columns & key_columns, StringRefs & keys, Arena & pool); - - template - void has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray & out) const; - - std::vector getKeys() const; - - template - std::vector getKeys(const Attribute & attribute) const; - - const DictionaryStructure dict_struct; - const DictionarySourcePtr source_ptr; - const DictionaryLifetime dict_lifetime; - const bool require_nonempty; - const std::string key_description{dict_struct.getKeyDescription()}; - - std::map attribute_index_by_name; - std::vector attributes; - Arena keys_pool; - - size_t bytes_allocated = 0; - size_t element_count = 0; - size_t bucket_count = 0; - mutable std::atomic query_count{0}; - - BlockPtr saved_block; -}; - -} diff --git a/src/Dictionaries/DictionaryBlockInputStream.cpp b/src/Dictionaries/DictionaryBlockInputStream.cpp new file mode 100644 index 00000000000..433ff211831 --- /dev/null +++ b/src/Dictionaries/DictionaryBlockInputStream.cpp @@ -0,0 +1,200 @@ +#include "DictionaryBlockInputStream.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, UInt64 max_block_size_, PaddedPODArray && ids_, const Names & column_names_) + : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , ids(std::move(ids_)) + , key_type(DictionaryInputStreamKeyType::Id) +{ +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, + UInt64 max_block_size_, + const PaddedPODArray & keys, + const Names & column_names_) + : DictionaryBlockInputStreamBase(keys.size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , key_type(DictionaryInputStreamKeyType::ComplexKey) +{ + const DictionaryStructure & dictionary_structure = dictionary->getStructure(); + fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns); +} + +DictionaryBlockInputStream::DictionaryBlockInputStream( + std::shared_ptr dictionary_, + UInt64 max_block_size_, + const Columns & data_columns_, + const Names & column_names_, + GetColumnsFunction && get_key_columns_function_, + GetColumnsFunction && get_view_columns_function_) + : DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_) + , dictionary(dictionary_) + , column_names(column_names_) + , data_columns(data_columns_) + , get_key_columns_function(std::move(get_key_columns_function_)) + , get_view_columns_function(std::move(get_view_columns_function_)) + , key_type(DictionaryInputStreamKeyType::Callback) +{ +} + +Block DictionaryBlockInputStream::getBlock(size_t start, size_t length) const +{ + /// TODO: Rewrite + switch (key_type) + { + case DictionaryInputStreamKeyType::ComplexKey: + { + Columns columns; + ColumnsWithTypeAndName view_columns; + columns.reserve(key_columns.size()); + for (const auto & key_column : key_columns) + { + ColumnPtr column = key_column.column->cut(start, length); + columns.emplace_back(column); + view_columns.emplace_back(column, key_column.type, key_column.name); + } + return fillBlock({}, columns, {}, std::move(view_columns)); + } + + case DictionaryInputStreamKeyType::Id: + { + PaddedPODArray ids_to_fill(ids.begin() + start, ids.begin() + start + length); + return fillBlock(ids_to_fill, {}, {}, {}); + } + + case DictionaryInputStreamKeyType::Callback: + { + Columns columns; + columns.reserve(data_columns.size()); + for (const auto & data_column : data_columns) + columns.push_back(data_column->cut(start, length)); + const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); + const auto & attributes = *dictionaty_structure.key; + ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes); + ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes); + DataTypes types; + columns.clear(); + for (const auto & key_column : keys_with_type_and_name) + { + columns.push_back(key_column.column); + types.push_back(key_column.type); + } + return fillBlock({}, columns, types, std::move(view_with_type_and_name)); + } + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected DictionaryInputStreamKeyType."); +} + +Block DictionaryBlockInputStream::fillBlock( + const PaddedPODArray & ids_to_fill, + const Columns & keys, + const DataTypes & types, + ColumnsWithTypeAndName && view) const +{ + std::unordered_set names(column_names.begin(), column_names.end()); + + DataTypes data_types = types; + ColumnsWithTypeAndName block_columns; + + data_types.reserve(keys.size()); + const DictionaryStructure & dictionary_structure = dictionary->getStructure(); + if (data_types.empty() && dictionary_structure.key) + for (const auto & key : *dictionary_structure.key) + data_types.push_back(key.type); + + for (const auto & column : view) + if (names.find(column.name) != names.end()) + block_columns.push_back(column); + + const DictionaryStructure & structure = dictionary->getStructure(); + ColumnPtr ids_column = getColumnFromIds(ids_to_fill); + + if (structure.id && names.find(structure.id->name) != names.end()) + { + block_columns.emplace_back(ids_column, std::make_shared(), structure.id->name); + } + + auto dictionary_key_type = dictionary->getKeyType(); + + for (const auto idx : ext::range(0, structure.attributes.size())) + { + const DictionaryAttribute & attribute = structure.attributes[idx]; + if (names.find(attribute.name) != names.end()) + { + ColumnPtr column; + + if (dictionary_key_type == DictionaryKeyType::simple) + { + column = dictionary->getColumn( + attribute.name, + attribute.type, + {ids_column}, + {std::make_shared()}, + nullptr /* default_values_column */); + } + else + { + column = dictionary->getColumn( + attribute.name, + attribute.type, + keys, + data_types, + nullptr /* default_values_column*/); + } + + block_columns.emplace_back(column, attribute.type, attribute.name); + } + } + + return Block(block_columns); +} + +ColumnPtr DictionaryBlockInputStream::getColumnFromIds(const PaddedPODArray & ids_to_fill) +{ + auto column_vector = ColumnVector::create(); + column_vector->getData().assign(ids_to_fill); + return column_vector; +} + +void DictionaryBlockInputStream::fillKeyColumns( + const PaddedPODArray & keys, + size_t start, + size_t size, + const DictionaryStructure & dictionary_structure, + ColumnsWithTypeAndName & result) +{ + MutableColumns columns; + columns.reserve(dictionary_structure.key->size()); + + for (const DictionaryAttribute & attribute : *dictionary_structure.key) + columns.emplace_back(attribute.type->createColumn()); + + for (auto idx : ext::range(start, size)) + { + const auto & key = keys[idx]; + const auto *ptr = key.data; + for (auto & column : columns) + ptr = column->deserializeAndInsertFromArena(ptr); + } + + for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i) + { + const auto & dictionary_attribute = (*dictionary_structure.key)[i]; + result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name}); + } +} + +} diff --git a/src/Dictionaries/DictionaryBlockInputStream.h b/src/Dictionaries/DictionaryBlockInputStream.h index f045d47c2c2..5197df411fa 100644 --- a/src/Dictionaries/DictionaryBlockInputStream.h +++ b/src/Dictionaries/DictionaryBlockInputStream.h @@ -16,26 +16,24 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - +/// TODO: Remove this class /* BlockInputStream implementation for external dictionaries * read() returns blocks consisting of the in-memory contents of the dictionaries */ -template class DictionaryBlockInputStream : public DictionaryBlockInputStreamBase { public: DictionaryBlockInputStream( - std::shared_ptr dictionary, UInt64 max_block_size, PaddedPODArray && ids, const Names & column_names); + std::shared_ptr dictionary, + UInt64 max_block_size, + PaddedPODArray && ids, + const Names & column_names); DictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, UInt64 max_block_size, - const std::vector & keys, + const PaddedPODArray & keys, const Names & column_names); using GetColumnsFunction = std::function & attributes)>; @@ -45,7 +43,7 @@ public: // and get_view_columns_function to get key representation. // Now used in trie dictionary, where columns are stored as ip and mask, and are showed as string DictionaryBlockInputStream( - std::shared_ptr dictionary, + std::shared_ptr dictionary, UInt64 max_block_size, const Columns & data_columns, const Names & column_names, @@ -55,24 +53,27 @@ public: String getName() const override { return "Dictionary"; } protected: - Block getBlock(size_t start, size_t size) const override; + Block getBlock(size_t start, size_t length) const override; private: - Block - fillBlock(const PaddedPODArray & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const; + Block fillBlock( + const PaddedPODArray & ids_to_fill, + const Columns & keys, + const DataTypes & types, + ColumnsWithTypeAndName && view) const; - ColumnPtr getColumnFromIds(const PaddedPODArray & ids_to_fill) const; + static ColumnPtr getColumnFromIds(const PaddedPODArray & ids_to_fill); - void fillKeyColumns( - const std::vector & keys, + static void fillKeyColumns( + const PaddedPODArray & keys, size_t start, size_t size, const DictionaryStructure & dictionary_structure, - ColumnsWithTypeAndName & columns) const; + ColumnsWithTypeAndName & result); - std::shared_ptr dictionary; + std::shared_ptr dictionary; Names column_names; - PaddedPODArray ids; + PaddedPODArray ids; ColumnsWithTypeAndName key_columns; Columns data_columns; @@ -89,200 +90,4 @@ private: DictionaryInputStreamKeyType key_type; }; - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, UInt64 max_block_size_, PaddedPODArray && ids_, const Names & column_names_) - : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , ids(std::move(ids_)) - , key_type(DictionaryInputStreamKeyType::Id) -{ -} - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, - UInt64 max_block_size_, - const std::vector & keys, - const Names & column_names_) - : DictionaryBlockInputStreamBase(keys.size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , key_type(DictionaryInputStreamKeyType::ComplexKey) -{ - const DictionaryStructure & dictionary_structure = dictionary->getStructure(); - fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns); -} - -template -DictionaryBlockInputStream::DictionaryBlockInputStream( - std::shared_ptr dictionary_, - UInt64 max_block_size_, - const Columns & data_columns_, - const Names & column_names_, - GetColumnsFunction && get_key_columns_function_, - GetColumnsFunction && get_view_columns_function_) - : DictionaryBlockInputStreamBase(data_columns_.front()->size(), max_block_size_) - , dictionary(dictionary_) - , column_names(column_names_) - , data_columns(data_columns_) - , get_key_columns_function(std::move(get_key_columns_function_)) - , get_view_columns_function(std::move(get_view_columns_function_)) - , key_type(DictionaryInputStreamKeyType::Callback) -{ -} - - -template -Block DictionaryBlockInputStream::getBlock(size_t start, size_t length) const -{ - /// TODO: Rewrite - switch (key_type) - { - case DictionaryInputStreamKeyType::ComplexKey: - { - Columns columns; - ColumnsWithTypeAndName view_columns; - columns.reserve(key_columns.size()); - for (const auto & key_column : key_columns) - { - ColumnPtr column = key_column.column->cut(start, length); - columns.emplace_back(column); - view_columns.emplace_back(column, key_column.type, key_column.name); - } - return fillBlock({}, columns, {}, std::move(view_columns)); - } - - case DictionaryInputStreamKeyType::Id: - { - PaddedPODArray ids_to_fill(ids.begin() + start, ids.begin() + start + length); - return fillBlock(ids_to_fill, {}, {}, {}); - } - - case DictionaryInputStreamKeyType::Callback: - { - Columns columns; - columns.reserve(data_columns.size()); - for (const auto & data_column : data_columns) - columns.push_back(data_column->cut(start, length)); - const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); - const auto & attributes = *dictionaty_structure.key; - ColumnsWithTypeAndName keys_with_type_and_name = get_key_columns_function(columns, attributes); - ColumnsWithTypeAndName view_with_type_and_name = get_view_columns_function(columns, attributes); - DataTypes types; - columns.clear(); - for (const auto & key_column : keys_with_type_and_name) - { - columns.push_back(key_column.column); - types.push_back(key_column.type); - } - return fillBlock({}, columns, types, std::move(view_with_type_and_name)); - } - } - - throw Exception("Unexpected DictionaryInputStreamKeyType.", ErrorCodes::LOGICAL_ERROR); -} - -template -Block DictionaryBlockInputStream::fillBlock( - const PaddedPODArray & ids_to_fill, const Columns & keys, const DataTypes & types, ColumnsWithTypeAndName && view) const -{ - std::unordered_set names(column_names.begin(), column_names.end()); - - DataTypes data_types = types; - ColumnsWithTypeAndName block_columns; - - data_types.reserve(keys.size()); - const DictionaryStructure & dictionaty_structure = dictionary->getStructure(); - if (data_types.empty() && dictionaty_structure.key) - for (const auto & key : *dictionaty_structure.key) - data_types.push_back(key.type); - - for (const auto & column : view) - if (names.find(column.name) != names.end()) - block_columns.push_back(column); - - const DictionaryStructure & structure = dictionary->getStructure(); - ColumnPtr ids_column = getColumnFromIds(ids_to_fill); - - if (structure.id && names.find(structure.id->name) != names.end()) - { - block_columns.emplace_back(ids_column, std::make_shared(), structure.id->name); - } - - auto dictionary_key_type = dictionary->getKeyType(); - - for (const auto idx : ext::range(0, structure.attributes.size())) - { - const DictionaryAttribute & attribute = structure.attributes[idx]; - if (names.find(attribute.name) != names.end()) - { - ColumnPtr column; - - if (dictionary_key_type == DictionaryKeyType::simple) - { - column = dictionary->getColumn( - attribute.name, - attribute.type, - {ids_column}, - {std::make_shared()}, - nullptr /* default_values_column */); - } - else - { - column = dictionary->getColumn( - attribute.name, - attribute.type, - keys, - data_types, - nullptr /* default_values_column*/); - } - - block_columns.emplace_back(column, attribute.type, attribute.name); - } - } - - return Block(block_columns); -} - -template -ColumnPtr DictionaryBlockInputStream::getColumnFromIds(const PaddedPODArray & ids_to_fill) const -{ - auto column_vector = ColumnVector::create(); - column_vector->getData().reserve(ids_to_fill.size()); - for (UInt64 id : ids_to_fill) - column_vector->insertValue(id); - return column_vector; -} - - -template -void DictionaryBlockInputStream::fillKeyColumns( - const std::vector & keys, - size_t start, - size_t size, - const DictionaryStructure & dictionary_structure, - ColumnsWithTypeAndName & res) const -{ - MutableColumns columns; - columns.reserve(dictionary_structure.key->size()); - - for (const DictionaryAttribute & attribute : *dictionary_structure.key) - columns.emplace_back(attribute.type->createColumn()); - - for (auto idx : ext::range(start, size)) - { - const auto & key = keys[idx]; - auto ptr = key.data; - for (auto & column : columns) - ptr = column->deserializeAndInsertFromArena(ptr); - } - - for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i) - res.emplace_back( - ColumnWithTypeAndName{std::move(columns[i]), (*dictionary_structure.key)[i].type, (*dictionary_structure.key)[i].name}); -} - } diff --git a/src/Dictionaries/DictionaryFactory.cpp b/src/Dictionaries/DictionaryFactory.cpp index 0ab7d199186..413bbd4f967 100644 --- a/src/Dictionaries/DictionaryFactory.cpp +++ b/src/Dictionaries/DictionaryFactory.cpp @@ -20,7 +20,7 @@ namespace ErrorCodes void DictionaryFactory::registerLayout(const std::string & layout_type, Creator create_layout, bool is_complex) { if (!registered_layouts.emplace(layout_type, std::move(create_layout)).second) - throw Exception("DictionaryFactory: the layout name '" + layout_type + "' is not unique", ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, "DictionaryFactory: the layout name '{}' is not unique", layout_type); layout_complexity[layout_type] = is_complex; @@ -31,15 +31,16 @@ DictionaryPtr DictionaryFactory::create( const std::string & name, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - const Context & context, + ContextPtr context, bool check_source_config) const { Poco::Util::AbstractConfiguration::Keys keys; const auto & layout_prefix = config_prefix + ".layout"; config.keys(layout_prefix, keys); if (keys.size() != 1) - throw Exception{name + ": element dictionary.layout should have exactly one child element", - ErrorCodes::EXCESSIVE_ELEMENT_IN_CONFIG}; + throw Exception(ErrorCodes::EXCESSIVE_ELEMENT_IN_CONFIG, + "{}: element dictionary.layout should have exactly one child element", + name); const DictionaryStructure dict_struct{config, config_prefix}; @@ -47,8 +48,8 @@ DictionaryPtr DictionaryFactory::create( name, config, config_prefix + ".source", dict_struct, context, config.getString(config_prefix + ".database", ""), check_source_config); LOG_TRACE(&Poco::Logger::get("DictionaryFactory"), "Created dictionary source '{}' for dictionary '{}'", source_ptr->toString(), name); - if (context.hasQueryContext() && context.getSettingsRef().log_queries) - context.getQueryContext().addQueryFactoriesInfo(Context::QueryLogFactories::Dictionary, name); + if (context->hasQueryContext() && context->getSettingsRef().log_queries) + context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Dictionary, name); const auto & layout_type = keys.front(); @@ -61,10 +62,13 @@ DictionaryPtr DictionaryFactory::create( } } - throw Exception{name + ": unknown dictionary layout type: " + layout_type, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG}; + throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, + "{}: unknown dictionary layout type: {}", + name, + layout_type); } -DictionaryPtr DictionaryFactory::create(const std::string & name, const ASTCreateQuery & ast, const Context & context) const +DictionaryPtr DictionaryFactory::create(const std::string & name, const ASTCreateQuery & ast, ContextPtr context) const { auto configuration = getDictionaryConfigurationFromAST(ast, context); return DictionaryFactory::create(name, *configuration, "dictionary", context, true); @@ -77,7 +81,9 @@ bool DictionaryFactory::isComplex(const std::string & layout_type) const if (found != layout_complexity.end()) return found->second; - throw Exception{"Unknown dictionary layout type: " + layout_type, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG}; + throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, + "Unknown dictionary layout type: {}", + layout_type); } diff --git a/src/Dictionaries/DictionaryFactory.h b/src/Dictionaries/DictionaryFactory.h index 3d3f793e2b2..b869550a9af 100644 --- a/src/Dictionaries/DictionaryFactory.h +++ b/src/Dictionaries/DictionaryFactory.h @@ -1,5 +1,6 @@ #pragma once +#include #include "IDictionary.h" #include "registerDictionaries.h" #include @@ -21,8 +22,6 @@ class Logger; namespace DB { -class Context; - /** Create dictionary according to its layout. */ class DictionaryFactory : private boost::noncopyable @@ -37,13 +36,13 @@ public: const std::string & name, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - const Context & context, + ContextPtr context, bool check_source_config = false) const; /// Create dictionary from DDL-query DictionaryPtr create(const std::string & name, const ASTCreateQuery & ast, - const Context & context) const; + ContextPtr context) const; using Creator = std::function +#include #include #include #include #include +#include #include -#include "DictionaryStructure.h" +#include +#include +#include namespace DB { @@ -13,6 +18,202 @@ namespace DB namespace ErrorCodes { extern const int TYPE_MISMATCH; + extern const int BAD_ARGUMENTS; +} + +/** Simple helper for getting default. + * Initialized with default value and default values column. + * If default values column is not null default value is taken from column. + * If default value is null default value is taken from initializer. + */ +class DefaultValueProvider final +{ +public: + explicit DefaultValueProvider(Field default_value_, ColumnPtr default_values_column_ = nullptr) + : default_value(std::move(default_value_)) + , default_values_column(default_values_column_) + { + } + + inline bool isConstant() const { return default_values_column == nullptr; } + + Field getDefaultValue(size_t row) const + { + if (default_values_column) + return (*default_values_column)[row]; + + return default_value; + } + +private: + Field default_value; + ColumnPtr default_values_column; +}; + +/** Support class for dictionary storages. + + The main idea is that during fetch we create all columns, but fill only columns that client requested. + + We need to create other columns during fetch, because in case of serialized storage we can skip + unnecessary columns serialized in cache with skipSerializedInArena method. + + When result is fetched from the storage client of storage can filterOnlyNecessaryColumns + and get only columns that match attributes_names_to_fetch. + */ +class DictionaryStorageFetchRequest +{ +public: + DictionaryStorageFetchRequest( + const DictionaryStructure & structure, + const Strings & attributes_names_to_fetch, + DataTypes attributes_to_fetch_result_types, + Columns attributes_default_values_columns) + : attributes_to_fetch_names_set(attributes_names_to_fetch.begin(), attributes_names_to_fetch.end()) + , attributes_to_fetch_filter(structure.attributes.size(), false) + { + assert(attributes_default_values_columns.size() == attributes_names_to_fetch.size()); + + if (attributes_to_fetch_names_set.size() != attributes_names_to_fetch.size()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Attribute names to fetch should be unique"); + + size_t attributes_size = structure.attributes.size(); + dictionary_attributes_types.reserve(attributes_size); + attributes_default_value_providers.reserve(attributes_to_fetch_names_set.size()); + + size_t attributes_to_fetch_index = 0; + for (size_t i = 0; i < attributes_size; ++i) + { + const auto & dictionary_attribute = structure.attributes[i]; + const auto & name = dictionary_attribute.name; + const auto & type = dictionary_attribute.type; + dictionary_attributes_types.emplace_back(type); + + if (attributes_to_fetch_names_set.find(name) != attributes_to_fetch_names_set.end()) + { + attributes_to_fetch_filter[i] = true; + auto & attribute_to_fetch_result_type = attributes_to_fetch_result_types[attributes_to_fetch_index]; + + if (!attribute_to_fetch_result_type->equals(*type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Attribute type does not match, expected ({}), found ({})", + attribute_to_fetch_result_type->getName(), + type->getName()); + + attributes_default_value_providers.emplace_back(dictionary_attribute.null_value, attributes_default_values_columns[attributes_to_fetch_index]); + ++attributes_to_fetch_index; + } + else + attributes_default_value_providers.emplace_back(dictionary_attribute.null_value); + } + } + + DictionaryStorageFetchRequest() = default; + + /// Check requested attributes size + ALWAYS_INLINE size_t attributesSize() const + { + return dictionary_attributes_types.size(); + } + + /// Check if attribute with attribute_name was requested to fetch + ALWAYS_INLINE bool containsAttribute(const String & attribute_name) const + { + return attributes_to_fetch_names_set.find(attribute_name) != attributes_to_fetch_names_set.end(); + } + + /// Check if attribute with attribute_index should be filled during fetch + ALWAYS_INLINE bool shouldFillResultColumnWithIndex(size_t attribute_index) const + { + return attributes_to_fetch_filter[attribute_index]; + } + + const DataTypePtr & dataTypeAtIndex(size_t attribute_index) const + { + return dictionary_attributes_types[attribute_index]; + } + + const DefaultValueProvider & defaultValueProviderAtIndex(size_t attribute_index) const + { + return attributes_default_value_providers[attribute_index]; + } + + /// Create columns for each of dictionary attributes + MutableColumns makeAttributesResultColumns() const + { + MutableColumns result; + result.reserve(dictionary_attributes_types.size()); + + for (const auto & type : dictionary_attributes_types) + result.emplace_back(type->createColumn()); + + return result; + } + + Columns makeAttributesResultColumnsNonMutable() const + { + Columns result; + result.reserve(dictionary_attributes_types.size()); + + for (const auto & type : dictionary_attributes_types) + result.emplace_back(type->createColumn()); + + return result; + } + + /// Filter only requested columns + Columns filterRequestedColumns(MutableColumns & fetched_mutable_columns) const + { + Columns result; + result.reserve(dictionary_attributes_types.size()); + + for (size_t fetch_request_index = 0; fetch_request_index < dictionary_attributes_types.size(); ++fetch_request_index) + if (shouldFillResultColumnWithIndex(fetch_request_index)) + result.emplace_back(std::move(fetched_mutable_columns[fetch_request_index])); + + return result; + } +private: + std::unordered_set attributes_to_fetch_names_set; + std::vector attributes_to_fetch_filter; + std::vector attributes_default_value_providers; + DataTypes dictionary_attributes_types; +}; + +static inline void insertDefaultValuesIntoColumns( + MutableColumns & columns, + const DictionaryStorageFetchRequest & fetch_request, + size_t row_index) +{ + size_t columns_size = columns.size(); + + for (size_t column_index = 0; column_index < columns_size; ++column_index) + { + const auto & column = columns[column_index]; + const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(column_index); + + if (fetch_request.shouldFillResultColumnWithIndex(column_index)) + column->insert(default_value_provider.getDefaultValue(row_index)); + } +} + +/// Deserialize column value and insert it in columns. +/// Skip unnecessary columns that were not requested from deserialization. +static inline void deserializeAndInsertIntoColumns( + MutableColumns & columns, + const DictionaryStorageFetchRequest & fetch_request, + const char * place_for_serialized_columns) +{ + size_t columns_size = columns.size(); + + for (size_t column_index = 0; column_index < columns_size; ++column_index) + { + const auto & column = columns[column_index]; + + if (fetch_request.shouldFillResultColumnWithIndex(column_index)) + place_for_serialized_columns = column->deserializeAndInsertFromArena(place_for_serialized_columns); + else + place_for_serialized_columns = column->skipSerializedInArena(place_for_serialized_columns); + } } /** @@ -50,7 +251,7 @@ public: else if constexpr (IsNumber) return ColumnType::create(size); else - throw Exception{"Unsupported attribute type.", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Unsupported attribute type."); } }; @@ -69,7 +270,7 @@ class DictionaryDefaultValueExtractor public: using DefaultValueType = DictionaryValueType; - DictionaryDefaultValueExtractor(DictionaryAttributeType attribute_default_value, ColumnPtr default_values_column_ = nullptr) + explicit DictionaryDefaultValueExtractor(DictionaryAttributeType attribute_default_value, ColumnPtr default_values_column_ = nullptr) : default_value(std::move(attribute_default_value)) { if (default_values_column_ == nullptr) @@ -87,7 +288,7 @@ public: use_default_value_from_column = false; } else - throw Exception{"Type of default column is not the same as dictionary attribute type.", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Type of default column is not the same as dictionary attribute type."); } } @@ -109,14 +310,235 @@ private: bool use_default_value_from_column = false; }; +template +class DictionaryKeysArenaHolder; + +template <> +class DictionaryKeysArenaHolder +{ +public: + static Arena * getComplexKeyArena() { return nullptr; } +}; + +template <> +class DictionaryKeysArenaHolder +{ +public: + + Arena * getComplexKeyArena() { return &complex_key_arena; } + +private: + Arena complex_key_arena; +}; + + +template +class DictionaryKeysExtractor +{ +public: + using KeyType = std::conditional_t; + static_assert(key_type != DictionaryKeyType::range, "Range key type is not supported by DictionaryKeysExtractor"); + + explicit DictionaryKeysExtractor(const Columns & key_columns_, Arena * complex_key_arena_) + : key_columns(key_columns_) + , complex_key_arena(complex_key_arena_) + { + assert(!key_columns.empty()); + + if constexpr (key_type == DictionaryKeyType::simple) + { + key_columns[0] = key_columns[0]->convertToFullColumnIfConst(); + + const auto * vector_col = checkAndGetColumn>(key_columns[0].get()); + if (!vector_col) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Column type mismatch for simple key expected UInt64"); + } + + keys_size = key_columns.front()->size(); + } + + inline size_t getKeysSize() const + { + return keys_size; + } + + inline size_t getCurrentKeyIndex() const + { + return current_key_index; + } + + inline KeyType extractCurrentKey() + { + assert(current_key_index < keys_size); + + if constexpr (key_type == DictionaryKeyType::simple) + { + const auto & column_vector = static_cast &>(*key_columns[0]); + const auto & data = column_vector.getData(); + + auto key = data[current_key_index]; + ++current_key_index; + return key; + } + else + { + size_t allocated_size_for_columns = 0; + const char * block_start = nullptr; + + for (const auto & column : key_columns) + { + StringRef serialized_data = column->serializeValueIntoArena(current_key_index, *complex_key_arena, block_start); + allocated_size_for_columns += serialized_data.size; + } + + ++current_key_index; + current_complex_key = StringRef{block_start, allocated_size_for_columns}; + return current_complex_key; + } + } + + void rollbackCurrentKey() const + { + if constexpr (key_type == DictionaryKeyType::complex) + complex_key_arena->rollback(current_complex_key.size); + } + + PaddedPODArray extractAllKeys() + { + PaddedPODArray result; + result.reserve(keys_size - current_key_index); + + for (; current_key_index < keys_size;) + { + auto value = extractCurrentKey(); + result.emplace_back(value); + } + + return result; + } + + void reset() + { + current_key_index = 0; + } +private: + Columns key_columns; + + size_t keys_size = 0; + size_t current_key_index = 0; + + KeyType current_complex_key {}; + Arena * complex_key_arena; +}; + +/** Merge block with blocks from stream. If there are duplicate keys in block they are filtered out. + * In result block_to_update will be merged with blocks from stream. + * Note: readPrefix readImpl readSuffix will be called on stream object during function execution. + */ +template +void mergeBlockWithStream( + size_t key_column_size [[maybe_unused]], + Block & block_to_update [[maybe_unused]], + BlockInputStreamPtr & stream [[maybe_unused]]) +{ + using KeyType = std::conditional_t; + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by updatePreviousyLoadedBlockWithStream"); + + Columns saved_block_key_columns; + saved_block_key_columns.reserve(key_column_size); + + /// Split into keys columns and attribute columns + for (size_t i = 0; i < key_column_size; ++i) + saved_block_key_columns.emplace_back(block_to_update.safeGetByPosition(i).column); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor saved_keys_extractor(saved_block_key_columns, arena_holder.getComplexKeyArena()); + auto saved_keys_extracted_from_block = saved_keys_extractor.extractAllKeys(); + + IColumn::Filter filter(saved_keys_extracted_from_block.size(), true); + + HashMap saved_key_to_index; + saved_key_to_index.reserve(saved_keys_extracted_from_block.size()); + + size_t indexes_to_remove_count = 0; + + for (size_t i = 0; i < saved_keys_extracted_from_block.size(); ++i) + { + auto saved_key = saved_keys_extracted_from_block[i]; + auto [it, was_inserted] = saved_key_to_index.insert(makePairNoInit(saved_key, i)); + + if (!was_inserted) + { + size_t index_to_remove = it->getMapped(); + filter[index_to_remove] = false; + it->getMapped() = i; + ++indexes_to_remove_count; + } + } + + auto result_fetched_columns = block_to_update.cloneEmptyColumns(); + + stream->readPrefix(); + + while (Block block = stream->read()) + { + Columns block_key_columns; + block_key_columns.reserve(key_column_size); + + /// Split into keys columns and attribute columns + for (size_t i = 0; i < key_column_size; ++i) + block_key_columns.emplace_back(block.safeGetByPosition(i).column); + + DictionaryKeysExtractor update_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena()); + PaddedPODArray update_keys = update_keys_extractor.extractAllKeys(); + + for (auto update_key : update_keys) + { + const auto * it = saved_key_to_index.find(update_key); + if (it != nullptr) + { + size_t index_to_filter = it->getMapped(); + filter[index_to_filter] = false; + ++indexes_to_remove_count; + } + } + + size_t rows = block.rows(); + + for (size_t column_index = 0; column_index < block.columns(); ++column_index) + { + const auto update_column = block.safeGetByPosition(column_index).column; + MutableColumnPtr & result_fetched_column = result_fetched_columns[column_index]; + + result_fetched_column->insertRangeFrom(*update_column, 0, rows); + } + } + + stream->readSuffix(); + + size_t result_fetched_rows = result_fetched_columns.front()->size(); + size_t filter_hint = filter.size() - indexes_to_remove_count; + + for (size_t column_index = 0; column_index < block_to_update.columns(); ++column_index) + { + auto & column = block_to_update.getByPosition(column_index).column; + column = column->filter(filter, filter_hint); + + MutableColumnPtr mutable_column = column->assumeMutable(); + const IColumn & fetched_column = *result_fetched_columns[column_index]; + mutable_column->insertRangeFrom(fetched_column, 0, result_fetched_rows); + } +} + /** * Returns ColumnVector data as PaddedPodArray. * If column is constant parameter backup_storage is used to store values. */ +/// TODO: Remove template static const PaddedPODArray & getColumnVectorData( - const IDictionaryBase * dictionary, + const IDictionary * dictionary, const ColumnPtr column, PaddedPODArray & backup_storage) { @@ -126,10 +548,10 @@ static const PaddedPODArray & getColumnVectorData( if (!vector_col) { - throw Exception{ErrorCodes::TYPE_MISMATCH, + throw Exception(ErrorCodes::TYPE_MISMATCH, "{}: type mismatch: column has wrong type expected {}", dictionary->getDictionaryID().getNameForLogs(), - TypeName::get()}; + TypeName::get()); } if (is_const_column) diff --git a/src/Dictionaries/DictionarySourceFactory.cpp b/src/Dictionaries/DictionarySourceFactory.cpp index af3552364ba..50ba6405074 100644 --- a/src/Dictionaries/DictionarySourceFactory.cpp +++ b/src/Dictionaries/DictionarySourceFactory.cpp @@ -71,7 +71,7 @@ DictionarySourceFactory::DictionarySourceFactory() : log(&Poco::Logger::get("Dic void DictionarySourceFactory::registerSource(const std::string & source_type, Creator create_source) { if (!registered_sources.emplace(source_type, std::move(create_source)).second) - throw Exception("DictionarySourceFactory: the source name '" + source_type + "' is not unique", ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, "DictionarySourceFactory: the source name '{}' is not unique", source_type); } DictionarySourcePtr DictionarySourceFactory::create( @@ -79,7 +79,7 @@ DictionarySourcePtr DictionarySourceFactory::create( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const DictionaryStructure & dict_struct, - const Context & context, + ContextPtr context, const std::string & default_database, bool check_config) const { @@ -87,8 +87,9 @@ DictionarySourcePtr DictionarySourceFactory::create( config.keys(config_prefix, keys); if (keys.empty() || keys.size() > 2) - throw Exception{name + ": element dictionary.source should have one or two child elements", - ErrorCodes::EXCESSIVE_ELEMENT_IN_CONFIG}; + throw Exception(ErrorCodes::EXCESSIVE_ELEMENT_IN_CONFIG, + "{}: element dictionary.source should have one or two child elements", + name); const std::string & source_type = keys.front() == "settings" ? keys.back() : keys.front(); @@ -100,7 +101,10 @@ DictionarySourcePtr DictionarySourceFactory::create( return create_source(dict_struct, config, config_prefix, sample_block, context, default_database, check_config); } - throw Exception{name + ": unknown dictionary source type: " + source_type, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG}; + throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, + "{}: unknown dictionary source type: {}", + name, + source_type); } DictionarySourceFactory & DictionarySourceFactory::instance() diff --git a/src/Dictionaries/DictionarySourceFactory.h b/src/Dictionaries/DictionarySourceFactory.h index 1406660dfb4..bb583927ac4 100644 --- a/src/Dictionaries/DictionarySourceFactory.h +++ b/src/Dictionaries/DictionarySourceFactory.h @@ -2,6 +2,7 @@ #include "IDictionarySource.h" #include +#include #include @@ -17,7 +18,7 @@ class Logger; namespace DB { -class Context; + struct DictionaryStructure; /// creates IDictionarySource instance from config and DictionaryStructure @@ -34,7 +35,7 @@ public: const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block, - const Context & context, + ContextPtr context, const std::string & default_database, bool check_config)>; @@ -47,7 +48,7 @@ public: const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const DictionaryStructure & dict_struct, - const Context & context, + ContextPtr context, const std::string & default_database, bool check_config) const; diff --git a/src/Dictionaries/DictionarySourceHelpers.cpp b/src/Dictionaries/DictionarySourceHelpers.cpp index 2a872672aff..e175c6c3eae 100644 --- a/src/Dictionaries/DictionarySourceHelpers.cpp +++ b/src/Dictionaries/DictionarySourceHelpers.cpp @@ -13,6 +13,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; +} + void formatBlock(BlockOutputStreamPtr & out, const Block & block) { out->writePrefix(); @@ -62,12 +67,12 @@ Block blockForKeys( return block; } -Context copyContextAndApplySettings( +ContextPtr copyContextAndApplySettings( const std::string & config_prefix, - const Context & context, + ContextPtr context, const Poco::Util::AbstractConfiguration & config) { - Context local_context(context); + auto local_context = Context::createCopy(context); if (config.has(config_prefix + ".settings")) { const auto prefix = config_prefix + ".settings"; @@ -83,9 +88,67 @@ Context copyContextAndApplySettings( changes.emplace_back(key, value); } - local_context.applySettingsChanges(changes); + local_context->applySettingsChanges(changes); } return local_context; } + +BlockInputStreamWithAdditionalColumns::BlockInputStreamWithAdditionalColumns( + Block block_to_add_, std::unique_ptr && stream_) + : block_to_add(std::move(block_to_add_)) + , stream(std::move(stream_)) +{ +} + +Block BlockInputStreamWithAdditionalColumns::getHeader() const +{ + auto header = stream->getHeader(); + + if (header) + { + for (Int64 i = static_cast(block_to_add.columns() - 1); i >= 0; --i) + header.insert(0, block_to_add.getByPosition(i).cloneEmpty()); + } + + return header; +} + +Block BlockInputStreamWithAdditionalColumns::readImpl() +{ + auto block = stream->read(); + + if (block) + { + auto block_rows = block.rows(); + + auto cut_block = block_to_add.cloneWithCutColumns(current_range_index, block_rows); + + if (cut_block.rows() != block_rows) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, + "Number of rows in block to add after cut must equal to number of rows in block from inner stream"); + + for (Int64 i = static_cast(cut_block.columns() - 1); i >= 0; --i) + block.insert(0, cut_block.getByPosition(i)); + + current_range_index += block_rows; + } + + return block; +} + +void BlockInputStreamWithAdditionalColumns::readPrefix() +{ + stream->readPrefix(); +} + +void BlockInputStreamWithAdditionalColumns::readSuffix() +{ + stream->readSuffix(); +} + +String BlockInputStreamWithAdditionalColumns::getName() const +{ + return "BlockInputStreamWithAdditionalColumns"; +} } diff --git a/src/Dictionaries/DictionarySourceHelpers.h b/src/Dictionaries/DictionarySourceHelpers.h index cad5441c66e..1febf921e07 100644 --- a/src/Dictionaries/DictionarySourceHelpers.h +++ b/src/Dictionaries/DictionarySourceHelpers.h @@ -6,17 +6,18 @@ #include #include - +#include #include #include +#include namespace DB { + class IBlockOutputStream; using BlockOutputStreamPtr = std::shared_ptr; struct DictionaryStructure; -class Context; /// Write keys to block output stream. @@ -36,14 +37,39 @@ Block blockForKeys( const std::vector & requested_rows); /// Used for applying settings to copied context in some register[...]Source functions -Context copyContextAndApplySettings( +ContextPtr copyContextAndApplySettings( const std::string & config_prefix, - const Context & context, + ContextPtr context, const Poco::Util::AbstractConfiguration & config); void applySettingsToContext( const std::string & config_prefix, - Context & context, + ContextPtr context, const Poco::Util::AbstractConfiguration & config); +/** A stream, adds additional columns to each block that it will read from inner stream. + * + * block_to_add rows size must be equal to final sum rows size of all inner stream blocks. + */ +class BlockInputStreamWithAdditionalColumns final : public IBlockInputStream +{ +public: + BlockInputStreamWithAdditionalColumns(Block block_to_add_, std::unique_ptr && stream_); + + Block getHeader() const override; + + Block readImpl() override; + + void readPrefix() override; + + void readSuffix() override; + + String getName() const override; + +private: + Block block_to_add; + std::unique_ptr stream; + size_t current_range_index = 0; +}; + } diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index 408e4803b1b..dd53e31041b 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -33,7 +33,7 @@ namespace const auto expression = config.getString(config_prefix + ".expression", ""); if (name.empty() && !expression.empty()) - throw Exception{"Element " + config_prefix + ".name is empty", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Element {}.name is empty"); const auto type_name = config.getString(config_prefix + ".type", default_type); return DictionaryTypedSpecialAttribute{std::move(name), std::move(expression), DataTypeFactory::instance().get(type_name)}; @@ -65,6 +65,7 @@ AttributeUnderlyingType getAttributeUnderlyingType(const DataTypePtr & type) case TypeIndex::Decimal32: return AttributeUnderlyingType::utDecimal32; case TypeIndex::Decimal64: return AttributeUnderlyingType::utDecimal64; case TypeIndex::Decimal128: return AttributeUnderlyingType::utDecimal128; + case TypeIndex::Decimal256: return AttributeUnderlyingType::utDecimal256; case TypeIndex::Date: return AttributeUnderlyingType::utUInt16; case TypeIndex::DateTime: return AttributeUnderlyingType::utUInt32; @@ -81,11 +82,11 @@ AttributeUnderlyingType getAttributeUnderlyingType(const DataTypePtr & type) default: break; } - throw Exception{"Unknown type for dictionary" + type->getName(), ErrorCodes::UNKNOWN_TYPE}; + throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown type {} for dictionary attribute", type->getName()); } -std::string toString(const AttributeUnderlyingType type) +std::string toString(AttributeUnderlyingType type) { switch (type) { @@ -117,11 +118,13 @@ std::string toString(const AttributeUnderlyingType type) return "Decimal64"; case AttributeUnderlyingType::utDecimal128: return "Decimal128"; + case AttributeUnderlyingType::utDecimal256: + return "Decimal256"; case AttributeUnderlyingType::utString: return "String"; } - throw Exception{"Unknown attribute_type " + toString(static_cast(type)), ErrorCodes::ARGUMENT_OUT_OF_BOUND}; + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Unknown dictionary attribute type {}", toString(static_cast(type))); } @@ -129,7 +132,7 @@ DictionarySpecialAttribute::DictionarySpecialAttribute(const Poco::Util::Abstrac : name{config.getString(config_prefix + ".name", "")}, expression{config.getString(config_prefix + ".expression", "")} { if (name.empty() && !expression.empty()) - throw Exception{"Element " + config_prefix + ".name is empty", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Element {}.name is empty", config_prefix); } @@ -141,23 +144,23 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration const auto has_key = config.has(structure_prefix + ".key"); if (has_key && has_id) - throw Exception{"Only one of 'id' and 'key' should be specified", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only one of 'id' and 'key' should be specified"); if (has_id) id.emplace(config, structure_prefix + ".id"); else if (has_key) { - key.emplace(getAttributes(config, structure_prefix + ".key", false, false)); + key.emplace(getAttributes(config, structure_prefix + ".key", true)); if (key->empty()) - throw Exception{"Empty 'key' supplied", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Empty 'key' supplied"); } else - throw Exception{"Dictionary structure should specify either 'id' or 'key'", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary structure should specify either 'id' or 'key'"); if (id) { if (id->name.empty()) - throw Exception{"'id' cannot be empty", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "'id' cannot be empty"); const char * range_default_type = "Date"; if (config.has(structure_prefix + ".range_min")) @@ -168,38 +171,57 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration if (range_min.has_value() != range_max.has_value()) { - throw Exception{"Dictionary structure should have both 'range_min' and 'range_max' either specified or not.", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Dictionary structure should have both 'range_min' and 'range_max' either specified or not."); } if (range_min && range_max && !range_min->type->equals(*range_max->type)) { - throw Exception{"Dictionary structure 'range_min' and 'range_max' should have same type, " - "'range_min' type: " - + range_min->type->getName() - + ", " - "'range_max' type: " - + range_max->type->getName(), - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Dictionary structure 'range_min' and 'range_max' should have same type, " + "'range_min' type: {}," + "'range_max' type: {}", + range_min->type->getName(), + range_max->type->getName()); } if (range_min) { if (!range_min->type->isValueRepresentedByInteger()) - throw Exception{"Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum." - " Actual 'range_min' and 'range_max' type is " - + range_min->type->getName(), - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum." + " Actual 'range_min' and 'range_max' type is {}", + range_min->type->getName()); } if (!id->expression.empty() || (range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty())) has_expressions = true; } - attributes = getAttributes(config, structure_prefix); + attributes = getAttributes(config, structure_prefix, false); + + for (size_t i = 0; i < attributes.size(); ++i) + { + const auto & attribute = attributes[i]; + const auto & attribute_name = attribute.name; + attribute_name_to_index[attribute_name] = i; + + if (attribute.hierarchical) + { + if (id && attribute.underlying_type != AttributeUnderlyingType::utUInt64) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Hierarchical attribute type for dictionary with simple key must be UInt64. Actual {}", + toString(attribute.underlying_type)); + + else if (key) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary with complex key does not support hierarchy"); + + hierarchical_attribute_index = i; + } + } if (attributes.empty()) - throw Exception{"Dictionary has no attributes defined", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary has no attributes defined"); if (config.getBool(config_prefix + ".layout.ip_trie.access_to_key_from_attributes", false)) access_to_key_from_attributes = true; @@ -209,7 +231,7 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration void DictionaryStructure::validateKeyTypes(const DataTypes & key_types) const { if (key_types.size() != key->size()) - throw Exception{"Key structure does not match, expected " + getKeyDescription(), ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Key structure does not match, expected {}", getKeyDescription()); for (const auto i : ext::range(0, key_types.size())) { @@ -217,40 +239,55 @@ void DictionaryStructure::validateKeyTypes(const DataTypes & key_types) const const auto & actual_type = key_types[i]; if (!areTypesEqual(expected_type, actual_type)) - throw Exception{"Key type at position " + std::to_string(i) + " does not match, expected " + expected_type->getName() + ", found " - + actual_type->getName(), - ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Key type at position {} does not match, expected {}, found {}", + std::to_string(i), + expected_type->getName(), + actual_type->getName()); } } -const DictionaryAttribute & DictionaryStructure::getAttribute(const String & attribute_name) const +const DictionaryAttribute & DictionaryStructure::getAttribute(const std::string & attribute_name) const { - auto find_iter - = std::find_if(attributes.begin(), attributes.end(), [&](const auto & attribute) { return attribute.name == attribute_name; }); - if (find_iter != attributes.end()) - return *find_iter; + auto it = attribute_name_to_index.find(attribute_name); - if (key && access_to_key_from_attributes) + if (it == attribute_name_to_index.end()) { - find_iter = std::find_if(key->begin(), key->end(), [&](const auto & attribute) { return attribute.name == attribute_name; }); - if (find_iter != key->end()) - return *find_iter; + if (!access_to_key_from_attributes) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such attribute '{}'", attribute_name); + + for (const auto & key_attribute : *key) + if (key_attribute.name == attribute_name) + return key_attribute; + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such attribute '{}' in keys", attribute_name); } - throw Exception{"No such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; + size_t attribute_index = it->second; + return attributes[attribute_index]; } -const DictionaryAttribute & DictionaryStructure::getAttribute(const String & attribute_name, const DataTypePtr & type) const +const DictionaryAttribute & DictionaryStructure::getAttribute(const std::string & attribute_name, const DataTypePtr & type) const { const auto & attribute = getAttribute(attribute_name); if (!areTypesEqual(attribute.type, type)) - throw Exception{"Attribute type does not match, expected " + attribute.type->getName() + ", found " + type->getName(), - ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Attribute type does not match, expected {}, found {}", + attribute.type->getName(), + type->getName()); return attribute; } +size_t DictionaryStructure::getKeysSize() const +{ + if (id) + return 1; + else + return key->size(); +} + std::string DictionaryStructure::getKeyDescription() const { if (id) @@ -289,14 +326,6 @@ bool DictionaryStructure::isKeySizeFixed() const return true; } -size_t DictionaryStructure::getKeySize() const -{ - return std::accumulate(std::begin(*key), std::end(*key), size_t{}, [](const auto running_size, const auto & key_i) - { - return running_size + key_i.type->getSizeOfValueInMemory(); - }); -} - Strings DictionaryStructure::getKeysNames() const { if (id) @@ -321,21 +350,24 @@ static void checkAttributeKeys(const Poco::Util::AbstractConfiguration::Keys & k for (const auto & key : keys) { if (valid_keys.find(key) == valid_keys.end()) - throw Exception{"Unknown key '" + key + "' inside attribute section", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown key '{}' inside attribute section", key); } } - std::vector DictionaryStructure::getAttributes( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - const bool hierarchy_allowed, - const bool allow_null_values) + bool complex_key_attributes) { + /// If we request complex key attributes they does not support hierarchy and does not allow null values + const bool hierarchy_allowed = !complex_key_attributes; + const bool allow_null_values = !complex_key_attributes; + Poco::Util::AbstractConfiguration::Keys config_elems; config.keys(config_prefix, config_elems); auto has_hierarchy = false; + std::unordered_set attribute_names; std::vector res_attributes; const FormatSettings format_settings; @@ -358,6 +390,13 @@ std::vector DictionaryStructure::getAttributes( if ((range_min && name == range_min->name) || (range_max && name == range_max->name)) continue; + auto insert_result = attribute_names.insert(name); + bool inserted = insert_result.second; + + if (!inserted) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Dictionary attributes names must be unique. Attribute name ({}) is not unique", + name); const auto type_string = config.getString(prefix + "type"); const auto initial_type = DataTypeFactory::instance().get(type_string); @@ -391,7 +430,7 @@ std::vector DictionaryStructure::getAttributes( { ReadBufferFromString null_value_buffer{null_value_string}; auto column_with_null_value = type->createColumn(); - type->deserializeAsTextEscaped(*column_with_null_value, null_value_buffer, format_settings); + type->getDefaultSerialization()->deserializeTextEscaped(*column_with_null_value, null_value_buffer, format_settings); null_value = (*column_with_null_value)[0]; } } @@ -408,13 +447,13 @@ std::vector DictionaryStructure::getAttributes( const auto injective = config.getBool(prefix + "injective", false); const auto is_object_id = config.getBool(prefix + "is_object_id", false); if (name.empty()) - throw Exception{"Properties 'name' and 'type' of an attribute cannot be empty", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Properties 'name' and 'type' of an attribute cannot be empty"); if (has_hierarchy && !hierarchy_allowed) - throw Exception{"Hierarchy not allowed in '" + prefix, ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Hierarchy not allowed in '{}'", prefix); if (has_hierarchy && hierarchical) - throw Exception{"Only one hierarchical attribute supported", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only one hierarchical attribute supported"); has_hierarchy = has_hierarchy || hierarchical; @@ -422,6 +461,7 @@ std::vector DictionaryStructure::getAttributes( name, underlying_type, initial_type, + initial_type->getDefaultSerialization(), type, expression, null_value, diff --git a/src/Dictionaries/DictionaryStructure.h b/src/Dictionaries/DictionaryStructure.h index 0ff50868e26..ce5dd3dd422 100644 --- a/src/Dictionaries/DictionaryStructure.h +++ b/src/Dictionaries/DictionaryStructure.h @@ -1,17 +1,18 @@ #pragma once -#include -#include -#include -#include -#include - #include #include #include #include +#include + +#include +#include +#include +#include + namespace DB { @@ -31,13 +32,14 @@ enum class AttributeUnderlyingType utDecimal32, utDecimal64, utDecimal128, + utDecimal256, utString }; AttributeUnderlyingType getAttributeUnderlyingType(const std::string & type); -std::string toString(const AttributeUnderlyingType type); +std::string toString(AttributeUnderlyingType type); /// Min and max lifetimes for a dictionary or it's entry using DictionaryLifetime = ExternalLoadableLifetime; @@ -45,6 +47,7 @@ using DictionaryLifetime = ExternalLoadableLifetime; /** Holds the description of a single dictionary attribute: * - name, used for lookup into dictionary and source; * - type, used in conjunction with DataTypeFactory and getAttributeUnderlyingTypeByname; +* - nested_type, contains nested type of complex type like Nullable, Array * - null_value, used as a default value for non-existent entries in the dictionary, * decimal representation for numeric attributes; * - hierarchical, whether this attribute defines a hierarchy; @@ -56,6 +59,7 @@ struct DictionaryAttribute final const std::string name; const AttributeUnderlyingType underlying_type; const DataTypePtr type; + const SerializationPtr serialization; const DataTypePtr nested_type; const std::string expression; const Field null_value; @@ -122,6 +126,9 @@ void callOnDictionaryAttributeType(AttributeUnderlyingType type, F&& func) case AttributeUnderlyingType::utDecimal128: func(DictionaryAttributeType()); break; + case AttributeUnderlyingType::utDecimal256: + func(DictionaryAttributeType()); + break; } }; @@ -147,28 +154,33 @@ struct DictionaryStructure final std::optional id; std::optional> key; std::vector attributes; + std::unordered_map attribute_name_to_index; std::optional range_min; std::optional range_max; + std::optional hierarchical_attribute_index; + bool has_expressions = false; bool access_to_key_from_attributes = false; DictionaryStructure(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); void validateKeyTypes(const DataTypes & key_types) const; - const DictionaryAttribute & getAttribute(const String & attribute_name) const; - const DictionaryAttribute & getAttribute(const String & attribute_name, const DataTypePtr & type) const; + + const DictionaryAttribute & getAttribute(const std::string & attribute_name) const; + const DictionaryAttribute & getAttribute(const std::string & attribute_name, const DataTypePtr & type) const; + + Strings getKeysNames() const; + size_t getKeysSize() const; + std::string getKeyDescription() const; bool isKeySizeFixed() const; - size_t getKeySize() const; - Strings getKeysNames() const; private: /// range_min and range_max have to be parsed before this function call std::vector getAttributes( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - const bool hierarchy_allowed = true, - const bool allow_null_values = true); + bool complex_key_attributes); }; } diff --git a/src/Dictionaries/DirectDictionary.cpp b/src/Dictionaries/DirectDictionary.cpp index b61f256b0cc..2038704414e 100644 --- a/src/Dictionaries/DirectDictionary.cpp +++ b/src/Dictionaries/DirectDictionary.cpp @@ -1,430 +1,338 @@ #include "DirectDictionary.h" -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" + #include -#include -#include +#include +#include #include -#include +#include + +#include +#include namespace DB { namespace ErrorCodes { - extern const int TYPE_MISMATCH; - extern const int BAD_ARGUMENTS; extern const int UNSUPPORTED_METHOD; + extern const int BAD_ARGUMENTS; } - -DirectDictionary::DirectDictionary( +template +DirectDictionary::DirectDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - BlockPtr saved_block_) + DictionarySourcePtr source_ptr_) : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} - , saved_block{std::move(saved_block_)} { - if (!this->source_ptr->supportsSelectiveLoad()) - throw Exception{full_name + ": source cannot be used with DirectDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - createAttributes(); + if (!source_ptr->supportsSelectiveLoad()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "{}: source cannot be used with DirectDictionary", full_name); } - -void DirectDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const +template +Columns DirectDictionary::getColumns( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types [[maybe_unused]], + const Columns & default_values_columns) const { - const auto null_value = std::get(hierarchical_attribute->null_values); - DictionaryDefaultValueExtractor extractor(null_value); + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); - getItemsImpl( - *hierarchical_attribute, - ids, - [&](const size_t row, const UInt64 value, bool) { out[row] = value; }, - extractor); -} + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + const auto requested_keys = extractor.extractAllKeys(); + DictionaryStorageFetchRequest request(dict_struct, attribute_names, result_types, default_values_columns); -static inline DirectDictionary::Key getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline DirectDictionary::Key getAt(const DirectDictionary::Key & value, const size_t) -{ - return value; -} + HashMap key_to_fetched_index; + key_to_fetched_index.reserve(requested_keys.size()); -DirectDictionary::Key DirectDictionary::getValueOrNullByKey(const Key & to_find) const -{ - std::vector required_key = {to_find}; - - auto stream = source_ptr->loadIds(required_key); - stream->readPrefix(); - - bool is_found = false; - Key result = std::get(hierarchical_attribute->null_values); - while (const auto block = stream->read()) + auto fetched_columns_from_storage = request.makeAttributesResultColumns(); + for (size_t attribute_index = 0; attribute_index < request.attributesSize(); ++attribute_index) { - const IColumn & id_column = *block.safeGetByPosition(0).column; + if (!request.shouldFillResultColumnWithIndex(attribute_index)) + continue; - for (const size_t attribute_idx : ext::range(0, attributes.size())) - { - if (is_found) - break; - - const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column; - - for (const auto row_idx : ext::range(0, id_column.size())) - { - const auto key = id_column[row_idx].get(); - - if (key == to_find && hierarchical_attribute->name == attribute_name_by_index.at(attribute_idx)) - { - result = attribute_column[row_idx].get(); - is_found = true; - break; - } - } - } + auto & fetched_column_from_storage = fetched_columns_from_storage[attribute_index]; + fetched_column_from_storage->reserve(requested_keys.size()); } - stream->readSuffix(); + size_t fetched_key_index = 0; - return result; -} + Columns block_key_columns; + size_t dictionary_keys_size = dict_struct.getKeysNames().size(); + block_key_columns.reserve(dictionary_keys_size); -template -void DirectDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - const auto rows = out.size(); + BlockInputStreamPtr stream = getSourceBlockInputStream(key_columns, requested_keys); - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - id = getValueOrNullByKey(id); - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - -void DirectDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -void DirectDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -void DirectDictionary::isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); -} - -ColumnPtr DirectDictionary::getColumn( - const std::string & attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes &, - const ColumnPtr default_values_column) const -{ - ColumnPtr result; - - PaddedPODArray backup_storage; - const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage); - - const auto & attribute = getAttribute(attribute_name); - - auto keys_size = ids.size(); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to = nullptr; - if (attribute.is_nullable) - { - col_null_map_to = ColumnUInt8::create(keys_size, false); - vec_null_map_to = &col_null_map_to->getData(); - } - - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - using ValueType = DictionaryValueType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto attribute_null_value = std::get(attribute.null_values); - AttributeType null_value = static_cast(attribute_null_value); - DictionaryDefaultValueExtractor default_value_extractor(std::move(null_value), default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) - { - auto * out = column.get(); - - getItemsImpl( - attribute, - ids, - [&](const size_t row, const String value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - const auto ref = StringRef{value}; - out->insertData(ref.data, ref.size); - }, - default_value_extractor); - } - else - { - auto & out = column->getData(); - - getItemsImpl( - attribute, - ids, - [&](const size_t row, const auto value, bool is_null) - { - if (attribute.is_nullable) - (*vec_null_map_to)[row] = is_null; - - out[row] = value; - }, - default_value_extractor); - } - - result = std::move(column); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - if (attribute.is_nullable) - { - result = ColumnNullable::create(result, std::move(col_null_map_to)); - } - - return result; -} - -ColumnUInt8::Ptr DirectDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const -{ - PaddedPODArray backup_storage; - const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); - - auto result = ColumnUInt8::create(ext::size(ids)); - auto& out = result->getData(); - - const auto rows = ext::size(ids); - - HashMap has_key; - for (const auto row : ext::range(0, rows)) - has_key[ids[row]] = 0; - - std::vector to_load; - to_load.reserve(has_key.size()); - for (auto it = has_key.begin(); it != has_key.end(); ++it) - to_load.emplace_back(static_cast(it->getKey())); - - auto stream = source_ptr->loadIds(to_load); stream->readPrefix(); while (const auto block = stream->read()) { - const IColumn & id_column = *block.safeGetByPosition(0).column; + /// Split into keys columns and attribute columns + for (size_t i = 0; i < dictionary_keys_size; ++i) + block_key_columns.emplace_back(block.safeGetByPosition(i).column); - for (const auto row_idx : ext::range(0, id_column.size())) + DictionaryKeysExtractor block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena()); + auto block_keys = block_keys_extractor.extractAllKeys(); + + for (size_t attribute_index = 0; attribute_index < request.attributesSize(); ++attribute_index) { - const auto key = id_column[row_idx].get(); - has_key[key] = 1; + if (!request.shouldFillResultColumnWithIndex(attribute_index)) + continue; + + const auto & block_column = block.safeGetByPosition(dictionary_keys_size + attribute_index).column; + fetched_columns_from_storage[attribute_index]->insertRangeFrom(*block_column, 0, block_keys.size()); } + + for (size_t block_key_index = 0; block_key_index < block_keys.size(); ++block_key_index) + { + auto block_key = block_keys[block_key_index]; + key_to_fetched_index[block_key] = fetched_key_index; + ++fetched_key_index; + } + + block_key_columns.clear(); } stream->readSuffix(); - for (const auto row : ext::range(0, rows)) - out[row] = has_key[ids[row]]; + Field value_to_insert; - query_count.fetch_add(rows, std::memory_order_relaxed); + size_t requested_keys_size = requested_keys.size(); + + auto result_columns = request.makeAttributesResultColumns(); + + for (size_t attribute_index = 0; attribute_index < result_columns.size(); ++attribute_index) + { + if (!request.shouldFillResultColumnWithIndex(attribute_index)) + continue; + + auto & result_column = result_columns[attribute_index]; + + const auto & fetched_column_from_storage = fetched_columns_from_storage[attribute_index]; + const auto & default_value_provider = request.defaultValueProviderAtIndex(attribute_index); + + result_column->reserve(requested_keys_size); + + for (size_t requested_key_index = 0; requested_key_index < requested_keys_size; ++requested_key_index) + { + const auto requested_key = requested_keys[requested_key_index]; + const auto * it = key_to_fetched_index.find(requested_key); + + if (it) + fetched_column_from_storage->get(it->getMapped(), value_to_insert); + else + value_to_insert = default_value_provider.getDefaultValue(requested_key_index); + + result_column->insert(value_to_insert); + } + } + + query_count.fetch_add(requested_keys_size, std::memory_order_relaxed); + + return request.filterRequestedColumns(result_columns); +} + +template +ColumnPtr DirectDictionary::getColumn( + const std::string & attribute_name, + const DataTypePtr & result_type, + const Columns & key_columns, + const DataTypes & key_types, + const ColumnPtr & default_values_column) const +{ + return getColumns({ attribute_name }, { result_type }, key_columns, key_types, { default_values_column }).front(); +} + +template +ColumnUInt8::Ptr DirectDictionary::hasKeys( + const Columns & key_columns, + const DataTypes & key_types [[maybe_unused]]) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor requested_keys_extractor(key_columns, arena_holder.getComplexKeyArena()); + auto requested_keys = requested_keys_extractor.extractAllKeys(); + size_t requested_keys_size = requested_keys.size(); + + HashMap requested_key_to_index; + requested_key_to_index.reserve(requested_keys_size); + + for (size_t i = 0; i < requested_keys.size(); ++i) + { + auto requested_key = requested_keys[i]; + requested_key_to_index[requested_key] = i; + } + + auto result = ColumnUInt8::create(requested_keys_size, false); + auto & result_data = result->getData(); + + Columns block_key_columns; + size_t dictionary_keys_size = dict_struct.getKeysNames().size(); + block_key_columns.reserve(dictionary_keys_size); + + BlockInputStreamPtr stream = getSourceBlockInputStream(key_columns, requested_keys); + + stream->readPrefix(); + + while (const auto block = stream->read()) + { + /// Split into keys columns and attribute columns + for (size_t i = 0; i < dictionary_keys_size; ++i) + block_key_columns.emplace_back(block.safeGetByPosition(i).column); + + DictionaryKeysExtractor block_keys_extractor(block_key_columns, arena_holder.getComplexKeyArena()); + size_t block_keys_size = block_keys_extractor.getKeysSize(); + + for (size_t i = 0; i < block_keys_size; ++i) + { + auto block_key = block_keys_extractor.extractCurrentKey(); + + const auto * it = requested_key_to_index.find(block_key); + assert(it); + + size_t result_data_found_index = it->getMapped(); + result_data[result_data_found_index] = true; + + block_keys_extractor.rollbackCurrentKey(); + } + + block_key_columns.clear(); + } + + stream->readSuffix(); + + query_count.fetch_add(requested_keys_size, std::memory_order_relaxed); return result; } -void DirectDictionary::createAttributes() +template +ColumnPtr DirectDictionary::getHierarchy( + ColumnPtr key_column, + const DataTypePtr & key_type) const { - const auto size = dict_struct.attributes.size(); - attributes.reserve(size); - - for (const auto & attribute : dict_struct.attributes) + if (dictionary_key_type == DictionaryKeyType::simple) { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attribute_name_by_index.emplace(attributes.size(), attribute.name); - attributes.push_back(createAttribute(attribute, attribute.null_value, attribute.name)); - - if (attribute.hierarchical) - { - hierarchical_attribute = &attributes.back(); - - if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } + auto result = getKeysHierarchyDefaultImplementation(this, key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; } + else + return nullptr; } - -template -void DirectDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) +template +ColumnUInt8::Ptr DirectDictionary::isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const { - attribute.null_values = T(null_value.get>()); -} - -template <> -void DirectDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); -} - - -DirectDictionary::Attribute DirectDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value, const std::string & attr_name) -{ - Attribute attr{attribute.underlying_type, attribute.is_nullable, {}, {}, attr_name}; - - auto type_call = [&](const auto &dictionary_attribute_type) + if (dictionary_key_type == DictionaryKeyType::simple) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - createAttributeImpl(attr, null_value); - }; - - callOnDictionaryAttributeType(attribute.underlying_type, type_call); - - return attr; + auto result = getKeysIsInHierarchyDefaultImplementation(this, key_column, in_key_column, key_type); + query_count.fetch_add(key_column->size(), std::memory_order_relaxed); + return result; + } + else + return nullptr; } - -template -void DirectDictionary::getItemsImpl( - const Attribute & attribute, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const +template +BlockInputStreamPtr DirectDictionary::getSourceBlockInputStream( + const Columns & key_columns [[maybe_unused]], + const PaddedPODArray & requested_keys [[maybe_unused]]) const { - const auto rows = ext::size(ids); + size_t requested_keys_size = requested_keys.size(); - HashMap value_by_key; - HashSet value_is_null; + BlockInputStreamPtr stream; - for (const auto row : ext::range(0, rows)) + if constexpr (dictionary_key_type == DictionaryKeyType::simple) { - auto key = ids[row]; - value_by_key[key] = static_cast(default_value_extractor[row]); + std::vector ids; + ids.reserve(requested_keys_size); + + for (auto key : requested_keys) + ids.emplace_back(key); + + stream = source_ptr->loadIds(ids); + } + else + { + std::vector requested_rows; + requested_rows.reserve(requested_keys_size); + for (size_t i = 0; i < requested_keys_size; ++i) + requested_rows.emplace_back(i); + + stream = source_ptr->loadKeys(key_columns, requested_rows); } - std::vector to_load; - to_load.reserve(value_by_key.size()); - for (auto it = value_by_key.begin(); it != value_by_key.end(); ++it) - to_load.emplace_back(static_cast(it->getKey())); - - auto stream = source_ptr->loadIds(to_load); - stream->readPrefix(); - - const auto it = attribute_index_by_name.find(attribute.name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute.name + "'", ErrorCodes::BAD_ARGUMENTS}; - - auto attribute_index = it->second; - - while (const auto block = stream->read()) - { - const IColumn & id_column = *block.safeGetByPosition(0).column; - - const IColumn & attribute_column = *block.safeGetByPosition(attribute_index + 1).column; - - for (const auto row_idx : ext::range(0, id_column.size())) - { - const auto key = id_column[row_idx].get(); - - if (value_by_key.find(key) != value_by_key.end()) - { - auto value = attribute_column[row_idx]; - - if (value.isNull()) - value_is_null.insert(key); - else - value_by_key[key] = static_cast(value.get>()); - } - } - } - - stream->readSuffix(); - - for (const auto row : ext::range(0, rows)) - { - auto key = ids[row]; - set_value(row, value_by_key[key], value_is_null.find(key) != nullptr); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); + return stream; } -const DirectDictionary::Attribute & DirectDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - - -BlockInputStreamPtr DirectDictionary::getBlockInputStream(const Names & /* column_names */, size_t /* max_block_size */) const +template +BlockInputStreamPtr DirectDictionary::getBlockInputStream(const Names & /* column_names */, size_t /* max_block_size */) const { return source_ptr->loadAll(); } - -void registerDictionaryDirect(DictionaryFactory & factory) +namespace { - auto create_layout = [=](const std::string & full_name, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr + template + DictionaryPtr createDirectDictionary( + const std::string & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) { - if (dict_struct.key) - throw Exception{"'key' is not supported for dictionary of layout 'direct'", ErrorCodes::UNSUPPORTED_METHOD}; + const auto * layout_name = dictionary_key_type == DictionaryKeyType::simple ? "direct" : "complex_key_direct"; + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + if (dict_struct.key) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "'key' is not supported for dictionary of layout '{}'", + layout_name); + } + else + { + if (dict_struct.id) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "'id' is not supported for dictionary of layout '{}'", + layout_name); + } if (dict_struct.range_min || dict_struct.range_max) - throw Exception{full_name - + ": elements .structure.range_min and .structure.range_max should be defined only " - "for a dictionary of layout 'range_hashed'", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: elements .structure.range_min and .structure.range_max should be defined only " + "for a dictionary of layout 'range_hashed'", + full_name); const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); if (config.has(config_prefix + ".lifetime.min") || config.has(config_prefix + ".lifetime.max")) - throw Exception{"'lifetime' parameter is redundant for the dictionary' of layout 'direct'", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "'lifetime' parameter is redundant for the dictionary' of layout '{}'", + layout_name); + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr)); + } +} - return std::make_unique(dict_id, dict_struct, std::move(source_ptr)); - }; - factory.registerLayout("direct", create_layout, false); +template class DirectDictionary; +template class DirectDictionary; + +void registerDictionaryDirect(DictionaryFactory & factory) +{ + factory.registerLayout("direct", createDirectDictionary, false); + factory.registerLayout("complex_key_direct", createDirectDictionary, true); } diff --git a/src/Dictionaries/DirectDictionary.h b/src/Dictionaries/DirectDictionary.h index c6f4c15556b..e547e10433c 100644 --- a/src/Dictionaries/DirectDictionary.h +++ b/src/Dictionaries/DirectDictionary.h @@ -18,16 +18,25 @@ namespace DB { +template class DirectDictionary final : public IDictionary { public: + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by direct dictionary"); + using KeyType = std::conditional_t; + DirectDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - BlockPtr saved_block_ = nullptr); + DictionarySourcePtr source_ptr_); - std::string getTypeName() const override { return "Direct"; } + std::string getTypeName() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return "Direct"; + else + return "ComplexKeyDirect"; + } size_t getBytesAllocated() const override { return 0; } @@ -41,7 +50,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), saved_block); + return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone()); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -52,98 +61,49 @@ public: bool isInjective(const std::string & attribute_name) const override { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } + DictionaryKeyType getKeyType() const override { return dictionary_key_type; } - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } + Columns getColumns( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns) const override; ColumnPtr getColumn( const std::string& attribute_name, const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const override; + const ColumnPtr & default_values_column) const override; ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; + bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: - struct Attribute final - { - AttributeUnderlyingType type; - bool is_nullable; - std::variant< - UInt8, - UInt16, - UInt32, - UInt64, - UInt128, - Int8, - Int16, - Int32, - Int64, - Decimal32, - Decimal64, - Decimal128, - Float32, - Float64, - StringRef> - null_values; - std::unique_ptr string_arena; - std::string name; - }; - - void createAttributes(); - - template - void addAttributeSize(const Attribute & attribute); - - template - static void createAttributeImpl(Attribute & attribute, const Field & null_value); - - static Attribute createAttribute(const DictionaryAttribute& attribute, const Field & null_value, const std::string & name); - - template - void getItemsImpl( - const Attribute & attribute, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const; - - template - void setAttributeValueImpl(Attribute & attribute, const Key id, const T & value); - - void setAttributeValue(Attribute & attribute, const Key id, const Field & value); - - const Attribute & getAttribute(const std::string & attribute_name) const; - - Key getValueOrNullByKey(const Key & to_find) const; - - template - void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; + BlockInputStreamPtr getSourceBlockInputStream(const Columns & key_columns, const PaddedPODArray & requested_keys) const; const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; - std::map attribute_index_by_name; - std::map attribute_name_by_index; - std::vector attributes; - const Attribute * hierarchical_attribute = nullptr; - mutable std::atomic query_count{0}; - - BlockPtr saved_block; }; +extern template class DirectDictionary; +extern template class DirectDictionary; + } diff --git a/src/Dictionaries/ExecutableDictionarySource.cpp b/src/Dictionaries/ExecutableDictionarySource.cpp index 37dde600adf..13feab2071a 100644 --- a/src/Dictionaries/ExecutableDictionarySource.cpp +++ b/src/Dictionaries/ExecutableDictionarySource.cpp @@ -27,7 +27,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int DICTIONARY_ACCESS_DENIED; extern const int UNSUPPORTED_METHOD; - extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; } namespace @@ -58,13 +57,12 @@ namespace } - ExecutableDictionarySource::ExecutableDictionarySource( const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block_, - const Context & context_) + ContextPtr context_) : log(&Poco::Logger::get("ExecutableDictionarySource")) , dict_struct{dict_struct_} , implicit_key{config.getBool(config_prefix + ".implicit_key", false)} @@ -99,25 +97,25 @@ ExecutableDictionarySource::ExecutableDictionarySource(const ExecutableDictionar , update_field{other.update_field} , format{other.format} , sample_block{other.sample_block} - , context(other.context) + , context(Context::createCopy(other.context)) { } BlockInputStreamPtr ExecutableDictionarySource::loadAll() { if (implicit_key) - throw Exception("ExecutableDictionarySource with implicit_key does not support loadAll method", ErrorCodes::UNSUPPORTED_METHOD); + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "ExecutableDictionarySource with implicit_key does not support loadAll method"); LOG_TRACE(log, "loadAll {}", toString()); auto process = ShellCommand::execute(command); - auto input_stream = context.getInputFormat(format, process->out, sample_block, max_block_size); + auto input_stream = context->getInputFormat(format, process->out, sample_block, max_block_size); return std::make_shared(log, input_stream, std::move(process)); } BlockInputStreamPtr ExecutableDictionarySource::loadUpdatedAll() { if (implicit_key) - throw Exception("ExecutableDictionarySource with implicit_key does not support loadUpdatedAll method", ErrorCodes::UNSUPPORTED_METHOD); + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "ExecutableDictionarySource with implicit_key does not support loadUpdatedAll method"); time_t new_update_time = time(nullptr); SCOPE_EXIT(update_time = new_update_time); @@ -128,7 +126,7 @@ BlockInputStreamPtr ExecutableDictionarySource::loadUpdatedAll() LOG_TRACE(log, "loadUpdatedAll {}", command_with_update_field); auto process = ShellCommand::execute(command_with_update_field); - auto input_stream = context.getInputFormat(format, process->out, sample_block, max_block_size); + auto input_stream = context->getInputFormat(format, process->out, sample_block, max_block_size); return std::make_shared(log, input_stream, std::move(process)); } @@ -141,7 +139,7 @@ namespace { public: BlockInputStreamWithBackgroundThread( - const Context & context, + ContextPtr context, const std::string & format, const Block & sample_block, const std::string & command_str, @@ -152,7 +150,7 @@ namespace send_data(std::move(send_data_)), thread([this] { send_data(command->in); }) { - stream = context.getInputFormat(format, command->out, sample_block, max_block_size); + stream = context->getInputFormat(format, command->out, sample_block, max_block_size); } ~BlockInputStreamWithBackgroundThread() override @@ -200,101 +198,14 @@ namespace std::function send_data; ThreadFromGlobalPool thread; }; - - /** A stream, adds additional columns to each block that it will read from inner stream. - * - * block_to_add rows size must be equal to final sum rows size of all inner stream blocks. - */ - class BlockInputStreamWithAdditionalColumns final: public IBlockInputStream - { - public: - BlockInputStreamWithAdditionalColumns( - Block block_to_add_, - std::unique_ptr&& stream_) - : block_to_add(std::move(block_to_add_)) - , stream(std::move(stream_)) - { - } - - Block getHeader() const override - { - auto header = stream->getHeader(); - - if (header) - { - for (Int64 i = static_cast(block_to_add.columns() - 1); i >= 0; --i) - header.insert(0, block_to_add.getByPosition(i).cloneEmpty()); - } - - return header; - } - - Block readImpl() override - { - auto block = stream->read(); - - if (block) - { - auto block_rows = block.rows(); - - auto cut_block = block_to_add.cloneWithCutColumns(current_range_index, block_rows); - - if (cut_block.rows() != block_rows) - throw Exception( - "Number of rows in block to add after cut must equal to number of rows in block from inner stream", - ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); - - for (Int64 i = static_cast(cut_block.columns() - 1); i >= 0; --i) - block.insert(0, cut_block.getByPosition(i)); - - current_range_index += block_rows; - } - - return block; - } - - void readPrefix() override - { - stream->readPrefix(); - } - - void readSuffix() override - { - stream->readSuffix(); - } - - String getName() const override { return "BlockInputStreamWithAdditionalColumns"; } - - private: - Block block_to_add; - std::unique_ptr stream; - size_t current_range_index = 0; - }; - } - BlockInputStreamPtr ExecutableDictionarySource::loadIds(const std::vector & ids) { LOG_TRACE(log, "loadIds {} size = {}", toString(), ids.size()); auto block = blockForIds(dict_struct, ids); - - auto stream = std::make_unique( - context, format, sample_block, command, log, - [block, this](WriteBufferFromFile & out) mutable - { - auto output_stream = context.getOutputStream(format, out, block.cloneEmpty()); - formatBlock(output_stream, block); - out.close(); - }); - - if (implicit_key) - { - return std::make_shared(block, std::move(stream)); - } - else - return std::shared_ptr(stream.release()); + return getStreamForBlock(block); } BlockInputStreamPtr ExecutableDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) @@ -302,12 +213,16 @@ BlockInputStreamPtr ExecutableDictionarySource::loadKeys(const Columns & key_col LOG_TRACE(log, "loadKeys {} size = {}", toString(), requested_rows.size()); auto block = blockForKeys(dict_struct, key_columns, requested_rows); + return getStreamForBlock(block); +} +BlockInputStreamPtr ExecutableDictionarySource::getStreamForBlock(const Block & block) +{ auto stream = std::make_unique( context, format, sample_block, command, log, [block, this](WriteBufferFromFile & out) mutable { - auto output_stream = context.getOutputStream(format, out, block.cloneEmpty()); + auto output_stream = context->getOutputStream(format, out, block.cloneEmpty()); formatBlock(output_stream, block); out.close(); }); @@ -349,20 +264,20 @@ void registerDictionarySourceExecutable(DictionarySourceFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block, - const Context & context, + ContextPtr context, const std::string & /* default_database */, bool check_config) -> DictionarySourcePtr { if (dict_struct.has_expressions) - throw Exception{"Dictionary source of type `executable` does not support attribute expressions", ErrorCodes::LOGICAL_ERROR}; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `executable` does not support attribute expressions"); /// Executable dictionaries may execute arbitrary commands. /// It's OK for dictionaries created by administrator from xml-file, but /// maybe dangerous for dictionaries created from DDL-queries. if (check_config) - throw Exception("Dictionaries with Executable dictionary source is not allowed", ErrorCodes::DICTIONARY_ACCESS_DENIED); + throw Exception(ErrorCodes::DICTIONARY_ACCESS_DENIED, "Dictionaries with executable dictionary source are not allowed to be created from DDL query"); - Context context_local_copy = copyContextAndApplySettings(config_prefix, context, config); + auto context_local_copy = copyContextAndApplySettings(config_prefix, context, config); return std::make_unique( dict_struct, config, config_prefix + ".executable", diff --git a/src/Dictionaries/ExecutableDictionarySource.h b/src/Dictionaries/ExecutableDictionarySource.h index 7aa203f267b..878cb086873 100644 --- a/src/Dictionaries/ExecutableDictionarySource.h +++ b/src/Dictionaries/ExecutableDictionarySource.h @@ -20,7 +20,7 @@ public: const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block_, - const Context & context_); + ContextPtr context_); ExecutableDictionarySource(const ExecutableDictionarySource & other); ExecutableDictionarySource & operator=(const ExecutableDictionarySource &) = delete; @@ -47,6 +47,8 @@ public: std::string toString() const override; + BlockInputStreamPtr getStreamForBlock(const Block & block); + private: Poco::Logger * log; time_t update_time = 0; @@ -56,7 +58,7 @@ private: const std::string update_field; const std::string format; Block sample_block; - Context context; + ContextPtr context; }; } diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.cpp b/src/Dictionaries/ExecutablePoolDictionarySource.cpp new file mode 100644 index 00000000000..e920b8392d6 --- /dev/null +++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp @@ -0,0 +1,323 @@ +#include "ExecutablePoolDictionarySource.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "DictionarySourceFactory.h" +#include "DictionarySourceHelpers.h" +#include "DictionaryStructure.h" +#include "registerDictionaries.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int DICTIONARY_ACCESS_DENIED; + extern const int UNSUPPORTED_METHOD; + extern const int TIMEOUT_EXCEEDED; +} + +ExecutablePoolDictionarySource::ExecutablePoolDictionarySource( + const DictionaryStructure & dict_struct_, + const Configuration & configuration_, + Block & sample_block_, + ContextPtr context_) + : log(&Poco::Logger::get("ExecutablePoolDictionarySource")) + , dict_struct{dict_struct_} + , configuration{configuration_} + , sample_block{sample_block_} + , context{context_} + /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type. + , process_pool{std::make_shared(configuration.pool_size == 0 ? std::numeric_limits::max() : configuration.pool_size)} +{ + /// Remove keys from sample_block for implicit_key dictionary because + /// these columns will not be returned from source + /// Implicit key means that the source script will return only values, + /// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. + if (configuration.implicit_key) + { + auto keys_names = dict_struct.getKeysNames(); + + for (auto & key_name : keys_names) + { + size_t key_column_position_in_block = sample_block.getPositionByName(key_name); + sample_block.erase(key_column_position_in_block); + } + } +} + +ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other) + : log(&Poco::Logger::get("ExecutablePoolDictionarySource")) + , update_time{other.update_time} + , dict_struct{other.dict_struct} + , configuration{other.configuration} + , sample_block{other.sample_block} + , context{Context::createCopy(other.context)} + , process_pool{std::make_shared(configuration.pool_size)} +{ +} + +BlockInputStreamPtr ExecutablePoolDictionarySource::loadAll() +{ + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "ExecutablePoolDictionarySource with implicit_key does not support loadAll method"); +} + +BlockInputStreamPtr ExecutablePoolDictionarySource::loadUpdatedAll() +{ + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "ExecutablePoolDictionarySource with implicit_key does not support loadAll method"); +} + +namespace +{ + /** A stream, that runs child process and sends data to its stdin in background thread, + * and receives data from its stdout. + */ + class PoolBlockInputStreamWithBackgroundThread final : public IBlockInputStream + { + public: + PoolBlockInputStreamWithBackgroundThread( + std::shared_ptr process_pool_, + std::unique_ptr && command_, + BlockInputStreamPtr && stream_, + size_t read_rows_, + Poco::Logger * log_, + std::function && send_data_) + : process_pool(process_pool_) + , command(std::move(command_)) + , stream(std::move(stream_)) + , rows_to_read(read_rows_) + , log(log_) + , send_data(std::move(send_data_)) + , thread([this] + { + try + { + send_data(command->in); + } + catch (...) + { + std::lock_guard lck(exception_during_read_lock); + exception_during_read = std::current_exception(); + } + }) + {} + + ~PoolBlockInputStreamWithBackgroundThread() override + { + if (thread.joinable()) + thread.join(); + + if (command) + process_pool->returnObject(std::move(command)); + } + + Block getHeader() const override + { + return stream->getHeader(); + } + + private: + Block readImpl() override + { + rethrowExceptionDuringReadIfNeeded(); + + if (current_read_rows == rows_to_read) + return Block(); + + Block block; + + try + { + block = stream->read(); + current_read_rows += block.rows(); + } + catch (...) + { + tryLogCurrentException(log); + command = nullptr; + throw; + } + + return block; + } + + void readPrefix() override + { + rethrowExceptionDuringReadIfNeeded(); + stream->readPrefix(); + } + + void readSuffix() override + { + if (thread.joinable()) + thread.join(); + + rethrowExceptionDuringReadIfNeeded(); + stream->readSuffix(); + } + + void rethrowExceptionDuringReadIfNeeded() + { + std::lock_guard lck(exception_during_read_lock); + if (exception_during_read) + { + command = nullptr; + std::rethrow_exception(exception_during_read); + } + } + + String getName() const override { return "PoolWithBackgroundThread"; } + + std::shared_ptr process_pool; + std::unique_ptr command; + BlockInputStreamPtr stream; + size_t rows_to_read; + Poco::Logger * log; + std::function send_data; + ThreadFromGlobalPool thread; + size_t current_read_rows = 0; + std::mutex exception_during_read_lock; + std::exception_ptr exception_during_read; + }; + +} + +BlockInputStreamPtr ExecutablePoolDictionarySource::loadIds(const std::vector & ids) +{ + LOG_TRACE(log, "loadIds {} size = {}", toString(), ids.size()); + + auto block = blockForIds(dict_struct, ids); + return getStreamForBlock(block); +} + +BlockInputStreamPtr ExecutablePoolDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) +{ + LOG_TRACE(log, "loadKeys {} size = {}", toString(), requested_rows.size()); + + auto block = blockForKeys(dict_struct, key_columns, requested_rows); + return getStreamForBlock(block); +} + +BlockInputStreamPtr ExecutablePoolDictionarySource::getStreamForBlock(const Block & block) +{ + std::unique_ptr process; + bool result = process_pool->tryBorrowObject(process, [this]() + { + bool terminate_in_destructor = true; + ShellCommandDestructorStrategy strategy { terminate_in_destructor, configuration.command_termination_timeout }; + auto shell_command = ShellCommand::execute(configuration.command, false, strategy); + return shell_command; + }, configuration.max_command_execution_time * 10000); + + if (!result) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, + "Could not get process from pool, max command execution timeout exceeded ({}) seconds", + configuration.max_command_execution_time); + + size_t rows_to_read = block.rows(); + auto read_stream = context->getInputFormat(configuration.format, process->out, sample_block, rows_to_read); + + auto stream = std::make_unique( + process_pool, std::move(process), std::move(read_stream), rows_to_read, log, + [block, this](WriteBufferFromFile & out) mutable + { + auto output_stream = context->getOutputStream(configuration.format, out, block.cloneEmpty()); + formatBlock(output_stream, block); + }); + + if (configuration.implicit_key) + return std::make_shared(block, std::move(stream)); + else + return std::shared_ptr(stream.release()); +} + +bool ExecutablePoolDictionarySource::isModified() const +{ + return true; +} + +bool ExecutablePoolDictionarySource::supportsSelectiveLoad() const +{ + return true; +} + +bool ExecutablePoolDictionarySource::hasUpdateField() const +{ + return !configuration.update_field.empty(); +} + +DictionarySourcePtr ExecutablePoolDictionarySource::clone() const +{ + return std::make_unique(*this); +} + +std::string ExecutablePoolDictionarySource::toString() const +{ + return "ExecutablePool size: " + std::to_string(configuration.pool_size) + " command: " + configuration.command; +} + +void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory) +{ + auto create_table_source = [=](const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + Block & sample_block, + ContextPtr context, + const std::string & /* default_database */, + bool check_config) -> DictionarySourcePtr + { + if (dict_struct.has_expressions) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `executable_pool` does not support attribute expressions"); + + /// Executable dictionaries may execute arbitrary commands. + /// It's OK for dictionaries created by administrator from xml-file, but + /// maybe dangerous for dictionaries created from DDL-queries. + if (check_config) + throw Exception(ErrorCodes::DICTIONARY_ACCESS_DENIED, "Dictionaries with executable pool dictionary source are not allowed to be created from DDL query"); + + auto context_local_copy = copyContextAndApplySettings(config_prefix, context, config); + + /** Currently parallel parsing input format cannot read exactly max_block_size rows from input, + * so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof. + */ + auto settings_no_parallel_parsing = context_local_copy->getSettings(); + settings_no_parallel_parsing.input_format_parallel_parsing = false; + context_local_copy->setSettings(settings_no_parallel_parsing); + + String configuration_config_prefix = config_prefix + ".executable_pool"; + + size_t max_command_execution_time = config.getUInt64(configuration_config_prefix + ".max_command_execution_time", 10); + + size_t max_execution_time_seconds = static_cast(context->getSettings().max_execution_time.totalSeconds()); + if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds) + max_command_execution_time = max_execution_time_seconds; + + ExecutablePoolDictionarySource::Configuration configuration + { + .command = config.getString(configuration_config_prefix + ".command"), + .format = config.getString(configuration_config_prefix + ".format"), + .pool_size = config.getUInt64(configuration_config_prefix + ".size"), + .update_field = config.getString(configuration_config_prefix + ".update_field", ""), + .implicit_key = config.getBool(configuration_config_prefix + ".implicit_key", false), + .command_termination_timeout = config.getUInt64(configuration_config_prefix + ".command_termination_timeout", 10), + .max_command_execution_time = max_command_execution_time + }; + + return std::make_unique(dict_struct, configuration, sample_block, context_local_copy); + }; + + factory.registerSource("executable_pool", create_table_source); +} + +} diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.h b/src/Dictionaries/ExecutablePoolDictionarySource.h new file mode 100644 index 00000000000..7a0b8681a21 --- /dev/null +++ b/src/Dictionaries/ExecutablePoolDictionarySource.h @@ -0,0 +1,85 @@ +#pragma once + +#include + +#include +#include + +#include "IDictionarySource.h" +#include "DictionaryStructure.h" + +namespace Poco { class Logger; } + + +namespace DB +{ + +using ProcessPool = BorrowedObjectPool>; + +/** ExecutablePoolDictionarySource allows loading data from pool of processes. + * When client requests ids or keys source get process from ProcessPool + * and create stream based on source format from process stdout. + * It is important that stream format will expect only rows that were requested. + * When stream is finished process is returned back to the ProcessPool. + * If there are no processes in pool during request client will be blocked + * until some process will be retunred to pool. + */ +class ExecutablePoolDictionarySource final : public IDictionarySource +{ +public: + struct Configuration + { + const String command; + const String format; + const size_t pool_size; + const String update_field; + const bool implicit_key; + const size_t command_termination_timeout; + const size_t max_command_execution_time; + }; + + ExecutablePoolDictionarySource( + const DictionaryStructure & dict_struct_, + const Configuration & configuration_, + Block & sample_block_, + ContextPtr context_); + + ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other); + ExecutablePoolDictionarySource & operator=(const ExecutablePoolDictionarySource &) = delete; + + BlockInputStreamPtr loadAll() override; + + /** The logic of this method is flawed, absolutely incorrect and ignorant. + * It may lead to skipping some values due to clock sync or timezone changes. + * The intended usage of "update_field" is totally different. + */ + BlockInputStreamPtr loadUpdatedAll() override; + + BlockInputStreamPtr loadIds(const std::vector & ids) override; + + BlockInputStreamPtr loadKeys(const Columns & key_columns, const std::vector & requested_rows) override; + + bool isModified() const override; + + bool supportsSelectiveLoad() const override; + + bool hasUpdateField() const override; + + DictionarySourcePtr clone() const override; + + std::string toString() const override; + + BlockInputStreamPtr getStreamForBlock(const Block & block); + +private: + Poco::Logger * log; + time_t update_time = 0; + const DictionaryStructure dict_struct; + const Configuration configuration; + + Block sample_block; + ContextPtr context; + std::shared_ptr process_pool; +}; + +} diff --git a/src/Dictionaries/ExternalQueryBuilder.cpp b/src/Dictionaries/ExternalQueryBuilder.cpp index e8d71b1fd85..8c6e9f60afb 100644 --- a/src/Dictionaries/ExternalQueryBuilder.cpp +++ b/src/Dictionaries/ExternalQueryBuilder.cpp @@ -173,7 +173,7 @@ std::string ExternalQueryBuilder::composeUpdateQuery(const std::string & update_ std::string ExternalQueryBuilder::composeLoadIdsQuery(const std::vector & ids) { if (!dict_struct.id) - throw Exception{"Simple key required for method", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Simple key required for method"); WriteBufferFromOwnString out; writeString("SELECT ", out); @@ -244,10 +244,10 @@ std::string ExternalQueryBuilder::composeLoadKeysQuery( const Columns & key_columns, const std::vector & requested_rows, LoadKeysMethod method, size_t partition_key_prefix) { if (!dict_struct.key) - throw Exception{"Composite key required for method", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Composite key required for method"); if (key_columns.size() != dict_struct.key->size()) - throw Exception{"The size of key_columns does not equal to the size of dictionary key", ErrorCodes::LOGICAL_ERROR}; + throw Exception(ErrorCodes::LOGICAL_ERROR, "The size of key_columns does not equal to the size of dictionary key"); WriteBufferFromOwnString out; writeString("SELECT ", out); @@ -358,7 +358,7 @@ void ExternalQueryBuilder::composeKeyCondition(const Columns & key_columns, cons /// key_i=value_i writeQuoted(key_description.name, out); writeString("=", out); - key_description.type->serializeAsTextQuoted(*key_columns[i], row, out, format_settings); + key_description.serialization->serializeTextQuoted(*key_columns[i], row, out, format_settings); } } @@ -386,7 +386,7 @@ void ExternalQueryBuilder::composeInWithTuples(const Columns & key_columns, cons void ExternalQueryBuilder::composeKeyTupleDefinition(WriteBuffer & out, size_t beg, size_t end) const { if (!dict_struct.key) - throw Exception{"Composite key required for method", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Composite key required for method"); writeChar('(', out); @@ -415,7 +415,7 @@ void ExternalQueryBuilder::composeKeyTuple(const Columns & key_columns, const si writeString(", ", out); first = false; - (*dict_struct.key)[i].type->serializeAsTextQuoted(*key_columns[i], row, out, format_settings); + (*dict_struct.key)[i].serialization->serializeTextQuoted(*key_columns[i], row, out, format_settings); } writeString(")", out); diff --git a/src/Dictionaries/FileDictionarySource.cpp b/src/Dictionaries/FileDictionarySource.cpp index addc2adef02..378c6f11857 100644 --- a/src/Dictionaries/FileDictionarySource.cpp +++ b/src/Dictionaries/FileDictionarySource.cpp @@ -1,8 +1,11 @@ #include "FileDictionarySource.h" + +#include +#include + #include #include #include -#include #include #include #include "DictionarySourceFactory.h" @@ -10,7 +13,6 @@ #include "registerDictionaries.h" #include "DictionarySourceHelpers.h" - namespace DB { static const UInt64 max_block_size = 8192; @@ -24,7 +26,7 @@ namespace ErrorCodes FileDictionarySource::FileDictionarySource( const std::string & filepath_, const std::string & format_, - Block & sample_block_, const Context & context_, bool check_config) + Block & sample_block_, ContextPtr context_, bool check_config) : filepath{filepath_} , format{format_} , sample_block{sample_block_} @@ -32,9 +34,19 @@ FileDictionarySource::FileDictionarySource( { if (check_config) { - const String user_files_path = context.getUserFilesPath(); - if (!startsWith(filepath, user_files_path)) - throw Exception(ErrorCodes::PATH_ACCESS_DENIED, "File path {} is not inside {}", filepath, user_files_path); + auto source_file_path = std::filesystem::path(filepath); + auto source_file_absolute_path = std::filesystem::canonical(source_file_path); + + String user_files_path_string_value = context->getUserFilesPath(); + auto user_files_path = std::filesystem::path(user_files_path_string_value); + auto user_files_absolute_path = std::filesystem::canonical(user_files_path); + + auto [_, user_files_absolute_path_mismatch_it] = std::mismatch(source_file_absolute_path.begin(), source_file_absolute_path.end(), user_files_absolute_path.begin(), user_files_absolute_path.end()); + + bool user_files_absolute_path_include_source_file_absolute_path = user_files_absolute_path_mismatch_it == user_files_absolute_path.end(); + + if (!user_files_absolute_path_include_source_file_absolute_path) + throw Exception(ErrorCodes::PATH_ACCESS_DENIED, "File path {} is not inside {}", filepath, user_files_path_string_value); } } @@ -43,7 +55,7 @@ FileDictionarySource::FileDictionarySource(const FileDictionarySource & other) : filepath{other.filepath} , format{other.format} , sample_block{other.sample_block} - , context(other.context) + , context(Context::createCopy(other.context)) , last_modification{other.last_modification} { } @@ -53,7 +65,7 @@ BlockInputStreamPtr FileDictionarySource::loadAll() { LOG_TRACE(&Poco::Logger::get("FileDictionary"), "loadAll {}", toString()); auto in_ptr = std::make_unique(filepath); - auto stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); + auto stream = context->getInputFormat(format, *in_ptr, sample_block, max_block_size); last_modification = getLastModification(); return std::make_shared>(stream, std::move(in_ptr)); @@ -77,17 +89,17 @@ void registerDictionarySourceFile(DictionarySourceFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block, - const Context & context, + ContextPtr context, const std::string & /* default_database */, bool check_config) -> DictionarySourcePtr { if (dict_struct.has_expressions) - throw Exception{"Dictionary source of type `file` does not support attribute expressions", ErrorCodes::LOGICAL_ERROR}; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `file` does not support attribute expressions"); const auto filepath = config.getString(config_prefix + ".file.path"); const auto format = config.getString(config_prefix + ".file.format"); - Context context_local_copy = copyContextAndApplySettings(config_prefix, context, config); + auto context_local_copy = copyContextAndApplySettings(config_prefix, context, config); return std::make_unique(filepath, format, sample_block, context_local_copy, check_config); }; diff --git a/src/Dictionaries/FileDictionarySource.h b/src/Dictionaries/FileDictionarySource.h index fa47b280911..6559503cccd 100644 --- a/src/Dictionaries/FileDictionarySource.h +++ b/src/Dictionaries/FileDictionarySource.h @@ -17,7 +17,7 @@ class FileDictionarySource final : public IDictionarySource { public: FileDictionarySource(const std::string & filepath_, const std::string & format_, - Block & sample_block_, const Context & context_, bool check_config); + Block & sample_block_, ContextPtr context_, bool check_config); FileDictionarySource(const FileDictionarySource & other); @@ -25,17 +25,17 @@ public: BlockInputStreamPtr loadUpdatedAll() override { - throw Exception{"Method loadUpdatedAll is unsupported for FileDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method loadUpdatedAll is unsupported for FileDictionarySource"); } BlockInputStreamPtr loadIds(const std::vector & /*ids*/) override { - throw Exception{"Method loadIds is unsupported for FileDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method loadIds is unsupported for FileDictionarySource"); } BlockInputStreamPtr loadKeys(const Columns & /*key_columns*/, const std::vector & /*requested_rows*/) override { - throw Exception{"Method loadKeys is unsupported for FileDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method loadKeys is unsupported for FileDictionarySource"); } bool isModified() const override @@ -61,7 +61,7 @@ private: const std::string filepath; const std::string format; Block sample_block; - const Context context; + ContextPtr context; Poco::Timestamp last_modification; }; diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index f4f50a69598..1540a3a876b 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -1,131 +1,69 @@ #include "FlatDictionary.h" #include +#include +#include + #include #include #include #include #include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" +#include +#include +#include namespace DB { namespace ErrorCodes { - extern const int TYPE_MISMATCH; extern const int ARGUMENT_OUT_OF_BOUND; extern const int BAD_ARGUMENTS; extern const int DICTIONARY_IS_EMPTY; extern const int UNSUPPORTED_METHOD; } -static const auto initial_array_size = 1024; -static const auto max_array_size = 500000; - - FlatDictionary::FlatDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, - BlockPtr saved_block_) + Configuration configuration_, + BlockPtr previously_loaded_block_) : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) - , require_nonempty(require_nonempty_) - , loaded_ids(initial_array_size, false) - , saved_block{std::move(saved_block_)} + , configuration(configuration_) + , loaded_keys(configuration.initial_array_size, false) + , previously_loaded_block(std::move(previously_loaded_block_)) { createAttributes(); loadData(); calculateBytesAllocated(); } - -void FlatDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - DictionaryDefaultValueExtractor extractor(null_value); - - getItemsImpl( - *hierarchical_attribute, - ids, - [&](const size_t row, const UInt64 value) { out[row] = value; }, - extractor); -} - - -/// Allow to use single value in same way as array. -static inline FlatDictionary::Key getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline FlatDictionary::Key getAt(const FlatDictionary::Key & value, const size_t) -{ - return value; -} - -template -void FlatDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - const auto & attr = std::get>(hierarchical_attribute->arrays); - const auto rows = out.size(); - - size_t loaded_size = attr.size(); - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id < loaded_size && id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - id = attr[id]; - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - -void FlatDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -void FlatDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -void FlatDictionary::isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); -} - ColumnPtr FlatDictionary::getColumn( const std::string & attribute_name, const DataTypePtr & result_type, const Columns & key_columns, const DataTypes &, - const ColumnPtr default_values_column) const + const ColumnPtr & default_values_column) const { ColumnPtr result; - PaddedPODArray backup_storage; + PaddedPODArray backup_storage; const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage); auto size = ids.size(); - const auto & attribute = getAttribute(attribute_name); const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - auto type_call = [&](const auto &dictionary_attribute_type) + size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + const auto & attribute = attributes[attribute_index]; + + auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; @@ -167,7 +105,7 @@ ColumnPtr FlatDictionary::getColumn( if (attribute.nullable_set) { ColumnUInt8::MutablePtr col_null_map_to = ColumnUInt8::create(size, false); - ColumnUInt8::Container& vec_null_map_to = col_null_map_to->getData(); + ColumnUInt8::Container & vec_null_map_to = col_null_map_to->getData(); for (size_t row = 0; row < ids.size(); ++row) { @@ -183,24 +121,111 @@ ColumnPtr FlatDictionary::getColumn( return result; } - ColumnUInt8::Ptr FlatDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const { - PaddedPODArray backup_storage; - const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); + PaddedPODArray backup_storage; + const auto & keys = getColumnVectorData(this, key_columns.front(), backup_storage); + size_t keys_size = keys.size(); - auto result = ColumnUInt8::create(ext::size(ids)); - auto& out = result->getData(); + auto result = ColumnUInt8::create(keys_size); + auto & out = result->getData(); - const auto ids_count = ext::size(ids); - - for (const auto i : ext::range(0, ids_count)) + for (size_t key_index = 0; key_index < keys_size; ++key_index) { - const auto id = ids[i]; - out[i] = id < loaded_ids.size() && loaded_ids[id]; + const auto key = keys[key_index]; + out[key_index] = key < loaded_keys.size() && loaded_keys[key]; } - query_count.fetch_add(ids_count, std::memory_order_relaxed); + query_count.fetch_add(keys_size, std::memory_order_relaxed); + + return result; +} + +ColumnPtr FlatDictionary::getHierarchy(ColumnPtr key_column, const DataTypePtr &) const +{ + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = std::get(hierarchical_attribute.null_values); + const ContainerType & parent_keys = std::get>(hierarchical_attribute.container); + + auto is_key_valid_func = [&, this](auto & key) { return key < loaded_keys.size() && loaded_keys[key]; }; + + auto get_parent_key_func = [&, this](auto & hierarchy_key) + { + bool is_key_valid = hierarchy_key < loaded_keys.size() && loaded_keys[hierarchy_key]; + std::optional result = is_key_valid ? std::make_optional(parent_keys[hierarchy_key]) : std::nullopt; + return result; + }; + + auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_key_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return dictionary_hierarchy_array; +} + +ColumnUInt8::Ptr FlatDictionary::isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr &) const +{ + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + PaddedPODArray keys_in_backup_storage; + const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = std::get(hierarchical_attribute.null_values); + const ContainerType & parent_keys = std::get>(hierarchical_attribute.container); + + auto is_key_valid_func = [&, this](auto & key) { return key < loaded_keys.size() && loaded_keys[key]; }; + + auto get_parent_key_func = [&, this](auto & hierarchy_key) + { + bool is_key_valid = hierarchy_key < loaded_keys.size() && loaded_keys[hierarchy_key]; + std::optional result = is_key_valid ? std::make_optional(parent_keys[hierarchy_key]) : std::nullopt; + return result; + }; + + auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_key_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; +} + +ColumnPtr FlatDictionary::getDescendants( + ColumnPtr key_column, + const DataTypePtr &, + size_t level) const +{ + PaddedPODArray keys_backup; + const auto & keys = getColumnVectorData(this, key_column, keys_backup); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + const ContainerType & parent_keys = std::get>(hierarchical_attribute.container); + + HashMap> parent_to_child; + + for (size_t i = 0; i < parent_keys.size(); ++i) + { + auto parent_key = parent_keys[i]; + + if (loaded_keys[i]) + parent_to_child[parent_key].emplace_back(static_cast(i)); + } + + auto result = getKeysDescendantsArray(keys, parent_to_child, level); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); return result; } @@ -211,38 +236,44 @@ void FlatDictionary::createAttributes() attributes.reserve(size); for (const auto & attribute : dict_struct.attributes) - { - attribute_index_by_name.emplace(attribute.name, attributes.size()); attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) - { - hierarchical_attribute = &attributes.back(); - - if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } - } } void FlatDictionary::blockToAttributes(const Block & block) { - const IColumn & id_column = *block.safeGetByPosition(0).column; - element_count += id_column.size(); + const auto keys_column = block.safeGetByPosition(0).column; - for (const size_t attribute_idx : ext::range(0, attributes.size())) + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor keys_extractor({ keys_column }, arena_holder.getComplexKeyArena()); + auto keys = keys_extractor.extractAllKeys(); + + HashSet already_processed_keys; + + size_t key_offset = 1; + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) { - const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column; - Attribute & attribute = attributes[attribute_idx]; + const IColumn & attribute_column = *block.safeGetByPosition(attribute_index + key_offset).column; + Attribute & attribute = attributes[attribute_index]; - for (const auto row_idx : ext::range(0, id_column.size())) - setAttributeValue(attribute, id_column[row_idx].get(), attribute_column[row_idx]); + for (size_t i = 0; i < keys.size(); ++i) + { + auto key = keys[i]; + + if (already_processed_keys.find(key) != nullptr) + continue; + already_processed_keys.insert(key); + + setAttributeValue(attribute, key, attribute_column[i]); + ++element_count; + } + + already_processed_keys.clear(); } } void FlatDictionary::updateData() { - if (!saved_block || saved_block->rows() == 0) + if (!previously_loaded_block || previously_loaded_block->rows() == 0) { auto stream = source_ptr->loadUpdatedAll(); stream->readPrefix(); @@ -250,12 +281,13 @@ void FlatDictionary::updateData() while (const auto block = stream->read()) { /// We are using this to keep saved data if input stream consists of multiple blocks - if (!saved_block) - saved_block = std::make_shared(block.cloneEmpty()); - for (const auto attribute_idx : ext::range(0, attributes.size() + 1)) + if (!previously_loaded_block) + previously_loaded_block = std::make_shared(block.cloneEmpty()); + + for (size_t column_index = 0; column_index < block.columns(); ++column_index) { - const IColumn & update_column = *block.getByPosition(attribute_idx).column.get(); - MutableColumnPtr saved_column = saved_block->getByPosition(attribute_idx).column->assumeMutable(); + const IColumn & update_column = *block.getByPosition(column_index).column.get(); + MutableColumnPtr saved_column = previously_loaded_block->getByPosition(column_index).column->assumeMutable(); saved_column->insertRangeFrom(update_column, 0, update_column.size()); } } @@ -264,51 +296,14 @@ void FlatDictionary::updateData() else { auto stream = source_ptr->loadUpdatedAll(); - stream->readPrefix(); - - while (Block block = stream->read()) - { - const auto & saved_id_column = *saved_block->safeGetByPosition(0).column; - const auto & update_id_column = *block.safeGetByPosition(0).column; - - std::unordered_map> update_ids; - for (size_t row = 0; row < update_id_column.size(); ++row) - { - const auto id = update_id_column.get64(row); - update_ids[id].push_back(row); - } - - const size_t saved_rows = saved_id_column.size(); - IColumn::Filter filter(saved_rows); - std::unordered_map>::iterator it; - - for (size_t row = 0; row < saved_id_column.size(); ++row) - { - auto id = saved_id_column.get64(row); - it = update_ids.find(id); - - if (it != update_ids.end()) - filter[row] = 0; - else - filter[row] = 1; - } - - auto block_columns = block.mutateColumns(); - for (const auto attribute_idx : ext::range(0, attributes.size() + 1)) - { - auto & column = saved_block->safeGetByPosition(attribute_idx).column; - const auto & filtered_column = column->filter(filter, -1); - - block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size()); - } - - saved_block->setColumns(std::move(block_columns)); - } - stream->readSuffix(); + mergeBlockWithStream( + dict_struct.getKeysSize(), + *previously_loaded_block, + stream); } - if (saved_block) - blockToAttributes(*saved_block.get()); + if (previously_loaded_block) + blockToAttributes(*previously_loaded_block.get()); } void FlatDictionary::loadData() @@ -326,26 +321,8 @@ void FlatDictionary::loadData() else updateData(); - if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY}; -} - - -template -void FlatDictionary::addAttributeSize(const Attribute & attribute) -{ - const auto & array_ref = std::get>(attribute.arrays); - bytes_allocated += sizeof(PaddedPODArray) + array_ref.allocated_bytes(); - bucket_count = array_ref.capacity(); -} - -template <> -void FlatDictionary::addAttributeSize(const Attribute & attribute) -{ - const auto & array_ref = std::get>(attribute.arrays); - bytes_allocated += sizeof(PaddedPODArray) + array_ref.allocated_bytes(); - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); - bucket_count = array_ref.capacity(); + if (configuration.require_nonempty && 0 == element_count) + throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, "{}: dictionary source is empty and 'require_nonempty' property is set.", full_name); } void FlatDictionary::calculateBytesAllocated() @@ -358,103 +335,108 @@ void FlatDictionary::calculateBytesAllocated() { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; - addAttributeSize(attribute); + const auto & container = std::get>(attribute.container); + bytes_allocated += sizeof(PaddedPODArray) + container.allocated_bytes(); + bucket_count = container.capacity(); + + if constexpr (std::is_same_v) + bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); }; callOnDictionaryAttributeType(attribute.type, type_call); } } - -template -void FlatDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) +FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttribute & dictionary_attribute, const Field & null_value) { - attribute.null_values = T(null_value.get>()); - const auto & null_value_ref = std::get(attribute.null_values); - attribute.arrays.emplace>(initial_array_size, null_value_ref); -} + auto nullable_set = dictionary_attribute.is_nullable ? std::make_optional() : std::optional{}; + Attribute attribute{dictionary_attribute.underlying_type, std::move(nullable_set), {}, {}, {}}; -template <> -void FlatDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); - attribute.arrays.emplace>(initial_array_size, StringRef(string_in_arena, string.size())); -} - - -FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value) -{ - auto nullable_set = attribute.is_nullable ? std::make_optional() : std::optional{}; - Attribute attr{attribute.underlying_type, std::move(nullable_set), {}, {}, {}}; - - auto type_call = [&](const auto &dictionary_attribute_type) + auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; - createAttributeImpl(attr, null_value); + if constexpr (std::is_same_v) + { + attribute.string_arena = std::make_unique(); + const String & string = null_value.get(); + const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); + attribute.null_values.emplace(string_in_arena, string.size()); + } + else + attribute.null_values = ValueType(null_value.get>()); + + const auto & null_value_ref = std::get(attribute.null_values); + attribute.container.emplace>(configuration.initial_array_size, null_value_ref); }; - callOnDictionaryAttributeType(attribute.underlying_type, type_call); + callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call); - return attr; + return attribute; } - template void FlatDictionary::getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & keys, ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const { - const auto & attr = std::get>(attribute.arrays); - const auto rows = ext::size(ids); + const auto & container = std::get>(attribute.container); + const auto rows = keys.size(); - for (const auto row : ext::range(0, rows)) + for (size_t row = 0; row < rows; ++row) { - const auto id = ids[row]; - set_value(row, id < ext::size(attr) && loaded_ids[id] ? static_cast(attr[id]) : default_value_extractor[row]); + const auto key = keys[row]; + + if (key < loaded_keys.size() && loaded_keys[key]) + set_value(row, static_cast(container[key])); + else + set_value(row, default_value_extractor[row]); } query_count.fetch_add(rows, std::memory_order_relaxed); } template -void FlatDictionary::resize(Attribute & attribute, const Key id) +void FlatDictionary::resize(Attribute & attribute, UInt64 key) { - if (id >= max_array_size) - throw Exception{full_name + ": identifier should be less than " + toString(max_array_size), ErrorCodes::ARGUMENT_OUT_OF_BOUND}; + if (key >= configuration.max_array_size) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "{}: identifier should be less than {}", + full_name, + toString(configuration.max_array_size)); - auto & array = std::get>(attribute.arrays); - if (id >= array.size()) + auto & container = std::get>(attribute.container); + + if (key >= container.size()) { - const size_t elements_count = id + 1; //id=0 -> elements_count=1 - loaded_ids.resize(elements_count, false); - array.resize_fill(elements_count, std::get(attribute.null_values)); + const size_t elements_count = key + 1; //id=0 -> elements_count=1 + loaded_keys.resize(elements_count, false); + container.resize_fill(elements_count, std::get(attribute.null_values)); } } template -void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T & value) +void FlatDictionary::setAttributeValueImpl(Attribute & attribute, UInt64 key, const T & value) { - auto & array = std::get>(attribute.arrays); - array[id] = value; - loaded_ids[id] = true; + auto & array = std::get>(attribute.container); + array[key] = value; + loaded_keys[key] = true; } template <> -void FlatDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const String & value) +void FlatDictionary::setAttributeValueImpl(Attribute & attribute, UInt64 key, const String & value) { const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - setAttributeValueImpl(attribute, id, StringRef{string_in_arena, value.size()}); + setAttributeValueImpl(attribute, key, StringRef{string_in_arena, value.size()}); } -void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value) +void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 key, const Field & value) { auto type_call = [&](const auto &dictionary_attribute_type) { @@ -462,55 +444,36 @@ void FlatDictionary::setAttributeValue(Attribute & attribute, const Key id, cons using AttributeType = typename Type::AttributeType; using ResizeType = std::conditional_t, StringRef, AttributeType>; - resize(attribute, id); + resize(attribute, key); if (attribute.nullable_set) { if (value.isNull()) { - attribute.nullable_set->insert(id); - loaded_ids[id] = true; + attribute.nullable_set->insert(key); + loaded_keys[key] = true; return; } - else - { - attribute.nullable_set->erase(id); - } } - setAttributeValueImpl(attribute, id, value.get>()); + setAttributeValueImpl(attribute, key, value.get()); }; callOnDictionaryAttributeType(attribute.type, type_call); } - -const FlatDictionary::Attribute & FlatDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -PaddedPODArray FlatDictionary::getIds() const -{ - const auto ids_count = ext::size(loaded_ids); - - PaddedPODArray ids; - ids.reserve(ids_count); - - for (auto idx : ext::range(0, ids_count)) - if (loaded_ids[idx]) - ids.push_back(idx); - return ids; -} - BlockInputStreamPtr FlatDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, getIds(), column_names); + const auto keys_count = loaded_keys.size(); + + PaddedPODArray keys; + keys.reserve(keys_count); + + for (size_t key_index = 0; key_index < keys_count; ++key_index) + if (loaded_keys[key_index]) + keys.push_back(key_index); + + return std::make_shared(shared_from_this(), max_block_size, std::move(keys), column_names); } void registerDictionaryFlat(DictionaryFactory & factory) @@ -522,19 +485,32 @@ void registerDictionaryFlat(DictionaryFactory & factory) DictionarySourcePtr source_ptr) -> DictionaryPtr { if (dict_struct.key) - throw Exception{"'key' is not supported for dictionary of layout 'flat'", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for dictionary of layout 'flat'"); if (dict_struct.range_min || dict_struct.range_max) - throw Exception{full_name - + ": elements .structure.range_min and .structure.range_max should be defined only " - "for a dictionary of layout 'range_hashed'", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: elements .structure.range_min and .structure.range_max should be defined only " + "for a dictionary of layout 'range_hashed'", + full_name); + + static constexpr size_t default_initial_array_size = 1024; + static constexpr size_t default_max_array_size = 500000; + + String dictionary_layout_prefix = config_prefix + ".layout" + ".flat"; + + FlatDictionary::Configuration configuration + { + .initial_array_size = config.getUInt64(dictionary_layout_prefix + ".initial_array_size", default_initial_array_size), + .max_array_size = config.getUInt64(dictionary_layout_prefix + ".max_array_size", default_max_array_size), + .require_nonempty = config.getBool(config_prefix + ".require_nonempty", false) + }; const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + + return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, std::move(configuration)); }; + factory.registerLayout("flat", create_layout, false); } diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index 23bfa3d37b5..0a5f88f270a 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -26,13 +26,20 @@ namespace DB class FlatDictionary final : public IDictionary { public: + struct Configuration + { + size_t initial_array_size; + size_t max_array_size; + bool require_nonempty; + }; + FlatDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, - bool require_nonempty_, - BlockPtr saved_block_ = nullptr); + Configuration configuration_, + BlockPtr previously_loaded_block_ = nullptr); std::string getTypeName() const override { return "Flat"; } @@ -48,7 +55,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block); + return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, configuration, previously_loaded_block); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -59,18 +66,9 @@ public: bool isInjective(const std::string & attribute_name) const override { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } ColumnPtr getColumn( @@ -78,17 +76,31 @@ public: const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const override; + const ColumnPtr & default_values_column) const override; ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; + bool hasHierarchy() const override { return dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & key_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + + ColumnPtr getDescendants( + ColumnPtr key_column, + const DataTypePtr & key_type, + size_t level) const override; + BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: template using ContainerType = PaddedPODArray; - using NullableSet = HashSet>; + using NullableSet = HashSet>; struct Attribute final { @@ -108,6 +120,7 @@ private: Decimal32, Decimal64, Decimal128, + Decimal256, Float32, Float64, StringRef> @@ -125,10 +138,11 @@ private: ContainerType, ContainerType, ContainerType, + ContainerType, ContainerType, ContainerType, ContainerType> - arrays; + container; std::unique_ptr string_arena; }; @@ -138,54 +152,39 @@ private: void updateData(); void loadData(); - template - void addAttributeSize(const Attribute & attribute); - void calculateBytesAllocated(); - template - static void createAttributeImpl(Attribute & attribute, const Field & null_value); - - static Attribute createAttribute(const DictionaryAttribute& attribute, const Field & null_value); + Attribute createAttribute(const DictionaryAttribute& attribute, const Field & null_value); template void getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + const PaddedPODArray & keys, ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const; template - void resize(Attribute & attribute, const Key id); + void resize(Attribute & attribute, UInt64 key); template - void setAttributeValueImpl(Attribute & attribute, const Key id, const T & value); + void setAttributeValueImpl(Attribute & attribute, UInt64 key, const T & value); - void setAttributeValue(Attribute & attribute, const Key id, const Field & value); - - const Attribute & getAttribute(const std::string & attribute_name) const; - - template - void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - - PaddedPODArray getIds() const; + void setAttributeValue(Attribute & attribute, UInt64 key, const Field & value); const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; - const bool require_nonempty; + const Configuration configuration; - std::map attribute_index_by_name; std::vector attributes; - const Attribute * hierarchical_attribute = nullptr; - std::vector loaded_ids; + std::vector loaded_keys; size_t bytes_allocated = 0; size_t element_count = 0; size_t bucket_count = 0; mutable std::atomic query_count{0}; - BlockPtr saved_block; + BlockPtr previously_loaded_block; }; } diff --git a/src/Dictionaries/HTTPDictionarySource.cpp b/src/Dictionaries/HTTPDictionarySource.cpp index ddcac117e58..b674d593444 100644 --- a/src/Dictionaries/HTTPDictionarySource.cpp +++ b/src/Dictionaries/HTTPDictionarySource.cpp @@ -30,7 +30,7 @@ HTTPDictionarySource::HTTPDictionarySource( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block_, - const Context & context_, + ContextPtr context_, bool check_config) : log(&Poco::Logger::get("HTTPDictionarySource")) , update_time{std::chrono::system_clock::from_time_t(0)} @@ -44,7 +44,7 @@ HTTPDictionarySource::HTTPDictionarySource( { if (check_config) - context.getRemoteHostFilter().checkURL(Poco::URI(url)); + context->getRemoteHostFilter().checkURL(Poco::URI(url)); const auto & credentials_prefix = config_prefix + ".credentials"; @@ -80,7 +80,7 @@ HTTPDictionarySource::HTTPDictionarySource(const HTTPDictionarySource & other) , update_field{other.update_field} , format{other.format} , sample_block{other.sample_block} - , context(other.context) + , context(Context::createCopy(other.context)) , timeouts(ConnectionTimeouts::getHTTPTimeouts(context)) { credentials.setUsername(other.credentials.getUsername()); @@ -111,7 +111,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadAll() auto in_ptr = std::make_unique( uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, 0, credentials, DBMS_DEFAULT_BUFFER_SIZE, header_entries); - auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); + auto input_stream = context->getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -123,7 +123,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadUpdatedAll() auto in_ptr = std::make_unique( uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, 0, credentials, DBMS_DEFAULT_BUFFER_SIZE, header_entries); - auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); + auto input_stream = context->getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -136,7 +136,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadIds(const std::vector & id ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = [block, this](std::ostream & ostr) { WriteBufferFromOStream out_buffer(ostr); - auto output_stream = context.getOutputStream(format, out_buffer, sample_block); + auto output_stream = context->getOutputStreamParallelIfPossible(format, out_buffer, sample_block); formatBlock(output_stream, block); }; @@ -144,7 +144,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadIds(const std::vector & id auto in_ptr = std::make_unique( uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, 0, credentials, DBMS_DEFAULT_BUFFER_SIZE, header_entries); - auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); + auto input_stream = context->getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -157,7 +157,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadKeys(const Columns & key_columns, ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = [block, this](std::ostream & ostr) { WriteBufferFromOStream out_buffer(ostr); - auto output_stream = context.getOutputStream(format, out_buffer, sample_block); + auto output_stream = context->getOutputStreamParallelIfPossible(format, out_buffer, sample_block); formatBlock(output_stream, block); }; @@ -165,7 +165,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadKeys(const Columns & key_columns, auto in_ptr = std::make_unique( uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, 0, credentials, DBMS_DEFAULT_BUFFER_SIZE, header_entries); - auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); + auto input_stream = context->getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -201,14 +201,14 @@ void registerDictionarySourceHTTP(DictionarySourceFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block, - const Context & context, + ContextPtr context, const std::string & /* default_database */, bool check_config) -> DictionarySourcePtr { if (dict_struct.has_expressions) - throw Exception{"Dictionary source of type `http` does not support attribute expressions", ErrorCodes::LOGICAL_ERROR}; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `http` does not support attribute expressions"); - Context context_local_copy = copyContextAndApplySettings(config_prefix, context, config); + auto context_local_copy = copyContextAndApplySettings(config_prefix, context, config); return std::make_unique( dict_struct, config, config_prefix + ".http", diff --git a/src/Dictionaries/HTTPDictionarySource.h b/src/Dictionaries/HTTPDictionarySource.h index e7920132e83..c42c67ec8c9 100644 --- a/src/Dictionaries/HTTPDictionarySource.h +++ b/src/Dictionaries/HTTPDictionarySource.h @@ -26,7 +26,7 @@ public: const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block_, - const Context & context_, + ContextPtr context_, bool check_config); HTTPDictionarySource(const HTTPDictionarySource & other); @@ -65,7 +65,7 @@ private: std::string update_field; const std::string format; Block sample_block; - Context context; + ContextPtr context; ConnectionTimeouts timeouts; }; diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index b51f2414142..2a403554a80 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -1,13 +1,14 @@ #include "HashedDictionary.h" -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include "ClickHouseDictionarySource.h" + #include -#include +#include #include #include -#include +#include + +#include +#include +#include namespace { @@ -15,136 +16,74 @@ namespace /// NOTE: Trailing return type is explicitly specified for SFINAE. /// google::sparse_hash_map -template auto first(const T & value) -> decltype(value.first) { return value.first; } // NOLINT -template auto second(const T & value) -> decltype(value.second) { return value.second; } // NOLINT +template auto getKeyFromCell(const T & value) -> decltype(value->first) { return value->first; } // NOLINT +template auto getValueFromCell(const T & value) -> decltype(value->second) { return value->second; } // NOLINT /// HashMap -template auto first(const T & value) -> decltype(value.getKey()) { return value.getKey(); } // NOLINT -template auto second(const T & value) -> decltype(value.getMapped()) { return value.getMapped(); } // NOLINT +template auto getKeyFromCell(const T & value) -> decltype(value->getKey()) { return value->getKey(); } // NOLINT +template auto getValueFromCell(const T & value) -> decltype(value->getMapped()) { return value->getMapped(); } // NOLINT } namespace DB { + namespace ErrorCodes { - extern const int TYPE_MISMATCH; extern const int BAD_ARGUMENTS; extern const int DICTIONARY_IS_EMPTY; extern const int UNSUPPORTED_METHOD; } - -HashedDictionary::HashedDictionary( +template +HashedDictionary::HashedDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, - bool sparse_, - BlockPtr saved_block_) + BlockPtr previously_loaded_block_) : IDictionary(dict_id_) , dict_struct(dict_struct_) - , source_ptr{std::move(source_ptr_)} + , source_ptr(std::move(source_ptr_)) , dict_lifetime(dict_lifetime_) , require_nonempty(require_nonempty_) - , sparse(sparse_) - , saved_block{std::move(saved_block_)} + , previously_loaded_block(std::move(previously_loaded_block_)) { createAttributes(); loadData(); calculateBytesAllocated(); } - -void HashedDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - DictionaryDefaultValueExtractor extractor(null_value); - - getItemsImpl( - *hierarchical_attribute, - ids, - [&](const size_t row, const UInt64 value) { out[row] = value; }, - extractor); -} - - -/// Allow to use single value in same way as array. -static inline HashedDictionary::Key getAt(const PaddedPODArray & arr, const size_t idx) -{ - return arr[idx]; -} -static inline HashedDictionary::Key getAt(const HashedDictionary::Key & value, const size_t) -{ - return value; -} - -template -void HashedDictionary::isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - const auto null_value = std::get(hierarchical_attribute->null_values); - const auto rows = out.size(); - - for (const auto row : ext::range(0, rows)) - { - auto id = getAt(child_ids, row); - const auto ancestor_id = getAt(ancestor_ids, row); - - for (size_t i = 0; id != null_value && id != ancestor_id && i < DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH; ++i) - { - auto it = attr.find(id); - if (it != std::end(attr)) - id = second(*it); - else - break; - } - - out[row] = id != null_value && id == ancestor_id; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} -template -void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const -{ - if (!sparse) - return isInAttrImpl(*std::get>(hierarchical_attribute->maps), child_ids, ancestor_ids, out); - return isInAttrImpl(*std::get>(hierarchical_attribute->sparse_maps), child_ids, ancestor_ids, out); -} - -void HashedDictionary::isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_ids, out); -} - -void HashedDictionary::isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const -{ - isInImpl(child_ids, ancestor_id, out); -} - -void HashedDictionary::isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const -{ - isInImpl(child_id, ancestor_ids, out); -} - -ColumnPtr HashedDictionary::getColumn( +template +ColumnPtr HashedDictionary::getColumn( const std::string & attribute_name, const DataTypePtr & result_type, const Columns & key_columns, - const DataTypes &, - const ColumnPtr default_values_column) const + const DataTypes & key_types [[maybe_unused]], + const ColumnPtr & default_values_column) const { + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + ColumnPtr result; - PaddedPODArray backup_storage; - const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage); + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); - auto size = ids.size(); + const size_t size = extractor.getKeysSize(); - const auto & attribute = getAttribute(attribute_name); const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); + const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + auto & attribute = attributes[attribute_index]; + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + if (attribute.is_nullable_set) + { + col_null_map_to = ColumnUInt8::create(size, false); + vec_null_map_to = &col_null_map_to->getData(); + } auto type_call = [&](const auto & dictionary_attribute_type) { @@ -159,24 +98,34 @@ ColumnPtr HashedDictionary::getColumn( auto column = ColumnProvider::getColumn(dictionary_attribute, size); - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) { auto * out = column.get(); - getItemsImpl( + getItemsImpl( attribute, - ids, + extractor, [&](const size_t, const StringRef value) { out->insertData(value.data, value.size); }, + [&](const size_t row) + { + out->insertDefault(); + (*vec_null_map_to)[row] = true; + }, default_value_extractor); } else { auto & out = column->getData(); - getItemsImpl( + getItemsImpl( attribute, - ids, + extractor, [&](const size_t row, const auto value) { return out[row] = value; }, + [&](const size_t row) + { + out[row] = ValueType(); + (*vec_null_map_to)[row] = true; + }, default_value_extractor); } @@ -185,89 +134,216 @@ ColumnPtr HashedDictionary::getColumn( callOnDictionaryAttributeType(attribute.type, type_call); - if (attribute.nullable_set) - { - ColumnUInt8::MutablePtr col_null_map_to = ColumnUInt8::create(size, false); - ColumnUInt8::Container& vec_null_map_to = col_null_map_to->getData(); - - for (size_t row = 0; row < ids.size(); ++row) - { - auto id = ids[row]; - - if (attribute.nullable_set->find(id) != nullptr) - vec_null_map_to[row] = true; - } - + if (attribute.is_nullable_set) result = ColumnNullable::create(result, std::move(col_null_map_to)); + + return result; +} + +template +ColumnUInt8::Ptr HashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const +{ + if (dictionary_key_type == DictionaryKeyType::complex) + dict_struct.validateKeyTypes(key_types); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor extractor(key_columns, arena_holder.getComplexKeyArena()); + + size_t keys_size = extractor.getKeysSize(); + + auto result = ColumnUInt8::create(keys_size, false); + auto & out = result->getData(); + + if (attributes.empty()) + { + query_count.fetch_add(keys_size, std::memory_order_relaxed); + return result; } - return result; -} - -ColumnUInt8::Ptr HashedDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const -{ - PaddedPODArray backup_storage; - const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); - - size_t ids_count = ext::size(ids); - - auto result = ColumnUInt8::create(ext::size(ids)); - auto& out = result->getData(); - const auto & attribute = attributes.front(); + bool is_attribute_nullable = attribute.is_nullable_set.has_value(); - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(0, [&](const auto & container) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - has(attribute, ids, out); - }; + for (size_t requested_key_index = 0; requested_key_index < keys_size; ++requested_key_index) + { + auto requested_key = extractor.extractCurrentKey(); - callOnDictionaryAttributeType(attribute.type, type_call); + out[requested_key_index] = container.find(requested_key) != container.end(); - query_count.fetch_add(ids_count, std::memory_order_relaxed); + if (is_attribute_nullable && !out[requested_key_index]) + out[requested_key_index] = attribute.is_nullable_set->find(requested_key) != nullptr; + + extractor.rollbackCurrentKey(); + } + }); + + query_count.fetch_add(keys_size, std::memory_order_relaxed); return result; } -void HashedDictionary::createAttributes() +template +ColumnPtr HashedDictionary::getHierarchy(ColumnPtr key_column [[maybe_unused]], const DataTypePtr &) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & dictionary_attribute = dict_struct.attributes[hierarchical_attribute_index]; + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = dictionary_attribute.null_value.template get(); + const CollectionType & parent_keys_map = std::get>(hierarchical_attribute.container); + + auto is_key_valid_func = [&](auto & key) { return parent_keys_map.find(key) != parent_keys_map.end(); }; + + auto get_parent_func = [&](auto & hierarchy_key) + { + std::optional result; + + auto it = parent_keys_map.find(hierarchy_key); + + if (it != parent_keys_map.end()) + result = getValueFromCell(it); + + return result; + }; + + auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return dictionary_hierarchy_array; + } + else + return nullptr; +} + +template +ColumnUInt8::Ptr HashedDictionary::isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr &) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup_storage; + const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage); + + PaddedPODArray keys_in_backup_storage; + const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & dictionary_attribute = dict_struct.attributes[hierarchical_attribute_index]; + auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + + const UInt64 null_value = dictionary_attribute.null_value.template get(); + const CollectionType & parent_keys_map = std::get>(hierarchical_attribute.container); + + auto is_key_valid_func = [&](auto & key) { return parent_keys_map.find(key) != parent_keys_map.end(); }; + + auto get_parent_func = [&](auto & hierarchy_key) + { + std::optional result; + + auto it = parent_keys_map.find(hierarchy_key); + + if (it != parent_keys_map.end()) + result = getValueFromCell(it); + + return result; + }; + + auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_func); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; + } + else + return nullptr; +} + +template +ColumnPtr HashedDictionary::getDescendants( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr &, + size_t level [[maybe_unused]]) const +{ + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + PaddedPODArray keys_backup; + const auto & keys = getColumnVectorData(this, key_column, keys_backup); + + size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index; + + const auto & hierarchical_attribute = attributes[hierarchical_attribute_index]; + const CollectionType & parent_keys = std::get>(hierarchical_attribute.container); + + HashMap> parent_to_child; + + for (const auto & [key, value] : parent_keys) + parent_to_child[value].emplace_back(key); + + auto result = getKeysDescendantsArray(keys, parent_to_child, level); + + query_count.fetch_add(keys.size(), std::memory_order_relaxed); + + return result; + } + else + return nullptr; +} + +template +void HashedDictionary::createAttributes() { const auto size = dict_struct.attributes.size(); attributes.reserve(size); - for (const auto & attribute : dict_struct.attributes) + for (const auto & dictionary_attribute : dict_struct.attributes) { - attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttribute(attribute, attribute.null_value)); - - if (attribute.hierarchical) + auto type_call = [&, this](const auto & dictionary_attribute_type) { - hierarchical_attribute = &attributes.back(); + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; - if (hierarchical_attribute->type != AttributeUnderlyingType::utUInt64) - throw Exception{full_name + ": hierarchical attribute must be UInt64.", ErrorCodes::TYPE_MISMATCH}; - } + auto is_nullable_set = dictionary_attribute.is_nullable ? std::make_optional() : std::optional{}; + std::unique_ptr string_arena = std::is_same_v ? std::make_unique() : nullptr; + + ValueType default_value; + + if constexpr (std::is_same_v) + { + string_arena = std::make_unique(); + + const auto & string_null_value = dictionary_attribute.null_value.template get(); + const size_t string_null_value_size = string_null_value.size(); + + const char * string_in_arena = string_arena->insert(string_null_value.data(), string_null_value_size); + default_value = {string_in_arena, string_null_value_size}; + } + else + default_value = dictionary_attribute.null_value.template get>(); + + Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), default_value, CollectionType(), std::move(string_arena)}; + attributes.emplace_back(std::move(attribute)); + }; + + callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call); } } -void HashedDictionary::blockToAttributes(const Block & block) +template +void HashedDictionary::updateData() { - const auto & id_column = *block.safeGetByPosition(0).column; - - for (const size_t attribute_idx : ext::range(0, attributes.size())) - { - const IColumn & attribute_column = *block.safeGetByPosition(attribute_idx + 1).column; - auto & attribute = attributes[attribute_idx]; - - for (const auto row_idx : ext::range(0, id_column.size())) - if (setAttributeValue(attribute, id_column[row_idx].get(), attribute_column[row_idx])) - ++element_count; - } -} - -void HashedDictionary::updateData() -{ - if (!saved_block || saved_block->rows() == 0) + if (!previously_loaded_block || previously_loaded_block->rows() == 0) { auto stream = source_ptr->loadUpdatedAll(); stream->readPrefix(); @@ -275,12 +351,13 @@ void HashedDictionary::updateData() while (const auto block = stream->read()) { /// We are using this to keep saved data if input stream consists of multiple blocks - if (!saved_block) - saved_block = std::make_shared(block.cloneEmpty()); + if (!previously_loaded_block) + previously_loaded_block = std::make_shared(block.cloneEmpty()); + for (const auto attribute_idx : ext::range(0, attributes.size() + 1)) { const IColumn & update_column = *block.getByPosition(attribute_idx).column.get(); - MutableColumnPtr saved_column = saved_block->getByPosition(attribute_idx).column->assumeMutable(); + MutableColumnPtr saved_column = previously_loaded_block->getByPosition(attribute_idx).column->assumeMutable(); saved_column->insertRangeFrom(update_column, 0, update_column.size()); } } @@ -289,137 +366,177 @@ void HashedDictionary::updateData() else { auto stream = source_ptr->loadUpdatedAll(); - stream->readPrefix(); + mergeBlockWithStream( + dict_struct.getKeysSize(), + *previously_loaded_block, + stream); + } - while (Block block = stream->read()) + if (previously_loaded_block) + { + resize(previously_loaded_block->rows()); + blockToAttributes(*previously_loaded_block.get()); + } +} + +template +void HashedDictionary::blockToAttributes(const Block & block [[maybe_unused]]) +{ + size_t skip_keys_size_offset = dict_struct.getKeysSize(); + + Columns key_columns; + key_columns.reserve(skip_keys_size_offset); + + /// Split into keys columns and attribute columns + for (size_t i = 0; i < skip_keys_size_offset; ++i) + key_columns.emplace_back(block.safeGetByPosition(i).column); + + DictionaryKeysArenaHolder arena_holder; + DictionaryKeysExtractor keys_extractor(key_columns, arena_holder.getComplexKeyArena()); + const size_t keys_size = keys_extractor.getKeysSize(); + + Field column_value_to_insert; + + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + { + const IColumn & attribute_column = *block.safeGetByPosition(skip_keys_size_offset + attribute_index).column; + auto & attribute = attributes[attribute_index]; + bool attribute_is_nullable = attribute.is_nullable_set.has_value(); + + getAttributeContainer(attribute_index, [&](auto & container) { - const auto & saved_id_column = *saved_block->safeGetByPosition(0).column; - const auto & update_id_column = *block.safeGetByPosition(0).column; + using ContainerType = std::decay_t; + using AttributeValueType = typename ContainerType::mapped_type; - std::unordered_map> update_ids; - for (size_t row = 0; row < update_id_column.size(); ++row) + for (size_t key_index = 0; key_index < keys_size; ++key_index) { - const auto id = update_id_column.get64(row); - update_ids[id].push_back(row); - } + auto key = keys_extractor.extractCurrentKey(); - const size_t saved_rows = saved_id_column.size(); - IColumn::Filter filter(saved_rows); - std::unordered_map>::iterator it; + auto it = container.find(key); + bool key_is_nullable_and_already_exists = attribute_is_nullable && attribute.is_nullable_set->find(key) != nullptr; - for (size_t row = 0; row < saved_id_column.size(); ++row) - { - auto id = saved_id_column.get64(row); - it = update_ids.find(id); + if (key_is_nullable_and_already_exists || it != container.end()) + { + keys_extractor.rollbackCurrentKey(); + continue; + } - if (it != update_ids.end()) - filter[row] = 0; + if constexpr (std::is_same_v) + key = copyKeyInArena(key); + + attribute_column.get(key_index, column_value_to_insert); + + if (attribute.is_nullable_set && column_value_to_insert.isNull()) + { + attribute.is_nullable_set->insert(key); + keys_extractor.rollbackCurrentKey(); + continue; + } + + if constexpr (std::is_same_v) + { + String & value_to_insert = column_value_to_insert.get(); + size_t value_to_insert_size = value_to_insert.size(); + + const char * string_in_arena = attribute.string_arena->insert(value_to_insert.data(), value_to_insert_size); + + StringRef string_in_arena_reference = StringRef{string_in_arena, value_to_insert_size}; + container.insert({key, string_in_arena_reference}); + } else - filter[row] = 1; + { + auto value_to_insert = column_value_to_insert.get>(); + container.insert({key, value_to_insert}); + } + + ++element_count; + + keys_extractor.rollbackCurrentKey(); } - auto block_columns = block.mutateColumns(); - for (const auto attribute_idx : ext::range(0, attributes.size() + 1)) - { - auto & column = saved_block->safeGetByPosition(attribute_idx).column; - const auto & filtered_column = column->filter(filter, -1); - - block_columns[attribute_idx]->insertRangeFrom(*filtered_column.get(), 0, filtered_column->size()); - } - - saved_block->setColumns(std::move(block_columns)); - } - stream->readSuffix(); - } - - if (saved_block) - { - resize(saved_block->rows()); - blockToAttributes(*saved_block.get()); + keys_extractor.reset(); + }); } } -template -void HashedDictionary::resize(Attribute & attribute, size_t added_rows) +template +void HashedDictionary::resize(size_t added_rows) { - if (!sparse) - { - const auto & map_ref = std::get>(attribute.maps); - added_rows += map_ref->size(); - map_ref->reserve(added_rows); - } - else - { - const auto & map_ref = std::get>(attribute.sparse_maps); - added_rows += map_ref->size(); - map_ref->resize(added_rows); - } -} - -template <> -void HashedDictionary::resize(Attribute & attribute, size_t added_rows) -{ - resize(attribute, added_rows); -} - -void HashedDictionary::resize(size_t added_rows) -{ - if (!added_rows) + if (unlikely(!added_rows)) return; - for (auto & attribute : attributes) + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) { - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(attribute_index, [added_rows](auto & attribute_map) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - resize(attribute, added_rows); - }; + size_t reserve_size = added_rows + attribute_map.size(); - callOnDictionaryAttributeType(attribute.type, type_call); + if constexpr (sparse) + attribute_map.resize(reserve_size); + else + attribute_map.reserve(reserve_size); + }); } } -void HashedDictionary::loadData() +template +template +void HashedDictionary::getItemsImpl( + const Attribute & attribute, + DictionaryKeysExtractor & keys_extractor, + ValueSetter && set_value [[maybe_unused]], + NullableValueSetter && set_nullable_value [[maybe_unused]], + DefaultValueExtractor & default_value_extractor) const +{ + const auto & attribute_container = std::get>(attribute.container); + const size_t keys_size = keys_extractor.getKeysSize(); + + bool is_attribute_nullable = attribute.is_nullable_set.has_value(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys_extractor.extractCurrentKey(); + + const auto it = attribute_container.find(key); + + if (it != attribute_container.end()) + set_value(key_index, getValueFromCell(it)); + else + { + if (is_attribute_nullable && attribute.is_nullable_set->find(key) != nullptr) + set_nullable_value(key_index); + else + set_value(key_index, default_value_extractor[key_index]); + } + + keys_extractor.rollbackCurrentKey(); + } + + query_count.fetch_add(keys_size, std::memory_order_relaxed); +} + +template +StringRef HashedDictionary::copyKeyInArena(StringRef key) +{ + size_t key_size = key.size; + char * place_for_key = complex_key_arena.alloc(key_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); + StringRef updated_key{place_for_key, key_size}; + return updated_key; +} + +template +void HashedDictionary::loadData() { if (!source_ptr->hasUpdateField()) { - /// atomic since progress callbac called in parallel - std::atomic new_size = 0; auto stream = source_ptr->loadAll(); - /// preallocation can be used only when we know number of rows, for this we need: - /// - source clickhouse - /// - no filtering (i.e. lack of ), since filtering can filter - /// too much rows and eventually it may allocate memory that will - /// never be used. - bool preallocate = false; - if (const auto & clickhouse_source = dynamic_cast(source_ptr.get())) - { - if (!clickhouse_source->hasWhere()) - preallocate = true; - } - - if (preallocate) - { - stream->setProgressCallback([&new_size](const Progress & progress) - { - new_size += progress.total_rows_to_read; - }); - } - stream->readPrefix(); while (const auto block = stream->read()) { - if (new_size) - { - size_t current_new_size = new_size.exchange(0); - if (current_new_size) - resize(current_new_size); - } - else - resize(block.rows()); + resize(block.rows()); blockToAttributes(block); } @@ -429,263 +546,120 @@ void HashedDictionary::loadData() updateData(); if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY}; + throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, + "{}: dictionary source is empty and 'require_nonempty' property is set.", + full_name); } -template -void HashedDictionary::addAttributeSize(const Attribute & attribute) -{ - if (!sparse) - { - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(CollectionType) + map_ref->getBufferSizeInBytes(); - bucket_count = map_ref->getBufferSizeInCells(); - } - else - { - const auto & map_ref = std::get>(attribute.sparse_maps); - bucket_count = map_ref->bucket_count(); - - /** TODO: more accurate calculation */ - bytes_allocated += sizeof(SparseCollectionType); - bytes_allocated += bucket_count; - bytes_allocated += map_ref->size() * (sizeof(Key) + sizeof(T)); - } -} - -template <> -void HashedDictionary::addAttributeSize(const Attribute & attribute) -{ - addAttributeSize(attribute); - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); -} - -void HashedDictionary::calculateBytesAllocated() +template +void HashedDictionary::calculateBytesAllocated() { bytes_allocated += attributes.size() * sizeof(attributes.front()); - for (const auto & attribute : attributes) + for (size_t i = 0; i < attributes.size(); ++i) { - auto type_call = [&](const auto & dictionary_attribute_type) + getAttributeContainer(i, [&](const auto & container) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - addAttributeSize(attribute); - }; + using ContainerType = std::decay_t; + using AttributeValueType = typename ContainerType::mapped_type; - callOnDictionaryAttributeType(attribute.type, type_call); - } -} + bytes_allocated += sizeof(container); -template -void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.null_values = T(null_value.get>()); - if (!sparse) - attribute.maps = std::make_unique>(); - else - attribute.sparse_maps = std::make_unique>(); -} - -template <> -void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) -{ - attribute.string_arena = std::make_unique(); - const String & string = null_value.get(); - const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - attribute.null_values.emplace(string_in_arena, string.size()); - - if (!sparse) - attribute.maps = std::make_unique>(); - else - attribute.sparse_maps = std::make_unique>(); -} - -HashedDictionary::Attribute HashedDictionary::createAttribute(const DictionaryAttribute& attribute, const Field & null_value) -{ - auto nullable_set = attribute.is_nullable ? std::make_optional() : std::optional{}; - Attribute attr{attribute.underlying_type, std::move(nullable_set), {}, {}, {}, {}}; - - auto type_call = [&, this](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - createAttributeImpl(attr, null_value); - }; - - callOnDictionaryAttributeType(attribute.underlying_type, type_call); - - return attr; -} - - -template -void HashedDictionary::getItemsAttrImpl( - const MapType & attr, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - const auto rows = ext::size(ids); - - for (const auto i : ext::range(0, rows)) - { - const auto it = attr.find(ids[i]); - set_value(i, it != attr.end() ? static_cast(second(*it)) : default_value_extractor[i]); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - -template -void HashedDictionary::getItemsImpl( - const Attribute & attribute, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - if (!sparse) - return getItemsAttrImpl(*std::get>(attribute.maps), ids, set_value, default_value_extractor); - return getItemsAttrImpl(*std::get>(attribute.sparse_maps), ids, set_value, default_value_extractor); -} - - -template -bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T value) -{ - if (!sparse) - { - auto & map = *std::get>(attribute.maps); - return map.insert({id, value}).second; - } - else - { - auto & map = *std::get>(attribute.sparse_maps); - return map.insert({id, value}).second; - } -} - -template <> -bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const String value) -{ - const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - return setAttributeValueImpl(attribute, id, StringRef{string_in_arena, value.size()}); -} - -bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value) -{ - bool result = false; - - auto type_call = [&, this](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - - if (attribute.nullable_set) - { - if (value.isNull()) + if constexpr (sparse || std::is_same_v) { - result = attribute.nullable_set->insert(id).second; - return; + /// bucket_count() - Returns table size, that includes empty and deleted + /// size() - Returns table size, w/o empty and deleted + /// and since this is sparsehash, empty cells should not be significant, + /// and since items cannot be removed from the dictionary, deleted is also not important. + bytes_allocated += container.size() * (sizeof(KeyType) + sizeof(AttributeValueType)); + bucket_count = container.bucket_count(); } else { - attribute.nullable_set->erase(id); + bytes_allocated += container.getBufferSizeInBytes(); + bucket_count = container.getBufferSizeInCells(); } - } + }); - result = setAttributeValueImpl(attribute, id, value.get>()); - }; - - callOnDictionaryAttributeType(attribute.type, type_call); - - return result; -} - -const HashedDictionary::Attribute & HashedDictionary::getAttribute(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; - - return attributes[it->second]; -} - -template -void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const -{ - const auto & attr = *std::get>(attribute.maps); - const auto rows = ext::size(ids); - - for (const auto i : ext::range(0, rows)) - { - out[i] = attr.find(ids[i]) != nullptr; - - if (attribute.nullable_set && !out[i]) - out[i] = attribute.nullable_set->find(ids[i]) != nullptr; + if (attributes[i].string_arena) + bytes_allocated += attributes[i].string_arena->size(); } + + bytes_allocated += complex_key_arena.size(); } -template <> -void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const +template +BlockInputStreamPtr HashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - has(attribute, ids, out); + PaddedPODArray keys; + + if (!attributes.empty()) + { + const auto & attribute = attributes.front(); + + getAttributeContainer(0, [&](auto & container) + { + keys.reserve(container.size()); + + for (const auto & [key, value] : container) + { + (void)(value); + keys.emplace_back(key); + } + + if (attribute.is_nullable_set) + { + const auto & is_nullable_set = *attribute.is_nullable_set; + keys.reserve(is_nullable_set.size()); + + for (auto & node : is_nullable_set) + keys.emplace_back(node.getKey()); + } + }); + } + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return std::make_shared(shared_from_this(), max_block_size, std::move(keys), column_names); + else + return std::make_shared(shared_from_this(), max_block_size, keys, column_names); } -template -PaddedPODArray HashedDictionary::getIdsAttrImpl(const AttrType & attr) const +template +template +void HashedDictionary::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) { - PaddedPODArray ids; - ids.reserve(attr.size()); - for (const auto & value : attr) - ids.push_back(first(value)); + assert(attribute_index < attributes.size()); - return ids; -} -template -PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const -{ - if (!sparse) - return getIdsAttrImpl(*std::get>(attribute.maps)); - return getIdsAttrImpl(*std::get>(attribute.sparse_maps)); -} - -template <> -PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const -{ - return getIds(attribute); -} - -PaddedPODArray HashedDictionary::getIds() const -{ - const auto & attribute = attributes.front(); - PaddedPODArray result; + auto & attribute = attributes[attribute_index]; auto type_call = [&](const auto & dictionary_attribute_type) { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; - /// TODO: Check if order is satisfied - result = getIds(attribute); + using ValueType = DictionaryValueType; - if (attribute.nullable_set) - { - for (const auto& value: *attribute.nullable_set) - result.push_back(value.getKey()); - } + auto & attribute_container = std::get>(attribute.container); + std::forward(get_container_func)(attribute_container); }; callOnDictionaryAttributeType(attribute.type, type_call); - - return result; } -BlockInputStreamPtr HashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const +template +template +void HashedDictionary::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const { - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, getIds(), column_names); + const_cast *>(this)->getAttributeContainer(attribute_index, [&](auto & attribute_container) + { + std::forward(get_container_func)(attribute_container); + }); } +template class HashedDictionary; +template class HashedDictionary; +template class HashedDictionary; +template class HashedDictionary; + void registerDictionaryHashed(DictionaryFactory & factory) { auto create_layout = [](const std::string & full_name, @@ -693,27 +667,51 @@ void registerDictionaryHashed(DictionaryFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, DictionarySourcePtr source_ptr, + DictionaryKeyType dictionary_key_type, bool sparse) -> DictionaryPtr { - if (dict_struct.key) - throw Exception{"'key' is not supported for dictionary of layout 'hashed'", ErrorCodes::UNSUPPORTED_METHOD}; + if (dictionary_key_type == DictionaryKeyType::simple && dict_struct.key) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for simple key hashed dictionary"); + else if (dictionary_key_type == DictionaryKeyType::complex && dict_struct.id) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for complex key hashed dictionary"); if (dict_struct.range_min || dict_struct.range_max) - throw Exception{full_name - + ": elements .structure.range_min and .structure.range_max should be defined only " - "for a dictionary of layout 'range_hashed'", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: elements .structure.range_min and .structure.range_max should be defined only " + "for a dictionary of layout 'range_hashed'", + full_name); const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty, sparse); + + if (dictionary_key_type == DictionaryKeyType::simple) + { + if (sparse) + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + else + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + } + else + { + if (sparse) + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + else + return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + } }; + using namespace std::placeholders; + factory.registerLayout("hashed", - [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), /* sparse = */ false); }, false); + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::simple, /* sparse = */ false); }, false); factory.registerLayout("sparse_hashed", - [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), /* sparse = */ true); }, false); + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::simple, /* sparse = */ true); }, false); + factory.registerLayout("complex_key_hashed", + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::complex, /* sparse = */ false); }, true); + factory.registerLayout("complex_key_sparse_hashed", + [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::complex, /* sparse = */ true); }, true); + } } diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 97b329a8b25..0d0ab8dcb43 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -4,17 +4,21 @@ #include #include #include -#include -#include -#include -#include -#include + #include #include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include /** This dictionary stores all content in a hash table in memory * (a separate Key -> Value map for each attribute) @@ -24,19 +28,32 @@ namespace DB { +template class HashedDictionary final : public IDictionary { public: + using KeyType = std::conditional_t; + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by hashed dictionary"); + HashedDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, - bool sparse_, - BlockPtr saved_block_ = nullptr); + BlockPtr previously_loaded_block_ = nullptr); - std::string getTypeName() const override { return sparse ? "SparseHashed" : "Hashed"; } + std::string getTypeName() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple && sparse) + return "SparseHashed"; + else if constexpr (dictionary_key_type == DictionaryKeyType::simple && !sparse) + return "Hashed"; + else if constexpr (dictionary_key_type == DictionaryKeyType::complex && sparse) + return "ComplexKeySpareseHashed"; + else + return "ComplexKeyHashed"; + } size_t getBytesAllocated() const override { return bytes_allocated; } @@ -50,7 +67,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block); + return std::make_shared>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, previously_loaded_block); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -61,54 +78,66 @@ public: bool isInjective(const std::string & attribute_name) const override { - return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + return dict_struct.getAttribute(attribute_name).injective; } - bool hasHierarchy() const override { return hierarchical_attribute; } - - void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const override; - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } + DictionaryKeyType getKeyType() const override { return dictionary_key_type; } ColumnPtr getColumn( const std::string& attribute_name, const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const override; + const ColumnPtr & default_values_column) const override; ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - void isInVectorVector( - const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; - void isInVectorConstant(const PaddedPODArray & child_ids, const Key ancestor_id, PaddedPODArray & out) const override; - void isInConstantVector(const Key child_id, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const override; + bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && dict_struct.hierarchical_attribute_index.has_value(); } + + ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & hierarchy_attribute_type) const override; + + ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) const override; + + ColumnPtr getDescendants( + ColumnPtr key_column, + const DataTypePtr & key_type, + size_t level) const override; BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: template - using CollectionType = HashMap; - template - using CollectionPtrType = std::unique_ptr>; + using CollectionTypeNonSparse = std::conditional_t< + dictionary_key_type == DictionaryKeyType::simple, + HashMap, + HashMapWithSavedHash>>; #if !defined(ARCADIA_BUILD) - template - using SparseCollectionType = google::sparse_hash_map>; + template + using SparseHashMap = google::sparse_hash_map>; #else - template - using SparseCollectionType = google::sparsehash::sparse_hash_map>; + template + using SparseHashMap = google::sparsehash::sparse_hash_map>; #endif template - using SparseCollectionPtrType = std::unique_ptr>; + using CollectionTypeSparse = std::conditional_t< + dictionary_key_type == DictionaryKeyType::simple, + SparseHashMap, + SparseHashMap>; - using NullableSet = HashSet>; + template + using CollectionType = std::conditional_t, CollectionTypeNonSparse>; + + using NullableSet = HashSet>; struct Attribute final { AttributeUnderlyingType type; - std::optional nullable_set; + std::optional is_nullable_set; std::variant< UInt8, @@ -123,44 +152,31 @@ private: Decimal32, Decimal64, Decimal128, + Decimal256, Float32, Float64, StringRef> null_values; + std::variant< - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType, - CollectionPtrType> - maps; - std::variant< - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType, - SparseCollectionPtrType> - sparse_maps; + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType, + CollectionType> + container; + std::unique_ptr string_arena; }; @@ -172,76 +188,46 @@ private: void loadData(); - template - void addAttributeSize(const Attribute & attribute); - void calculateBytesAllocated(); - template - void createAttributeImpl(Attribute & attribute, const Field & null_value); - - Attribute createAttribute(const DictionaryAttribute& attribute, const Field & null_value); - - template - void getItemsAttrImpl( - const MapType & attr, - const PaddedPODArray & ids, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const; - - template + template void getItemsImpl( const Attribute & attribute, - const PaddedPODArray & ids, + DictionaryKeysExtractor & keys_extractor, ValueSetter && set_value, + NullableValueSetter && set_nullable_value, DefaultValueExtractor & default_value_extractor) const; - template - bool setAttributeValueImpl(Attribute & attribute, const Key id, const T value); + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func); - bool setAttributeValue(Attribute & attribute, const Key id, const Field & value); + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const; - const Attribute & getAttribute(const std::string & attribute_name) const; - - template - void has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const; - - template - PaddedPODArray getIdsAttrImpl(const AttrType & attr) const; - template - PaddedPODArray getIds(const Attribute & attribute) const; - - PaddedPODArray getIds() const; - - /// Preallocates the hashtable based on query progress - /// (Only while loading all data). - /// - /// @see preallocate - template - void resize(Attribute & attribute, size_t added_rows); void resize(size_t added_rows); - template - void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; - template - void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; + StringRef copyKeyInArena(StringRef key); const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; const bool require_nonempty; - const bool sparse; - std::map attribute_index_by_name; std::vector attributes; - const Attribute * hierarchical_attribute = nullptr; size_t bytes_allocated = 0; size_t element_count = 0; size_t bucket_count = 0; mutable std::atomic query_count{0}; - BlockPtr saved_block; + BlockPtr previously_loaded_block; + Arena complex_key_arena; }; +extern template class HashedDictionary; +extern template class HashedDictionary; + +extern template class HashedDictionary; +extern template class HashedDictionary; + } diff --git a/src/Dictionaries/HierarchyDictionariesUtils.cpp b/src/Dictionaries/HierarchyDictionariesUtils.cpp new file mode 100644 index 00000000000..5bca6a5ac1a --- /dev/null +++ b/src/Dictionaries/HierarchyDictionariesUtils.cpp @@ -0,0 +1,156 @@ +#include "HierarchyDictionariesUtils.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNSUPPORTED_METHOD; +} + +namespace +{ + /** In case of cache or direct dictionary we does not have structure with child to parent representation. + * This function build such structure calling getColumn for initial keys to request and for next keys in hierarchy, + * until all keys are requested or result key is null value. + * To distinguish null value key and key that is not present in dictionary, we use special default value column + * with max UInt64 value, if result column key has such value we assume that current key is not presented in dictionary storage. + */ + HashMap getChildToParentHierarchyMapImpl( + const IDictionary * dictionary, + const DictionaryAttribute & hierarchical_attribute, + const PaddedPODArray & initial_keys_to_request, + const DataTypePtr & key_type) + { + UInt64 null_value = hierarchical_attribute.null_value.get(); + + ColumnPtr key_to_request_column = ColumnVector::create(); + auto * key_to_request_column_typed = static_cast *>(key_to_request_column->assumeMutable().get()); + + UInt64 key_not_in_storage_value = std::numeric_limits::max(); + ColumnPtr key_not_in_storage_default_value_column = ColumnVector::create(initial_keys_to_request.size(), key_not_in_storage_value); + + PaddedPODArray & keys_to_request = key_to_request_column_typed->getData(); + keys_to_request.assign(initial_keys_to_request); + + PaddedPODArray next_keys_to_request; + HashSet already_requested_keys; + + HashMap child_to_parent_key; + + while (!keys_to_request.empty()) + { + child_to_parent_key.reserve(child_to_parent_key.size() + keys_to_request.size()); + + auto parent_key_column = dictionary->getColumn( + hierarchical_attribute.name, + hierarchical_attribute.type, + {key_to_request_column}, + {key_type}, + key_not_in_storage_default_value_column); + + const auto * parent_key_column_typed = checkAndGetColumn>(*parent_key_column); + if (!parent_key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Parent key column should be UInt64. Actual {}", + hierarchical_attribute.type->getName()); + + const auto & parent_keys = parent_key_column_typed->getData(); + next_keys_to_request.clear(); + + for (size_t i = 0; i < keys_to_request.size(); ++i) + { + auto key = keys_to_request[i]; + auto parent_key = parent_keys[i]; + + if (parent_key == key_not_in_storage_value) + continue; + + child_to_parent_key[key] = parent_key; + + if (parent_key == null_value || + already_requested_keys.find(parent_key) != nullptr) + continue; + + already_requested_keys.insert(parent_key); + next_keys_to_request.emplace_back(parent_key); + } + + keys_to_request.clear(); + keys_to_request.assign(next_keys_to_request); + } + + return child_to_parent_key; + } +} + +ColumnPtr getKeysHierarchyDefaultImplementation(const IDictionary * dictionary, ColumnPtr key_column, const DataTypePtr & key_type) +{ + key_column = key_column->convertToFullColumnIfConst(); + const auto * key_column_typed = checkAndGetColumn>(*key_column); + if (!key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto & dictionary_structure = dictionary->getStructure(); + size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index; + const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index]; + + const PaddedPODArray & requested_keys = key_column_typed->getData(); + HashMap key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type); + + auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto & key) + { + auto it = key_to_parent_key.find(key); + std::optional result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return result; + }; + + UInt64 null_value = hierarchical_attribute.null_value.get(); + + auto dictionary_hierarchy_array = getKeysHierarchyArray(requested_keys, null_value, is_key_valid_func, get_parent_key_func); + return dictionary_hierarchy_array; +} + +ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type) +{ + key_column = key_column->convertToFullColumnIfConst(); + in_key_column = in_key_column->convertToFullColumnIfConst(); + + const auto * key_column_typed = checkAndGetColumn>(*key_column); + if (!key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto * in_key_column_typed = checkAndGetColumn>(*in_key_column); + if (!in_key_column_typed) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Key column should be UInt64"); + + const auto & dictionary_structure = dictionary->getStructure(); + size_t hierarchical_attribute_index = *dictionary_structure.hierarchical_attribute_index; + const auto & hierarchical_attribute = dictionary_structure.attributes[hierarchical_attribute_index]; + + const PaddedPODArray & requested_keys = key_column_typed->getData(); + HashMap key_to_parent_key = getChildToParentHierarchyMapImpl(dictionary, hierarchical_attribute, requested_keys, key_type); + + auto is_key_valid_func = [&](auto & key) { return key_to_parent_key.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto & key) + { + auto it = key_to_parent_key.find(key); + std::optional result = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return result; + }; + + UInt64 null_value = hierarchical_attribute.null_value.get(); + const auto & in_keys = in_key_column_typed->getData(); + + auto result = getKeysIsInHierarchyColumn(requested_keys, in_keys, null_value, is_key_valid_func, get_parent_key_func); + return result; +} + +} diff --git a/src/Dictionaries/HierarchyDictionariesUtils.h b/src/Dictionaries/HierarchyDictionariesUtils.h new file mode 100644 index 00000000000..8b2fe6ef08e --- /dev/null +++ b/src/Dictionaries/HierarchyDictionariesUtils.h @@ -0,0 +1,467 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + +namespace detail +{ + template + struct ElementsAndOffsets + { + PaddedPODArray elements; + PaddedPODArray offsets; + }; + + template + struct IsKeyValidFuncInterface + { + bool operator()(T key [[maybe_unused]]) { return false; } + }; + + template + struct GetParentKeyFuncInterface + { + std::optional operator()(T key [[maybe_unused]]) { return {}; } + }; + + /** Calculate hierarchy for keys iterating the hierarchy from child to parent using get_parent_key_func provided by client. + * Hierarchy iteration is stopped if key equals null value, get_parent_key_func returns null optional, or hierarchy depth + * greater or equal than DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH. + * IsKeyValidFunc used for each input hierarchy key, if it returns false result hierarchy for that key will have size 0. + * Hierarchy result is ElementsAndOffsets structure, for each element there is hierarchy array, + * with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0). + * + * Example: + * id parent_id + * 1 0 + * 2 1 + * 3 1 + * 4 2 + * + * If hierarchy_null_value will be 0. Requested keys [1, 2, 3, 4, 5]. + * Result: [1], [2, 1], [3, 1], [4, 2, 1], [] + * Elements: [1, 2, 1, 3, 1, 4, 2, 1] + * Offsets: [1, 3, 5, 8, 8] + */ + template + ElementsAndOffsets getHierarchy( + const PaddedPODArray & keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_key_func) + { + size_t hierarchy_keys_size = keys.size(); + + PaddedPODArray elements; + elements.reserve(hierarchy_keys_size); + + PaddedPODArray offsets; + offsets.reserve(hierarchy_keys_size); + + struct OffsetInArray + { + size_t offset_index; + size_t array_element_offset; + }; + + HashMap already_processes_keys_to_offset; + already_processes_keys_to_offset.reserve(hierarchy_keys_size); + + for (size_t i = 0; i < hierarchy_keys_size; ++i) + { + auto hierarchy_key = keys[i]; + size_t current_hierarchy_depth = 0; + + bool is_key_valid = std::forward(is_key_valid_func)(hierarchy_key); + + if (!is_key_valid) + { + offsets.emplace_back(elements.size()); + continue; + } + + while (true) + { + const auto * it = already_processes_keys_to_offset.find(hierarchy_key); + + if (it) + { + const auto & index = it->getMapped(); + + size_t offset = index.offset_index; + + bool is_loop = (offset == offsets.size()); + + if (unlikely(is_loop)) + break; + + size_t array_element_offset = index.array_element_offset; + + size_t previous_offset_size = offset > 0 ? offsets[offset - 1] : 0; + size_t start_index = previous_offset_size + array_element_offset; + size_t end_index = offsets[offset]; + + elements.insertFromItself(elements.begin() + start_index, elements.begin() + end_index); + break; + } + + if (hierarchy_key == hierarchy_null_value || current_hierarchy_depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH) + break; + + already_processes_keys_to_offset[hierarchy_key] = {offsets.size(), current_hierarchy_depth}; + elements.emplace_back(hierarchy_key); + ++current_hierarchy_depth; + + std::optional parent_key = std::forward(get_parent_key_func)(hierarchy_key); + + if (!parent_key.has_value()) + break; + + hierarchy_key = *parent_key; + } + + offsets.emplace_back(elements.size()); + } + + ElementsAndOffsets result = {std::move(elements), std::move(offsets)}; + + return result; + } + + /** Returns array with UInt8 represent if key from in_keys array is in hierarchy of key from keys column. + * If value in result array is 1 that means key from in_keys array is in hierarchy of key from + * keys array with same index, 0 therwise. + * For getting hierarchy implementation uses getKeysHierarchy function. + * + * Not: keys size must be equal to in_keys_size. + */ + template + PaddedPODArray getIsInHierarchy( + const PaddedPODArray & keys, + const PaddedPODArray & in_keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) + { + assert(keys.size() == in_keys.size()); + + PaddedPODArray result; + result.resize_fill(keys.size()); + + detail::ElementsAndOffsets hierarchy = detail::getHierarchy( + keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + auto & offsets = hierarchy.offsets; + auto & elements = hierarchy.elements; + + for (size_t i = 0; i < offsets.size(); ++i) + { + size_t i_elements_start = i > 0 ? offsets[i - 1] : 0; + size_t i_elements_end = offsets[i]; + + auto & key_to_find = in_keys[i]; + + const auto * begin = elements.begin() + i_elements_start; + const auto * end = elements.begin() + i_elements_end; + + const auto * it = std::find(begin, end, key_to_find); + + bool contains_key = (it != end); + result[i] = contains_key; + } + + return result; + } + + struct GetAllDescendantsStrategy { size_t level = 0; }; + struct GetDescendantsAtSpecificLevelStrategy { size_t level = 0; }; + + /** Get descendants for keys iterating the hierarchy from parent to child using parent_to_child hash map provided by client. + * GetAllDescendantsStrategy get all descendants for key + * GetDescendantsAtSpecificLevelStrategy get descendants only for specific hierarchy level. + * Hierarchy result is ElementsAndOffsets structure, for each element there is descendants array, + * with size offset[element_index] - (element_index > 0 ? offset[element_index - 1] : 0). + * + * Example: + * id parent_id + * 1 0 + * 2 1 + * 3 1 + * 4 2 + * + * Example. Strategy GetAllDescendantsStrategy. + * Requested keys [0, 1, 2, 3, 4]. + * Result: [1, 2, 3, 4], [2, 2, 4], [4], [], [] + * Elements: [1, 2, 3, 4, 2, 3, 4, 4] + * Offsets: [4, 7, 8, 8, 8] + * + * Example. Strategy GetDescendantsAtSpecificLevelStrategy with level 1. + * Requested keys [0, 1, 2, 3, 4]. + * Result: [1], [2, 3], [4], [], []; + * Offsets: [1, 3, 4, 4, 4]; + */ + template + ElementsAndOffsets getDescendants( + const PaddedPODArray & keys, + const HashMap> & parent_to_child, + Strategy strategy) + { + /// If strategy is GetAllDescendantsStrategy we try to cache and later reuse previously calculated descendants. + /// If strategy is GetDescendantsAtSpecificLevelStrategy we does not use cache strategy. + size_t keys_size = keys.size(); + + PaddedPODArray descendants; + descendants.reserve(keys_size); + + PaddedPODArray descendants_offsets; + descendants_offsets.reserve(keys_size); + + struct Range + { + size_t start_index; + size_t end_index; + }; + + static constexpr Int64 key_range_requires_update = -1; + HashMap already_processed_keys_to_range [[maybe_unused]]; + + if constexpr (std::is_same_v) + already_processed_keys_to_range.reserve(keys_size); + + struct KeyAndDepth + { + KeyType key; + Int64 depth; + }; + + HashSet already_processed_keys_during_loop; + already_processed_keys_during_loop.reserve(keys_size); + + PaddedPODArray next_keys_to_process_stack; + next_keys_to_process_stack.reserve(keys_size); + + Int64 level = static_cast(strategy.level); + + for (size_t i = 0; i < keys_size; ++i) + { + const KeyType & requested_key = keys[i]; + + if (parent_to_child.find(requested_key) == nullptr) + { + descendants_offsets.emplace_back(descendants.size()); + continue; + } + + next_keys_to_process_stack.emplace_back(KeyAndDepth{requested_key, 0}); + + /** To cache range for key without recursive function calls and custom stack we put special + * signaling value on stack key_range_requires_update. + * When we pop such value from stack that means processing descendants for key is finished + * and we can update range with end_index. + */ + while (!next_keys_to_process_stack.empty()) + { + KeyAndDepth key_to_process = next_keys_to_process_stack.back(); + + KeyType key = key_to_process.key; + Int64 depth = key_to_process.depth; + next_keys_to_process_stack.pop_back(); + + if constexpr (std::is_same_v) + { + /// Update end_index for key + if (depth == key_range_requires_update) + { + auto * it = already_processed_keys_to_range.find(key); + assert(it); + + auto & range_to_update = it->getMapped(); + range_to_update.end_index = descendants.size(); + continue; + } + } + + if (unlikely(already_processed_keys_during_loop.find(key) != nullptr)) + { + next_keys_to_process_stack.clear(); + break; + } + + if constexpr (std::is_same_v) + { + const auto * already_processed_it = already_processed_keys_to_range.find(key); + + if (already_processed_it) + { + Range range = already_processed_it->getMapped(); + + if (unlikely(range.start_index > range.end_index)) + { + /// Broken range because there was loop + already_processed_keys_to_range.erase(key); + } + else + { + auto insert_start_iterator = descendants.begin() + range.start_index; + auto insert_end_iterator = descendants.begin() + range.end_index; + descendants.insertFromItself(insert_start_iterator, insert_end_iterator); + continue; + } + } + } + + const auto * it = parent_to_child.find(key); + + if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH) + continue; + + if constexpr (std::is_same_v) + { + if (depth > level) + continue; + } + + if constexpr (std::is_same_v) + { + /// Put special signaling value on stack and update cache with range start + size_t range_start_index = descendants.size(); + already_processed_keys_to_range[key].start_index = range_start_index; + next_keys_to_process_stack.emplace_back(KeyAndDepth{key, key_range_requires_update}); + } + + already_processed_keys_during_loop.insert(key); + + ++depth; + + const auto & children = it->getMapped(); + + for (auto child_key : children) + { + /// In case of GetAllDescendantsStrategy we add any descendant to result array + /// If strategy is GetDescendantsAtSpecificLevelStrategy we require depth == level + if (std::is_same_v || depth == level) + descendants.emplace_back(child_key); + + next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth}); + } + } + + already_processed_keys_during_loop.clear(); + + descendants_offsets.emplace_back(descendants.size()); + } + + ElementsAndOffsets result = {std::move(descendants), std::move(descendants_offsets)}; + return result; + } + + /// Converts ElementAndOffsets structure into ArrayColumn + template + ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets && elements_and_offsets) + { + auto elements_column = ColumnVector::create(); + elements_column->getData() = std::move(elements_and_offsets.elements); + + auto offsets_column = ColumnVector::create(); + offsets_column->getData() = std::move(elements_and_offsets.offsets); + + auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column)); + + return column_array; + } +} + +/// Returns hierarchy array column for keys +template +ColumnPtr getKeysHierarchyArray( + const PaddedPODArray & keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) +{ + auto elements_and_offsets = detail::getHierarchy( + keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); +} + +/// Returns is in hierarchy column for keys +template +ColumnUInt8::Ptr getKeysIsInHierarchyColumn( + const PaddedPODArray & hierarchy_keys, + const PaddedPODArray & hierarchy_in_keys, + const KeyType & hierarchy_null_value, + IsKeyValidFunc && is_key_valid_func, + GetParentKeyFunc && get_parent_func) +{ + auto is_in_hierarchy_data = detail::getIsInHierarchy( + hierarchy_keys, + hierarchy_in_keys, + hierarchy_null_value, + std::forward(is_key_valid_func), + std::forward(get_parent_func)); + + auto result = ColumnUInt8::create(); + result->getData() = std::move(is_in_hierarchy_data); + + return result; +} + +/// Returns descendants array column for keys +template +ColumnPtr getKeysDescendantsArray( + const PaddedPODArray & requested_keys, + const HashMap> & parent_to_child, + size_t level) +{ + if (level == 0) + { + detail::GetAllDescendantsStrategy strategy { .level = level }; + auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy); + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); + } + else + { + detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level }; + auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy); + return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets)); + } +} + +/** Default getHierarchy implementation for dictionaries that does not have structure with child to parent representation. + * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure. + * Returns ColumnArray with hierarchy arrays for keys from key_column. + */ +ColumnPtr getKeysHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + const DataTypePtr & key_type); + +/** Default isInHierarchy implementation for dictionaries that does not have structure with child to parent representation. + * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure. + * Returns UInt8 column if key from in_key_column is in key hierarchy from key_column. + */ +ColumnUInt8::Ptr getKeysIsInHierarchyDefaultImplementation( + const IDictionary * dictionary, + ColumnPtr key_column, + ColumnPtr in_key_column, + const DataTypePtr & key_type); + +} diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h new file mode 100644 index 00000000000..72b3ef76f11 --- /dev/null +++ b/src/Dictionaries/ICacheDictionaryStorage.h @@ -0,0 +1,125 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +struct KeyState +{ + enum State: uint8_t + { + not_found = 0, + expired = 1, + found = 2, + }; + + KeyState(State state_, size_t fetched_column_index_) + : state(state_) + , fetched_column_index(fetched_column_index_) + {} + + KeyState(State state_) + : state(state_) + {} + + inline bool isFound() const { return state == State::found; } + inline bool isExpired() const { return state == State::expired; } + inline bool isNotFound() const { return state == State::not_found; } + inline bool isDefault() const { return is_default; } + inline void setDefault() { is_default = true; } + inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; } + /// Valid only if keyState is found or expired + inline size_t getFetchedColumnIndex() const { return fetched_column_index; } + inline void setFetchedColumnIndex(size_t fetched_column_index_value) { fetched_column_index = fetched_column_index_value; } +private: + State state = not_found; + size_t fetched_column_index = 0; + bool is_default = false; +}; + +/// Result of fetch from CacheDictionaryStorage +template +struct KeysStorageFetchResult +{ + /// Fetched column values + MutableColumns fetched_columns; + + PaddedPODArray key_index_to_state; + + size_t expired_keys_size = 0; + + size_t found_keys_size = 0; + + size_t not_found_keys_size = 0; + + size_t default_keys_size = 0; + +}; + +using SimpleKeysStorageFetchResult = KeysStorageFetchResult; +using ComplexKeysStorageFetchResult = KeysStorageFetchResult; + +class ICacheDictionaryStorage +{ +public: + + virtual ~ICacheDictionaryStorage() = default; + + /// Necessary if all keys are found we can return result to client without additional aggregation + virtual bool returnsFetchedColumnsInOrderOfRequestedKeys() const = 0; + + /// Name of storage + virtual String getName() const = 0; + + /// Does storage support simple keys + virtual bool supportsSimpleKeys() const = 0; + + /// Fetch columns for keys, this method is not write thread safe + virtual SimpleKeysStorageFetchResult fetchColumnsForKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & fetch_request) = 0; + + /// Fetch columns for keys, this method is not write thread safe + virtual void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) = 0; + + /// Insert default keys + virtual void insertDefaultKeys(const PaddedPODArray & keys) = 0; + + /// Return cached simple keys + virtual PaddedPODArray getCachedSimpleKeys() const = 0; + + /// Does storage support complex keys + virtual bool supportsComplexKeys() const = 0; + + /// Fetch columns for keys, this method is not write thread safe + virtual ComplexKeysStorageFetchResult fetchColumnsForKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & column_fetch_requests) = 0; + + /// Fetch columns for keys, this method is not write thread safe + virtual void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) = 0; + + /// Insert default keys + virtual void insertDefaultKeys(const PaddedPODArray & keys) = 0; + + /// Return cached simple keys + virtual PaddedPODArray getCachedComplexKeys() const = 0; + + /// Return size of keys in storage + virtual size_t getSize() const = 0; + + /// Returns storage load factor + virtual double getLoadFactor() const = 0; + + /// Return bytes allocated in storage + virtual size_t getBytesAllocated() const = 0; + +}; + +using CacheDictionaryStoragePtr = std::shared_ptr; + +} diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index e0e4c7eb880..a7445312409 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -24,8 +24,8 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -struct IDictionaryBase; -using DictionaryPtr = std::unique_ptr; +struct IDictionary; +using DictionaryPtr = std::unique_ptr; /** DictionaryKeyType provides IDictionary client information about * which key type is supported by dictionary. @@ -47,13 +47,11 @@ enum class DictionaryKeyType /** * Base class for Dictionaries implementation. */ -struct IDictionaryBase : public IExternalLoadable +struct IDictionary : public IExternalLoadable { - using Key = UInt64; - - IDictionaryBase(const StorageID & dict_id_) - : dict_id(dict_id_) - , full_name(dict_id.getInternalDictionaryName()) + explicit IDictionary(const StorageID & dictionary_id_) + : dictionary_id(dictionary_id_) + , full_name(dictionary_id.getInternalDictionaryName()) { } @@ -61,14 +59,14 @@ struct IDictionaryBase : public IExternalLoadable StorageID getDictionaryID() const { std::lock_guard lock{name_mutex}; - return dict_id; + return dictionary_id; } void updateDictionaryName(const StorageID & new_name) const { std::lock_guard lock{name_mutex}; - assert(new_name.uuid == dict_id.uuid && dict_id.uuid != UUIDHelpers::Nil); - dict_id = new_name; + assert(new_name.uuid == dictionary_id.uuid && dictionary_id.uuid != UUIDHelpers::Nil); + dictionary_id = new_name; } const std::string & getLoadableName() const override final { return getFullName(); } @@ -80,8 +78,9 @@ struct IDictionaryBase : public IExternalLoadable std::string getDatabaseOrNoDatabaseTag() const { - if (!dict_id.database_name.empty()) - return dict_id.database_name; + if (!dictionary_id.database_name.empty()) + return dictionary_id.database_name; + return NO_DATABASE_TAG; } @@ -120,7 +119,36 @@ struct IDictionaryBase : public IExternalLoadable const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const = 0; + const ColumnPtr & default_values_column) const = 0; + + /** Get multiple columns from dictionary. + * + * Default implementation just calls getColumn multiple times. + * Subclasses can provide custom more efficient implementation. + */ + virtual Columns getColumns( + const Strings & attribute_names, + const DataTypes & result_types, + const Columns & key_columns, + const DataTypes & key_types, + const Columns & default_values_columns) const + { + size_t attribute_names_size = attribute_names.size(); + + Columns result; + result.reserve(attribute_names_size); + + for (size_t i = 0; i < attribute_names_size; ++i) + { + const auto & attribute_name = attribute_names[i]; + const auto & result_type = result_types[i]; + const auto & default_values_column = default_values_columns[i]; + + result.emplace_back(getColumn(attribute_name, result_type, key_columns, key_types, default_values_column)); + } + + return result; + } /** Subclass must validate key columns and key types and return ColumnUInt8 that * is bitmask representation of is key in dictionary or not. @@ -130,74 +158,65 @@ struct IDictionaryBase : public IExternalLoadable const Columns & key_columns, const DataTypes & key_types) const = 0; + virtual bool hasHierarchy() const { return false; } + + virtual ColumnPtr getHierarchy( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method getHierarchy is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + + virtual ColumnUInt8::Ptr isInHierarchy( + ColumnPtr key_column [[maybe_unused]], + ColumnPtr in_key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method isInHierarchy is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + + virtual ColumnPtr getDescendants( + ColumnPtr key_column [[maybe_unused]], + const DataTypePtr & key_type [[maybe_unused]], + size_t level [[maybe_unused]]) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Method getDescendants is not supported for {} dictionary.", + getDictionaryID().getNameForLogs()); + } + virtual BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const = 0; bool supportUpdates() const override { return true; } bool isModified() const override { - auto source = getSource(); + const auto * source = getSource(); return source && source->isModified(); } virtual std::exception_ptr getLastException() const { return {}; } - std::shared_ptr shared_from_this() + std::shared_ptr shared_from_this() { - return std::static_pointer_cast(IExternalLoadable::shared_from_this()); + return std::static_pointer_cast(IExternalLoadable::shared_from_this()); } - std::shared_ptr shared_from_this() const + std::shared_ptr shared_from_this() const { - return std::static_pointer_cast(IExternalLoadable::shared_from_this()); + return std::static_pointer_cast(IExternalLoadable::shared_from_this()); } private: mutable std::mutex name_mutex; - mutable StorageID dict_id; + mutable StorageID dictionary_id; protected: const String full_name; }; -struct IDictionary : IDictionaryBase -{ - IDictionary(const StorageID & dict_id_) : IDictionaryBase(dict_id_) {} - - virtual bool hasHierarchy() const = 0; - - virtual void toParent(const PaddedPODArray & ids, PaddedPODArray & out) const = 0; - - /// TODO: Rewrite - /// Methods for hierarchy. - - virtual void isInVectorVector( - const PaddedPODArray & /*child_ids*/, const PaddedPODArray & /*ancestor_ids*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - virtual void - isInVectorConstant(const PaddedPODArray & /*child_ids*/, const Key /*ancestor_id*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - virtual void - isInConstantVector(const Key /*child_id*/, const PaddedPODArray & /*ancestor_ids*/, PaddedPODArray & /*out*/) const - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Hierarchy is not supported for {} dictionary.", getDictionaryID().getNameForLogs()); - } - - void isInConstantConstant(const Key child_id, const Key ancestor_id, UInt8 & out) const - { - PaddedPODArray out_arr(1); - isInVectorConstant(PaddedPODArray(1, child_id), ancestor_id, out_arr); - out = out_arr[0]; - } -}; - } diff --git a/src/Dictionaries/IDictionarySource.h b/src/Dictionaries/IDictionarySource.h index 145b2e03dd2..90f8b7f3a55 100644 --- a/src/Dictionaries/IDictionarySource.h +++ b/src/Dictionaries/IDictionarySource.h @@ -10,6 +10,7 @@ namespace DB { class IDictionarySource; using DictionarySourcePtr = std::unique_ptr; +using SharedDictionarySourcePtr = std::shared_ptr; /** Data-provider interface for external dictionaries, * abstracts out the data source (file, MySQL, ClickHouse, external program, network request et cetera) diff --git a/src/Dictionaries/IPAddressDictionary.cpp b/src/Dictionaries/IPAddressDictionary.cpp index 6447c76ee73..5c5f71e0b1d 100644 --- a/src/Dictionaries/IPAddressDictionary.cpp +++ b/src/Dictionaries/IPAddressDictionary.cpp @@ -4,19 +4,17 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include #include #include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" +#include +#include #include namespace DB @@ -126,20 +124,20 @@ static size_t formatIPWithPrefix(const unsigned char * src, UInt8 prefix_len, bo static void validateKeyTypes(const DataTypes & key_types) { if (key_types.empty() || key_types.size() > 2) - throw Exception{"Expected a single IP address or IP with mask", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected a single IP address or IP with mask"); const auto * key_ipv4type = typeid_cast(key_types[0].get()); const auto * key_ipv6type = typeid_cast(key_types[0].get()); if (key_ipv4type == nullptr && (key_ipv6type == nullptr || key_ipv6type->getN() != 16)) - throw Exception{"Key does not match, expected either `IPv4` (`UInt32`) or `IPv6` (`FixedString(16)`)", - ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Key does not match, expected either `IPv4` (`UInt32`) or `IPv6` (`FixedString(16)`)"); if (key_types.size() > 1) { const auto * mask_col_type = typeid_cast(key_types[1].get()); if (mask_col_type == nullptr) - throw Exception{"Mask do not match, expected UInt8", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Mask do not match, expected UInt8"); } } @@ -191,64 +189,13 @@ inline static void mapIPv4ToIPv6(UInt32 addr, uint8_t * buf) memcpy(&buf[12], &addr, 4); } -static bool matchIPv4Subnet(UInt32 target, UInt32 addr, UInt8 prefix) -{ - UInt32 mask = (prefix >= 32) ? 0xffffffffu : ~(0xffffffffu >> prefix); - return (target & mask) == addr; -} - -#if defined(__SSE2__) -#include - -static bool matchIPv6Subnet(const uint8_t * target, const uint8_t * addr, UInt8 prefix) -{ - uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8( - _mm_loadu_si128(reinterpret_cast(target)), - _mm_loadu_si128(reinterpret_cast(addr)))); - mask = ~mask; - - if (mask) - { - auto offset = __builtin_ctz(mask); - - if (prefix / 8 != offset) - return prefix / 8 < offset; - - auto cmpmask = ~(0xff >> (prefix % 8)); - return (target[offset] & cmpmask) == addr[offset]; - } - return true; -} - -# else - -static bool matchIPv6Subnet(const uint8_t * target, const uint8_t * addr, UInt8 prefix) -{ - if (prefix > IPV6_BINARY_LENGTH * 8U) - prefix = IPV6_BINARY_LENGTH * 8U; - - size_t i = 0; - for (; prefix >= 8; ++i, prefix -= 8) - { - if (target[i] != addr[i]) - return false; - } - if (prefix == 0) - return true; - - auto mask = ~(0xff >> prefix); - return (target[i] & mask) == addr[i]; -} - -#endif // __SSE2__ - IPAddressDictionary::IPAddressDictionary( const StorageID & dict_id_, const DictionaryStructure & dict_struct_, DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) @@ -267,7 +214,7 @@ ColumnPtr IPAddressDictionary::getColumn( const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const + const ColumnPtr & default_values_column) const { validateKeyTypes(key_types); @@ -290,7 +237,6 @@ ColumnPtr IPAddressDictionary::getColumn( auto column = ColumnProvider::getColumn(dictionary_attribute, size); - if constexpr (std::is_same_v) { auto * out = column.get(); @@ -347,7 +293,7 @@ ColumnUInt8::Ptr IPAddressDictionary::hasKeys(const Columns & key_columns, const { auto addr = first_column->getDataAt(i); if (unlikely(addr.size != IPV6_BINARY_LENGTH)) - throw Exception("Expected key to be FixedString(16)", ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected key to be FixedString(16)"); auto found = tryLookupIPv6(reinterpret_cast(addr.data)); out[i] = (found != ipNotFound()); @@ -370,8 +316,10 @@ void IPAddressDictionary::createAttributes() attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value)); if (attribute.hierarchical) - throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, + "{}: hierarchical attributes not supported for dictionary of type {}", + full_name, + getTypeName()); } }; @@ -546,7 +494,7 @@ void IPAddressDictionary::loadData() LOG_TRACE(logger, "{} ip records are read", ip_records.size()); if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY}; + throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, "{}: dictionary source is empty and 'require_nonempty' property is set.", full_name); } template @@ -596,7 +544,7 @@ void IPAddressDictionary::calculateBytesAllocated() template void IPAddressDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) { - attribute.null_values = null_value.isNull() ? T{} : T(null_value.get>()); + attribute.null_values = null_value.isNull() ? T{} : T(null_value.get()); attribute.maps.emplace>(); } @@ -645,7 +593,7 @@ void IPAddressDictionary::getItemsByTwoKeyColumnsImpl( { const auto * key_ip_column_ptr = typeid_cast *>(&*key_columns.front()); if (key_ip_column_ptr == nullptr) - throw Exception{"Expected a UInt32 IP column", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected a UInt32 IP column"); const auto & key_mask_column = assert_cast &>(*key_columns.back()); @@ -679,7 +627,7 @@ void IPAddressDictionary::getItemsByTwoKeyColumnsImpl( const auto * key_ip_column_ptr = typeid_cast(&*key_columns.front()); if (key_ip_column_ptr == nullptr || key_ip_column_ptr->getN() != IPV6_BINARY_LENGTH) - throw Exception{"Expected a FixedString(16) IP column", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected a FixedString(16) IP column"); const auto & key_mask_column = assert_cast &>(*key_columns.back()); @@ -752,7 +700,7 @@ void IPAddressDictionary::getItemsImpl( { auto addr = first_column->getDataAt(i); if (addr.size != IPV6_BINARY_LENGTH) - throw Exception("Expected key to be FixedString(16)", ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected key to be FixedString(16)"); auto found = tryLookupIPv6(reinterpret_cast(addr.data)); if (found != ipNotFound()) @@ -787,7 +735,7 @@ void IPAddressDictionary::setAttributeValue(Attribute & attribute, const Field & } else { - setAttributeValueImpl(attribute, value.get>()); + setAttributeValueImpl(attribute, value.get()); } }; @@ -798,7 +746,7 @@ const IPAddressDictionary::Attribute & IPAddressDictionary::getAttribute(const s { const auto it = attribute_index_by_name.find(attribute_name); if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "{}: no such attribute '{}'", full_name, attribute_name); return attributes[it->second]; } @@ -858,9 +806,6 @@ static auto keyViewGetter() BlockInputStreamPtr IPAddressDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; - - const bool is_ipv4 = std::get_if(&ip_column) != nullptr; auto get_keys = [is_ipv4](const Columns & columns, const std::vector & dict_attributes) @@ -881,12 +826,12 @@ BlockInputStreamPtr IPAddressDictionary::getBlockInputStream(const Names & colum if (is_ipv4) { auto get_view = keyViewGetter, true>(); - return std::make_shared( + return std::make_shared( shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view)); } auto get_view = keyViewGetter(); - return std::make_shared( + return std::make_shared( shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view)); } @@ -978,7 +923,7 @@ void registerDictionaryTrie(DictionaryFactory & factory) DictionarySourcePtr source_ptr) -> DictionaryPtr { if (!dict_struct.key || dict_struct.key->size() != 1) - throw Exception{"Dictionary of layout 'ip_trie' has to have one 'key'", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary of layout 'ip_trie' has to have one 'key'"); const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; diff --git a/src/Dictionaries/IPAddressDictionary.h b/src/Dictionaries/IPAddressDictionary.h index 6c5cfa765e8..619d1579e4e 100644 --- a/src/Dictionaries/IPAddressDictionary.h +++ b/src/Dictionaries/IPAddressDictionary.h @@ -20,7 +20,7 @@ namespace DB { -class IPAddressDictionary final : public IDictionaryBase +class IPAddressDictionary final : public IDictionary { public: IPAddressDictionary( @@ -67,7 +67,7 @@ public: const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const override; + const ColumnPtr & default_values_column) const override; ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; @@ -101,6 +101,7 @@ private: Decimal32, Decimal64, Decimal128, + Decimal256, Float32, Float64, String> @@ -118,6 +119,7 @@ private: ContainerType, ContainerType, ContainerType, + ContainerType, ContainerType, ContainerType, ContainerType> diff --git a/src/Dictionaries/LibraryDictionarySource.cpp b/src/Dictionaries/LibraryDictionarySource.cpp index 6d763444b54..a971ba4b1be 100644 --- a/src/Dictionaries/LibraryDictionarySource.cpp +++ b/src/Dictionaries/LibraryDictionarySource.cpp @@ -1,4 +1,5 @@ #include "LibraryDictionarySource.h" + #include #include #include @@ -6,304 +7,185 @@ #include #include #include -#include #include "DictionarySourceFactory.h" +#include "DictionarySourceHelpers.h" #include "DictionaryStructure.h" -#include "LibraryDictionarySourceExternal.h" #include "registerDictionaries.h" +#include +#include + namespace DB { + namespace ErrorCodes { - extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; extern const int FILE_DOESNT_EXIST; extern const int EXTERNAL_LIBRARY_ERROR; extern const int PATH_ACCESS_DENIED; } -class CStringsHolder -{ -public: - using Container = std::vector; - explicit CStringsHolder(const Container & strings_pass) - { - strings_holder = strings_pass; - strings.size = strings_holder.size(); - ptr_holder = std::make_unique(strings.size); - strings.data = ptr_holder.get(); - size_t i = 0; - for (auto & str : strings_holder) - { - strings.data[i] = str.c_str(); - ++i; - } - } - - ClickHouseLibrary::CStrings strings; // will pass pointer to lib - -private: - std::unique_ptr ptr_holder = nullptr; - Container strings_holder; -}; - - -namespace -{ - constexpr auto lib_config_settings = ".settings"; - - - CStringsHolder getLibSettings(const Poco::Util::AbstractConfiguration & config, const std::string & config_root) - { - Poco::Util::AbstractConfiguration::Keys config_keys; - config.keys(config_root, config_keys); - CStringsHolder::Container strings; - for (const auto & key : config_keys) - { - std::string key_name = key; - auto bracket_pos = key.find('['); - if (bracket_pos != std::string::npos && bracket_pos > 0) - key_name = key.substr(0, bracket_pos); - strings.emplace_back(key_name); - strings.emplace_back(config.getString(config_root + "." + key)); - } - return CStringsHolder(strings); - } - - - Block dataToBlock(const Block & sample_block, const void * data) - { - if (!data) - throw Exception("LibraryDictionarySource: No data returned", ErrorCodes::EXTERNAL_LIBRARY_ERROR); - - const auto * columns_received = static_cast(data); - if (columns_received->error_code) - throw Exception( - "LibraryDictionarySource: Returned error: " + std::to_string(columns_received->error_code) + " " - + (columns_received->error_string ? columns_received->error_string : ""), - ErrorCodes::EXTERNAL_LIBRARY_ERROR); - - MutableColumns columns(sample_block.columns()); - for (const auto i : ext::range(0, columns.size())) - columns[i] = sample_block.getByPosition(i).column->cloneEmpty(); - - for (size_t col_n = 0; col_n < columns_received->size; ++col_n) - { - if (columns.size() != columns_received->data[col_n].size) - throw Exception( - "LibraryDictionarySource: Returned unexpected number of columns: " + std::to_string(columns_received->data[col_n].size) - + ", must be " + std::to_string(columns.size()), - ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); - - for (size_t row_n = 0; row_n < columns_received->data[col_n].size; ++row_n) - { - const auto & field = columns_received->data[col_n].data[row_n]; - if (!field.data) - { - /// sample_block contains null_value (from config) inside corresponding column - const auto & col = sample_block.getByPosition(row_n); - columns[row_n]->insertFrom(*(col.column), 0); - } - else - { - const auto & size = field.size; - columns[row_n]->insertData(static_cast(field.data), size); - } - } - } - - return sample_block.cloneWithColumns(std::move(columns)); - } -} - - LibraryDictionarySource::LibraryDictionarySource( const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix_, Block & sample_block_, - const Context & context, + ContextPtr context_, bool check_config) : log(&Poco::Logger::get("LibraryDictionarySource")) , dict_struct{dict_struct_} , config_prefix{config_prefix_} , path{config.getString(config_prefix + ".path", "")} + , dictionary_id(getDictID()) , sample_block{sample_block_} + , context(Context::createCopy(context_)) { if (check_config) { - const String dictionaries_lib_path = context.getDictionariesLibPath(); + const String dictionaries_lib_path = context->getDictionariesLibPath(); if (!startsWith(path, dictionaries_lib_path)) - throw Exception("LibraryDictionarySource: Library path " + path + " is not inside " + dictionaries_lib_path, ErrorCodes::PATH_ACCESS_DENIED); + throw Exception(ErrorCodes::PATH_ACCESS_DENIED, "LibraryDictionarySource: Library path {} is not inside {}", path, dictionaries_lib_path); } if (!Poco::File(path).exists()) - throw Exception( - "LibraryDictionarySource: Can't load library " + Poco::File(path).path() + ": file doesn't exist", - ErrorCodes::FILE_DOESNT_EXIST); + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "LibraryDictionarySource: Can't load library {}: file doesn't exist", Poco::File(path).path()); description.init(sample_block); - library = std::make_shared(path, RTLD_LAZY -#if defined(RTLD_DEEPBIND) && !defined(ADDRESS_SANITIZER) // Does not exists in FreeBSD. Cannot work with Address Sanitizer. - | RTLD_DEEPBIND -#endif - ); - settings = std::make_shared(getLibSettings(config, config_prefix + lib_config_settings)); - if (auto lib_new = library->tryGetstrings), decltype(&ClickHouseLibrary::log))>( - "ClickHouseDictionary_v3_libNew")) - lib_data = lib_new(&settings->strings, ClickHouseLibrary::log); + bridge_helper = std::make_shared(context, description.sample_block, dictionary_id); + auto res = bridge_helper->initLibrary(path, getLibrarySettingsString(config, config_prefix + ".settings"), getDictAttributesString()); + + if (!res) + throw Exception(ErrorCodes::EXTERNAL_LIBRARY_ERROR, "Failed to create shared library from path: {}", path); } + +LibraryDictionarySource::~LibraryDictionarySource() +{ + bridge_helper->removeLibrary(); +} + + LibraryDictionarySource::LibraryDictionarySource(const LibraryDictionarySource & other) : log(&Poco::Logger::get("LibraryDictionarySource")) , dict_struct{other.dict_struct} , config_prefix{other.config_prefix} , path{other.path} + , dictionary_id{getDictID()} , sample_block{other.sample_block} - , library{other.library} + , context(other.context) , description{other.description} - , settings{other.settings} { - if (auto lib_clone = library->tryGet("ClickHouseDictionary_v3_libClone")) - lib_data = lib_clone(other.lib_data); - else if ( - auto lib_new = library->tryGetstrings), decltype(&ClickHouseLibrary::log))>( - "ClickHouseDictionary_v3_libNew")) - lib_data = lib_new(&settings->strings, ClickHouseLibrary::log); + bridge_helper = std::make_shared(context, description.sample_block, dictionary_id); + bridge_helper->cloneLibrary(other.dictionary_id); } -LibraryDictionarySource::~LibraryDictionarySource() + +bool LibraryDictionarySource::isModified() const { - if (auto lib_delete = library->tryGet("ClickHouseDictionary_v3_libDelete")) - lib_delete(lib_data); + return bridge_helper->isModified(); } + +bool LibraryDictionarySource::supportsSelectiveLoad() const +{ + return bridge_helper->supportsSelectiveLoad(); +} + + BlockInputStreamPtr LibraryDictionarySource::loadAll() { LOG_TRACE(log, "loadAll {}", toString()); - - auto columns_holder = std::make_unique(dict_struct.attributes.size()); - ClickHouseLibrary::CStrings columns{static_cast(columns_holder.get()), - dict_struct.attributes.size()}; - size_t i = 0; - for (const auto & a : dict_struct.attributes) - { - columns.data[i] = a.name.c_str(); - ++i; - } - void * data_ptr = nullptr; - - /// Get function pointer before dataNew call because library->get may throw. - auto func_load_all - = library->getstrings), decltype(&columns))>("ClickHouseDictionary_v3_loadAll"); - data_ptr = library->get("ClickHouseDictionary_v3_dataNew")(lib_data); - auto * data = func_load_all(data_ptr, &settings->strings, &columns); - auto block = dataToBlock(description.sample_block, data); - SCOPE_EXIT(library->get("ClickHouseDictionary_v3_dataDelete")(lib_data, data_ptr)); - return std::make_shared(block); + return bridge_helper->loadAll(); } + BlockInputStreamPtr LibraryDictionarySource::loadIds(const std::vector & ids) { LOG_TRACE(log, "loadIds {} size = {}", toString(), ids.size()); - - const ClickHouseLibrary::VectorUInt64 ids_data{ext::bit_cast(ids.data()), ids.size()}; - auto columns_holder = std::make_unique(dict_struct.attributes.size()); - ClickHouseLibrary::CStrings columns_pass{static_cast(columns_holder.get()), - dict_struct.attributes.size()}; - size_t i = 0; - for (const auto & a : dict_struct.attributes) - { - columns_pass.data[i] = a.name.c_str(); - ++i; - } - void * data_ptr = nullptr; - - /// Get function pointer before dataNew call because library->get may throw. - auto func_load_ids - = library->getstrings), decltype(&columns_pass), decltype(&ids_data))>( - "ClickHouseDictionary_v3_loadIds"); - data_ptr = library->get("ClickHouseDictionary_v3_dataNew")(lib_data); - auto * data = func_load_ids(data_ptr, &settings->strings, &columns_pass, &ids_data); - auto block = dataToBlock(description.sample_block, data); - SCOPE_EXIT(library->get("ClickHouseDictionary_v3_dataDelete")(lib_data, data_ptr)); - return std::make_shared(block); + return bridge_helper->loadIds(getDictIdsString(ids)); } + BlockInputStreamPtr LibraryDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) { LOG_TRACE(log, "loadKeys {} size = {}", toString(), requested_rows.size()); - - auto holder = std::make_unique(key_columns.size()); - std::vector> column_data_holders; - for (size_t i = 0; i < key_columns.size(); ++i) - { - auto cell_holder = std::make_unique(requested_rows.size()); - for (size_t j = 0; j < requested_rows.size(); ++j) - { - auto data_ref = key_columns[i]->getDataAt(requested_rows[j]); - cell_holder[j] = ClickHouseLibrary::Field{.data = static_cast(data_ref.data), .size = data_ref.size}; - } - holder[i] - = ClickHouseLibrary::Row{.data = static_cast(cell_holder.get()), .size = requested_rows.size()}; - - column_data_holders.push_back(std::move(cell_holder)); - } - - ClickHouseLibrary::Table request_cols{.data = static_cast(holder.get()), .size = key_columns.size()}; - - void * data_ptr = nullptr; - /// Get function pointer before dataNew call because library->get may throw. - auto func_load_keys = library->getstrings), decltype(&request_cols))>( - "ClickHouseDictionary_v3_loadKeys"); - data_ptr = library->get("ClickHouseDictionary_v3_dataNew")(lib_data); - auto * data = func_load_keys(data_ptr, &settings->strings, &request_cols); - auto block = dataToBlock(description.sample_block, data); - SCOPE_EXIT(library->get("ClickHouseDictionary_v3_dataDelete")(lib_data, data_ptr)); - return std::make_shared(block); + auto block = blockForKeys(dict_struct, key_columns, requested_rows); + return bridge_helper->loadKeys(block); } -bool LibraryDictionarySource::isModified() const -{ - if (auto func_is_modified - = library->tryGetstrings))>("ClickHouseDictionary_v3_isModified")) - return func_is_modified(lib_data, &settings->strings); - return true; -} - -bool LibraryDictionarySource::supportsSelectiveLoad() const -{ - if (auto func_supports_selective_load - = library->tryGetstrings))>("ClickHouseDictionary_v3_supportsSelectiveLoad")) - return func_supports_selective_load(lib_data, &settings->strings); - return true; -} DictionarySourcePtr LibraryDictionarySource::clone() const { return std::make_unique(*this); } + std::string LibraryDictionarySource::toString() const { return path; } + +String LibraryDictionarySource::getLibrarySettingsString(const Poco::Util::AbstractConfiguration & config, const std::string & config_root) +{ + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys(config_root, config_keys); + WriteBufferFromOwnString out; + std::vector settings; + + for (const auto & key : config_keys) + { + std::string key_name = key; + auto bracket_pos = key.find('['); + + if (bracket_pos != std::string::npos && bracket_pos > 0) + key_name = key.substr(0, bracket_pos); + + settings.push_back(key_name); + settings.push_back(config.getString(config_root + "." + key)); + } + + writeVectorBinary(settings, out); + return out.str(); +} + + +String LibraryDictionarySource::getDictIdsString(const std::vector & ids) +{ + WriteBufferFromOwnString out; + writeVectorBinary(ids, out); + return out.str(); +} + + +String LibraryDictionarySource::getDictAttributesString() +{ + std::vector attributes_names(dict_struct.attributes.size()); + for (size_t i = 0; i < dict_struct.attributes.size(); ++i) + attributes_names[i] = dict_struct.attributes[i].name; + WriteBufferFromOwnString out; + writeVectorBinary(attributes_names, out); + return out.str(); +} + + void registerDictionarySourceLibrary(DictionarySourceFactory & factory) { auto create_table_source = [=](const DictionaryStructure & dict_struct, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block, - const Context & context, + ContextPtr context, const std::string & /* default_database */, bool check_config) -> DictionarySourcePtr { return std::make_unique(dict_struct, config, config_prefix + ".library", sample_block, context, check_config); }; + factory.registerSource("library", create_table_source); } + } diff --git a/src/Dictionaries/LibraryDictionarySource.h b/src/Dictionaries/LibraryDictionarySource.h index 4d73b3f97d4..1ab47c5a06f 100644 --- a/src/Dictionaries/LibraryDictionarySource.h +++ b/src/Dictionaries/LibraryDictionarySource.h @@ -1,10 +1,13 @@ #pragma once #include +#include #include +#include #include "DictionaryStructure.h" #include #include "IDictionarySource.h" +#include namespace Poco @@ -17,18 +20,17 @@ namespace Util } } - namespace DB { + namespace ErrorCodes { extern const int NOT_IMPLEMENTED; } -class CStringsHolder; -/// Allows loading dictionaries from dynamic libraries (.so) -/// Experimental version -/// Example: tests/external_dictionaries/dictionary_library/dictionary_library.cpp +class CStringsHolder; +using LibraryBridgeHelperPtr = std::shared_ptr; + class LibraryDictionarySource final : public IDictionarySource { public: @@ -37,7 +39,7 @@ public: const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix_, Block & sample_block_, - const Context & context, + ContextPtr context_, bool check_config); LibraryDictionarySource(const LibraryDictionarySource & other); @@ -49,7 +51,7 @@ public: BlockInputStreamPtr loadUpdatedAll() override { - throw Exception{"Method loadUpdatedAll is unsupported for LibraryDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method loadUpdatedAll is unsupported for LibraryDictionarySource"); } BlockInputStreamPtr loadIds(const std::vector & ids) override; @@ -68,18 +70,26 @@ public: std::string toString() const override; private: - Poco::Logger * log; + static String getDictIdsString(const std::vector & ids); - LocalDateTime getLastModification() const; + String getDictAttributesString(); + + static String getLibrarySettingsString(const Poco::Util::AbstractConfiguration & config, const std::string & config_root); + + static Field getDictID() { return UUIDHelpers::generateV4(); } + + Poco::Logger * log; const DictionaryStructure dict_struct; const std::string config_prefix; const std::string path; + const Field dictionary_id; + Block sample_block; - SharedLibraryPtr library; + ContextPtr context; + + LibraryBridgeHelperPtr bridge_helper; ExternalResultDescription description; - std::shared_ptr settings; - void * lib_data = nullptr; }; } diff --git a/src/Dictionaries/LibraryDictionarySourceExternal.h b/src/Dictionaries/LibraryDictionarySourceExternal.h deleted file mode 100644 index 7a031cdb315..00000000000 --- a/src/Dictionaries/LibraryDictionarySourceExternal.h +++ /dev/null @@ -1,64 +0,0 @@ -#pragma once - -#include - -#define CLICKHOUSE_DICTIONARY_LIBRARY_API 1 - -namespace ClickHouseLibrary -{ -using CString = const char *; -using ColumnName = CString; -using ColumnNames = ColumnName[]; - -struct CStrings -{ - CString * data = nullptr; - uint64_t size = 0; -}; - -struct VectorUInt64 -{ - const uint64_t * data = nullptr; - uint64_t size = 0; -}; - -struct ColumnsUInt64 -{ - VectorUInt64 * data = nullptr; - uint64_t size = 0; -}; - -struct Field -{ - const void * data = nullptr; - uint64_t size = 0; -}; - -struct Row -{ - const Field * data = nullptr; - uint64_t size = 0; -}; - -struct Table -{ - const Row * data = nullptr; - uint64_t size = 0; - uint64_t error_code = 0; // 0 = ok; !0 = error, with message in error_string - const char * error_string = nullptr; -}; - -enum LogLevel -{ - FATAL = 1, - CRITICAL, - ERROR, - WARNING, - NOTICE, - INFORMATION, - DEBUG, - TRACE, -}; - -void log(LogLevel level, CString msg); -} diff --git a/src/Dictionaries/MongoDBDictionarySource.cpp b/src/Dictionaries/MongoDBDictionarySource.cpp index 5b5d0c4d20f..0ab45dc4593 100644 --- a/src/Dictionaries/MongoDBDictionarySource.cpp +++ b/src/Dictionaries/MongoDBDictionarySource.cpp @@ -13,7 +13,7 @@ void registerDictionarySourceMongoDB(DictionarySourceFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & root_config_prefix, Block & sample_block, - const Context &, + ContextPtr, const std::string & /* default_database */, bool /* check_config */) { @@ -126,7 +126,7 @@ MongoDBDictionarySource::MongoDBDictionarySource( #if POCO_VERSION >= 0x01070800 Poco::MongoDB::Database poco_db(db); if (!poco_db.authenticate(*connection, user, password, method.empty() ? Poco::MongoDB::Database::AUTH_SCRAM_SHA1 : method)) - throw Exception("Cannot authenticate in MongoDB, incorrect user or password", ErrorCodes::MONGODB_CANNOT_AUTHENTICATE); + throw Exception(ErrorCodes::MONGODB_CANNOT_AUTHENTICATE, "Cannot authenticate in MongoDB, incorrect user or password"); #else authenticate(*connection, db, user, password); #endif @@ -151,7 +151,7 @@ BlockInputStreamPtr MongoDBDictionarySource::loadAll() BlockInputStreamPtr MongoDBDictionarySource::loadIds(const std::vector & ids) { if (!dict_struct.id) - throw Exception{"'id' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is required for selective loading"); auto cursor = createCursor(db, collection, sample_block); @@ -172,7 +172,7 @@ BlockInputStreamPtr MongoDBDictionarySource::loadIds(const std::vector & BlockInputStreamPtr MongoDBDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) { if (!dict_struct.key) - throw Exception{"'key' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is required for selective loading"); auto cursor = createCursor(db, collection, sample_block); @@ -198,6 +198,7 @@ BlockInputStreamPtr MongoDBDictionarySource::loadKeys(const Columns & key_column case AttributeUnderlyingType::utDecimal32: case AttributeUnderlyingType::utDecimal64: case AttributeUnderlyingType::utDecimal128: + case AttributeUnderlyingType::utDecimal256: key.add(attr.second.name, Int32(key_columns[attr.first]->get64(row_idx))); break; diff --git a/src/Dictionaries/MongoDBDictionarySource.h b/src/Dictionaries/MongoDBDictionarySource.h index ad7b66fe1a7..fef5749190f 100644 --- a/src/Dictionaries/MongoDBDictionarySource.h +++ b/src/Dictionaries/MongoDBDictionarySource.h @@ -50,7 +50,7 @@ public: BlockInputStreamPtr loadUpdatedAll() override { - throw Exception{"Method loadUpdatedAll is unsupported for MongoDBDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method loadUpdatedAll is unsupported for MongoDBDictionarySource"); } bool supportsSelectiveLoad() const override { return true; } diff --git a/src/Dictionaries/MySQLDictionarySource.cpp b/src/Dictionaries/MySQLDictionarySource.cpp index a21b1bd50fc..676863ae588 100644 --- a/src/Dictionaries/MySQLDictionarySource.cpp +++ b/src/Dictionaries/MySQLDictionarySource.cpp @@ -4,9 +4,15 @@ #include "DictionarySourceFactory.h" #include "DictionaryStructure.h" #include "registerDictionaries.h" +#include +#include namespace DB { + +[[maybe_unused]] +static const size_t default_num_tries_on_connection_loss = 3; + namespace ErrorCodes { extern const int SUPPORT_IS_DISABLED; @@ -14,22 +20,22 @@ namespace ErrorCodes void registerDictionarySourceMysql(DictionarySourceFactory & factory) { - auto create_table_source = [=](const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - Block & sample_block, - const Context & /* context */, + auto create_table_source = [=]([[maybe_unused]] const DictionaryStructure & dict_struct, + [[maybe_unused]] const Poco::Util::AbstractConfiguration & config, + [[maybe_unused]] const std::string & config_prefix, + [[maybe_unused]] Block & sample_block, + [[maybe_unused]] ContextPtr context, const std::string & /* default_database */, bool /* check_config */) -> DictionarySourcePtr { #if USE_MYSQL - return std::make_unique(dict_struct, config, config_prefix + ".mysql", sample_block); + StreamSettings mysql_input_stream_settings(context->getSettingsRef() + , config.getBool(config_prefix + ".mysql.close_connection", false) || config.getBool(config_prefix + ".mysql.share_connection", false) + , false + , config.getBool(config_prefix + ".mysql.fail_on_connection_loss", false) ? 1 : default_num_tries_on_connection_loss); + return std::make_unique(dict_struct, config, config_prefix + ".mysql", sample_block, mysql_input_stream_settings); #else - (void)dict_struct; - (void)config; - (void)config_prefix; - (void)sample_block; - throw Exception{"Dictionary source of type `mysql` is disabled because ClickHouse was built without mysql support.", - ErrorCodes::SUPPORT_IS_DISABLED}; + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Dictionary source of type `mysql` is disabled because ClickHouse was built without mysql support."); #endif }; factory.registerSource("mysql", create_table_source); @@ -45,20 +51,21 @@ void registerDictionarySourceMysql(DictionarySourceFactory & factory) # include # include # include -# include # include "readInvalidateQuery.h" +# include # include +# include namespace DB { -static const UInt64 max_block_size = 8192; MySQLDictionarySource::MySQLDictionarySource( const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - const Block & sample_block_) + const Block & sample_block_, + const StreamSettings & settings_) : log(&Poco::Logger::get("MySQLDictionarySource")) , update_time{std::chrono::system_clock::from_time_t(0)} , dict_struct{dict_struct_} @@ -68,11 +75,11 @@ MySQLDictionarySource::MySQLDictionarySource( , update_field{config.getString(config_prefix + ".update_field", "")} , dont_check_update_time{config.getBool(config_prefix + ".dont_check_update_time", false)} , sample_block{sample_block_} - , pool{mysqlxx::PoolFactory::instance().get(config, config_prefix)} + , pool{std::make_shared(mysqlxx::PoolFactory::instance().get(config, config_prefix))} , query_builder{dict_struct, db, "", table, where, IdentifierQuotingStyle::Backticks} , load_all_query{query_builder.composeLoadAllQuery()} , invalidate_query{config.getString(config_prefix + ".invalidate_query", "")} - , close_connection{config.getBool(config_prefix + ".close_connection", false) || config.getBool(config_prefix + ".share_connection", false)} + , settings(settings_) { } @@ -93,7 +100,7 @@ MySQLDictionarySource::MySQLDictionarySource(const MySQLDictionarySource & other , last_modification{other.last_modification} , invalidate_query{other.invalidate_query} , invalidate_query_response{other.invalidate_query_response} - , close_connection{other.close_connection} + , settings(other.settings) { } @@ -101,10 +108,9 @@ std::string MySQLDictionarySource::getUpdateFieldAndDate() { if (update_time != std::chrono::system_clock::from_time_t(0)) { - auto tmp_time = update_time; + time_t hr_time = std::chrono::system_clock::to_time_t(update_time) - 1; + std::string str_time = DateLUT::instance().timeToString(hr_time); update_time = std::chrono::system_clock::now(); - time_t hr_time = std::chrono::system_clock::to_time_t(tmp_time) - 1; - std::string str_time = std::to_string(LocalDateTime(hr_time)); return query_builder.composeUpdateQuery(update_field, str_time); } else @@ -114,39 +120,43 @@ std::string MySQLDictionarySource::getUpdateFieldAndDate() } } +BlockInputStreamPtr MySQLDictionarySource::loadFromQuery(const String & query) +{ + return std::make_shared( + pool, query, sample_block, settings); +} + BlockInputStreamPtr MySQLDictionarySource::loadAll() { - auto connection = pool.get(); + auto connection = pool->get(); last_modification = getLastModification(connection, false); LOG_TRACE(log, load_all_query); - return std::make_shared(connection, load_all_query, sample_block, max_block_size, close_connection); + return loadFromQuery(load_all_query); } BlockInputStreamPtr MySQLDictionarySource::loadUpdatedAll() { - auto connection = pool.get(); + auto connection = pool->get(); last_modification = getLastModification(connection, false); std::string load_update_query = getUpdateFieldAndDate(); LOG_TRACE(log, load_update_query); - return std::make_shared(connection, load_update_query, sample_block, max_block_size, close_connection); + return loadFromQuery(load_update_query); } BlockInputStreamPtr MySQLDictionarySource::loadIds(const std::vector & ids) { /// We do not log in here and do not update the modification time, as the request can be large, and often called. - const auto query = query_builder.composeLoadIdsQuery(ids); - return std::make_shared(pool.get(), query, sample_block, max_block_size, close_connection); + return loadFromQuery(query); } BlockInputStreamPtr MySQLDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) { /// We do not log in here and do not update the modification time, as the request can be large, and often called. - const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN); - return std::make_shared(pool.get(), query, sample_block, max_block_size, close_connection); + return loadFromQuery(query); } bool MySQLDictionarySource::isModified() const @@ -162,7 +172,7 @@ bool MySQLDictionarySource::isModified() const if (dont_check_update_time) return true; - auto connection = pool.get(); + auto connection = pool->get(); return getLastModification(connection, true) > last_modification; } @@ -228,7 +238,7 @@ LocalDateTime MySQLDictionarySource::getLastModification(mysqlxx::Pool::Entry & if (!update_time_value.isNull()) { modification_time = update_time_value.getDateTime(); - LOG_TRACE(log, "Got modification time: {}", modification_time); + LOG_TRACE(log, "Got modification time: {}", update_time_value.getString()); } /// fetch remaining rows to avoid "commands out of sync" error @@ -236,7 +246,7 @@ LocalDateTime MySQLDictionarySource::getLastModification(mysqlxx::Pool::Entry & ++fetched_rows; } - if (close_connection && allow_connection_closure) + if (settings.auto_close && allow_connection_closure) { connection.disconnect(); } @@ -260,7 +270,7 @@ std::string MySQLDictionarySource::doInvalidateQuery(const std::string & request Block invalidate_sample_block; ColumnPtr column(ColumnString::create()); invalidate_sample_block.insert(ColumnWithTypeAndName(column, std::make_shared(), "Sample Block")); - MySQLBlockInputStream block_input_stream(pool.get(), request, invalidate_sample_block, 1, close_connection); + MySQLBlockInputStream block_input_stream(pool->get(), request, invalidate_sample_block, settings); return readInvalidateQuery(block_input_stream); } diff --git a/src/Dictionaries/MySQLDictionarySource.h b/src/Dictionaries/MySQLDictionarySource.h index 34f784cdfeb..ef1d81b862f 100644 --- a/src/Dictionaries/MySQLDictionarySource.h +++ b/src/Dictionaries/MySQLDictionarySource.h @@ -12,7 +12,7 @@ # include "DictionaryStructure.h" # include "ExternalQueryBuilder.h" # include "IDictionarySource.h" - +# include namespace Poco { @@ -34,8 +34,9 @@ public: MySQLDictionarySource( const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - const Block & sample_block_); + const String & config_prefix, + const Block & sample_block_, + const StreamSettings & settings_); /// copy-constructor is provided in order to support cloneability MySQLDictionarySource(const MySQLDictionarySource & other); @@ -60,6 +61,8 @@ public: std::string toString() const override; private: + BlockInputStreamPtr loadFromQuery(const String & query); + std::string getUpdateFieldAndDate(); static std::string quoteForLike(const std::string s); @@ -79,13 +82,13 @@ private: const std::string update_field; const bool dont_check_update_time; Block sample_block; - mutable mysqlxx::PoolWithFailover pool; + mutable mysqlxx::PoolWithFailoverPtr pool; ExternalQueryBuilder query_builder; const std::string load_all_query; LocalDateTime last_modification; std::string invalidate_query; mutable std::string invalidate_query_response; - const bool close_connection; + const StreamSettings settings; }; } diff --git a/src/Dictionaries/PolygonDictionary.cpp b/src/Dictionaries/PolygonDictionary.cpp index e0d0fa0a0e6..64d435cf632 100644 --- a/src/Dictionaries/PolygonDictionary.cpp +++ b/src/Dictionaries/PolygonDictionary.cpp @@ -1,4 +1,8 @@ #include "PolygonDictionary.h" + +#include +#include + #include "DictionaryBlockInputStream.h" #include "DictionaryFactory.h" @@ -8,8 +12,6 @@ #include #include -#include - namespace DB { @@ -28,70 +30,16 @@ IPolygonDictionary::IPolygonDictionary( const DictionaryLifetime dict_lifetime_, InputType input_type_, PointType point_type_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr(std::move(source_ptr_)) , dict_lifetime(dict_lifetime_) , input_type(input_type_) , point_type(point_type_) { - createAttributes(); + setup(); loadData(); -} - -std::string IPolygonDictionary::getTypeName() const -{ - return "Polygon"; -} - -std::string IPolygonDictionary::getKeyDescription() const -{ - return dict_struct.getKeyDescription(); -} - -size_t IPolygonDictionary::getBytesAllocated() const -{ - return bytes_allocated; -} - -size_t IPolygonDictionary::getQueryCount() const -{ - return query_count.load(std::memory_order_relaxed); -} - -double IPolygonDictionary::getHitRate() const -{ - return 1.0; -} - -size_t IPolygonDictionary::getElementCount() const -{ - return element_count; -} - -double IPolygonDictionary::getLoadFactor() const -{ - return 1.0; -} - -const IDictionarySource * IPolygonDictionary::getSource() const -{ - return source_ptr.get(); -} - -const DictionaryLifetime & IPolygonDictionary::getLifetime() const -{ - return dict_lifetime; -} - -const DictionaryStructure & IPolygonDictionary::getStructure() const -{ - return dict_struct; -} - -bool IPolygonDictionary::isInjective(const std::string &) const -{ - return false; + calculateBytesAllocated(); } ColumnPtr IPolygonDictionary::getColumn( @@ -99,52 +47,102 @@ ColumnPtr IPolygonDictionary::getColumn( const DataTypePtr & result_type, const Columns & key_columns, const DataTypes &, - const ColumnPtr default_values_column) const + const ColumnPtr & default_values_column) const { - ColumnPtr result; + const auto requested_key_points = extractPoints(key_columns); - const auto index = getAttributeIndex(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); + const auto & attribute = dict_struct.getAttribute(attribute_name, result_type); + bool complex_attribute = attribute.is_nullable || attribute.is_array; + DefaultValueProvider default_value_provider(attribute.null_value, default_values_column); - auto keys_size = key_columns.front()->size(); + size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second; + const auto & attribute_values_column = attributes[attribute_index]; - auto type_call = [&](const auto &dictionary_attribute_type) + auto result = attribute_values_column->cloneEmpty(); + result->reserve(requested_key_points.size()); + + Field row_value_to_insert; + size_t polygon_index = 0; + + if (unlikely(complex_attribute)) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto & null_value = std::get(null_values[index]); - DictionaryDefaultValueExtractor default_value_extractor(null_value, default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) + for (size_t requested_key_index = 0; requested_key_index < requested_key_points.size(); ++requested_key_index) { - auto column_string = ColumnString::create(); - auto * out = column.get(); + const auto found = find(requested_key_points[requested_key_index], polygon_index); - getItemsImpl( - index, - key_columns, - [&](const size_t, const StringRef & value) { out->insertData(value.data, value.size); }, - default_value_extractor); + if (found) + { + size_t attribute_values_index = polygon_index_to_attribute_value_index[polygon_index]; + attribute_values_column->get(attribute_values_index, row_value_to_insert); + } + else + row_value_to_insert = default_value_provider.getDefaultValue(requested_key_index); + + result->insert(row_value_to_insert); } - else + } + else + { + auto type_call = [&](const auto & dictionary_attribute_type) { - auto & out = column->getData(); + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnType = std::conditional_t< + std::is_same_v, + ColumnString, + std::conditional_t, ColumnDecimal, ColumnVector>>; - getItemsImpl( - index, - key_columns, - [&](const size_t row, const auto value) { return out[row] = value; }, - default_value_extractor); - } + const auto attribute_values_column_typed = typeid_cast(attribute_values_column.get()); + if (!attribute_values_column_typed) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "An attribute type should be same as dictionary type"); - result = std::move(column); - }; + ColumnType & result_column_typed = static_cast(*result); - callOnDictionaryAttributeType(dict_struct.attributes[index].underlying_type, type_call); + if constexpr (std::is_same_v) + { + for (size_t requested_key_index = 0; requested_key_index < requested_key_points.size(); ++requested_key_index) + { + const auto found = find(requested_key_points[requested_key_index], polygon_index); + + if (found) + { + size_t attribute_values_index = polygon_index_to_attribute_value_index[polygon_index]; + auto data_to_insert = attribute_values_column->getDataAt(attribute_values_index); + result_column_typed.insertData(data_to_insert.data, data_to_insert.size); + } + else + result_column_typed.insert(default_value_provider.getDefaultValue(requested_key_index)); + } + } + else + { + auto & attribute_data = attribute_values_column_typed->getData(); + auto & result_data = result_column_typed.getData(); + + for (size_t requested_key_index = 0; requested_key_index < requested_key_points.size(); ++requested_key_index) + { + const auto found = find(requested_key_points[requested_key_index], polygon_index); + + if (found) + { + size_t attribute_values_index = polygon_index_to_attribute_value_index[polygon_index]; + auto & item = attribute_data[attribute_values_index]; + result_data.emplace_back(item); + } + else + { + row_value_to_insert = default_value_provider.getDefaultValue(requested_key_index); + result_data.emplace_back(row_value_to_insert.template get>()); + } + } + } + }; + + callOnDictionaryAttributeType(attribute.underlying_type, type_call); + } + + query_count.fetch_add(requested_key_points.size(), std::memory_order_relaxed); return result; } @@ -153,103 +151,42 @@ BlockInputStreamPtr IPolygonDictionary::getBlockInputStream(const Names &, size_ { // TODO: In order for this to work one would first have to support retrieving arrays from dictionaries. // I believe this is a separate task done by some other people. - throw Exception{"Reading the dictionary is not allowed", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Reading the dictionary is not allowed"); } -template -void IPolygonDictionary::appendNullValueImpl(const Field & null_value) +void IPolygonDictionary::setup() { - null_values.emplace_back(T(null_value.get>())); -} + attributes.reserve(dict_struct.attributes.size()); -void IPolygonDictionary::appendNullValue(AttributeUnderlyingType type, const Field & null_value) -{ - switch (type) + for (const auto & attribute : dict_struct.attributes) { - case AttributeUnderlyingType::utUInt8: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utUInt16: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utUInt32: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utUInt64: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utUInt128: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utInt8: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utInt16: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utInt32: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utInt64: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utFloat32: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utFloat64: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utDecimal32: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utDecimal64: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utDecimal128: - appendNullValueImpl(null_value); - break; - case AttributeUnderlyingType::utString: - appendNullValueImpl(null_value); - break; - } -} + auto column = attribute.type->createColumn(); + attributes.emplace_back(std::move(column)); -void IPolygonDictionary::createAttributes() -{ - attributes.resize(dict_struct.attributes.size()); - for (size_t i = 0; i < dict_struct.attributes.size(); ++i) - { - const auto & attr = dict_struct.attributes[i]; - attribute_index_by_name.emplace(attr.name, i); - - appendNullValue(attr.underlying_type, attr.null_value); - - if (attr.hierarchical) - throw Exception{ErrorCodes::TYPE_MISMATCH, + if (attribute.hierarchical) + throw Exception(ErrorCodes::TYPE_MISMATCH, "{}: hierarchical attributes not supported for dictionary of polygonal type", - getDictionaryID().getNameForLogs()}; + getDictionaryID().getNameForLogs()); } } void IPolygonDictionary::blockToAttributes(const DB::Block & block) { const auto rows = block.rows(); - element_count += rows; + + size_t skip_key_column_offset = 1; for (size_t i = 0; i < attributes.size(); ++i) { - const auto & column = block.safeGetByPosition(i + 1); - if (attributes[i]) - { - MutableColumnPtr mutated = IColumn::mutate(std::move(attributes[i])); - mutated->insertRangeFrom(*column.column, 0, column.column->size()); - attributes[i] = std::move(mutated); - } - else - attributes[i] = column.column; + const auto & block_column = block.safeGetByPosition(i + skip_key_column_offset); + const auto & column = block_column.column; + + attributes[i]->assumeMutable()->insertRangeFrom(*column, 0, column->size()); } + /** Multi-polygons could cause bigger sizes, but this is better than nothing. */ polygons.reserve(polygons.size() + rows); - ids.reserve(ids.size() + rows); + polygon_index_to_attribute_value_index.reserve(polygon_index_to_attribute_value_index.size() + rows); + const auto & key = block.safeGetByPosition(0).column; extractPolygons(key); } @@ -262,114 +199,104 @@ void IPolygonDictionary::loadData() blockToAttributes(block); stream->readSuffix(); - std::vector areas; - areas.reserve(polygons.size()); + + /// Correct and sort polygons by area and update polygon_index_to_attribute_value_index after sort + PaddedPODArray areas; + areas.resize_fill(polygons.size()); std::vector> polygon_ids; polygon_ids.reserve(polygons.size()); + for (size_t i = 0; i < polygons.size(); ++i) { auto & polygon = polygons[i]; bg::correct(polygon); - areas.push_back(bg::area(polygon)); + + areas[i] = bg::area(polygon); polygon_ids.emplace_back(polygon, i); } - sort(polygon_ids.begin(), polygon_ids.end(), [& areas](const auto & lhs, const auto & rhs) + + std::sort(polygon_ids.begin(), polygon_ids.end(), [& areas](const auto & lhs, const auto & rhs) { return areas[lhs.second] < areas[rhs.second]; }); + std::vector correct_ids; correct_ids.reserve(polygon_ids.size()); + for (size_t i = 0; i < polygon_ids.size(); ++i) { auto & polygon = polygon_ids[i]; - correct_ids.emplace_back(ids[polygon.second]); + correct_ids.emplace_back(polygon_index_to_attribute_value_index[polygon.second]); polygons[i] = polygon.first; } - ids = correct_ids; + + polygon_index_to_attribute_value_index = std::move(correct_ids); } void IPolygonDictionary::calculateBytesAllocated() { - // TODO:: Account for key. + /// Index allocated by subclass not counted because it take a small part in relation to attributes and polygons + for (const auto & column : attributes) bytes_allocated += column->allocatedBytes(); + + for (auto & polygon : polygons) + bytes_allocated += bg::num_points(polygon) * sizeof(Point); } std::vector IPolygonDictionary::extractPoints(const Columns & key_columns) { if (key_columns.size() != 2) - throw Exception{"Expected two columns of coordinates", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected two columns of coordinates with type Float64"); + const auto * column_x = typeid_cast*>(key_columns[0].get()); const auto * column_y = typeid_cast*>(key_columns[1].get()); + if (!column_x || !column_y) - throw Exception{"Expected columns of Float64", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected columns of Float64"); + const auto rows = key_columns.front()->size(); + std::vector result; result.reserve(rows); + for (const auto row : ext::range(0, rows)) - result.emplace_back(column_x->getElement(row), column_y->getElement(row)); + { + auto x = column_x->getElement(row); + auto y = column_y->getElement(row); + + if (isNaN(x) || isNaN(y)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "PolygonDictionary input point component must not be NaN"); + + if (std::isinf(x) || std::isinf(y)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "PolygonDictionary input point component must not be infinite"); + + result.emplace_back(x, y); + } + return result; } ColumnUInt8::Ptr IPolygonDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const { - auto size = key_columns.front()->size(); - auto result = ColumnUInt8::create(size); + std::vector points = extractPoints(key_columns); + + auto result = ColumnUInt8::create(points.size()); auto& out = result->getData(); - size_t row = 0; - for (const auto & pt : extractPoints(key_columns)) + for (size_t i = 0; i < points.size(); ++i) { - size_t trash = 0; - out[row] = find(pt, trash); - ++row; - } - - query_count.fetch_add(row, std::memory_order_relaxed); - - return result; -} - -size_t IPolygonDictionary::getAttributeIndex(const std::string & attribute_name) const -{ - const auto it = attribute_index_by_name.find(attribute_name); - if (it == attribute_index_by_name.end()) - throw Exception{"No such attribute: " + attribute_name, ErrorCodes::BAD_ARGUMENTS}; - return it->second; -} - -template -void IPolygonDictionary::getItemsImpl( - size_t attribute_ind, - const Columns & key_columns, - ValueSetter && set_value, - DefaultValueExtractor & default_value_extractor) const -{ - const auto points = extractPoints(key_columns); - - using ColVecType = std::conditional_t, ColumnDecimal, ColumnVector>; - using ColType = std::conditional_t::value, ColumnString, ColVecType>; - const auto column = typeid_cast(attributes[attribute_ind].get()); - if (!column) - throw Exception{"An attribute should be a column of its type", ErrorCodes::BAD_ARGUMENTS}; - for (const auto i : ext::range(0, points.size())) - { - size_t id = 0; - const auto found = find(points[i], id); - id = ids[id]; - if (!found) - { - set_value(i, static_cast(default_value_extractor[i])); - continue; - } - if constexpr (std::is_same::value) - set_value(i, static_cast(column->getDataAt(id))); - else - set_value(i, static_cast(column->getElement(id))); + size_t unused_find_result = 0; + auto & point = points[i]; + out[i] = find(point, unused_find_result); } query_count.fetch_add(points.size(), std::memory_order_relaxed); + + return result; } namespace @@ -465,17 +392,17 @@ const IColumn * unrollMultiPolygons(const ColumnPtr & column, Offset & offset) { const auto * ptr_multi_polygons = typeid_cast(column.get()); if (!ptr_multi_polygons) - throw Exception{"Expected a column containing arrays of polygons", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected a column containing arrays of polygons"); offset.multi_polygon_offsets.assign(ptr_multi_polygons->getOffsets()); const auto * ptr_polygons = typeid_cast(&ptr_multi_polygons->getData()); if (!ptr_polygons) - throw Exception{"Expected a column containing arrays of rings when reading polygons", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected a column containing arrays of rings when reading polygons"); offset.polygon_offsets.assign(ptr_polygons->getOffsets()); const auto * ptr_rings = typeid_cast(&ptr_polygons->getData()); if (!ptr_rings) - throw Exception{"Expected a column containing arrays of points when reading rings", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected a column containing arrays of points when reading rings"); offset.ring_offsets.assign(ptr_rings->getOffsets()); return ptr_rings->getDataPtr().get(); @@ -485,7 +412,7 @@ const IColumn * unrollSimplePolygons(const ColumnPtr & column, Offset & offset) { const auto * ptr_polygons = typeid_cast(column.get()); if (!ptr_polygons) - throw Exception{"Expected a column containing arrays of points", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected a column containing arrays of points"); offset.ring_offsets.assign(ptr_polygons->getOffsets()); std::iota(offset.polygon_offsets.begin(), offset.polygon_offsets.end(), 1); offset.multi_polygon_offsets.assign(offset.polygon_offsets); @@ -498,13 +425,13 @@ void handlePointsReprByArrays(const IColumn * column, Data & data, Offset & offs const auto * ptr_points = typeid_cast(column); const auto * ptr_coord = typeid_cast*>(&ptr_points->getData()); if (!ptr_coord) - throw Exception{"Expected coordinates to be of type Float64", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected coordinates to be of type Float64"); const auto & offsets = ptr_points->getOffsets(); IColumn::Offset prev_offset = 0; for (size_t i = 0; i < offsets.size(); ++i) { if (offsets[i] - prev_offset != 2) - throw Exception{"All points should be two-dimensional", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "All points should be two-dimensional"); prev_offset = offsets[i]; addNewPoint(ptr_coord->getElement(2 * i), ptr_coord->getElement(2 * i + 1), data, offset); } @@ -514,13 +441,13 @@ void handlePointsReprByTuples(const IColumn * column, Data & data, Offset & offs { const auto * ptr_points = typeid_cast(column); if (!ptr_points) - throw Exception{"Expected a column of tuples representing points", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected a column of tuples representing points"); if (ptr_points->tupleSize() != 2) - throw Exception{"Points should be two-dimensional", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Points should be two-dimensional"); const auto * column_x = typeid_cast*>(&ptr_points->getColumn(0)); const auto * column_y = typeid_cast*>(&ptr_points->getColumn(1)); if (!column_x || !column_y) - throw Exception{"Expected coordinates to be of type Float64", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected coordinates to be of type Float64"); for (size_t i = 0; i < column_x->size(); ++i) { addNewPoint(column_x->getElement(i), column_y->getElement(i), data, offset); @@ -531,7 +458,7 @@ void handlePointsReprByTuples(const IColumn * column, Data & data, Offset & offs void IPolygonDictionary::extractPolygons(const ColumnPtr & column) { - Data data = {polygons, ids}; + Data data = {polygons, polygon_index_to_attribute_value_index}; Offset offset; const IColumn * points_collection = nullptr; @@ -546,8 +473,8 @@ void IPolygonDictionary::extractPolygons(const ColumnPtr & column) } if (!offset.allRingsHaveAPositiveArea()) - throw Exception{"Every ring included in a polygon or excluded from it should contain at least 3 points", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Every ring included in a polygon or excluded from it should contain at least 3 points"); /** Adding the first empty polygon */ data.addPolygon(true); diff --git a/src/Dictionaries/PolygonDictionary.h b/src/Dictionaries/PolygonDictionary.h index a0ea189c10a..5974e6461a7 100644 --- a/src/Dictionaries/PolygonDictionary.h +++ b/src/Dictionaries/PolygonDictionary.h @@ -24,7 +24,7 @@ namespace bg = boost::geometry; * An implementation should inherit from this base class and preprocess the data upon construction if needed. * It must override the find method of this class which retrieves the polygon containing a single point. */ -class IPolygonDictionary : public IDictionaryBase +class IPolygonDictionary : public IDictionary { public: /** Controls the different types of polygons allowed as input. @@ -57,27 +57,25 @@ public: InputType input_type_, PointType point_type_); - std::string getTypeName() const override; + std::string getTypeName() const override { return "Polygon"; } - std::string getKeyDescription() const; + size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getBytesAllocated() const override; + size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } - size_t getQueryCount() const override; + double getHitRate() const override { return 1.0; } - double getHitRate() const override; + size_t getElementCount() const override { return attributes.empty() ? 0 : attributes.front()->size(); } - size_t getElementCount() const override; + double getLoadFactor() const override { return 1.0; } - double getLoadFactor() const override; + const IDictionarySource * getSource() const override { return source_ptr.get(); } - const IDictionarySource * getSource() const override; + const DictionaryStructure & getStructure() const override { return dict_struct; } - const DictionaryStructure & getStructure() const override; + const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } - const DictionaryLifetime & getLifetime() const override; - - bool isInjective(const std::string & attribute_name) const override; + bool isInjective(const std::string & attribute_name) const override { return dict_struct.getAttribute(attribute_name).injective; } DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } @@ -86,7 +84,7 @@ public: const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const override; + const ColumnPtr & default_values_column) const override; ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; @@ -106,13 +104,9 @@ protected: * If true id is set to the index of a polygon containing the given point. * Overridden in different implementations of this interface. */ - virtual bool find(const Point & point, size_t & id) const = 0; + virtual bool find(const Point & point, size_t & polygon_index) const = 0; std::vector polygons; - /** Since the original data may have been in the form of multi-polygons, an id is stored for each single polygon - * corresponding to the row in which any other attributes for this entry are located. - */ - std::vector ids; const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; @@ -126,7 +120,7 @@ private: * The polygons serving as keys are extracted into boost types. * All other values are stored in one column per attribute. */ - void createAttributes(); + void setup(); void blockToAttributes(const Block & block); void loadData(); @@ -135,13 +129,6 @@ private: /** Checks whether a given attribute exists and returns its index */ size_t getAttributeIndex(const std::string & attribute_name) const; - /** Helper functions to retrieve and instantiate the provided null value of an attribute. - * Since a null value is obligatory for every attribute they are simply appended to null_values defined below. - */ - template - void appendNullValueImpl(const Field & null_value); - void appendNullValue(AttributeUnderlyingType type, const Field & value); - /** Helper function for retrieving the value of an attribute by key. */ template void getItemsImpl( @@ -150,32 +137,16 @@ private: ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const; - /** A mapping from the names of the attributes to their index in the two vectors defined below. */ - std::map attribute_index_by_name; - /** A vector of columns storing the values of each attribute. */ Columns attributes; - /** A vector of null values corresponding to each attribute. */ - std::vector> null_values; size_t bytes_allocated = 0; - size_t element_count = 0; mutable std::atomic query_count{0}; + /** Since the original data may have been in the form of multi-polygons, an id is stored for each single polygon + * corresponding to the row in which any other attributes for this entry are located. + */ + std::vector polygon_index_to_attribute_value_index; + /** Extracts a list of polygons from a column according to input_type and point_type. * The polygons are appended to the dictionary with the corresponding ids. */ diff --git a/src/Dictionaries/PolygonDictionaryImplementations.cpp b/src/Dictionaries/PolygonDictionaryImplementations.cpp index 6570b112853..fb4fb605465 100644 --- a/src/Dictionaries/PolygonDictionaryImplementations.cpp +++ b/src/Dictionaries/PolygonDictionaryImplementations.cpp @@ -39,14 +39,14 @@ std::shared_ptr PolygonDictionarySimple::clone() const this->point_type); } -bool PolygonDictionarySimple::find(const Point & point, size_t & id) const +bool PolygonDictionarySimple::find(const Point & point, size_t & polygon_index) const { bool found = false; for (size_t i = 0; i < polygons.size(); ++i) { if (bg::covered_by(point, polygons[i])) { - id = i; + polygon_index = i; found = true; break; } @@ -90,7 +90,7 @@ std::shared_ptr PolygonDictionaryIndexEach::clone() con this->max_depth); } -bool PolygonDictionaryIndexEach::find(const Point & point, size_t & id) const +bool PolygonDictionaryIndexEach::find(const Point & point, size_t & polygon_index) const { const auto * cell = grid.find(point.x(), point.y()); if (cell) @@ -100,13 +100,13 @@ bool PolygonDictionaryIndexEach::find(const Point & point, size_t & id) const size_t unused; if (buckets[candidate].find(point, unused)) { - id = candidate; + polygon_index = candidate; return true; } } if (cell->first_covered != FinalCell::kNone) { - id = cell->first_covered; + polygon_index = cell->first_covered; return true; } } @@ -142,19 +142,19 @@ std::shared_ptr PolygonDictionaryIndexCell::clone() con this->max_depth); } -bool PolygonDictionaryIndexCell::find(const Point & point, size_t & id) const +bool PolygonDictionaryIndexCell::find(const Point & point, size_t & polygon_index) const { const auto * cell = index.find(point.x(), point.y()); if (cell) { - if (!(cell->corresponding_ids).empty() && cell->index.find(point, id)) + if (!(cell->corresponding_ids).empty() && cell->index.find(point, polygon_index)) { - id = cell->corresponding_ids[id]; + polygon_index = cell->corresponding_ids[polygon_index]; return true; } if (cell->first_covered != FinalCellWithSlabs::kNone) { - id = cell->first_covered; + polygon_index = cell->first_covered; return true; } } @@ -172,10 +172,10 @@ DictionaryPtr createLayout(const std::string & , const String name = config.getString(config_prefix + ".name"); if (!dict_struct.key) - throw Exception{"'key' is required for a polygon dictionary", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "'key' is required for a polygon dictionary"); if (dict_struct.key->size() != 1) - throw Exception{"The 'key' should consist of a single attribute for a polygon dictionary", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "The 'key' should consist of a single attribute for a polygon dictionary"); IPolygonDictionary::InputType input_type; IPolygonDictionary::PointType point_type; @@ -206,19 +206,19 @@ DictionaryPtr createLayout(const std::string & , point_type = IPolygonDictionary::PointType::Tuple; } else - throw Exception{"The key type " + key_type->getName() + - " is not one of the following allowed types for a polygon dictionary: " + - multi_polygon_array.getName() + " " + - multi_polygon_tuple.getName() + " " + - simple_polygon_array.getName() + " " + - simple_polygon_tuple.getName() + " ", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "The key type {} is not one of the following allowed types for a polygon dictionary: {} {} {} {} ", + key_type->getName(), + multi_polygon_array.getName(), + multi_polygon_tuple.getName(), + simple_polygon_array.getName(), + simple_polygon_tuple.getName()); if (dict_struct.range_min || dict_struct.range_max) - throw Exception{name - + ": elements range_min and range_max should be defined only " - "for a dictionary of layout 'range_hashed'", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: elements range_min and range_max should be defined only " + "for a dictionary of layout 'range_hashed'", + name); const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; diff --git a/src/Dictionaries/PolygonDictionaryImplementations.h b/src/Dictionaries/PolygonDictionaryImplementations.h index 24910c23430..b49bf3f4d6c 100644 --- a/src/Dictionaries/PolygonDictionaryImplementations.h +++ b/src/Dictionaries/PolygonDictionaryImplementations.h @@ -27,7 +27,7 @@ public: std::shared_ptr clone() const override; private: - bool find(const Point & point, size_t & id) const override; + bool find(const Point & point, size_t & polygon_index) const override; }; /** A polygon dictionary which generates a recursive grid in order to efficiently cut the number @@ -55,7 +55,7 @@ public: static constexpr size_t kMaxDepthDefault = 5; private: - bool find(const Point & point, size_t & id) const override; + bool find(const Point & point, size_t & polygon_index) const override; std::vector buckets; GridRoot grid; @@ -84,7 +84,7 @@ public: static constexpr size_t kMaxDepthDefault = 5; private: - bool find(const Point & point, size_t & id) const override; + bool find(const Point & point, size_t & polygon_index) const override; GridRoot index; diff --git a/src/Dictionaries/PolygonDictionaryUtils.cpp b/src/Dictionaries/PolygonDictionaryUtils.cpp index e35016aaef2..2575affbb56 100644 --- a/src/Dictionaries/PolygonDictionaryUtils.cpp +++ b/src/Dictionaries/PolygonDictionaryUtils.cpp @@ -90,7 +90,6 @@ std::vector SlabsPolygonIndex::uniqueX(const std::vector & polyg std::sort(all_x.begin(), all_x.end()); all_x.erase(std::unique(all_x.begin(), all_x.end()), all_x.end()); - LOG_TRACE(log, "Found {} unique x coordinates", all_x.size()); return all_x; } @@ -112,8 +111,6 @@ void SlabsPolygonIndex::indexBuild(const std::vector & polygons) /** Total number of edges */ size_t m = all_edges.size(); - LOG_TRACE(log, "Just sorted {} edges from all {} polygons", all_edges.size(), polygons.size()); - /** Using custom comparator for fetching edges in right_point order, like in scanline */ auto cmp = [](const Edge & a, const Edge & b) { @@ -180,8 +177,6 @@ void SlabsPolygonIndex::indexBuild(const std::vector & polygons) } } } - - LOG_TRACE(log, "Polygon index is built, total_index_edges = {}", total_index_edges); } void SlabsPolygonIndex::indexAddRing(const Ring & ring, size_t polygon_id) diff --git a/src/Dictionaries/PolygonDictionaryUtils.h b/src/Dictionaries/PolygonDictionaryUtils.h index cd99717f98a..865d78a7cad 100644 --- a/src/Dictionaries/PolygonDictionaryUtils.h +++ b/src/Dictionaries/PolygonDictionaryUtils.h @@ -73,7 +73,7 @@ public: private: /** Returns unique x coordinates among all points */ - std::vector uniqueX(const std::vector & polygons); + static std::vector uniqueX(const std::vector & polygons); /** Builds index described above */ void indexBuild(const std::vector & polygons); diff --git a/src/Dictionaries/PostgreSQLDictionarySource.cpp b/src/Dictionaries/PostgreSQLDictionarySource.cpp index aa852404750..54022dfd5cb 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.cpp +++ b/src/Dictionaries/PostgreSQLDictionarySource.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include "readInvalidateQuery.h" #endif @@ -29,11 +28,10 @@ PostgreSQLDictionarySource::PostgreSQLDictionarySource( const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config_, const std::string & config_prefix, - PostgreSQLConnectionPtr connection_, const Block & sample_block_) : dict_struct{dict_struct_} , sample_block(sample_block_) - , connection(std::move(connection_)) + , connection(std::make_shared(config_, config_prefix)) , log(&Poco::Logger::get("PostgreSQLDictionarySource")) , db(config_.getString(fmt::format("{}.db", config_prefix), "")) , table(config_.getString(fmt::format("{}.table", config_prefix), "")) @@ -50,7 +48,7 @@ PostgreSQLDictionarySource::PostgreSQLDictionarySource( PostgreSQLDictionarySource::PostgreSQLDictionarySource(const PostgreSQLDictionarySource & other) : dict_struct(other.dict_struct) , sample_block(other.sample_block) - , connection(std::make_shared(other.connection->conn_str())) + , connection(other.connection) , log(&Poco::Logger::get("PostgreSQLDictionarySource")) , db(other.db) , table(other.table) @@ -68,8 +66,7 @@ PostgreSQLDictionarySource::PostgreSQLDictionarySource(const PostgreSQLDictionar BlockInputStreamPtr PostgreSQLDictionarySource::loadAll() { LOG_TRACE(log, load_all_query); - return std::make_shared( - connection->conn(), load_all_query, sample_block, max_block_size); + return loadBase(load_all_query); } @@ -77,23 +74,28 @@ BlockInputStreamPtr PostgreSQLDictionarySource::loadUpdatedAll() { auto load_update_query = getUpdateFieldAndDate(); LOG_TRACE(log, load_update_query); - return std::make_shared(connection->conn(), load_update_query, sample_block, max_block_size); + return loadBase(load_update_query); } BlockInputStreamPtr PostgreSQLDictionarySource::loadIds(const std::vector & ids) { const auto query = query_builder.composeLoadIdsQuery(ids); - return std::make_shared(connection->conn(), query, sample_block, max_block_size); + return loadBase(query); } BlockInputStreamPtr PostgreSQLDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) { const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN); - return std::make_shared(connection->conn(), query, sample_block, max_block_size); + return loadBase(query); } +BlockInputStreamPtr PostgreSQLDictionarySource::loadBase(const String & query) +{ + return std::make_shared(connection->get(), query, sample_block, max_block_size); +} + bool PostgreSQLDictionarySource::isModified() const { if (!invalidate_query.empty()) @@ -112,7 +114,7 @@ std::string PostgreSQLDictionarySource::doInvalidateQuery(const std::string & re Block invalidate_sample_block; ColumnPtr column(ColumnString::create()); invalidate_sample_block.insert(ColumnWithTypeAndName(column, std::make_shared(), "Sample Block")); - PostgreSQLBlockInputStream block_input_stream(connection->conn(), request, invalidate_sample_block, 1); + PostgreSQLBlockInputStream block_input_stream(connection->get(), request, invalidate_sample_block, 1); return readInvalidateQuery(block_input_stream); } @@ -127,10 +129,9 @@ std::string PostgreSQLDictionarySource::getUpdateFieldAndDate() { if (update_time != std::chrono::system_clock::from_time_t(0)) { - auto tmp_time = update_time; + time_t hr_time = std::chrono::system_clock::to_time_t(update_time) - 1; + std::string str_time = DateLUT::instance().timeToString(hr_time); update_time = std::chrono::system_clock::now(); - time_t hr_time = std::chrono::system_clock::to_time_t(tmp_time) - 1; - std::string str_time = std::to_string(LocalDateTime(hr_time)); return query_builder.composeUpdateQuery(update_field, str_time); } else @@ -166,28 +167,21 @@ void registerDictionarySourcePostgreSQL(DictionarySourceFactory & factory) const Poco::Util::AbstractConfiguration & config, const std::string & root_config_prefix, Block & sample_block, - const Context & /* context */, + ContextPtr /* context */, const std::string & /* default_database */, bool /* check_config */) -> DictionarySourcePtr { #if USE_LIBPQXX const auto config_prefix = root_config_prefix + ".postgresql"; - auto connection = std::make_shared( - config.getString(fmt::format("{}.db", config_prefix), ""), - config.getString(fmt::format("{}.host", config_prefix), ""), - config.getUInt(fmt::format("{}.port", config_prefix), 0), - config.getString(fmt::format("{}.user", config_prefix), ""), - config.getString(fmt::format("{}.password", config_prefix), "")); - return std::make_unique( - dict_struct, config, config_prefix, connection, sample_block); + dict_struct, config, config_prefix, sample_block); #else (void)dict_struct; (void)config; (void)root_config_prefix; (void)sample_block; - throw Exception{"Dictionary source of type `postgresql` is disabled because ClickHouse was built without postgresql support.", - ErrorCodes::SUPPORT_IS_DISABLED}; + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Dictionary source of type `postgresql` is disabled because ClickHouse was built without postgresql support."); #endif }; factory.registerSource("postgresql", create_table_source); diff --git a/src/Dictionaries/PostgreSQLDictionarySource.h b/src/Dictionaries/PostgreSQLDictionarySource.h index a826ff15f4f..f1520a37a79 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.h +++ b/src/Dictionaries/PostgreSQLDictionarySource.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include @@ -26,7 +26,6 @@ public: const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config_, const std::string & config_prefix, - PostgreSQLConnectionPtr connection_, const Block & sample_block_); /// copy-constructor is provided in order to support cloneability @@ -48,10 +47,11 @@ public: private: std::string getUpdateFieldAndDate(); std::string doInvalidateQuery(const std::string & request) const; + BlockInputStreamPtr loadBase(const String & query); const DictionaryStructure dict_struct; Block sample_block; - PostgreSQLConnectionPtr connection; + postgres::PoolWithFailoverPtr connection; Poco::Logger * log; const std::string db; diff --git a/src/Dictionaries/RangeDictionaryBlockInputStream.h b/src/Dictionaries/RangeDictionaryBlockInputStream.h index ccd77d49e0f..499eea7152f 100644 --- a/src/Dictionaries/RangeDictionaryBlockInputStream.h +++ b/src/Dictionaries/RangeDictionaryBlockInputStream.h @@ -17,14 +17,14 @@ namespace DB * BlockInputStream implementation for external dictionaries * read() returns single block consisting of the in-memory contents of the dictionaries */ -template +template class RangeDictionaryBlockInputStream : public DictionaryBlockInputStreamBase { public: - using DictionaryPtr = std::shared_ptr; + using Key = UInt64; RangeDictionaryBlockInputStream( - DictionaryPtr dictionary, + std::shared_ptr dictionary, size_t max_block_size, const Names & column_names, PaddedPODArray && ids_to_fill, @@ -40,35 +40,26 @@ private: template ColumnPtr getColumnFromPODArray(const PaddedPODArray & array) const; - template - void addSpecialColumn( - const std::optional & attribute, - DataTypePtr type, - const std::string & default_name, - const std::unordered_set & column_names_set, - const PaddedPODArray & values, - ColumnsWithTypeAndName & columns, - bool force = false) const; - Block fillBlock( const PaddedPODArray & ids_to_fill, const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; - PaddedPODArray - makeDateKey(const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; + PaddedPODArray makeDateKey( + const PaddedPODArray & block_start_dates, + const PaddedPODArray & block_end_dates) const; - DictionaryPtr dictionary; - Names column_names; + std::shared_ptr dictionary; + NameSet column_names; PaddedPODArray ids; PaddedPODArray start_dates; PaddedPODArray end_dates; }; -template -RangeDictionaryBlockInputStream::RangeDictionaryBlockInputStream( - DictionaryPtr dictionary_, +template +RangeDictionaryBlockInputStream::RangeDictionaryBlockInputStream( + std::shared_ptr dictionary_, size_t max_block_size_, const Names & column_names_, PaddedPODArray && ids_, @@ -76,15 +67,15 @@ RangeDictionaryBlockInputStream::RangeDictionary PaddedPODArray && block_end_dates) : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) , dictionary(dictionary_) - , column_names(column_names_) + , column_names(column_names_.begin(), column_names_.end()) , ids(std::move(ids_)) , start_dates(std::move(block_start_dates)) , end_dates(std::move(block_end_dates)) { } -template -Block RangeDictionaryBlockInputStream::getBlock(size_t start, size_t length) const +template +Block RangeDictionaryBlockInputStream::getBlock(size_t start, size_t length) const { PaddedPODArray block_ids; PaddedPODArray block_start_dates; @@ -103,38 +94,19 @@ Block RangeDictionaryBlockInputStream::getBlock( return fillBlock(block_ids, block_start_dates, block_end_dates); } -template +template template -ColumnPtr RangeDictionaryBlockInputStream::getColumnFromPODArray(const PaddedPODArray & array) const +ColumnPtr RangeDictionaryBlockInputStream::getColumnFromPODArray(const PaddedPODArray & array) const { auto column_vector = ColumnVector::create(); column_vector->getData().reserve(array.size()); - for (T value : array) - column_vector->insertValue(value); + column_vector->getData().insert(array.begin(), array.end()); + return column_vector; } -template -template -void RangeDictionaryBlockInputStream::addSpecialColumn( - const std::optional & attribute, - DataTypePtr type, - const std::string & default_name, - const std::unordered_set & column_names_set, - const PaddedPODArray & values, - ColumnsWithTypeAndName & columns, - bool force) const -{ - std::string name = default_name; - if (attribute) - name = attribute->name; - - if (force || column_names_set.find(name) != column_names_set.end()) - columns.emplace_back(getColumnFromPODArray(values), type, name); -} - -template -PaddedPODArray RangeDictionaryBlockInputStream::makeDateKey( +template +PaddedPODArray RangeDictionaryBlockInputStream::makeDateKey( const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const { PaddedPODArray key(block_start_dates.size()); @@ -150,8 +122,8 @@ PaddedPODArray RangeDictionaryBlockInputStream -Block RangeDictionaryBlockInputStream::fillBlock( +template +Block RangeDictionaryBlockInputStream::fillBlock( const PaddedPODArray & ids_to_fill, const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const @@ -159,20 +131,32 @@ Block RangeDictionaryBlockInputStream::fillBlock ColumnsWithTypeAndName columns; const DictionaryStructure & structure = dictionary->getStructure(); - std::unordered_set names(column_names.begin(), column_names.end()); - - addSpecialColumn(structure.id, std::make_shared(), "ID", names, ids_to_fill, columns, true); - auto ids_column = columns.back().column; - addSpecialColumn(structure.range_min, structure.range_max->type, "Range Start", names, block_start_dates, columns); - addSpecialColumn(structure.range_max, structure.range_max->type, "Range End", names, block_end_dates, columns); + auto ids_column = getColumnFromPODArray(ids_to_fill); + const std::string & id_column_name = structure.id->name; + if (column_names.find(id_column_name) != column_names.end()) + columns.emplace_back(ids_column, std::make_shared(), id_column_name); auto date_key = makeDateKey(block_start_dates, block_end_dates); auto date_column = getColumnFromPODArray(date_key); + const std::string & range_min_column_name = structure.range_min->name; + if (column_names.find(range_min_column_name) != column_names.end()) + { + auto range_min_column = getColumnFromPODArray(block_start_dates); + columns.emplace_back(range_min_column, structure.range_max->type, range_min_column_name); + } + + const std::string & range_max_column_name = structure.range_max->name; + if (column_names.find(range_max_column_name) != column_names.end()) + { + auto range_max_column = getColumnFromPODArray(block_end_dates); + columns.emplace_back(range_max_column, structure.range_max->type, range_max_column_name); + } + for (const auto idx : ext::range(0, structure.attributes.size())) { const DictionaryAttribute & attribute = structure.attributes[idx]; - if (names.find(attribute.name) != names.end()) + if (column_names.find(attribute.name) != column_names.end()) { ColumnPtr column = dictionary->getColumn( attribute.name, diff --git a/src/Dictionaries/RangeHashedDictionary.cpp b/src/Dictionaries/RangeHashedDictionary.cpp index 9fb1a57a381..1bdcd942b4a 100644 --- a/src/Dictionaries/RangeHashedDictionary.cpp +++ b/src/Dictionaries/RangeHashedDictionary.cpp @@ -52,7 +52,6 @@ namespace ErrorCodes extern const int DICTIONARY_IS_EMPTY; extern const int TYPE_MISMATCH; extern const int UNSUPPORTED_METHOD; - extern const int NOT_IMPLEMENTED; } bool RangeHashedDictionary::Range::isCorrectDate(const RangeStorageType & date) @@ -77,7 +76,7 @@ RangeHashedDictionary::RangeHashedDictionary( DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_) - : IDictionaryBase(dict_id_) + : IDictionary(dict_id_) , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) @@ -93,7 +92,7 @@ ColumnPtr RangeHashedDictionary::getColumn( const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const + const ColumnPtr & default_values_column) const { ColumnPtr result; @@ -178,10 +177,76 @@ ColumnPtr RangeHashedDictionary::getColumn( return result; } -ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns &, const DataTypes &) const +ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Has not supported", getDictionaryID().getNameForLogs()); + auto range_storage_column = key_columns[1]; + ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types[1], ""}; + + auto range_column_storage_type = std::make_shared(); + auto range_column_updated = castColumnAccurate(column_to_cast, range_column_storage_type); + + PaddedPODArray key_backup_storage; + PaddedPODArray range_backup_storage; + + const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); + const PaddedPODArray & dates = getColumnVectorData(this, range_column_updated, range_backup_storage); + + const auto & attribute = attributes.front(); + + ColumnUInt8::Ptr result; + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + result = hasKeysImpl(attribute, ids, dates); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + query_count.fetch_add(ids.size(), std::memory_order_relaxed); + + return result; +} + +template +ColumnUInt8::Ptr RangeHashedDictionary::hasKeysImpl( + const Attribute & attribute, + const PaddedPODArray & ids, + const PaddedPODArray & dates) const +{ + auto result = ColumnUInt8::create(ids.size()); + auto& out = result->getData(); + + const auto & attr = *std::get>(attribute.maps); + + for (const auto row : ext::range(0, ids.size())) + { + const auto it = attr.find(ids[row]); + + if (it) + { + const auto date = dates[row]; + const auto & ranges_and_values = it->getMapped(); + const auto val_it = std::find_if( + std::begin(ranges_and_values), + std::end(ranges_and_values), + [date](const Value & v) + { + return v.range.contains(date); + }); + + if (val_it != std::end(ranges_and_values)) + out[row] = true; + else + out[row] = false; + } + else + out[row] = false; + } + + return result; } void RangeHashedDictionary::createAttributes() @@ -195,8 +260,8 @@ void RangeHashedDictionary::createAttributes() attributes.push_back(createAttribute(attribute, attribute.null_value)); if (attribute.hierarchical) - throw Exception{ErrorCodes::BAD_ARGUMENTS, "Hierarchical attributes not supported by {} dictionary.", - getDictionaryID().getNameForLogs()}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Hierarchical attributes not supported by {} dictionary.", + getDictionaryID().getNameForLogs()); } } @@ -246,8 +311,8 @@ void RangeHashedDictionary::loadData() stream->readSuffix(); if (require_nonempty && 0 == element_count) - throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", - ErrorCodes::DICTIONARY_IS_EMPTY}; + throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, + "{}: dictionary source is empty and 'require_nonempty' property is set."); } template @@ -285,7 +350,7 @@ void RangeHashedDictionary::calculateBytesAllocated() template void RangeHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) { - attribute.null_values = T(null_value.get>()); + attribute.null_values = T(null_value.get()); attribute.maps = std::make_unique>(); } @@ -323,10 +388,10 @@ void RangeHashedDictionary::getItemsImpl( ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const { - PaddedPODArray key_backup_storage; + PaddedPODArray key_backup_storage; PaddedPODArray range_backup_storage; - const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); + const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); const PaddedPODArray & dates = getColumnVectorData(this, key_columns[1], range_backup_storage); const auto & attr = *std::get>(attribute.maps); @@ -371,7 +436,7 @@ void RangeHashedDictionary::getItemsImpl( template -void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value) +void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value) { using ValueType = std::conditional_t, StringRef, T>; auto & map = *std::get>(attribute.maps); @@ -393,7 +458,7 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const K } else { - value_to_insert = Value{ range, { value.get>() }}; + value_to_insert = Value{ range, { value.get() }}; } } @@ -415,7 +480,7 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const K map.insert({id, Values{std::move(value_to_insert)}}); } -void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Range & range, const Field & value) +void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value) { auto type_call = [&](const auto &dictionary_attribute_type) { @@ -432,7 +497,7 @@ const RangeHashedDictionary::Attribute & RangeHashedDictionary::getAttribute(con { const auto it = attribute_index_by_name.find(attribute_name); if (it == std::end(attribute_index_by_name)) - throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "{}: no such attribute '{}'", full_name, attribute_name); return attributes[it->second]; } @@ -442,15 +507,18 @@ RangeHashedDictionary::getAttributeWithType(const std::string & attribute_name, { const auto & attribute = getAttribute(attribute_name); if (attribute.type != type) - throw Exception{attribute_name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type), - ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "attribute {} has type {}", + attribute_name, + toString(attribute.type)); return attribute; } template void RangeHashedDictionary::getIdsAndDates( - PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const + PaddedPODArray & ids, + PaddedPODArray & start_dates, + PaddedPODArray & end_dates) const { const auto & attribute = attributes.front(); @@ -458,11 +526,9 @@ void RangeHashedDictionary::getIdsAndDates( { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; - if constexpr (std::is_same_v) - getIdsAndDates(attribute, ids, start_dates, end_dates); - else - getIdsAndDates(attribute, ids, start_dates, end_dates); + getIdsAndDates(attribute, ids, start_dates, end_dates); }; callOnDictionaryAttributeType(attribute.type, type_call); @@ -471,7 +537,7 @@ void RangeHashedDictionary::getIdsAndDates( template void RangeHashedDictionary::getIdsAndDates( const Attribute & attribute, - PaddedPODArray & ids, + PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const { @@ -491,8 +557,9 @@ void RangeHashedDictionary::getIdsAndDates( start_dates.push_back(value.range.left); end_dates.push_back(value.range.right); - if (is_date && static_cast(end_dates.back()) > DATE_LUT_MAX_DAY_NUM) - end_dates.back() = 0; + if constexpr (std::numeric_limits::max() > DATE_LUT_MAX_DAY_NUM) /// Avoid warning about tautological comparison in next line. + if (is_date && static_cast(end_dates.back()) > DATE_LUT_MAX_DAY_NUM) + end_dates.back() = 0; } } } @@ -501,18 +568,25 @@ void RangeHashedDictionary::getIdsAndDates( template BlockInputStreamPtr RangeHashedDictionary::getBlockInputStreamImpl(const Names & column_names, size_t max_block_size) const { - PaddedPODArray ids; + PaddedPODArray ids; PaddedPODArray start_dates; PaddedPODArray end_dates; getIdsAndDates(ids, start_dates, end_dates); - using BlockInputStreamType = RangeDictionaryBlockInputStream; - auto dict_ptr = std::static_pointer_cast(shared_from_this()); - return std::make_shared( - dict_ptr, max_block_size, column_names, std::move(ids), std::move(start_dates), std::move(end_dates)); + using BlockInputStreamType = RangeDictionaryBlockInputStream; + + auto stream = std::make_shared( + shared_from_this(), + max_block_size, + column_names, + std::move(ids), + std::move(start_dates), + std::move(end_dates)); + + return stream; } -struct RangeHashedDIctionaryCallGetBlockInputStreamImpl +struct RangeHashedDictionaryCallGetBlockInputStreamImpl { BlockInputStreamPtr stream; const RangeHashedDictionary * dict; @@ -532,7 +606,7 @@ BlockInputStreamPtr RangeHashedDictionary::getBlockInputStream(const Names & col { using ListType = TypeList; - RangeHashedDIctionaryCallGetBlockInputStreamImpl callable; + RangeHashedDictionaryCallGetBlockInputStreamImpl callable; callable.dict = this; callable.column_names = &column_names; callable.max_block_size = max_block_size; @@ -540,8 +614,9 @@ BlockInputStreamPtr RangeHashedDictionary::getBlockInputStream(const Names & col ListType::forEach(callable); if (!callable.stream) - throw Exception( - "Unexpected range type for RangeHashed dictionary: " + dict_struct.range_min->type->getName(), ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Unexpected range type for RangeHashed dictionary: {}", + dict_struct.range_min->type->getName()); return callable.stream; } @@ -556,11 +631,12 @@ void registerDictionaryRangeHashed(DictionaryFactory & factory) DictionarySourcePtr source_ptr) -> DictionaryPtr { if (dict_struct.key) - throw Exception{"'key' is not supported for dictionary of layout 'range_hashed'", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for dictionary of layout 'range_hashed'"); if (!dict_struct.range_min || !dict_struct.range_max) - throw Exception{full_name + ": dictionary of layout 'range_hashed' requires .structure.range_min and .structure.range_max", - ErrorCodes::BAD_ARGUMENTS}; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: dictionary of layout 'range_hashed' requires .structure.range_min and .structure.range_max", + full_name); const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index 80cf47eb93b..8a286f530ba 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -16,7 +16,7 @@ namespace DB { -class RangeHashedDictionary final : public IDictionaryBase +class RangeHashedDictionary final : public IDictionary { public: RangeHashedDictionary( @@ -61,7 +61,7 @@ public: const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnPtr default_values_column) const override; + const ColumnPtr & default_values_column) const override; ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; @@ -93,8 +93,6 @@ private: template using Ptr = std::unique_ptr>; - using NullableSet = HashSet>; - struct Attribute final { public: @@ -114,6 +112,7 @@ private: Decimal32, Decimal64, Decimal128, + Decimal256, Float32, Float64, StringRef> @@ -131,6 +130,7 @@ private: Ptr, Ptr, Ptr, + Ptr, Ptr, Ptr, Ptr> @@ -159,29 +159,35 @@ private: ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const; - template - static void setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value); + template + ColumnUInt8::Ptr hasKeysImpl( + const Attribute & attribute, + const PaddedPODArray & ids, + const PaddedPODArray & dates) const; - static void setAttributeValue(Attribute & attribute, const Key id, const Range & range, const Field & value); + template + static void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value); + + static void setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value); const Attribute & getAttribute(const std::string & attribute_name) const; const Attribute & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const; template - void getIdsAndDates(PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; + void getIdsAndDates(PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; template void getIdsAndDates( const Attribute & attribute, - PaddedPODArray & ids, + PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; template BlockInputStreamPtr getBlockInputStreamImpl(const Names & column_names, size_t max_block_size) const; - friend struct RangeHashedDIctionaryCallGetBlockInputStreamImpl; + friend struct RangeHashedDictionaryCallGetBlockInputStreamImpl; const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; diff --git a/src/Dictionaries/RedisBlockInputStream.cpp b/src/Dictionaries/RedisBlockInputStream.cpp index a5514d14155..f74ac348a99 100644 --- a/src/Dictionaries/RedisBlockInputStream.cpp +++ b/src/Dictionaries/RedisBlockInputStream.cpp @@ -57,7 +57,7 @@ namespace DB void insertValue(IColumn & column, const ValueType type, const Poco::Redis::BulkString & bulk_string) { if (bulk_string.isNull()) - throw Exception{"Type mismatch, expected not Null String", ErrorCodes::TYPE_MISMATCH}; + throw Exception(ErrorCodes::TYPE_MISMATCH, "Type mismatch, expected not Null String"); const String & string_value = bulk_string.value(); switch (type) @@ -99,13 +99,22 @@ namespace DB assert_cast(column).insertValue(parse(string_value).getDayNum()); break; case ValueType::vtDateTime: - assert_cast(column).insertValue(static_cast(parse(string_value))); + { + ReadBufferFromString in(string_value); + time_t time = 0; + readDateTimeText(time, in); + if (time < 0) + time = 0; + assert_cast(column).insertValue(time); break; + } case ValueType::vtUUID: assert_cast(column).insertValue(parse(string_value)); break; default: - throw Exception("Value of unsupported type:" + column.getName(), ErrorCodes::UNKNOWN_TYPE); + throw Exception(ErrorCodes::UNKNOWN_TYPE, + "Value of unsupported type: {}", + column.getName()); } } } @@ -145,8 +154,9 @@ namespace DB const auto & keys_array = keys.get(cursor); if (keys_array.size() < 2) { - throw Exception{"Too low keys in request to source: " + DB::toString(keys_array.size()) - + ", expected 2 or more", ErrorCodes::LOGICAL_ERROR}; + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Too low keys in request to source: {}, expected 2 or more", + DB::toString(keys_array.size())); } if (num_rows + keys_array.size() - 1 > max_block_size) @@ -159,8 +169,8 @@ namespace DB auto values = client->execute(command_for_values); if (keys_array.size() != values.size() + 1) // 'HMGET' primary_key secondary_keys - throw Exception{"Inconsistent sizes of keys and values in Redis request", - ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; + throw Exception(ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH, + "Inconsistent sizes of keys and values in Redis request"); const auto & primary_key = keys_array.get(0); for (size_t i = 0; i < values.size(); ++i) @@ -189,7 +199,8 @@ namespace DB auto values = client->execute(command_for_values); if (values.size() != need_values) - throw Exception{"Inconsistent sizes of keys and values in Redis request", ErrorCodes::INTERNAL_REDIS_ERROR}; + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, + "Inconsistent sizes of keys and values in Redis request"); for (size_t i = 0; i < values.size(); ++i) { diff --git a/src/Dictionaries/RedisDictionarySource.cpp b/src/Dictionaries/RedisDictionarySource.cpp index f1d0c0c5d3c..8144b37e63d 100644 --- a/src/Dictionaries/RedisDictionarySource.cpp +++ b/src/Dictionaries/RedisDictionarySource.cpp @@ -12,7 +12,7 @@ void registerDictionarySourceRedis(DictionarySourceFactory & factory) const Poco::Util::AbstractConfiguration & config, const String & config_prefix, Block & sample_block, - const Context & /* context */, + ContextPtr /* context */, const std::string & /* default_database */, bool /* check_config */) -> DictionarySourcePtr { return std::make_unique(dict_struct, config, config_prefix + ".redis", sample_block); @@ -66,25 +66,27 @@ namespace DB , client{std::make_shared(host, port)} { if (dict_struct.attributes.size() != 1) - throw Exception{"Invalid number of non key columns for Redis source: " + - DB::toString(dict_struct.attributes.size()) + ", expected 1", - ErrorCodes::INVALID_CONFIG_PARAMETER}; + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, + "Invalid number of non key columns for Redis source: {}, expected 1", + DB::toString(dict_struct.attributes.size())); if (storage_type == RedisStorageType::HASH_MAP) { if (!dict_struct.key) - throw Exception{"Redis source with storage type \'hash_map\' must have key", - ErrorCodes::INVALID_CONFIG_PARAMETER}; + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, + "Redis source with storage type \'hash_map\' must have key"); if (dict_struct.key->size() != 2) - throw Exception{"Redis source with storage type \'hash_map\' requires 2 keys", - ErrorCodes::INVALID_CONFIG_PARAMETER}; + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, + "Redis source with storage type \'hash_map\' requires 2 keys"); // suppose key[0] is primary key, key[1] is secondary key for (const auto & key : *dict_struct.key) if (!isInteger(key.type) && !isString(key.type)) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, - "Redis source supports only integer or string key, but key '{}' of type {} given", key.name, key.type->getName()); + "Redis source supports only integer or string key, but key '{}' of type {} given", + key.name, + key.type->getName()); } if (!password.empty()) @@ -93,8 +95,9 @@ namespace DB command << password; String reply = client->execute(command); if (reply != "OK") - throw Exception{"Authentication failed with reason " - + reply, ErrorCodes::INTERNAL_REDIS_ERROR}; + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, + "Authentication failed with reason {}", + reply); } if (db_index != 0) @@ -103,8 +106,10 @@ namespace DB command << std::to_string(db_index); String reply = client->execute(command); if (reply != "OK") - throw Exception{"Selecting database with index " + DB::toString(db_index) - + " failed with reason " + reply, ErrorCodes::INTERNAL_REDIS_ERROR}; + throw Exception(ErrorCodes::INTERNAL_REDIS_ERROR, + "Selecting database with index {} failed with reason {}", + DB::toString(db_index), + reply); } } @@ -215,10 +220,10 @@ namespace DB client->connect(host, port); if (storage_type == RedisStorageType::HASH_MAP) - throw Exception{"Cannot use loadIds with 'hash_map' storage type", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot use loadIds with 'hash_map' storage type"); if (!dict_struct.id) - throw Exception{"'id' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is required for selective loading"); RedisArray keys; @@ -234,7 +239,7 @@ namespace DB client->connect(host, port); if (key_columns.size() != dict_struct.key->size()) - throw Exception{"The size of key_columns does not equal to the size of dictionary key", ErrorCodes::LOGICAL_ERROR}; + throw Exception(ErrorCodes::LOGICAL_ERROR, "The size of key_columns does not equal to the size of dictionary key"); RedisArray keys; for (auto row : requested_rows) @@ -268,7 +273,7 @@ namespace DB if (storage_type_str == "hash_map") return RedisStorageType::HASH_MAP; else if (!storage_type_str.empty() && storage_type_str != "simple") - throw Exception("Unknown storage type " + storage_type_str + " for Redis dictionary", ErrorCodes::INVALID_CONFIG_PARAMETER); + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Unknown storage type {} for Redis dictionary", storage_type_str); return RedisStorageType::SIMPLE; } diff --git a/src/Dictionaries/RedisDictionarySource.h b/src/Dictionaries/RedisDictionarySource.h index 71d2a7e724a..b2c5859decd 100644 --- a/src/Dictionaries/RedisDictionarySource.h +++ b/src/Dictionaries/RedisDictionarySource.h @@ -63,7 +63,7 @@ namespace ErrorCodes BlockInputStreamPtr loadUpdatedAll() override { - throw Exception{"Method loadUpdatedAll is unsupported for RedisDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method loadUpdatedAll is unsupported for RedisDictionarySource"); } bool supportsSelectiveLoad() const override { return true; } diff --git a/src/Dictionaries/SSDCacheDictionary.cpp b/src/Dictionaries/SSDCacheDictionary.cpp deleted file mode 100644 index cbeea39decb..00000000000 --- a/src/Dictionaries/SSDCacheDictionary.cpp +++ /dev/null @@ -1,1667 +0,0 @@ -#if defined(OS_LINUX) || defined(__FreeBSD__) - -#include "SSDCacheDictionary.h" - -#include -#include -#include -#include -#include -#include -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ProfileEvents -{ - extern const Event DictCacheKeysRequested; - extern const Event DictCacheKeysRequestedMiss; - extern const Event DictCacheKeysRequestedFound; - extern const Event DictCacheKeysExpired; - extern const Event DictCacheKeysNotFound; - extern const Event DictCacheKeysHit; - extern const Event DictCacheRequestTimeNs; - extern const Event DictCacheRequests; - extern const Event DictCacheLockWriteNs; - extern const Event DictCacheLockReadNs; - extern const Event FileOpen; - extern const Event WriteBufferAIOWrite; - extern const Event WriteBufferAIOWriteBytes; -} - -namespace CurrentMetrics -{ - extern const Metric DictCacheRequests; - extern const Metric Write; -} - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int AIO_READ_ERROR; - extern const int AIO_WRITE_ERROR; - extern const int BAD_ARGUMENTS; - extern const int CACHE_DICTIONARY_UPDATE_FAIL; - extern const int CANNOT_ALLOCATE_MEMORY; - extern const int CANNOT_CREATE_DIRECTORY; - extern const int CANNOT_FSYNC; - extern const int CANNOT_IO_GETEVENTS; - extern const int CANNOT_IO_SUBMIT; - extern const int CANNOT_OPEN_FILE; - extern const int CORRUPTED_DATA; - extern const int FILE_DOESNT_EXIST; - extern const int LOGICAL_ERROR; - extern const int TYPE_MISMATCH; - extern const int UNSUPPORTED_METHOD; -} - -namespace -{ - constexpr size_t DEFAULT_SSD_BLOCK_SIZE_BYTES = DEFAULT_AIO_FILE_BLOCK_SIZE; - constexpr size_t DEFAULT_FILE_SIZE_BYTES = 4 * 1024 * 1024 * 1024ULL; - constexpr size_t DEFAULT_PARTITIONS_COUNT = 16; - constexpr size_t DEFAULT_READ_BUFFER_SIZE_BYTES = 16 * DEFAULT_SSD_BLOCK_SIZE_BYTES; - constexpr size_t DEFAULT_WRITE_BUFFER_SIZE_BYTES = DEFAULT_SSD_BLOCK_SIZE_BYTES; - - constexpr size_t DEFAULT_MAX_STORED_KEYS = 100000; - - constexpr size_t BUFFER_ALIGNMENT = DEFAULT_AIO_FILE_BLOCK_SIZE; - constexpr size_t BLOCK_CHECKSUM_SIZE_BYTES = 8; - constexpr size_t BLOCK_SPECIAL_FIELDS_SIZE_BYTES = 4; - - constexpr UInt64 KEY_METADATA_EXPIRES_AT_MASK = std::numeric_limits::max(); - constexpr UInt64 KEY_METADATA_IS_DEFAULT_MASK = ~KEY_METADATA_EXPIRES_AT_MASK; - - constexpr size_t KEY_IN_MEMORY_BIT = 63; - constexpr size_t KEY_IN_MEMORY = (1ULL << KEY_IN_MEMORY_BIT); - constexpr size_t BLOCK_INDEX_BITS = 32; - constexpr size_t INDEX_IN_BLOCK_BITS = 16; - constexpr size_t INDEX_IN_BLOCK_MASK = (1ULL << INDEX_IN_BLOCK_BITS) - 1; - constexpr size_t BLOCK_INDEX_MASK = ((1ULL << (BLOCK_INDEX_BITS + INDEX_IN_BLOCK_BITS)) - 1) ^ INDEX_IN_BLOCK_MASK; - - constexpr size_t NOT_EXISTS = -1; - - constexpr UInt8 HAS_NOT_FOUND = 2; - - const std::string BIN_FILE_EXT = ".bin"; - - int preallocateDiskSpace(int fd, size_t len) - { - #if defined(__FreeBSD__) - return posix_fallocate(fd, 0, len); - #else - return fallocate(fd, 0, 0, len); - #endif - } -} - -SSDCachePartition::Metadata::time_point_t SSDCachePartition::Metadata::expiresAt() const -{ - return ext::safe_bit_cast(data & KEY_METADATA_EXPIRES_AT_MASK); -} - -void SSDCachePartition::Metadata::setExpiresAt(const time_point_t & t) -{ - data = ext::safe_bit_cast(t); -} - -bool SSDCachePartition::Metadata::isDefault() const -{ - return (data & KEY_METADATA_IS_DEFAULT_MASK) == KEY_METADATA_IS_DEFAULT_MASK; -} -void SSDCachePartition::Metadata::setDefault() -{ - data |= KEY_METADATA_IS_DEFAULT_MASK; -} - -bool SSDCachePartition::Index::inMemory() const -{ - return (index & KEY_IN_MEMORY) == KEY_IN_MEMORY; -} - -bool SSDCachePartition::Index::exists() const -{ - return index != NOT_EXISTS; -} - -void SSDCachePartition::Index::setNotExists() -{ - index = NOT_EXISTS; -} - -void SSDCachePartition::Index::setInMemory(const bool in_memory) -{ - index = (index & ~KEY_IN_MEMORY) | (static_cast(in_memory) << KEY_IN_MEMORY_BIT); -} - -size_t SSDCachePartition::Index::getAddressInBlock() const -{ - return index & INDEX_IN_BLOCK_MASK; -} - -void SSDCachePartition::Index::setAddressInBlock(const size_t address_in_block) -{ - index = (index & ~INDEX_IN_BLOCK_MASK) | address_in_block; -} - -size_t SSDCachePartition::Index::getBlockId() const -{ - return (index & BLOCK_INDEX_MASK) >> INDEX_IN_BLOCK_BITS; -} - -void SSDCachePartition::Index::setBlockId(const size_t block_id) -{ - index = (index & ~BLOCK_INDEX_MASK) | (block_id << INDEX_IN_BLOCK_BITS); -} - -SSDCachePartition::SSDCachePartition( - const AttributeUnderlyingType & /* key_structure */, - const std::vector & attributes_structure_, - const std::string & dir_path, - const size_t file_id_, - const size_t max_size_, - const size_t block_size_, - const size_t read_buffer_size_, - const size_t write_buffer_size_, - const size_t max_stored_keys_) - : file_id(file_id_) - , max_size(max_size_) - , block_size(block_size_) - , read_buffer_size(read_buffer_size_) - , write_buffer_size(write_buffer_size_) - , max_stored_keys(max_stored_keys_) - , path(dir_path + "/" + std::to_string(file_id)) - , key_to_index(max_stored_keys) - , attributes_structure(attributes_structure_) -{ - keys_buffer.type = AttributeUnderlyingType::utUInt64; - keys_buffer.values = SSDCachePartition::Attribute::Container(); - - if (!std::filesystem::create_directories(std::filesystem::path{dir_path})) - { - if (std::filesystem::exists(std::filesystem::path{dir_path})) - LOG_INFO(&Poco::Logger::get("SSDCachePartition::Constructor"), "Using existing directory '{}' for cache-partition", dir_path); - else - throw Exception{"Failed to create directories.", ErrorCodes::CANNOT_CREATE_DIRECTORY}; - } - - { - ProfileEvents::increment(ProfileEvents::FileOpen); - - const std::string filename = path + BIN_FILE_EXT; - fd = ::open(filename.c_str(), O_RDWR | O_CREAT | O_TRUNC | O_DIRECT, 0666); - if (fd == -1) - { - auto error_code = (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE; - throwFromErrnoWithPath("Cannot open file " + filename, filename, error_code); - } - - if (preallocateDiskSpace(fd, max_size * block_size) < 0) - { - throwFromErrnoWithPath("Cannot preallocate space for the file " + filename, filename, ErrorCodes::CANNOT_ALLOCATE_MEMORY); - } - } -} - -SSDCachePartition::~SSDCachePartition() -{ - std::unique_lock lock(rw_lock); - ::close(fd); -} - -size_t SSDCachePartition::appendDefaults( - const Attribute & new_keys, const PaddedPODArray & metadata, const size_t begin) -{ - return appendBlock(new_keys, Attributes{}, metadata, begin); -} - -size_t SSDCachePartition::appendBlock( - const Attribute & new_keys, const Attributes & new_attributes, const PaddedPODArray & metadata, const size_t begin) -{ - std::unique_lock lock(rw_lock); - if (!new_attributes.empty() && new_attributes.size() != attributes_structure.size()) - throw Exception{"Wrong columns number in block.", ErrorCodes::BAD_ARGUMENTS}; - - const auto & ids = std::get>(new_keys.values); - auto & ids_buffer = std::get>(keys_buffer.values); - - if (!memory) - memory.emplace(block_size * write_buffer_size, BUFFER_ALIGNMENT); - - auto init_write_buffer = [&]() - { - write_buffer.emplace(memory->data() + current_memory_block_id * block_size, block_size); - uint64_t tmp = 0; - write_buffer->write(reinterpret_cast(&tmp), BLOCK_CHECKSUM_SIZE_BYTES); - write_buffer->write(reinterpret_cast(&tmp), BLOCK_SPECIAL_FIELDS_SIZE_BYTES); - keys_in_block = 0; - }; - - if (!write_buffer) - init_write_buffer(); - - bool flushed = false; - auto finish_block = [&]() - { - write_buffer.reset(); - std::memcpy(memory->data() + block_size * current_memory_block_id + BLOCK_CHECKSUM_SIZE_BYTES, &keys_in_block, sizeof(keys_in_block)); // set count - uint64_t checksum = CityHash_v1_0_2::CityHash64(memory->data() + block_size * current_memory_block_id + BLOCK_CHECKSUM_SIZE_BYTES, block_size - BLOCK_CHECKSUM_SIZE_BYTES); // checksum - std::memcpy(memory->data() + block_size * current_memory_block_id, &checksum, sizeof(checksum)); - if (++current_memory_block_id == write_buffer_size) - flush(); - flushed = true; - }; - - for (size_t index = begin; index < ids.size();) - { - Index cache_index; - cache_index.setInMemory(true); - cache_index.setBlockId(current_memory_block_id); - if (current_memory_block_id >= write_buffer_size) - throw DB::Exception("lel " + std::to_string(current_memory_block_id) + " " + - std::to_string(write_buffer_size) + " " + std::to_string(index), ErrorCodes::LOGICAL_ERROR); - - cache_index.setAddressInBlock(write_buffer->offset()); - - flushed = false; - if (2 * sizeof(UInt64) > write_buffer->available()) // place for key and metadata - { - finish_block(); - } - else - { - writeBinary(ids[index], *write_buffer); - writeBinary(metadata[index].data, *write_buffer); - } - - for (const auto & attribute : new_attributes) - { - if (flushed) - break; - switch (attribute.type) - { -#define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - { \ - if (sizeof(TYPE) > write_buffer->available()) \ - { \ - finish_block(); \ - continue; \ - } \ - else \ - { \ - const auto & values = std::get>(attribute.values); /* NOLINT */ \ - writeBinary(values[index], *write_buffer); \ - } \ - } \ - break; - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - - case AttributeUnderlyingType::utString: - { - const auto & value = std::get>(attribute.values)[index]; - if (sizeof(UInt64) + value.size() > write_buffer->available()) - { - finish_block(); - continue; - } - else - { - writeStringBinary(value, *write_buffer); - } - } - break; - } - } - - if (!flushed) - { - key_to_index.set(ids[index], cache_index); - ids_buffer.push_back(ids[index]); - ++index; - ++keys_in_block; - } - else // next block in write buffer or flushed to ssd - { - init_write_buffer(); - } - } - return ids.size() - begin; -} - -void SSDCachePartition::flush() -{ - if (current_file_block_id >= max_size) - clearOldestBlocks(); - - const auto & ids = std::get>(keys_buffer.values); - if (ids.empty()) - return; - LOG_INFO(&Poco::Logger::get("SSDCachePartition::flush()"), "Flushing to Disk."); - - AIOContext aio_context{1}; - - iocb write_request{}; - iocb * write_request_ptr{&write_request}; - -#if defined(__FreeBSD__) - write_request.aio.aio_lio_opcode = LIO_WRITE; - write_request.aio.aio_fildes = fd; - write_request.aio.aio_buf = reinterpret_cast(memory->data()); - write_request.aio.aio_nbytes = block_size * write_buffer_size; - write_request.aio.aio_offset = (current_file_block_id % max_size) * block_size; -#else - write_request.aio_lio_opcode = IOCB_CMD_PWRITE; - write_request.aio_fildes = fd; - write_request.aio_buf = reinterpret_cast(memory->data()); - write_request.aio_nbytes = block_size * write_buffer_size; - write_request.aio_offset = (current_file_block_id % max_size) * block_size; -#endif - - while (io_submit(aio_context.ctx, 1, &write_request_ptr) < 0) - { - if (errno != EINTR) - throw Exception("Cannot submit request for asynchronous IO on file " + path + BIN_FILE_EXT, ErrorCodes::CANNOT_IO_SUBMIT); - } - - CurrentMetrics::Increment metric_increment_write{CurrentMetrics::Write}; - - io_event event; - while (io_getevents(aio_context.ctx, 1, 1, &event, nullptr) < 0) - { - if (errno != EINTR) - throw Exception("Failed to wait for asynchronous IO completion on file " + path + BIN_FILE_EXT, ErrorCodes::CANNOT_IO_GETEVENTS); - } - - // Unpoison the memory returned from an uninstrumented system function. - __msan_unpoison(&event, sizeof(event)); - - ssize_t bytes_written; -#if defined(__FreeBSD__) - bytes_written = aio_return(reinterpret_cast(event.udata)); -#else - bytes_written = event.res; -#endif - - ProfileEvents::increment(ProfileEvents::WriteBufferAIOWrite); - ProfileEvents::increment(ProfileEvents::WriteBufferAIOWriteBytes, bytes_written); - - if (bytes_written != static_cast(block_size * write_buffer_size)) - throw Exception("Not all data was written for asynchronous IO on file " + path + BIN_FILE_EXT + ". returned: " + std::to_string(bytes_written), ErrorCodes::AIO_WRITE_ERROR); - - if (::fsync(fd) < 0) - throwFromErrnoWithPath("Cannot fsync " + path + BIN_FILE_EXT, path + BIN_FILE_EXT, ErrorCodes::CANNOT_FSYNC); - - /// commit changes in index - for (const auto & id : ids) - { - Index index; - if (key_to_index.get(id, index)) - { - if (index.inMemory()) // Row can be inserted in the buffer twice, so we need to move to ssd only the last index. - { - index.setInMemory(false); - index.setBlockId((current_file_block_id % max_size) + index.getBlockId()); - } - key_to_index.set(id, index); - } - } - - current_file_block_id += write_buffer_size; - current_memory_block_id = 0; - - /// clear buffer - std::visit([](auto & attr) { attr.clear(); }, keys_buffer.values); -} - -template -void SSDCachePartition::getValue(const size_t attribute_index, const PaddedPODArray & ids, - ResultArrayType & out, std::vector & found, GetDefault & default_value_extractor, - std::chrono::system_clock::time_point now) const -{ - auto set_value = [&](const size_t index, ReadBuffer & buf) - { - buf.ignore(sizeof(Key)); // key - Metadata metadata; - readBinary(metadata.data, buf); - if (metadata.expiresAt() > now) - { - if (metadata.isDefault()) - out[index] = default_value_extractor[index]; - else - { - ignoreFromBufferToAttributeIndex(attribute_index, buf); - readBinary(out[index], buf); - } - found[index] = true; - } - }; - - getImpl(ids, set_value, found); -} - -void SSDCachePartition::getString(const size_t attribute_index, const PaddedPODArray & ids, - StringRefs & refs, ArenaWithFreeLists & arena, std::vector & found, std::vector & default_ids, - std::chrono::system_clock::time_point now) const -{ - auto set_value = [&](const size_t index, ReadBuffer & buf) - { - buf.ignore(sizeof(Key)); // key - Metadata metadata; - readBinary(metadata.data, buf); - - if (metadata.expiresAt() > now) - { - if (metadata.isDefault()) - default_ids.push_back(index); - else - { - ignoreFromBufferToAttributeIndex(attribute_index, buf); - size_t size = 0; - readVarUInt(size, buf); - char * string_ptr = arena.alloc(size); - memcpy(string_ptr, buf.position(), size); - refs[index].data = string_ptr; - refs[index].size = size; - } - found[index] = true; - } - }; - - getImpl(ids, set_value, found); -} - -void SSDCachePartition::has(const PaddedPODArray & ids, ResultArrayType & out, - std::vector & found, std::chrono::system_clock::time_point now) const -{ - auto set_value = [&](const size_t index, ReadBuffer & buf) - { - buf.ignore(sizeof(Key)); // key - Metadata metadata; - readBinary(metadata.data, buf); - - if (metadata.expiresAt() > now) - out[index] = !metadata.isDefault(); - }; - - getImpl(ids, set_value, found); -} - -template -void SSDCachePartition::getImpl(const PaddedPODArray & ids, SetFunc & set, - std::vector & found) const -{ - std::shared_lock lock(rw_lock); - PaddedPODArray indices(ids.size()); - for (size_t i = 0; i < ids.size(); ++i) - { - Index index; - if (found[i]) - indices[i].setNotExists(); - else if (key_to_index.get(ids[i], index)) - { - indices[i] = index; - } - else - indices[i].setNotExists(); - } - - getValueFromMemory(indices, set); - getValueFromStorage(indices, set); -} - -template -void SSDCachePartition::getValueFromMemory(const PaddedPODArray & indices, SetFunc & set) const -{ - // Do not check checksum while reading from memory. - for (size_t i = 0; i < indices.size(); ++i) - { - const auto & index = indices[i]; - if (index.exists() && index.inMemory()) - { - const size_t offset = index.getBlockId() * block_size + index.getAddressInBlock(); - - ReadBufferFromMemory read_buffer(memory->data() + offset, block_size * write_buffer_size - offset); - set(i, read_buffer); - } - } -} - -template -void SSDCachePartition::getValueFromStorage(const PaddedPODArray & indices, SetFunc & set) const -{ - std::vector> index_to_out; - for (size_t i = 0; i < indices.size(); ++i) - { - const auto & index = indices[i]; - if (index.exists() && !index.inMemory()) - index_to_out.emplace_back(index, i); - } - if (index_to_out.empty()) - return; - - /// sort by (block_id, offset_in_block) - std::sort(std::begin(index_to_out), std::end(index_to_out)); - - Memory read_buffer(block_size * read_buffer_size, BUFFER_ALIGNMENT); - - std::vector requests; - std::vector pointers; - std::vector> blocks_to_indices; - requests.reserve(index_to_out.size()); - pointers.reserve(index_to_out.size()); - blocks_to_indices.reserve(index_to_out.size()); - for (size_t i = 0; i < index_to_out.size(); ++i) - { - #if defined(__FreeBSD__) - const size_t back_offset = requests.empty() ? -1 : static_cast(requests.back().aio.aio_offset); - #else - const size_t back_offset = requests.empty() ? -1 : static_cast(requests.back().aio_offset); - #endif - - if (!requests.empty() && back_offset == index_to_out[i].first.getBlockId() * block_size) - { - blocks_to_indices.back().push_back(i); - continue; - } - - iocb request{}; -#if defined(__FreeBSD__) - request.aio.aio_lio_opcode = LIO_READ; - request.aio.aio_fildes = fd; - request.aio.aio_buf = reinterpret_cast( - reinterpret_cast(read_buffer.data()) + block_size * (requests.size() % read_buffer_size)); - request.aio.aio_nbytes = block_size; - request.aio.aio_offset = index_to_out[i].first.getBlockId() * block_size; - request.aio_data = requests.size(); -#else - request.aio_lio_opcode = IOCB_CMD_PREAD; - request.aio_fildes = fd; - request.aio_buf = reinterpret_cast(read_buffer.data()) + block_size * (requests.size() % read_buffer_size); - request.aio_nbytes = block_size; - request.aio_offset = index_to_out[i].first.getBlockId() * block_size; - request.aio_data = requests.size(); -#endif - requests.push_back(request); - pointers.push_back(&requests.back()); - blocks_to_indices.emplace_back(); - blocks_to_indices.back().push_back(i); - } - - AIOContext aio_context(read_buffer_size); - - std::vector processed(requests.size(), false); - std::vector events(requests.size()); - #if defined(__linux__) - for (auto & event : events) - event.res = -1; - #endif - - size_t to_push = 0; - size_t to_pop = 0; - while (to_pop < requests.size()) - { - int popped = 0; - while (to_pop < to_push && (popped = io_getevents(aio_context.ctx, to_push - to_pop, to_push - to_pop, &events[to_pop], nullptr)) <= 0) - { - if (errno != EINTR) - throwFromErrno("io_getevents: Failed to get an event for asynchronous IO", ErrorCodes::CANNOT_IO_GETEVENTS); - } - - for (size_t i = to_pop; i < to_pop + popped; ++i) - { - const auto request_id = events[i].data; - const auto & request = requests[request_id]; - - #if defined(__FreeBSD__) - const auto bytes_written = aio_return(reinterpret_cast(events[i].udata)); - #else - const auto bytes_written = events[i].res; - #endif - - if (bytes_written != static_cast(block_size)) - { - #if defined(__FreeBSD__) - throw Exception("AIO failed to read file " + path + BIN_FILE_EXT + ".", ErrorCodes::AIO_READ_ERROR); - #else - throw Exception("AIO failed to read file " + path + BIN_FILE_EXT + ". " + - "request_id= " + std::to_string(request.aio_data) + "/ " + std::to_string(requests.size()) + - ", aio_nbytes=" + std::to_string(request.aio_nbytes) + ", aio_offset=" + std::to_string(request.aio_offset) + - ", returned=" + std::to_string(events[i].res) + ", errno=" + std::to_string(errno), ErrorCodes::AIO_READ_ERROR); - #endif - } - #if defined(__FreeBSD__) - const char* buf_ptr = reinterpret_cast(reinterpret_cast(request.aio.aio_buf)); - #else - const auto* buf_ptr = reinterpret_cast(request.aio_buf); - #endif - __msan_unpoison(buf_ptr, block_size); - uint64_t checksum = 0; - ReadBufferFromMemory buf_special(buf_ptr, block_size); - readBinary(checksum, buf_special); - uint64_t calculated_checksum = CityHash_v1_0_2::CityHash64(buf_ptr + BLOCK_CHECKSUM_SIZE_BYTES, block_size - BLOCK_CHECKSUM_SIZE_BYTES); - if (checksum != calculated_checksum) - { - throw Exception("Cache data corrupted. From block = " + std::to_string(checksum) + " calculated = " + std::to_string(calculated_checksum) + ".", ErrorCodes::CORRUPTED_DATA); - } - - for (const size_t idx : blocks_to_indices[request_id]) - { - const auto & [file_index, out_index] = index_to_out[idx]; - ReadBufferFromMemory buf( - buf_ptr + file_index.getAddressInBlock(), - block_size - file_index.getAddressInBlock()); - set(out_index, buf); - } - - processed[request_id] = true; - } - - while (to_pop < requests.size() && processed[to_pop]) - ++to_pop; - - /// add new io tasks - const int new_tasks_count = std::min(read_buffer_size - (to_push - to_pop), requests.size() - to_push); - - int pushed = 0; - while (new_tasks_count > 0 && (pushed = io_submit(aio_context.ctx, new_tasks_count, &pointers[to_push])) <= 0) - { - if (errno != EINTR) - throwFromErrno("io_submit: Failed to submit a request for asynchronous IO", ErrorCodes::CANNOT_IO_SUBMIT); - } - to_push += pushed; - } -} - -void SSDCachePartition::clearOldestBlocks() -{ - // write_buffer_size, because we need to erase the whole buffer. - Memory read_buffer_memory(block_size * write_buffer_size, BUFFER_ALIGNMENT); - - iocb request{}; -#if defined(__FreeBSD__) - request.aio.aio_lio_opcode = LIO_READ; - request.aio.aio_fildes = fd; - request.aio.aio_buf = reinterpret_cast(reinterpret_cast(read_buffer_memory.data())); - request.aio.aio_nbytes = block_size * write_buffer_size; - request.aio.aio_offset = (current_file_block_id % max_size) * block_size; - request.aio_data = 0; -#else - request.aio_lio_opcode = IOCB_CMD_PREAD; - request.aio_fildes = fd; - request.aio_buf = reinterpret_cast(read_buffer_memory.data()); - request.aio_nbytes = block_size * write_buffer_size; - request.aio_offset = (current_file_block_id % max_size) * block_size; - request.aio_data = 0; -#endif - - { - iocb* request_ptr = &request; - io_event event{}; - AIOContext aio_context(1); - - while (io_submit(aio_context.ctx, 1, &request_ptr) != 1) - { - if (errno != EINTR) - throwFromErrno("io_submit: Failed to submit a request for asynchronous IO", ErrorCodes::CANNOT_IO_SUBMIT); - } - - while (io_getevents(aio_context.ctx, 1, 1, &event, nullptr) != 1) - { - if (errno != EINTR) - throwFromErrno("io_getevents: Failed to get an event for asynchronous IO", ErrorCodes::CANNOT_IO_GETEVENTS); - } - -#if defined(__FreeBSD__) - if (aio_return(reinterpret_cast(event.udata)) != static_cast(request.aio.aio_nbytes)) - throw Exception("GC: AIO failed to read file " + path + BIN_FILE_EXT + ".", ErrorCodes::AIO_READ_ERROR); -#else - if (event.res != static_cast(request.aio_nbytes)) - throw Exception("GC: AIO failed to read file " + path + BIN_FILE_EXT + ". " + - "aio_nbytes=" + std::to_string(request.aio_nbytes) + - ", returned=" + std::to_string(event.res) + ".", ErrorCodes::AIO_READ_ERROR); -#endif - __msan_unpoison(read_buffer_memory.data(), read_buffer_memory.size()); - } - - std::vector keys; - keys.reserve(write_buffer_size); - - for (size_t i = 0; i < write_buffer_size; ++i) - { - ReadBufferFromMemory read_buffer(read_buffer_memory.data() + i * block_size, block_size); - - uint64_t checksum = 0; - readBinary(checksum, read_buffer); - uint64_t calculated_checksum = CityHash_v1_0_2::CityHash64(read_buffer_memory.data() + i * block_size + BLOCK_CHECKSUM_SIZE_BYTES, block_size - BLOCK_CHECKSUM_SIZE_BYTES); - if (checksum != calculated_checksum) - { - throw Exception("Cache data corrupted. From block = " + std::to_string(checksum) + " calculated = " + std::to_string(calculated_checksum) + ".", ErrorCodes::CORRUPTED_DATA); - } - - uint32_t keys_in_current_block = 0; - readBinary(keys_in_current_block, read_buffer); - - for (uint32_t j = 0; j < keys_in_current_block; ++j) - { - keys.emplace_back(); - readBinary(keys.back(), read_buffer); - Metadata metadata; - readBinary(metadata.data, read_buffer); - - if (!metadata.isDefault()) - { - for (const auto & attribute : attributes_structure) - { - switch (attribute) - { - #define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - read_buffer.ignore(sizeof(TYPE)); \ - break; - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) - #undef DISPATCH - - case AttributeUnderlyingType::utString: - { - size_t size = 0; - readVarUInt(size, read_buffer); - read_buffer.ignore(size); - } - break; - } - } - } - } - } - - const size_t start_block = current_file_block_id % max_size; - const size_t finish_block = start_block + write_buffer_size; - for (const auto & key : keys) - { - Index index; - if (key_to_index.get(key, index)) - { - size_t block_id = index.getBlockId(); - if (start_block <= block_id && block_id < finish_block) - key_to_index.erase(key); - } - } -} - -void SSDCachePartition::ignoreFromBufferToAttributeIndex(const size_t attribute_index, ReadBuffer & buf) const -{ - for (size_t i = 0; i < attribute_index; ++i) - { - switch (attributes_structure[i]) - { -#define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - buf.ignore(sizeof(TYPE)); \ - break; - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - - case AttributeUnderlyingType::utString: - { - size_t size = 0; - readVarUInt(size, buf); - buf.ignore(size); - } - break; - } - } -} - -size_t SSDCachePartition::getId() const -{ - return file_id; -} - -double SSDCachePartition::getLoadFactor() const -{ - std::shared_lock lock(rw_lock); - return static_cast(current_file_block_id) / max_size; -} - -size_t SSDCachePartition::getElementCount() const -{ - std::shared_lock lock(rw_lock); - return key_to_index.size(); -} - -size_t SSDCachePartition::getBytesAllocated() const -{ - std::shared_lock lock(rw_lock); - return 16.5 * key_to_index.capacity() + (memory ? memory->size() : 0); -} - -PaddedPODArray SSDCachePartition::getCachedIds(const std::chrono::system_clock::time_point /* now */) const -{ - std::unique_lock lock(rw_lock); // Begin and end iterators can be changed. - PaddedPODArray array; - for (const auto & key : key_to_index.keys()) - array.push_back(key); - return array; -} - -void SSDCachePartition::remove() -{ - std::unique_lock lock(rw_lock); - std::filesystem::remove(std::filesystem::path(path + BIN_FILE_EXT)); -} - -SSDCacheStorage::SSDCacheStorage( - const AttributeTypes & attributes_structure_, - const std::string & path_, - const size_t max_partitions_count_, - const size_t file_size_, - const size_t block_size_, - const size_t read_buffer_size_, - const size_t write_buffer_size_, - const size_t max_stored_keys_) - : attributes_structure(attributes_structure_) - , path(path_) - , max_partitions_count(max_partitions_count_) - , file_size(file_size_) - , block_size(block_size_) - , read_buffer_size(read_buffer_size_) - , write_buffer_size(write_buffer_size_) - , max_stored_keys(max_stored_keys_) - , log(&Poco::Logger::get("SSDCacheStorage")) -{ -} - -SSDCacheStorage::~SSDCacheStorage() -{ - std::unique_lock lock(rw_lock); - partition_delete_queue.splice(std::end(partition_delete_queue), partitions); - collectGarbage(); -} - -template -void SSDCacheStorage::getValue(const size_t attribute_index, const PaddedPODArray & ids, - ResultArrayType & out, std::unordered_map> & not_found, - GetDefault & default_value_extractor, std::chrono::system_clock::time_point now) const -{ - std::vector found(ids.size(), false); - - { - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - partition->getValue(attribute_index, ids, out, found, default_value_extractor, now); - } - - for (size_t i = 0; i < ids.size(); ++i) - if (!found[i]) - not_found[ids[i]].push_back(i); - - query_count.fetch_add(ids.size(), std::memory_order_relaxed); - hit_count.fetch_add(ids.size() - not_found.size(), std::memory_order_release); -} - -void SSDCacheStorage::getString(const size_t attribute_index, const PaddedPODArray & ids, - StringRefs & refs, ArenaWithFreeLists & arena, std::unordered_map> & not_found, - std::vector & default_ids, std::chrono::system_clock::time_point now) const -{ - std::vector found(ids.size(), false); - - { - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - partition->getString(attribute_index, ids, refs, arena, found, default_ids, now); - } - - for (size_t i = 0; i < ids.size(); ++i) - if (!found[i]) - not_found[ids[i]].push_back(i); - - query_count.fetch_add(ids.size(), std::memory_order_relaxed); - hit_count.fetch_add(ids.size() - not_found.size(), std::memory_order_release); -} - -void SSDCacheStorage::has(const PaddedPODArray & ids, ResultArrayType & out, - std::unordered_map> & not_found, std::chrono::system_clock::time_point now) const -{ - for (size_t i = 0; i < ids.size(); ++i) - out[i] = HAS_NOT_FOUND; - std::vector found(ids.size(), false); - - { - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - partition->has(ids, out, found, now); - - for (size_t i = 0; i < ids.size(); ++i) - if (out[i] == HAS_NOT_FOUND) - not_found[ids[i]].push_back(i); - } - - query_count.fetch_add(ids.size(), std::memory_order_relaxed); - hit_count.fetch_add(ids.size() - not_found.size(), std::memory_order_release); -} - -namespace -{ -SSDCachePartition::Attributes createAttributesFromBlock( - const Block & block, const size_t begin_column, const std::vector & structure) -{ - SSDCachePartition::Attributes attributes; - - const auto columns = block.getColumns(); - for (size_t i = 0; i < structure.size(); ++i) - { - const auto & column = columns[i + begin_column]; - switch (structure[i]) - { -#define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - { \ - SSDCachePartition::Attribute::Container values(column->size()); \ - memcpy(&values[0], column->getRawData().data, sizeof(TYPE) * values.size()); \ - attributes.emplace_back(); \ - attributes.back().type = structure[i]; \ - attributes.back().values = std::move(values); \ - } \ - break; - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - - case AttributeUnderlyingType::utString: - { - attributes.emplace_back(); - SSDCachePartition::Attribute::Container values(column->size()); - for (size_t j = 0; j < column->size(); ++j) - { - const auto ref = column->getDataAt(j); - values[j].resize(ref.size); - memcpy(values[j].data(), ref.data, ref.size); - } - attributes.back().type = structure[i]; - attributes.back().values = std::move(values); - } - break; - } - } - - return attributes; -} -} - -template -void SSDCacheStorage::update(DictionarySourcePtr & source_ptr, const std::vector & requested_ids, - PresentIdHandler && on_updated, AbsentIdHandler && on_id_not_found, - const DictionaryLifetime lifetime) -{ - auto append_block = [this](const SSDCachePartition::Attribute & new_keys, - const SSDCachePartition::Attributes & new_attributes, const PaddedPODArray & metadata) - { - size_t inserted = 0; - while (inserted < metadata.size()) - { - if (!partitions.empty()) - inserted += partitions.front()->appendBlock(new_keys, new_attributes, metadata, inserted); - if (inserted < metadata.size()) - { - partitions.emplace_front(std::make_unique( - AttributeUnderlyingType::utUInt64, attributes_structure, path, - (partitions.empty() ? 0 : partitions.front()->getId() + 1), - file_size, block_size, read_buffer_size, write_buffer_size, max_stored_keys)); - } - } - - collectGarbage(); - }; - - CurrentMetrics::Increment metric_increment{CurrentMetrics::DictCacheRequests}; - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, requested_ids.size()); - - std::unordered_map remaining_ids{requested_ids.size()}; - for (const auto id : requested_ids) - remaining_ids.insert({id, 0}); - - const auto now = std::chrono::system_clock::now(); - - { - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - - if (now > backoff_end_time) - { - try - { - if (update_error_count) - { - /// Recover after error: we have to clone the source here because - /// it could keep connections which should be reset after error. - source_ptr = source_ptr->clone(); - } - - Stopwatch watch; - auto stream = source_ptr->loadIds(requested_ids); - stream->readPrefix(); - - while (const auto block = stream->read()) - { - const auto new_keys = std::move(createAttributesFromBlock(block, 0, { AttributeUnderlyingType::utUInt64 }).front()); - const auto new_attributes = createAttributesFromBlock(block, 1, attributes_structure); - - const auto & ids = std::get>(new_keys.values); - - PaddedPODArray metadata(ids.size()); - - for (const auto i : ext::range(0, ids.size())) - { - std::uniform_int_distribution distribution{lifetime.min_sec, lifetime.max_sec}; - metadata[i].setExpiresAt(now + std::chrono::seconds(distribution(rnd_engine))); - /// mark corresponding id as found - on_updated(ids[i], i, new_attributes); - remaining_ids[ids[i]] = 1; - } - - append_block(new_keys, new_attributes, metadata); - } - - stream->readSuffix(); - - update_error_count = 0; - last_update_exception = std::exception_ptr{}; - backoff_end_time = std::chrono::system_clock::time_point{}; - - ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed()); - } - catch (...) - { - ++update_error_count; - last_update_exception = std::current_exception(); - backoff_end_time = now + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, update_error_count)); - - tryLogException(last_update_exception, log, - "Could not update ssd cache dictionary, next update is scheduled at " + ext::to_string(backoff_end_time)); - } - } - } - - auto append_defaults = [this](const SSDCachePartition::Attribute & new_keys, const PaddedPODArray & metadata) - { - size_t inserted = 0; - while (inserted < metadata.size()) - { - if (!partitions.empty()) - inserted += partitions.front()->appendDefaults(new_keys, metadata, inserted); - if (inserted < metadata.size()) - { - partitions.emplace_front(std::make_unique( - AttributeUnderlyingType::utUInt64, attributes_structure, path, - (partitions.empty() ? 0 : partitions.front()->getId() + 1), - file_size, block_size, read_buffer_size, write_buffer_size, max_stored_keys)); - } - } - - collectGarbage(); - }; - - size_t not_found_num = 0, found_num = 0; - /// Check which ids have not been found and require setting null_value - SSDCachePartition::Attribute new_keys; - new_keys.type = AttributeUnderlyingType::utUInt64; - new_keys.values = SSDCachePartition::Attribute::Container(); - - PaddedPODArray metadata; - { - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - - for (const auto & id_found_pair : remaining_ids) - { - if (id_found_pair.second) - { - ++found_num; - continue; - } - ++not_found_num; - - const auto id = id_found_pair.first; - - if (update_error_count) - { - /// TODO: use old values - - // We don't have expired data for that `id` so all we can do is - // to rethrow `last_exception`. We might have to throw the same - // exception for different callers of dictGet() in different - // threads, which might then modify the exception object, so we - // have to throw a copy. - try - { - std::rethrow_exception(last_update_exception); - } - catch (...) - { - throw DB::Exception(ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL, - "Update failed for dictionary '{}': {}", - getPath(), - getCurrentExceptionMessage(true /*with stack trace*/, - true /*check embedded stack trace*/)); - } - } - - /// Set key - std::get>(new_keys.values).push_back(id); - - std::uniform_int_distribution distribution{lifetime.min_sec, lifetime.max_sec}; - metadata.emplace_back(); - metadata.back().setExpiresAt(now + std::chrono::seconds(distribution(rnd_engine))); - metadata.back().setDefault(); - - /// Inform caller that the cell has not been found - on_id_not_found(id); - } - - if (not_found_num) - append_defaults(new_keys, metadata); - } - - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num); - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num); - ProfileEvents::increment(ProfileEvents::DictCacheRequests); -} - -PaddedPODArray SSDCacheStorage::getCachedIds() const -{ - PaddedPODArray array; - - const auto now = std::chrono::system_clock::now(); - - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - { - const auto cached_in_partition = partition->getCachedIds(now); - array.insert(std::begin(cached_in_partition), std::end(cached_in_partition)); - } - - return array; -} - -double SSDCacheStorage::getLoadFactor() const -{ - double result = 0; - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - result += partition->getLoadFactor(); - return result / partitions.size(); -} - -size_t SSDCacheStorage::getElementCount() const -{ - size_t result = 0; - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - result += partition->getElementCount(); - return result; -} - -size_t SSDCacheStorage::getBytesAllocated() const -{ - size_t result = 0; - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - result += partition->getBytesAllocated(); - return result; -} - -void SSDCacheStorage::collectGarbage() -{ - // add partitions to queue - while (partitions.size() > max_partitions_count) - partition_delete_queue.splice(std::end(partition_delete_queue), partitions, std::prev(std::end(partitions))); - - // drop unused partitions - while (!partition_delete_queue.empty() && partition_delete_queue.front().use_count() == 1) - { - partition_delete_queue.front()->remove(); - partition_delete_queue.pop_front(); - } -} - -SSDCacheDictionary::SSDCacheDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - const std::string & path_, - const size_t max_partitions_count_, - const size_t file_size_, - const size_t block_size_, - const size_t read_buffer_size_, - const size_t write_buffer_size_, - const size_t max_stored_keys_) - : IDictionary(dict_id_) - , dict_struct(dict_struct_) - , source_ptr(std::move(source_ptr_)) - , dict_lifetime(dict_lifetime_) - , path(path_) - , max_partitions_count(max_partitions_count_) - , file_size(file_size_) - , block_size(block_size_) - , read_buffer_size(read_buffer_size_) - , write_buffer_size(write_buffer_size_) - , max_stored_keys(max_stored_keys_) - , storage(ext::map(dict_struct.attributes, [](const auto & attribute) { return attribute.underlying_type; }), - path, max_partitions_count, file_size, block_size, read_buffer_size, write_buffer_size, max_stored_keys) - , log(&Poco::Logger::get("SSDCacheDictionary")) -{ - LOG_INFO(log, "Using storage path '{}'.", path); - if (!this->source_ptr->supportsSelectiveLoad()) - throw Exception{name + ": source cannot be used with CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - createAttributes(); -} - -ColumnPtr SSDCacheDictionary::getColumn( - const std::string & attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes &, - const ColumnPtr default_values_column) const -{ - ColumnPtr result; - - PaddedPODArray backup_storage; - const auto & ids = getColumnVectorData(this, key_columns.front(), backup_storage); - auto keys_size = ids.size(); - - const auto index = getAttributeIndex(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto & null_value = std::get(null_values[index]); - DictionaryDefaultValueExtractor default_value_extractor(null_value, default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) - { - getItemsStringImpl(index, ids, column.get(), default_value_extractor); - } - else - { - auto & out = column->getData(); - getItemsNumberImpl(index, ids, out, default_value_extractor); - } - - result = std::move(column); - }; - - callOnDictionaryAttributeType(dict_struct.attributes[index].underlying_type, type_call); - - return result; -} - -template -void SSDCacheDictionary::getItemsNumberImpl( - const size_t attribute_index, - const PaddedPODArray & ids, - ResultArrayType & out, - DefaultGetter & default_value_extractor) const -{ - const auto now = std::chrono::system_clock::now(); - - std::unordered_map> not_found_ids; - storage.getValue(attribute_index, ids, out, not_found_ids, default_value_extractor, now); - if (not_found_ids.empty()) - return; - - std::vector required_ids(not_found_ids.size()); - std::transform(std::begin(not_found_ids), std::end(not_found_ids), std::begin(required_ids), [](const auto & pair) { return pair.first; }); - - storage.update( - source_ptr, - required_ids, - [&](const auto id, const auto row, const auto & new_attributes) - { - for (const size_t out_row : not_found_ids[id]) - out[out_row] = std::get>(new_attributes[attribute_index].values)[row]; - }, - [&](const size_t id) - { - for (const size_t row : not_found_ids[id]) - out[row] = default_value_extractor[row]; - }, - getLifetime()); -} - -template -void SSDCacheDictionary::getItemsStringImpl( - const size_t attribute_index, - const PaddedPODArray & ids, - ColumnString * out, - DefaultGetter & default_value_extractor) const -{ - const auto now = std::chrono::system_clock::now(); - - std::unordered_map> not_found_ids; - - StringRefs refs(ids.size()); - ArenaWithFreeLists string_arena; - std::vector default_rows; - storage.getString(attribute_index, ids, refs, string_arena, not_found_ids, default_rows, now); - std::sort(std::begin(default_rows), std::end(default_rows)); - - if (not_found_ids.empty()) - { - size_t default_index = 0; - for (size_t row = 0; row < ids.size(); ++row) - { - if (unlikely(default_index != default_rows.size() && default_rows[default_index] == row)) - { - auto to_insert = default_value_extractor[row]; - out->insertData(to_insert.data, to_insert.size); - ++default_index; - } - else - out->insertData(refs[row].data, refs[row].size); - } - return; - } - - std::vector required_ids(not_found_ids.size()); - std::transform(std::begin(not_found_ids), std::end(not_found_ids), std::begin(required_ids), [](const auto & pair) { return pair.first; }); - - std::unordered_map update_result; - - storage.update( - source_ptr, - required_ids, - [&](const auto id, const auto row, const auto & new_attributes) - { - update_result[id] = std::get>(new_attributes[attribute_index].values)[row]; - }, - [&](const size_t) {}, - getLifetime()); - - size_t default_index = 0; - for (size_t row = 0; row < ids.size(); ++row) - { - const auto & id = ids[row]; - if (unlikely(default_index != default_rows.size() && default_rows[default_index] == row)) - { - auto to_insert = default_value_extractor[row]; - out->insertData(to_insert.data, to_insert.size); - ++default_index; - } - else if (auto it = not_found_ids.find(id); it == std::end(not_found_ids)) - { - out->insertData(refs[row].data, refs[row].size); - } - else if (auto it_update = update_result.find(id); it_update != std::end(update_result)) - { - out->insertData(it_update->second.data(), it_update->second.size()); - } - else - { - auto to_insert = default_value_extractor[row]; - out->insertData(to_insert.data, to_insert.size); - } - } -} - -ColumnUInt8::Ptr SSDCacheDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const -{ - PaddedPODArray backup_storage; - const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); - - auto result = ColumnUInt8::create(ext::size(ids)); - auto& out = result->getData(); - - const auto rows = ext::size(ids); - for (const auto row : ext::range(0, rows)) - out[row] = false; - - const auto now = std::chrono::system_clock::now(); - - std::unordered_map> not_found_ids; - storage.has(ids, out, not_found_ids, now); - if (not_found_ids.empty()) - return result; - - std::vector required_ids(not_found_ids.size()); - std::transform(std::begin(not_found_ids), std::end(not_found_ids), std::begin(required_ids), [](const auto & pair) { return pair.first; }); - - storage.update( - source_ptr, - required_ids, - [&](const auto id, const auto, const auto &) - { - for (const size_t out_row : not_found_ids[id]) - out[out_row] = true; - }, - [&](const size_t id) - { - for (const size_t row : not_found_ids[id]) - out[row] = false; - }, - getLifetime()); - - return result; -} - -BlockInputStreamPtr SSDCacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const -{ - using BlockInputStreamType = DictionaryBlockInputStream; - return std::make_shared(shared_from_this(), max_block_size, storage.getCachedIds(), column_names); -} - -size_t SSDCacheDictionary::getAttributeIndex(const std::string & attr_name) const -{ - auto it = attribute_index_by_name.find(attr_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{"Attribute `" + name + "` does not exist.", ErrorCodes::BAD_ARGUMENTS}; - return it->second; -} - -template -AttributeValueVariant SSDCacheDictionary::createAttributeNullValueWithTypeImpl(const Field & null_value) -{ - AttributeValueVariant var_null_value = static_cast(null_value.get>()); - bytes_allocated += sizeof(T); - return var_null_value; -} - -template <> -AttributeValueVariant SSDCacheDictionary::createAttributeNullValueWithTypeImpl(const Field & null_value) -{ - AttributeValueVariant var_null_value = null_value.get(); - bytes_allocated += sizeof(StringRef); - return var_null_value; -} - -AttributeValueVariant SSDCacheDictionary::createAttributeNullValueWithType(const AttributeUnderlyingType type, const Field & null_value) -{ - switch (type) - { -#define DISPATCH(TYPE) \ -case AttributeUnderlyingType::ut##TYPE: \ - return createAttributeNullValueWithTypeImpl(null_value); - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) - DISPATCH(String) -#undef DISPATCH - } - throw Exception{"Unknown attribute type: " + std::to_string(static_cast(type)), ErrorCodes::TYPE_MISMATCH}; -} - -void SSDCacheDictionary::createAttributes() -{ - null_values.reserve(dict_struct.attributes.size()); - for (size_t i = 0; i < dict_struct.attributes.size(); ++i) - { - const auto & attribute = dict_struct.attributes[i]; - - attribute_index_by_name.emplace(attribute.name, i); - null_values.push_back(createAttributeNullValueWithType(attribute.underlying_type, attribute.null_value)); - - if (attribute.hierarchical) - throw Exception{name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; - } -} - -void registerDictionarySSDCache(DictionaryFactory & factory) -{ - auto create_layout = [=](const std::string & name, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - if (dict_struct.key) - throw Exception{"'key' is not supported for dictionary of layout 'cache'", ErrorCodes::UNSUPPORTED_METHOD}; - - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - - if (dict_struct.range_min || dict_struct.range_max) - throw Exception{name - + ": elements .structure.range_min and .structure.range_max should be defined only " - "for a dictionary of layout 'range_hashed'", - ErrorCodes::BAD_ARGUMENTS}; - const auto & layout_prefix = config_prefix + ".layout"; - - const auto max_partitions_count = config.getInt(layout_prefix + ".ssd_cache.max_partitions_count", DEFAULT_PARTITIONS_COUNT); - if (max_partitions_count <= 0) - throw Exception{name + ": dictionary of layout 'ssd_cache' cannot have 0 (or less) max_partitions_count", ErrorCodes::BAD_ARGUMENTS}; - - const auto block_size = config.getInt(layout_prefix + ".ssd_cache.block_size", DEFAULT_SSD_BLOCK_SIZE_BYTES); - if (block_size <= 0) - throw Exception{name + ": dictionary of layout 'ssd_cache' cannot have 0 (or less) block_size", ErrorCodes::BAD_ARGUMENTS}; - - const auto file_size = config.getInt64(layout_prefix + ".ssd_cache.file_size", DEFAULT_FILE_SIZE_BYTES); - if (file_size <= 0) - throw Exception{name + ": dictionary of layout 'ssd_cache' cannot have 0 (or less) file_size", ErrorCodes::BAD_ARGUMENTS}; - if (file_size % block_size != 0) - throw Exception{name + ": file_size must be a multiple of block_size", ErrorCodes::BAD_ARGUMENTS}; - - const auto read_buffer_size = config.getInt64(layout_prefix + ".ssd_cache.read_buffer_size", DEFAULT_READ_BUFFER_SIZE_BYTES); - if (read_buffer_size <= 0) - throw Exception{name + ": dictionary of layout 'ssd_cache' cannot have 0 (or less) read_buffer_size", ErrorCodes::BAD_ARGUMENTS}; - if (read_buffer_size % block_size != 0) - throw Exception{name + ": read_buffer_size must be a multiple of block_size", ErrorCodes::BAD_ARGUMENTS}; - - const auto write_buffer_size = config.getInt64(layout_prefix + ".ssd_cache.write_buffer_size", DEFAULT_WRITE_BUFFER_SIZE_BYTES); - if (write_buffer_size <= 0) - throw Exception{name + ": dictionary of layout 'ssd_cache' cannot have 0 (or less) write_buffer_size", ErrorCodes::BAD_ARGUMENTS}; - if (write_buffer_size % block_size != 0) - throw Exception{name + ": write_buffer_size must be a multiple of block_size", ErrorCodes::BAD_ARGUMENTS}; - - auto path = config.getString(layout_prefix + ".ssd_cache.path"); - if (path.empty()) - throw Exception{name + ": dictionary of layout 'ssd_cache' cannot have empty path", - ErrorCodes::BAD_ARGUMENTS}; - if (path.at(0) != '/') - path = std::filesystem::path{config.getString("path")}.concat(path).string(); - - const auto max_stored_keys = config.getInt64(layout_prefix + ".ssd_cache.max_stored_keys", DEFAULT_MAX_STORED_KEYS); - if (max_stored_keys <= 0) - throw Exception{name + ": dictionary of layout 'ssd_cache' cannot have 0 (or less) max_stored_keys", ErrorCodes::BAD_ARGUMENTS}; - - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - return std::make_unique( - dict_id, dict_struct, std::move(source_ptr), dict_lifetime, path, - max_partitions_count, file_size / block_size, block_size, - read_buffer_size / block_size, write_buffer_size / block_size, - max_stored_keys); - }; - factory.registerLayout("ssd_cache", create_layout, false); -} - -} - -#endif diff --git a/src/Dictionaries/SSDCacheDictionary.h b/src/Dictionaries/SSDCacheDictionary.h deleted file mode 100644 index 4d4d3befa22..00000000000 --- a/src/Dictionaries/SSDCacheDictionary.h +++ /dev/null @@ -1,418 +0,0 @@ -#pragma once - -#if defined(__linux__) || defined(__FreeBSD__) - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" - -namespace DB -{ - -using AttributeValueVariant = std::variant< - UInt8, - UInt16, - UInt32, - UInt64, - UInt128, - Int8, - Int16, - Int32, - Int64, - Decimal32, - Decimal64, - Decimal128, - Float32, - Float64, - String>; - - -/* - Class for operations with cache file and index. - Supports GET/SET operations. -*/ -class SSDCachePartition -{ -public: - struct Index final - { - bool inMemory() const; - void setInMemory(bool in_memory); - - bool exists() const; - void setNotExists(); - - size_t getAddressInBlock() const; - void setAddressInBlock(size_t address_in_block); - - size_t getBlockId() const; - void setBlockId(size_t block_id); - - bool operator< (const Index & rhs) const { return index < rhs.index; } - - /// Stores `is_in_memory` flag, block id, address in uncompressed block - uint64_t index = 0; - }; - - struct Metadata final - { - using time_point_t = std::chrono::system_clock::time_point; - using time_point_rep_t = time_point_t::rep; - using time_point_urep_t = make_unsigned_t; - - time_point_t expiresAt() const; - void setExpiresAt(const time_point_t & t); - - bool isDefault() const; - void setDefault(); - - /// Stores both expiration time and `is_default` flag in the most significant bit - time_point_urep_t data = 0; - }; - - using Offset = size_t; - using Offsets = std::vector; - using Key = IDictionary::Key; - - SSDCachePartition( - const AttributeUnderlyingType & key_structure, - const std::vector & attributes_structure, - const std::string & dir_path, - size_t file_id, - size_t max_size, - size_t block_size, - size_t read_buffer_size, - size_t write_buffer_size, - size_t max_stored_keys); - - ~SSDCachePartition(); - - template - using ResultArrayType = std::conditional_t, DecimalPaddedPODArray, PaddedPODArray>; - - template - void getValue(size_t attribute_index, const PaddedPODArray & ids, - ResultArrayType & out, std::vector & found, GetDefault & default_value_extractor, - std::chrono::system_clock::time_point now) const; - - void getString(size_t attribute_index, const PaddedPODArray & ids, - StringRefs & refs, ArenaWithFreeLists & arena, std::vector & found, - std::vector & default_ids, std::chrono::system_clock::time_point now) const; - - void has(const PaddedPODArray & ids, ResultArrayType & out, - std::vector & found, std::chrono::system_clock::time_point now) const; - - struct Attribute - { - template - using Container = std::vector; - - AttributeUnderlyingType type; - std::variant< - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container> values; - }; - using Attributes = std::vector; - - size_t appendBlock(const Attribute & new_keys, const Attributes & new_attributes, - const PaddedPODArray & metadata, size_t begin); - - size_t appendDefaults(const Attribute & new_keys, const PaddedPODArray & metadata, size_t begin); - - void flush(); - - void remove(); - - size_t getId() const; - - PaddedPODArray getCachedIds(std::chrono::system_clock::time_point now) const; - - double getLoadFactor() const; - - size_t getElementCount() const; - - size_t getBytesAllocated() const; - -private: - void clearOldestBlocks(); - - template - void getImpl(const PaddedPODArray & ids, SetFunc & set, std::vector & found) const; - - template - void getValueFromMemory(const PaddedPODArray & indices, SetFunc & set) const; - - template - void getValueFromStorage(const PaddedPODArray & indices, SetFunc & set) const; - - void ignoreFromBufferToAttributeIndex(size_t attribute_index, ReadBuffer & buf) const; - - const size_t file_id; - const size_t max_size; - const size_t block_size; - const size_t read_buffer_size; - const size_t write_buffer_size; - const size_t max_stored_keys; - const std::string path; - - mutable std::shared_mutex rw_lock; - - int fd = -1; - - mutable BucketCacheIndex key_to_index; - - Attribute keys_buffer; - const std::vector attributes_structure; - - std::optional> memory; - std::optional write_buffer; - uint32_t keys_in_block = 0; - - size_t current_memory_block_id = 0; - size_t current_file_block_id = 0; -}; - -using SSDCachePartitionPtr = std::shared_ptr; - - -/* - Class for managing SSDCachePartition and getting data from source. -*/ -class SSDCacheStorage -{ -public: - using AttributeTypes = std::vector; - using Key = SSDCachePartition::Key; - - SSDCacheStorage( - const AttributeTypes & attributes_structure, - const std::string & path, - size_t max_partitions_count, - size_t file_size, - size_t block_size, - size_t read_buffer_size, - size_t write_buffer_size, - size_t max_stored_keys); - - ~SSDCacheStorage(); - - template - using ResultArrayType = SSDCachePartition::ResultArrayType; - - template - void getValue(size_t attribute_index, const PaddedPODArray & ids, - ResultArrayType & out, std::unordered_map> & not_found, - GetDefault & default_value_extractor, std::chrono::system_clock::time_point now) const; - - void getString(size_t attribute_index, const PaddedPODArray & ids, - StringRefs & refs, ArenaWithFreeLists & arena, std::unordered_map> & not_found, - std::vector & default_ids, std::chrono::system_clock::time_point now) const; - - void has(const PaddedPODArray & ids, ResultArrayType & out, - std::unordered_map> & not_found, std::chrono::system_clock::time_point now) const; - - template - void update(DictionarySourcePtr & source_ptr, const std::vector & requested_ids, - PresentIdHandler && on_updated, AbsentIdHandler && on_id_not_found, - DictionaryLifetime lifetime); - - PaddedPODArray getCachedIds() const; - - std::exception_ptr getLastException() const { return last_update_exception; } - - const std::string & getPath() const { return path; } - - size_t getQueryCount() const { return query_count.load(std::memory_order_relaxed); } - - size_t getHitCount() const { return hit_count.load(std::memory_order_acquire); } - - size_t getElementCount() const; - - double getLoadFactor() const; - - size_t getBytesAllocated() const; - -private: - void collectGarbage(); - - const AttributeTypes attributes_structure; - - const std::string path; - const size_t max_partitions_count; - const size_t file_size; - const size_t block_size; - const size_t read_buffer_size; - const size_t write_buffer_size; - const size_t max_stored_keys; - - mutable std::shared_mutex rw_lock; - std::list partitions; - std::list partition_delete_queue; - - Poco::Logger * const log; - - mutable pcg64 rnd_engine; - - mutable std::exception_ptr last_update_exception; - mutable size_t update_error_count = 0; - mutable std::chrono::system_clock::time_point backoff_end_time; - - mutable std::atomic hit_count{0}; - mutable std::atomic query_count{0}; -}; - - -/* - Dictionary interface -*/ -class SSDCacheDictionary final : public IDictionary -{ -public: - SSDCacheDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - DictionaryLifetime dict_lifetime_, - const std::string & path, - size_t max_partitions_count_, - size_t file_size_, - size_t block_size_, - size_t read_buffer_size_, - size_t write_buffer_size_, - size_t max_stored_keys_); - - std::string getTypeName() const override { return "SSDCache"; } - - size_t getBytesAllocated() const override { return storage.getBytesAllocated(); } - - size_t getQueryCount() const override { return storage.getQueryCount(); } - - double getHitRate() const override - { - return static_cast(storage.getHitCount()) / storage.getQueryCount(); - } - - size_t getElementCount() const override { return storage.getElementCount(); } - - double getLoadFactor() const override { return storage.getLoadFactor(); } - - bool supportUpdates() const override { return false; } - - std::shared_ptr clone() const override - { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, - path, max_partitions_count, file_size, block_size, read_buffer_size, write_buffer_size, max_stored_keys); - } - - const IDictionarySource * getSource() const override { return source_ptr.get(); } - - const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } - - const DictionaryStructure & getStructure() const override { return dict_struct; } - - bool isInjective(const std::string & attribute_name) const override - { - return dict_struct.attributes[getAttributeIndex(attribute_name)].injective; - } - - bool hasHierarchy() const override { return false; } - - void toParent(const PaddedPODArray &, PaddedPODArray &) const override { } - - std::exception_ptr getLastException() const override { return storage.getLastException(); } - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } - - ColumnPtr getColumn( - const std::string& attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const override; - - ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - - template - using ResultArrayType = SSDCacheStorage::ResultArrayType; - - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; - -private: - size_t getAttributeIndex(const std::string & attr_name) const; - - template - AttributeValueVariant createAttributeNullValueWithTypeImpl(const Field & null_value); - AttributeValueVariant createAttributeNullValueWithType(AttributeUnderlyingType type, const Field & null_value); - void createAttributes(); - - template - void getItemsNumberImpl( - size_t attribute_index, - const PaddedPODArray & ids, - ResultArrayType & out, - DefaultGetter & default_value_extractor) const; - - template - void getItemsStringImpl( - size_t attribute_index, - const PaddedPODArray & ids, - ColumnString * out, - DefaultGetter & default_value_extractor) const; - - const std::string name; - const DictionaryStructure dict_struct; - mutable DictionarySourcePtr source_ptr; - const DictionaryLifetime dict_lifetime; - - const std::string path; - const size_t max_partitions_count; - const size_t file_size; - const size_t block_size; - const size_t read_buffer_size; - const size_t write_buffer_size; - const size_t max_stored_keys; - - std::map attribute_index_by_name; - std::vector null_values; - mutable SSDCacheStorage storage; - Poco::Logger * const log; - - mutable size_t bytes_allocated = 0; -}; - -} - -#endif diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h new file mode 100644 index 00000000000..7d72beca35e --- /dev/null +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -0,0 +1,1390 @@ +#pragma once + +#if defined(__linux__) || defined(__FreeBSD__) + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event FileOpen; + extern const Event WriteBufferAIOWrite; + extern const Event WriteBufferAIOWriteBytes; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int AIO_READ_ERROR; + extern const int AIO_WRITE_ERROR; + extern const int CANNOT_ALLOCATE_MEMORY; + extern const int CANNOT_CREATE_DIRECTORY; + extern const int CANNOT_FSYNC; + extern const int CANNOT_IO_GETEVENTS; + extern const int CANNOT_IO_SUBMIT; + extern const int CANNOT_OPEN_FILE; + extern const int CORRUPTED_DATA; + extern const int FILE_DOESNT_EXIST; + extern const int UNSUPPORTED_METHOD; + extern const int NOT_IMPLEMENTED; +} + +struct SSDCacheDictionaryStorageConfiguration +{ + const size_t strict_max_lifetime_seconds; + const DictionaryLifetime lifetime; + + const std::string file_path; + const size_t max_partitions_count; + const size_t block_size; + const size_t file_blocks_size; + const size_t read_buffer_blocks_size; + const size_t write_buffer_blocks_size; +}; + + +/** Simple Key is serialized in block with following structure + key | data_size | data + 8 bytes | 8 bytes | data_size bytes + + Complex Key is serialized in block with following structure + key_size | key_data | data_size | data + 8 bytes | key_size bytes | 8 bytes | data_size bytes +*/ +template +struct SSDCacheKey final +{ + using KeyType = TKeyType; + + SSDCacheKey(KeyType key_, size_t size_, const char * data_) + : key(key_) + , size(size_) + , data(data_) + {} + + KeyType key; + size_t size; + const char * data; +}; + +using SSDCacheSimpleKey = SSDCacheKey; +using SSDCacheComplexKey = SSDCacheKey; + +/** Block is serialized with following structure + check_sum | keys_size | [keys] + 8 bytes | 8 bytes | +*/ +class SSDCacheBlock final +{ + static constexpr size_t block_header_check_sum_size = sizeof(size_t); + static constexpr size_t block_header_keys_size = sizeof(size_t); +public: + + /// Block header size + static constexpr size_t block_header_size = block_header_check_sum_size + block_header_keys_size; + + explicit SSDCacheBlock(size_t block_size_) + : block_size(block_size_) + {} + + /// Checks if simple key can be written in empty block with block_size + static bool canBeWrittenInEmptyBlock(SSDCacheSimpleKey & simple_key, size_t block_size) + { + static constexpr size_t simple_key_size = sizeof(simple_key.key); + + return (block_header_size + simple_key_size + sizeof(simple_key.size) + simple_key.size) <= block_size; + } + + /// Checks if complex key can be written in empty block with block_size + static bool canBeWrittenInEmptyBlock(SSDCacheComplexKey & complex_key, size_t block_size) + { + StringRef & key = complex_key.key; + size_t complex_key_size = sizeof(key.size) + key.size; + + return (block_header_size + complex_key_size + sizeof(complex_key.size) + complex_key.size) <= block_size; + } + + /// Reset block with new block_data + /// block_data must be filled with zeroes if it is new block + inline void reset(char * new_block_data) + { + block_data = new_block_data; + current_block_offset = block_header_size; + keys_size = unalignedLoad(new_block_data + block_header_check_sum_size); + } + + /// Check if it is enough place to write key in block + inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const + { + return (current_block_offset + (sizeof(cache_key.key) + sizeof(cache_key.size) + cache_key.size)) <= block_size; + } + + /// Check if it is enough place to write key in block + inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const + { + const StringRef & key = cache_key.key; + size_t complex_key_size = sizeof(key.size) + key.size; + + return (current_block_offset + (complex_key_size + sizeof(cache_key.size) + cache_key.size)) <= block_size; + } + + /// Write key and returns offset in ssd cache block where data is written + /// It is client responsibility to check if there is enough place in block to write key + /// Returns true if key was written and false if there was not enough place to write key + inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block) + { + assert(cache_key.size > 0); + + if (!enoughtPlaceToWriteKey(cache_key)) + return false; + + char * current_block_offset_data = block_data + current_block_offset; + + /// Write simple key + memcpy(reinterpret_cast(current_block_offset_data), reinterpret_cast(&cache_key.key), sizeof(cache_key.key)); + current_block_offset_data += sizeof(cache_key.key); + current_block_offset += sizeof(cache_key.key); + + /// Write serialized columns size + memcpy(reinterpret_cast(current_block_offset_data), reinterpret_cast(&cache_key.size), sizeof(cache_key.size)); + current_block_offset_data += sizeof(cache_key.size); + current_block_offset += sizeof(cache_key.size); + + offset_in_block = current_block_offset; + + memcpy(reinterpret_cast(current_block_offset_data), reinterpret_cast(cache_key.data), cache_key.size); + current_block_offset += cache_key.size; + + ++keys_size; + + return true; + } + + inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block) + { + assert(cache_key.size > 0); + + if (!enoughtPlaceToWriteKey(cache_key)) + return false; + + char * current_block_offset_data = block_data + current_block_offset; + + const StringRef & key = cache_key.key; + + /// Write complex key + memcpy(reinterpret_cast(current_block_offset_data), reinterpret_cast(&key.size), sizeof(key.size)); + current_block_offset_data += sizeof(key.size); + current_block_offset += sizeof(key.size); + + memcpy(reinterpret_cast(current_block_offset_data), reinterpret_cast(key.data), key.size); + current_block_offset_data += key.size; + current_block_offset += key.size; + + /// Write serialized columns size + memcpy(reinterpret_cast(current_block_offset_data), reinterpret_cast(&cache_key.size), sizeof(cache_key.size)); + current_block_offset_data += sizeof(cache_key.size); + current_block_offset += sizeof(cache_key.size); + + offset_in_block = current_block_offset; + + memcpy(reinterpret_cast(current_block_offset_data), reinterpret_cast(cache_key.data), cache_key.size); + current_block_offset += cache_key.size; + + ++keys_size; + + return true; + } + + inline size_t getKeysSize() const { return keys_size; } + + /// Write keys size into block header + inline void writeKeysSize() + { + char * keys_size_offset_data = block_data + block_header_check_sum_size; + std::memcpy(keys_size_offset_data, &keys_size, sizeof(size_t)); + } + + /// Get check sum from block header + inline size_t getCheckSum() const { return unalignedLoad(block_data); } + + /// Calculate check sum in block + inline size_t calculateCheckSum() const + { + size_t calculated_check_sum = static_cast(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size)); + + return calculated_check_sum; + } + + /// Check if check sum from block header matched calculated check sum in block + inline bool checkCheckSum() const + { + size_t calculated_check_sum = calculateCheckSum(); + size_t check_sum = getCheckSum(); + + return calculated_check_sum == check_sum; + } + + /// Write check sum in block header + inline void writeCheckSum() + { + size_t check_sum = static_cast(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size)); + std::memcpy(block_data, &check_sum, sizeof(size_t)); + } + + inline size_t getBlockSize() const { return block_size; } + + /// Returns block data + inline char * getBlockData() const { return block_data; } + + /// Read keys that were serialized in block + /// It is client responsibility to ensure that simple or complex keys were written in block + void readSimpleKeys(PaddedPODArray & simple_keys) const + { + char * block_start = block_data + block_header_size; + char * block_end = block_data + block_size; + + static constexpr size_t key_prefix_size = sizeof(UInt64) + sizeof(size_t); + + while (block_start + key_prefix_size < block_end) + { + UInt64 key = unalignedLoad(block_start); + block_start += sizeof(UInt64); + + size_t allocated_size = unalignedLoad(block_start); + block_start += sizeof(size_t); + + /// If we read empty allocated size that means it is end of block + if (allocated_size == 0) + break; + + simple_keys.emplace_back(key); + block_start += allocated_size; + } + } + + void readComplexKeys(PaddedPODArray & complex_keys) const + { + char * block_start = block_data + block_header_size; + char * block_end = block_data + block_size; + + static constexpr size_t key_prefix_size = sizeof(size_t) + sizeof(size_t); + + while (block_start + key_prefix_size < block_end) + { + size_t key_size = unalignedLoad(block_start); + block_start += sizeof(key_size); + + StringRef complex_key (block_start, key_size); + + block_start += key_size; + + size_t allocated_size = unalignedLoad(block_start); + block_start += sizeof(size_t); + + /// If we read empty allocated size that means it is end of block + if (allocated_size == 0) + break; + + complex_keys.emplace_back(complex_key); + block_start += allocated_size; + } + } + +private: + size_t block_size; + char * block_data = nullptr; + + size_t current_block_offset = block_header_size; + size_t keys_size = 0; +}; + +struct SSDCacheIndex +{ + SSDCacheIndex(size_t block_index_, size_t offset_in_block_) + : block_index(block_index_) + , offset_in_block(offset_in_block_) + {} + + SSDCacheIndex() = default; + + size_t block_index = 0; + size_t offset_in_block = 0; +}; + +inline bool operator==(const SSDCacheIndex & lhs, const SSDCacheIndex & rhs) +{ + return lhs.block_index == rhs.block_index && lhs.offset_in_block == rhs.offset_in_block; +} + +/** Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks and current write block. + * If key cannot be written into current_write_block, current block keys size and check summ is written + * and buffer increase index of current_write_block_index. + * If current_write_block_index == memory_buffer_blocks_size write key will always returns true. + * If reset is called current_write_block_index is set to 0. + */ +template +class SSDCacheMemoryBuffer +{ +public: + using KeyType = typename SSDCacheKeyType::KeyType; + + explicit SSDCacheMemoryBuffer(size_t block_size_, size_t memory_buffer_blocks_size_) + : block_size(block_size_) + , partition_blocks_size(memory_buffer_blocks_size_) + , buffer(block_size * partition_blocks_size, 4096) + , current_write_block(block_size) + { + current_write_block.reset(buffer.m_data); + } + + bool writeKey(const SSDCacheKeyType & key, SSDCacheIndex & index) + { + if (current_block_index == partition_blocks_size) + return false; + + size_t block_offset = 0; + bool write_in_current_block = current_write_block.writeKey(key, block_offset); + + if (write_in_current_block) + { + index.block_index = current_block_index; + index.offset_in_block = block_offset; + return true; + } + + current_write_block.writeKeysSize(); + current_write_block.writeCheckSum(); + + ++current_block_index; + + if (current_block_index == partition_blocks_size) + return false; + + current_write_block.reset(buffer.m_data + (block_size * current_block_index)); + + write_in_current_block = current_write_block.writeKey(key, block_offset); + assert(write_in_current_block); + + index.block_index = current_block_index; + index.offset_in_block = block_offset; + + return write_in_current_block; + } + + void writeKeysSizeAndCheckSumForCurrentWriteBlock() + { + current_write_block.writeKeysSize(); + current_write_block.writeCheckSum(); + } + + inline char * getPlace(SSDCacheIndex index) const + { + return buffer.m_data + index.block_index * block_size + index.offset_in_block; + } + + inline size_t getCurrentBlockIndex() const { return current_block_index; } + + inline const char * getData() const { return buffer.m_data; } + + inline size_t getSizeInBytes() const { return block_size * partition_blocks_size; } + + void readKeys(PaddedPODArray & keys) const + { + SSDCacheBlock block(block_size); + + for (size_t block_index = 0; block_index < partition_blocks_size; ++block_index) + { + block.reset(buffer.m_data + (block_index * block_size)); + + if constexpr (std::is_same_v) + block.readSimpleKeys(keys); + else + block.readComplexKeys(keys); + } + } + + inline void reset() + { + current_block_index = 0; + current_write_block.reset(buffer.m_data); + } + + const size_t block_size; + + const size_t partition_blocks_size; + +private: + Memory> buffer; + + SSDCacheBlock current_write_block; + + size_t current_block_index = 0; +}; + +/// Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks on file system +template +class SSDCacheFileBuffer : private boost::noncopyable +{ + static constexpr auto BIN_FILE_EXT = ".bin"; + +public: + + using KeyType = typename SSDCacheKeyType::KeyType; + + explicit SSDCacheFileBuffer( + const std::string & file_path_, + size_t block_size_, + size_t file_blocks_size_) + : file_path(file_path_ + BIN_FILE_EXT) + , block_size(block_size_) + , file_blocks_size(file_blocks_size_) + { + auto path = std::filesystem::path{file_path}; + auto parent_path_directory = path.parent_path(); + + /// If cache file is in directory that does not exists create it + if (!std::filesystem::exists(parent_path_directory)) + if (!std::filesystem::create_directories(parent_path_directory)) + throw Exception(ErrorCodes::CANNOT_CREATE_DIRECTORY, "Failed to create directories."); + + ProfileEvents::increment(ProfileEvents::FileOpen); + + file.fd = ::open(file_path.c_str(), O_RDWR | O_CREAT | O_TRUNC | O_DIRECT, 0666); + if (file.fd == -1) + { + auto error_code = (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE; + throwFromErrnoWithPath("Cannot open file " + file_path, file_path, error_code); + } + + allocateSizeForNextPartition(); + } + + void allocateSizeForNextPartition() + { + if (preallocateDiskSpace(file.fd, current_blocks_size * block_size, block_size * file_blocks_size) < 0) + throwFromErrnoWithPath("Cannot preallocate space for the file " + file_path, file_path, ErrorCodes::CANNOT_ALLOCATE_MEMORY); + + current_blocks_size += file_blocks_size; + } + + bool writeBuffer(const char * buffer, size_t buffer_size_in_blocks) + { + if (current_block_index + buffer_size_in_blocks > current_blocks_size) + return false; + + AIOContext aio_context{1}; + + iocb write_request{}; + iocb * write_request_ptr{&write_request}; + + #if defined(__FreeBSD__) + write_request.aio.aio_lio_opcode = LIO_WRITE; + write_request.aio.aio_fildes = file.fd; + write_request.aio.aio_buf = reinterpret_cast(const_cast(buffer)); + write_request.aio.aio_nbytes = block_size * buffer_size_in_blocks; + write_request.aio.aio_offset = current_block_index * block_size; + #else + write_request.aio_lio_opcode = IOCB_CMD_PWRITE; + write_request.aio_fildes = file.fd; + write_request.aio_buf = reinterpret_cast(buffer); + write_request.aio_nbytes = block_size * buffer_size_in_blocks; + write_request.aio_offset = current_block_index * block_size; + #endif + + while (io_submit(aio_context.ctx, 1, &write_request_ptr) < 0) + { + if (errno != EINTR) + throw Exception(ErrorCodes::CANNOT_IO_SUBMIT, "Cannot submit request for asynchronous IO on file {}", file_path); + } + + // CurrentMetrics::Increment metric_increment_write{CurrentMetrics::Write}; + + io_event event; + + while (io_getevents(aio_context.ctx, 1, 1, &event, nullptr) < 0) + { + if (errno != EINTR) + throw Exception(ErrorCodes::CANNOT_IO_GETEVENTS, "Failed to wait for asynchronous IO completion on file {}", file_path); + } + + // Unpoison the memory returned from an uninstrumented system function. + __msan_unpoison(&event, sizeof(event)); + + auto bytes_written = eventResult(event); + + ProfileEvents::increment(ProfileEvents::WriteBufferAIOWrite); + ProfileEvents::increment(ProfileEvents::WriteBufferAIOWriteBytes, bytes_written); + + if (bytes_written != static_cast(block_size * buffer_size_in_blocks)) + throw Exception(ErrorCodes::AIO_WRITE_ERROR, + "Not all data was written for asynchronous IO on file {}. returned: {}", + file_path, + std::to_string(bytes_written)); + + if (::fsync(file.fd) < 0) + throwFromErrnoWithPath("Cannot fsync " + file_path, file_path, ErrorCodes::CANNOT_FSYNC); + + current_block_index += buffer_size_in_blocks; + + return true; + } + + bool readKeys(size_t block_start, size_t blocks_length, PaddedPODArray & out) const + { + if (block_start + blocks_length > current_blocks_size) + return false; + + size_t buffer_size_in_bytes = blocks_length * block_size; + + Memory read_buffer_memory(block_size * blocks_length, block_size); + + iocb request{}; + iocb * request_ptr = &request; + + #if defined(__FreeBSD__) + request.aio.aio_lio_opcode = LIO_READ; + request.aio.aio_fildes = file.fd; + request.aio.aio_buf = reinterpret_cast(reinterpret_cast(read_buffer_memory.data())); + request.aio.aio_nbytes = buffer_size_in_bytes; + request.aio.aio_offset = block_start * block_size; + request.aio_data = 0; + #else + request.aio_lio_opcode = IOCB_CMD_PREAD; + request.aio_fildes = file.fd; + request.aio_buf = reinterpret_cast(read_buffer_memory.data()); + request.aio_nbytes = buffer_size_in_bytes; + request.aio_offset = block_start * block_size; + request.aio_data = 0; + #endif + + io_event event{}; + AIOContext aio_context(1); + + while (io_submit(aio_context.ctx, 1, &request_ptr) != 1) + { + if (errno != EINTR) + throwFromErrno("io_submit: Failed to submit a request for asynchronous IO", ErrorCodes::CANNOT_IO_SUBMIT); + } + + while (io_getevents(aio_context.ctx, 1, 1, &event, nullptr) != 1) + { + if (errno != EINTR) + throwFromErrno("io_getevents: Failed to get an event for asynchronous IO", ErrorCodes::CANNOT_IO_GETEVENTS); + } + + auto read_bytes = eventResult(event); + + if (read_bytes != static_cast(buffer_size_in_bytes)) + throw Exception(ErrorCodes::AIO_READ_ERROR, + "GC: AIO failed to read file {}. Expected bytes {}. Actual bytes {}", + file_path, + buffer_size_in_bytes, + read_bytes); + + SSDCacheBlock block(block_size); + + for (size_t i = 0; i < blocks_length; ++i) + { + block.reset(read_buffer_memory.data() + (i * block_size)); + + if constexpr (std::is_same_v) + block.readSimpleKeys(out); + else + block.readComplexKeys(out); + } + + return true; + } + + template + void fetchBlocks(size_t read_from_file_buffer_blocks_size, const PaddedPODArray & blocks_to_fetch, FetchBlockFunc && func) const + { + if (blocks_to_fetch.empty()) + return; + + Memory> read_buffer(read_from_file_buffer_blocks_size * block_size, 4096); + + size_t blocks_to_fetch_size = blocks_to_fetch.size(); + + PaddedPODArray requests; + PaddedPODArray pointers; + + requests.reserve(blocks_to_fetch_size); + pointers.reserve(blocks_to_fetch_size); + + for (size_t block_to_fetch_index = 0; block_to_fetch_index < blocks_to_fetch_size; ++block_to_fetch_index) + { + iocb request{}; + + char * buffer_place = read_buffer.data() + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size); + + #if defined(__FreeBSD__) + request.aio.aio_lio_opcode = LIO_READ; + request.aio.aio_fildes = file.fd; + request.aio.aio_buf = reinterpret_cast(reinterpret_cast(buffer_place)); + request.aio.aio_nbytes = block_size; + request.aio.aio_offset = block_size * blocks_to_fetch[block_to_fetch_index]; + request.aio_data = block_to_fetch_index; + #else + request.aio_lio_opcode = IOCB_CMD_PREAD; + request.aio_fildes = file.fd; + request.aio_buf = reinterpret_cast(buffer_place); + request.aio_nbytes = block_size; + request.aio_offset = block_size * blocks_to_fetch[block_to_fetch_index]; + request.aio_data = block_to_fetch_index; + #endif + + requests.push_back(request); + pointers.push_back(&requests.back()); + } + + AIOContext aio_context(read_from_file_buffer_blocks_size); + + PaddedPODArray processed(requests.size(), false); + PaddedPODArray events; + events.resize_fill(requests.size()); + + size_t to_push = 0; + size_t to_pop = 0; + + while (to_pop < requests.size()) + { + int popped = 0; + + while (to_pop < to_push && (popped = io_getevents(aio_context.ctx, to_push - to_pop, to_push - to_pop, &events[to_pop], nullptr)) <= 0) + { + if (errno != EINTR) + throwFromErrno("io_getevents: Failed to get an event for asynchronous IO", ErrorCodes::CANNOT_IO_GETEVENTS); + } + + for (size_t i = to_pop; i < to_pop + popped; ++i) + { + size_t block_to_fetch_index = events[i].data; + const auto & request = requests[block_to_fetch_index]; + + const ssize_t read_bytes = eventResult(events[i]); + + if (read_bytes != static_cast(block_size)) + throw Exception(ErrorCodes::AIO_READ_ERROR, + "GC: AIO failed to read file ({}). Expected bytes ({}). Actual bytes ({})", file_path, block_size, read_bytes); + + char * request_buffer = getRequestBuffer(request); + + // Unpoison the memory returned from an uninstrumented system function. + __msan_unpoison(request_buffer, block_size); + + SSDCacheBlock block(block_size); + block.reset(request_buffer); + + if (!block.checkCheckSum()) + { + std::string calculated_check_sum = std::to_string(block.calculateCheckSum()); + std::string check_sum = std::to_string(block.getCheckSum()); + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Cache data corrupted. Checksum validation failed. Calculated {} in block {}", + calculated_check_sum, + check_sum); + } + + std::forward(func)(blocks_to_fetch[block_to_fetch_index], block.getBlockData()); + + processed[block_to_fetch_index] = true; + } + + while (to_pop < requests.size() && processed[to_pop]) + ++to_pop; + + /// add new io tasks + const int new_tasks_count = std::min(read_from_file_buffer_blocks_size - (to_push - to_pop), requests.size() - to_push); + + int pushed = 0; + while (new_tasks_count > 0 && (pushed = io_submit(aio_context.ctx, new_tasks_count, &pointers[to_push])) <= 0) + { + if (errno != EINTR) + throwFromErrno("io_submit: Failed to submit a request for asynchronous IO", ErrorCodes::CANNOT_IO_SUBMIT); + } + + to_push += pushed; + } + } + + inline size_t getCurrentBlockIndex() const { return current_block_index; } + + inline void reset() + { + current_block_index = 0; + } +private: + struct FileDescriptor : private boost::noncopyable + { + + FileDescriptor() = default; + + FileDescriptor(FileDescriptor && rhs) : fd(rhs.fd) { rhs.fd = -1; } + + FileDescriptor & operator=(FileDescriptor && rhs) + { + close(fd); + + fd = rhs.fd; + rhs.fd = -1; + } + + ~FileDescriptor() + { + if (fd != -1) + close(fd); + } + + int fd = -1; + }; + + inline static int preallocateDiskSpace(int fd, size_t offset, size_t len) + { + #if defined(__FreeBSD__) + return posix_fallocate(fd, offset, len); + #else + return fallocate(fd, 0, offset, len); + #endif + } + + inline static char * getRequestBuffer(const iocb & request) + { + char * result = nullptr; + + #if defined(__FreeBSD__) + result = reinterpret_cast(reinterpret_cast(request.aio.aio_buf)); + #else + result = reinterpret_cast(request.aio_buf); + #endif + + return result; + } + + inline static ssize_t eventResult(io_event & event) + { + ssize_t bytes_written; + + #if defined(__FreeBSD__) + bytes_written = aio_return(reinterpret_cast(event.udata)); + #else + bytes_written = event.res; + #endif + + return bytes_written; + } + + String file_path; + size_t block_size; + size_t file_blocks_size; + FileDescriptor file; + + size_t current_block_index = 0; + size_t current_blocks_size = 0; +}; + +/** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions. + * Data is first written in memory buffer. + * If memory buffer is full then buffer is flushed to disk partition. + * If memory buffer cannot be flushed to associated disk partition, then if partition + * can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused. + * Index maps key to partition block and offset. + */ +template +class SSDCacheDictionaryStorage final : public ICacheDictionaryStorage +{ +public: + using SSDCacheKeyType = std::conditional_t; + using KeyType = std::conditional_t; + + explicit SSDCacheDictionaryStorage(const SSDCacheDictionaryStorageConfiguration & configuration_) + : configuration(configuration_) + , file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size) + , rnd_engine(randomSeed()) + { + memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size); + } + + bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return false; } + + String getName() const override + { + if (dictionary_key_type == DictionaryKeyType::simple) + return "SSDCache"; + else + return "SSDComplexKeyCache"; + } + + bool supportsSimpleKeys() const override { return dictionary_key_type == DictionaryKeyType::simple; } + + SimpleKeysStorageFetchResult fetchColumnsForKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & fetch_request) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return fetchColumnsForKeysImpl(keys, fetch_request); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertColumnsForKeys is not supported for complex key storage"); + } + + void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + insertColumnsForKeysImpl(keys, columns); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertColumnsForKeys is not supported for complex key storage"); + } + + void insertDefaultKeys(const PaddedPODArray & keys) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + insertDefaultKeysImpl(keys); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertDefaultKeysImpl is not supported for complex key storage"); + } + + PaddedPODArray getCachedSimpleKeys() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return getCachedKeysImpl(); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getCachedSimpleKeys is not supported for complex key storage"); + } + + bool supportsComplexKeys() const override { return dictionary_key_type == DictionaryKeyType::complex; } + + ComplexKeysStorageFetchResult fetchColumnsForKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & fetch_request) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + return fetchColumnsForKeysImpl(keys, fetch_request); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method fetchColumnsForKeys is not supported for simple key storage"); + } + + void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + insertColumnsForKeysImpl(keys, columns); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertColumnsForKeys is not supported for simple key storage"); + } + + void insertDefaultKeys(const PaddedPODArray & keys) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + insertDefaultKeysImpl(keys); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertDefaultKeysImpl is not supported for simple key storage"); + } + + PaddedPODArray getCachedComplexKeys() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + return getCachedKeysImpl(); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getCachedSimpleKeys is not supported for simple key storage"); + } + + size_t getSize() const override { return index.size(); } + + double getLoadFactor() const override + { + size_t partitions_size = memory_buffer_partitions.size(); + + if (partitions_size == configuration.max_partitions_count) + return 1.0; + + auto & current_memory_partition = memory_buffer_partitions[current_partition_index]; + + size_t full_partitions = partitions_size - 1; + size_t blocks_in_memory = (full_partitions * configuration.write_buffer_blocks_size) + current_memory_partition.getCurrentBlockIndex(); + size_t blocks_on_disk = file_buffer.getCurrentBlockIndex(); + + size_t max_blocks_size = (configuration.file_blocks_size + configuration.write_buffer_blocks_size) * configuration.max_partitions_count; + + double load_factor = static_cast(blocks_in_memory + blocks_on_disk) / max_blocks_size; + return load_factor; + } + + size_t getBytesAllocated() const override + { + size_t memory_partitions_bytes_size = memory_buffer_partitions.size() * configuration.write_buffer_blocks_size * configuration.block_size; + size_t file_partitions_bytes_size = memory_buffer_partitions.size() * configuration.file_blocks_size * configuration.block_size; + + return index.getBufferSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size; + } + +private: + + using TimePoint = std::chrono::system_clock::time_point; + + struct Cell + { + enum CellState + { + in_memory, + on_disk, + default_value + }; + + time_t deadline; + SSDCacheIndex index; + size_t in_memory_partition_index; + CellState state; + + inline bool isInMemory() const { return state == in_memory; } + inline bool isOnDisk() const { return state == on_disk; } + inline bool isDefaultValue() const { return state == default_value; } + }; + + struct KeyToBlockOffset + { + KeyToBlockOffset(size_t key_index_, size_t offset_in_block_) + : key_index(key_index_), offset_in_block(offset_in_block_) + {} + + size_t key_index = 0; + size_t offset_in_block = 0; + }; + + template + Result fetchColumnsForKeysImpl( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & fetch_request) const + { + Result result; + + result.fetched_columns = fetch_request.makeAttributesResultColumns(); + result.key_index_to_state.resize_fill(keys.size()); + + const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + + size_t fetched_columns_index = 0; + + using BlockIndexToKeysMap = absl::flat_hash_map, DefaultHash>; + BlockIndexToKeysMap block_to_keys_map; + absl::flat_hash_set> unique_blocks_to_request; + PaddedPODArray blocks_to_request; + + time_t strict_max_lifetime_seconds = static_cast(configuration.strict_max_lifetime_seconds); + size_t keys_size = keys.size(); + + for (size_t attribute_size = 0; attribute_size < fetch_request.attributesSize(); ++attribute_size) + if (fetch_request.shouldFillResultColumnWithIndex(attribute_size)) + result.fetched_columns[attribute_size]->reserve(keys_size); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys[key_index]; + + const auto * it = index.find(key); + + if (!it) + { + ++result.not_found_keys_size; + continue; + } + + const auto & cell = it->getMapped(); + + if (unlikely(now > cell.deadline + strict_max_lifetime_seconds)) + { + ++result.not_found_keys_size; + continue; + } + + bool cell_is_expired = false; + KeyState::State key_state = KeyState::found; + + if (now > cell.deadline) + { + cell_is_expired = true; + key_state = KeyState::expired; + } + + result.expired_keys_size += static_cast(cell_is_expired); + result.found_keys_size += static_cast(!cell_is_expired); + + switch (cell.state) + { + case Cell::in_memory: + { + result.key_index_to_state[key_index] = {key_state, fetched_columns_index}; + ++fetched_columns_index; + + const auto & partition = memory_buffer_partitions[cell.in_memory_partition_index]; + char * serialized_columns_place = partition.getPlace(cell.index); + deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, serialized_columns_place); + break; + } + case Cell::on_disk: + { + PaddedPODArray & keys_block = block_to_keys_map[cell.index.block_index]; + keys_block.emplace_back(key_index, cell.index.offset_in_block); + + KeyState::State state = cell_is_expired ? KeyState::expired : KeyState::found; + + /// Fetched column index will be set later during fetch blocks + result.key_index_to_state[key_index] = {state, 0}; + + auto insert_result = unique_blocks_to_request.insert(cell.index.block_index); + bool was_inserted = insert_result.second; + + if (was_inserted) + blocks_to_request.emplace_back(cell.index.block_index); + + break; + } + case Cell::default_value: + { + result.key_index_to_state[key_index] = {key_state, fetched_columns_index}; + result.key_index_to_state[key_index].setDefault(); + ++fetched_columns_index; + ++result.default_keys_size; + + insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index); + break; + } + } + } + + /// Sort blocks by offset before start async io requests + std::sort(blocks_to_request.begin(), blocks_to_request.end()); + + file_buffer.fetchBlocks(configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data) + { + auto & keys_in_block = block_to_keys_map[block_index]; + + for (auto & key_in_block : keys_in_block) + { + char * key_data = block_data + key_in_block.offset_in_block; + deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data); + + result.key_index_to_state[key_in_block.key_index].setFetchedColumnIndex(fetched_columns_index); + + ++fetched_columns_index; + } + }); + + return result; + } + + void insertColumnsForKeysImpl(const PaddedPODArray & keys, Columns columns) + { + size_t columns_to_serialize_size = columns.size(); + PaddedPODArray temporary_column_data(columns_to_serialize_size); + + Arena temporary_values_pool; + + const auto now = std::chrono::system_clock::now(); + + for (size_t key_index = 0; key_index < keys.size(); ++key_index) + { + size_t allocated_size_for_columns = 0; + const char * block_start = nullptr; + + auto key = keys[key_index]; + + for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index) + { + auto & column = columns[column_index]; + temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start); + allocated_size_for_columns += temporary_column_data[column_index].size; + } + + SSDCacheKeyType ssd_cache_key { key, allocated_size_for_columns, block_start }; + + if (!SSDCacheBlock::canBeWrittenInEmptyBlock(ssd_cache_key, configuration.block_size)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Serialized columns size is greater than allowed block size and metadata"); + + /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index + eraseKeyFromIndex(key); + + Cell cell; + setCellDeadline(cell, now); + + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + { + /// Copy complex key into arena and put in cache + size_t key_size = key.size; + char * place_for_key = complex_key_arena.alloc(key_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); + KeyType updated_key{place_for_key, key_size}; + ssd_cache_key.key = updated_key; + } + + insertCell(ssd_cache_key, cell); + + temporary_values_pool.rollback(allocated_size_for_columns); + } + } + + void insertDefaultKeysImpl(const PaddedPODArray & keys) + { + const auto now = std::chrono::system_clock::now(); + + for (auto key : keys) + { + eraseKeyFromIndex(key); + + Cell cell; + + setCellDeadline(cell, now); + cell.index = {0, 0}; + cell.in_memory_partition_index = 0; + cell.state = Cell::default_value; + + + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + { + /// Copy complex key into arena and put in cache + size_t key_size = key.size; + char * place_for_key = complex_key_arena.alloc(key_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); + KeyType updated_key{place_for_key, key_size}; + key = updated_key; + } + + index[key] = cell; + } + } + + PaddedPODArray getCachedKeysImpl() const + { + PaddedPODArray result; + result.reserve(index.size()); + + for (auto & node : index) + { + auto & cell = node.getMapped(); + + if (cell.state == Cell::default_value) + continue; + + result.emplace_back(node.getKey()); + } + + return result; + } + + void insertCell(SSDCacheKeyType & ssd_cache_key, Cell & cell) + { + /** InsertCell has following flow + + 1. We try to write key into current memory buffer, if write succeeded then return. + 2. Then if we does not write key into current memory buffer, we try to flush current memory buffer + to disk. + + If flush succeeded then reset current memory buffer, write key into it and return. + If flush failed that means that current partition on disk is full, need to allocate new partition + or start reusing old ones. + + Retry to step 1. + */ + + SSDCacheIndex cache_index {0, 0}; + + while (true) + { + bool started_reusing_old_partitions = memory_buffer_partitions.size() == configuration.max_partitions_count; + + auto & current_memory_buffer_partition = memory_buffer_partitions[current_partition_index]; + + bool write_into_memory_buffer_result = current_memory_buffer_partition.writeKey(ssd_cache_key, cache_index); + + if (write_into_memory_buffer_result) + { + cell.state = Cell::in_memory; + cell.index = cache_index; + cell.in_memory_partition_index = current_partition_index; + + index[ssd_cache_key.key] = cell; + break; + } + else + { + /// Partition memory write buffer if full flush it to disk and retry + size_t block_index_in_file_before_write = file_buffer.getCurrentBlockIndex(); + + if (started_reusing_old_partitions) + { + /// If we start reusing old partitions we need to remove old keys on disk from index before writing buffer + PaddedPODArray old_keys; + file_buffer.readKeys(block_index_in_file_before_write, configuration.write_buffer_blocks_size, old_keys); + + size_t file_read_end_block_index = block_index_in_file_before_write + configuration.write_buffer_blocks_size; + + for (auto old_key : old_keys) + { + auto * it = index.find(old_key); + + if (it) + { + const Cell & old_key_cell = it->getMapped(); + + size_t old_key_block = old_key_cell.index.block_index; + + /// Check if key in index is key from old partition blocks + if (old_key_cell.isOnDisk() && + old_key_block >= block_index_in_file_before_write && + old_key_block < file_read_end_block_index) + eraseKeyFromIndex(old_key); + } + } + } + + const char * partition_data = current_memory_buffer_partition.getData(); + + bool flush_to_file_result = file_buffer.writeBuffer(partition_data, configuration.write_buffer_blocks_size); + + if (flush_to_file_result) + { + /// Update index cells keys offset and block index + PaddedPODArray keys_to_update; + current_memory_buffer_partition.readKeys(keys_to_update); + + absl::flat_hash_set> updated_keys; + + Int64 keys_to_update_size = static_cast(keys_to_update.size()); + + /// Start from last to first because there can be multiple keys in same partition. + /// The valid key is the latest. + for (Int64 i = keys_to_update_size - 1; i >= 0; --i) + { + auto key_to_update = keys_to_update[i]; + auto * it = index.find(key_to_update); + + /// If there are no key to update or key to update not in memory + if (!it || it->getMapped().state != Cell::in_memory) + continue; + + /// If there were duplicated keys in memory buffer partition + if (updated_keys.contains(it->getKey())) + continue; + + updated_keys.insert(key_to_update); + + Cell & cell_to_update = it->getMapped(); + + cell_to_update.state = Cell::on_disk; + cell_to_update.index.block_index += block_index_in_file_before_write; + } + + /// Memory buffer partition flushed to disk start reusing it + current_memory_buffer_partition.reset(); + memset(const_cast(current_memory_buffer_partition.getData()), 0, current_memory_buffer_partition.getSizeInBytes()); + + write_into_memory_buffer_result = current_memory_buffer_partition.writeKey(ssd_cache_key, cache_index); + assert(write_into_memory_buffer_result); + + cell.state = Cell::in_memory; + cell.index = cache_index; + cell.in_memory_partition_index = current_partition_index; + + index[ssd_cache_key.key] = cell; + break; + } + else + { + /// Partition is full need to try next partition + + if (memory_buffer_partitions.size() < configuration.max_partitions_count) + { + /// Try tro create next partition without reusing old partitions + ++current_partition_index; + file_buffer.allocateSizeForNextPartition(); + memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size); + } + else + { + /// Start reusing old partitions + current_partition_index = (current_partition_index + 1) % memory_buffer_partitions.size(); + file_buffer.reset(); + } + } + } + } + } + + inline void setCellDeadline(Cell & cell, TimePoint now) + { + if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) + { + auto deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); + return; + } + + size_t min_sec_lifetime = configuration.lifetime.min_sec; + size_t max_sec_lifetime = configuration.lifetime.max_sec; + + std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; + auto deadline = now + std::chrono::seconds(distribution(rnd_engine)); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); + } + + inline void eraseKeyFromIndex(KeyType key) + { + auto it = index.find(key); + + if (it == nullptr) + return; + + /// In case of complex key in arena key is serialized from hash table + KeyType key_copy = it->getKey(); + + index.erase(key); + + if constexpr (std::is_same_v) + complex_key_arena.free(const_cast(key_copy.data), key_copy.size); + } + + SSDCacheDictionaryStorageConfiguration configuration; + + SSDCacheFileBuffer file_buffer; + + std::vector> memory_buffer_partitions; + + pcg64 rnd_engine; + + using SimpleKeyHashMap = HashMap; + using ComplexKeyHashMap = HashMapWithSavedHash; + + using CacheMap = std::conditional_t< + dictionary_key_type == DictionaryKeyType::simple, + SimpleKeyHashMap, + ComplexKeyHashMap>; + + ArenaWithFreeLists complex_key_arena; + + CacheMap index; + + size_t current_partition_index = 0; + +}; + +} + +#endif diff --git a/src/Dictionaries/SSDComplexKeyCacheDictionary.cpp b/src/Dictionaries/SSDComplexKeyCacheDictionary.cpp deleted file mode 100644 index cb22dd2be15..00000000000 --- a/src/Dictionaries/SSDComplexKeyCacheDictionary.cpp +++ /dev/null @@ -1,1772 +0,0 @@ -#if defined(OS_LINUX) || defined(__FreeBSD__) - -#include "SSDComplexKeyCacheDictionary.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include "DictionaryBlockInputStream.h" -#include "DictionaryFactory.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ProfileEvents -{ - extern const Event DictCacheKeysRequested; - extern const Event DictCacheKeysRequestedMiss; - extern const Event DictCacheKeysRequestedFound; - extern const Event DictCacheKeysExpired; - extern const Event DictCacheKeysNotFound; - extern const Event DictCacheKeysHit; - extern const Event DictCacheRequestTimeNs; - extern const Event DictCacheRequests; - extern const Event DictCacheLockWriteNs; - extern const Event DictCacheLockReadNs; - extern const Event FileOpen; - extern const Event WriteBufferAIOWrite; - extern const Event WriteBufferAIOWriteBytes; -} - -namespace CurrentMetrics -{ - extern const Metric DictCacheRequests; - extern const Metric Write; -} - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int AIO_READ_ERROR; - extern const int AIO_WRITE_ERROR; - extern const int BAD_ARGUMENTS; - extern const int CACHE_DICTIONARY_UPDATE_FAIL; - extern const int CANNOT_ALLOCATE_MEMORY; - extern const int CANNOT_CREATE_DIRECTORY; - extern const int CANNOT_FSYNC; - extern const int CANNOT_IO_GETEVENTS; - extern const int CANNOT_IO_SUBMIT; - extern const int CANNOT_OPEN_FILE; - extern const int CORRUPTED_DATA; - extern const int FILE_DOESNT_EXIST; - extern const int NOT_IMPLEMENTED; - extern const int TYPE_MISMATCH; - extern const int UNSUPPORTED_METHOD; -} - -namespace -{ - constexpr size_t DEFAULT_SSD_BLOCK_SIZE_BYTES = DEFAULT_AIO_FILE_BLOCK_SIZE; - constexpr size_t DEFAULT_FILE_SIZE_BYTES = 4 * 1024 * 1024 * 1024ULL; - constexpr size_t DEFAULT_PARTITIONS_COUNT = 16; - constexpr size_t DEFAULT_READ_BUFFER_SIZE_BYTES = 16 * DEFAULT_SSD_BLOCK_SIZE_BYTES; - constexpr size_t DEFAULT_WRITE_BUFFER_SIZE_BYTES = DEFAULT_SSD_BLOCK_SIZE_BYTES; - - constexpr size_t DEFAULT_MAX_STORED_KEYS = 100000; - - constexpr size_t BUFFER_ALIGNMENT = DEFAULT_AIO_FILE_BLOCK_SIZE; - constexpr size_t BLOCK_CHECKSUM_SIZE_BYTES = 8; - constexpr size_t BLOCK_SPECIAL_FIELDS_SIZE_BYTES = 4; - - constexpr UInt64 KEY_METADATA_EXPIRES_AT_MASK = std::numeric_limits::max(); - constexpr UInt64 KEY_METADATA_IS_DEFAULT_MASK = ~KEY_METADATA_EXPIRES_AT_MASK; - - constexpr size_t KEY_IN_MEMORY_BIT = 63; - constexpr size_t KEY_IN_MEMORY = (1ULL << KEY_IN_MEMORY_BIT); - constexpr size_t BLOCK_INDEX_BITS = 32; - constexpr size_t INDEX_IN_BLOCK_BITS = 16; - constexpr size_t INDEX_IN_BLOCK_MASK = (1ULL << INDEX_IN_BLOCK_BITS) - 1; - constexpr size_t BLOCK_INDEX_MASK = ((1ULL << (BLOCK_INDEX_BITS + INDEX_IN_BLOCK_BITS)) - 1) ^ INDEX_IN_BLOCK_MASK; - - constexpr size_t NOT_EXISTS = -1; - - constexpr UInt8 HAS_NOT_FOUND = 2; - - const std::string BIN_FILE_EXT = ".bin"; - const std::string IND_FILE_EXT = ".idx"; - - int preallocateDiskSpace(int fd, size_t len) - { - #if defined(__FreeBSD__) - return posix_fallocate(fd, 0, len); - #else - return fallocate(fd, 0, 0, len); - #endif - } -} - -SSDComplexKeyCachePartition::Metadata::time_point_t SSDComplexKeyCachePartition::Metadata::expiresAt() const -{ - return ext::safe_bit_cast(data & KEY_METADATA_EXPIRES_AT_MASK); -} -void SSDComplexKeyCachePartition::Metadata::setExpiresAt(const time_point_t & t) -{ - data = ext::safe_bit_cast(t); -} - -bool SSDComplexKeyCachePartition::Metadata::isDefault() const -{ - return (data & KEY_METADATA_IS_DEFAULT_MASK) == KEY_METADATA_IS_DEFAULT_MASK; -} -void SSDComplexKeyCachePartition::Metadata::setDefault() -{ - data |= KEY_METADATA_IS_DEFAULT_MASK; -} - -bool SSDComplexKeyCachePartition::Index::inMemory() const -{ - return (index & KEY_IN_MEMORY) == KEY_IN_MEMORY; -} - -bool SSDComplexKeyCachePartition::Index::exists() const -{ - return index != NOT_EXISTS; -} - -void SSDComplexKeyCachePartition::Index::setNotExists() -{ - index = NOT_EXISTS; -} - -void SSDComplexKeyCachePartition::Index::setInMemory(const bool in_memory) -{ - index = (index & ~KEY_IN_MEMORY) | (static_cast(in_memory) << KEY_IN_MEMORY_BIT); -} - -size_t SSDComplexKeyCachePartition::Index::getAddressInBlock() const -{ - return index & INDEX_IN_BLOCK_MASK; -} - -void SSDComplexKeyCachePartition::Index::setAddressInBlock(const size_t address_in_block) -{ - index = (index & ~INDEX_IN_BLOCK_MASK) | address_in_block; -} - -size_t SSDComplexKeyCachePartition::Index::getBlockId() const -{ - return (index & BLOCK_INDEX_MASK) >> INDEX_IN_BLOCK_BITS; -} - -void SSDComplexKeyCachePartition::Index::setBlockId(const size_t block_id) -{ - index = (index & ~BLOCK_INDEX_MASK) | (block_id << INDEX_IN_BLOCK_BITS); -} - -SSDComplexKeyCachePartition::SSDComplexKeyCachePartition( - const AttributeUnderlyingType & /* key_structure */, - const std::vector & attributes_structure_, - const std::string & dir_path, - const size_t file_id_, - const size_t max_size_, - const size_t block_size_, - const size_t read_buffer_size_, - const size_t write_buffer_size_, - const size_t max_stored_keys_) - : file_id(file_id_) - , max_size(max_size_) - , block_size(block_size_) - , read_buffer_size(read_buffer_size_) - , write_buffer_size(write_buffer_size_) - , max_stored_keys(max_stored_keys_) - , path(dir_path + "/" + std::to_string(file_id)) - , key_to_index(max_stored_keys, KeyDeleter(keys_pool)) - , attributes_structure(attributes_structure_) -{ - if (!std::filesystem::create_directories(std::filesystem::path{dir_path})) - { - if (std::filesystem::exists(std::filesystem::path{dir_path})) - LOG_INFO(&Poco::Logger::get("SSDComplexKeyCachePartition::Constructor"), "Using existing directory '{}' for cache-partition", dir_path); - else - throw Exception{"Failed to create directories.", ErrorCodes::CANNOT_CREATE_DIRECTORY}; - } - - { - ProfileEvents::increment(ProfileEvents::FileOpen); - - const std::string filename = path + BIN_FILE_EXT; - fd = ::open(filename.c_str(), O_RDWR | O_CREAT | O_TRUNC | O_DIRECT, 0666); - if (fd == -1) - { - auto error_code = (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE; - throwFromErrnoWithPath("Cannot open file " + filename, filename, error_code); - } - - if (preallocateDiskSpace(fd, max_size * block_size) < 0) - throwFromErrnoWithPath("Cannot preallocate space for the file " + filename, filename, ErrorCodes::CANNOT_ALLOCATE_MEMORY); - } -} - -SSDComplexKeyCachePartition::~SSDComplexKeyCachePartition() -{ - std::unique_lock lock(rw_lock); - ::close(fd); -} - -size_t SSDComplexKeyCachePartition::appendDefaults( - const KeyRefs & keys_in, - const PaddedPODArray & metadata, - const size_t begin) -{ - std::unique_lock lock(rw_lock); - KeyRefs keys(keys_in.size()); - for (size_t i = 0; i < keys_in.size(); ++i) - keys[i] = keys_pool.copyKeyFrom(keys_in[i]); - - return append(keys, Attributes{}, metadata, begin); -} - -size_t SSDComplexKeyCachePartition::appendBlock( - const Columns & key_columns, const DataTypes & /* key_types */, - const Attributes & new_attributes, const PaddedPODArray & metadata, const size_t begin) -{ - std::unique_lock lock(rw_lock); - if (!new_attributes.empty() && new_attributes.size() != attributes_structure.size()) - throw Exception{"Wrong columns number in block.", ErrorCodes::BAD_ARGUMENTS}; - - const auto keys_size = key_columns.size(); - KeyRefs keys(key_columns.front()->size()); - { - StringRefs tmp_keys_refs(keys_size); - for (size_t i = 0; i < key_columns.front()->size(); ++i) - keys[i] = keys_pool.allocKey(i, key_columns, tmp_keys_refs); - } - - return append(keys, new_attributes, metadata, begin); -} - -size_t SSDComplexKeyCachePartition::append( - const KeyRefs & keys, - const Attributes & new_attributes, - const PaddedPODArray & metadata, - const size_t begin) -{ - if (!memory) - memory.emplace(block_size * write_buffer_size, BUFFER_ALIGNMENT); - - auto init_write_buffer = [&]() - { - write_buffer.emplace(memory->data() + current_memory_block_id * block_size, block_size); - uint64_t tmp = 0; - write_buffer->write(reinterpret_cast(&tmp), BLOCK_CHECKSUM_SIZE_BYTES); - write_buffer->write(reinterpret_cast(&tmp), BLOCK_SPECIAL_FIELDS_SIZE_BYTES); - keys_in_block = 0; - }; - - if (!write_buffer) - init_write_buffer(); - if (!keys_buffer_pool) - keys_buffer_pool.emplace(); - - bool flushed = false; - auto finish_block = [&]() - { - write_buffer.reset(); - std::memcpy(memory->data() + block_size * current_memory_block_id + BLOCK_CHECKSUM_SIZE_BYTES, &keys_in_block, sizeof(keys_in_block)); // set count - uint64_t checksum = CityHash_v1_0_2::CityHash64(memory->data() + block_size * current_memory_block_id + BLOCK_CHECKSUM_SIZE_BYTES, block_size - BLOCK_CHECKSUM_SIZE_BYTES); // checksum - std::memcpy(memory->data() + block_size * current_memory_block_id, &checksum, sizeof(checksum)); - if (++current_memory_block_id == write_buffer_size) - flush(); - flushed = true; - }; - - for (size_t index = begin; index < keys.size();) - { - Index cache_index; - cache_index.setInMemory(true); - cache_index.setBlockId(current_memory_block_id); - cache_index.setAddressInBlock(write_buffer->offset()); - - flushed = false; - if (keys[index].fullSize() + sizeof(UInt64) > write_buffer->available()) // place for key and metadata - { - finish_block(); - } - else - { - keys_pool.writeKey(keys[index], *write_buffer); - writeBinary(metadata[index].data, *write_buffer); - } - - for (const auto & attribute : new_attributes) - { - if (flushed) - break; - switch (attribute.type) - { -#define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - { \ - if (sizeof(TYPE) > write_buffer->available()) \ - { \ - finish_block(); \ - continue; \ - } \ - else \ - { \ - const auto & values = std::get>(attribute.values); /* NOLINT */ \ - writeBinary(values[index], *write_buffer); \ - } \ - } \ - break; - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - - case AttributeUnderlyingType::utString: - { - const auto & value = std::get>(attribute.values)[index]; - if (sizeof(UInt64) + value.size() > write_buffer->available()) - { - finish_block(); - continue; - } - else - { - writeStringBinary(value, *write_buffer); - } - } - break; - } - } - - if (!flushed) - { - key_to_index.setWithDelete(keys[index], cache_index); - keys_buffer.push_back(keys_buffer_pool->copyKeyFrom(keys[index])); - ++index; - ++keys_in_block; - } - else // next block in write buffer or flushed to ssd - { - init_write_buffer(); - } - } - return keys.size() - begin; -} - -void SSDComplexKeyCachePartition::flush() -{ - if (current_file_block_id >= max_size) - clearOldestBlocks(); - - if (keys_buffer.empty()) - return; - - AIOContext aio_context{1}; - - iocb write_request{}; - iocb * write_request_ptr{&write_request}; - -#if defined(__FreeBSD__) - write_request.aio.aio_lio_opcode = LIO_WRITE; - write_request.aio.aio_fildes = fd; - write_request.aio.aio_buf = reinterpret_cast(memory->data()); - write_request.aio.aio_nbytes = block_size * write_buffer_size; - write_request.aio.aio_offset = (current_file_block_id % max_size) * block_size; -#else - write_request.aio_lio_opcode = IOCB_CMD_PWRITE; - write_request.aio_fildes = fd; - write_request.aio_buf = reinterpret_cast(memory->data()); - write_request.aio_nbytes = block_size * write_buffer_size; - write_request.aio_offset = (current_file_block_id % max_size) * block_size; -#endif - - while (io_submit(aio_context.ctx, 1, &write_request_ptr) < 0) - { - if (errno != EINTR) - throw Exception("Cannot submit request for asynchronous IO on file " + path + BIN_FILE_EXT, ErrorCodes::CANNOT_IO_SUBMIT); - } - - CurrentMetrics::Increment metric_increment_write{CurrentMetrics::Write}; - - io_event event; - while (io_getevents(aio_context.ctx, 1, 1, &event, nullptr) < 0) - { - if (errno != EINTR) - throw Exception("Failed to wait for asynchronous IO completion on file " + path + BIN_FILE_EXT, ErrorCodes::CANNOT_IO_GETEVENTS); - } - - // Unpoison the memory returned from an uninstrumented system function. - __msan_unpoison(&event, sizeof(event)); - - ssize_t bytes_written; -#if defined(__FreeBSD__) - bytes_written = aio_return(reinterpret_cast(event.udata)); -#else - bytes_written = event.res; -#endif - - ProfileEvents::increment(ProfileEvents::WriteBufferAIOWrite); - ProfileEvents::increment(ProfileEvents::WriteBufferAIOWriteBytes, bytes_written); - - if (bytes_written != static_cast(block_size * write_buffer_size)) - throw Exception("Not all data was written for asynchronous IO on file " + path + BIN_FILE_EXT + ". returned: " + std::to_string(bytes_written), ErrorCodes::AIO_WRITE_ERROR); - - if (::fsync(fd) < 0) - throwFromErrnoWithPath("Cannot fsync " + path + BIN_FILE_EXT, path + BIN_FILE_EXT, ErrorCodes::CANNOT_FSYNC); - - /// commit changes in index - for (auto & key : keys_buffer) - { - Index index; - if (key_to_index.getKeyAndValue(key, index)) - { - if (index.inMemory()) // Row can be inserted in the buffer twice, so we need to move to ssd only the last index. - { - index.setInMemory(false); - index.setBlockId((current_file_block_id % max_size) + index.getBlockId()); - } - key_to_index.set(key, index); - } - } - - current_file_block_id += write_buffer_size; - current_memory_block_id = 0; - - /// clear buffer - keys_buffer.clear(); - keys_buffer_pool.reset(); - keys_buffer_pool.emplace(); -} - -template -void SSDComplexKeyCachePartition::getValue( - const size_t attribute_index, - const Columns & key_columns, - const DataTypes & key_types, - ResultArrayType & out, - std::vector & found, - GetDefault & default_value_extractor, - std::chrono::system_clock::time_point now) const -{ - auto set_value = [&](const size_t index, ReadBuffer & buf) - { - keys_pool.ignoreKey(buf); - Metadata metadata; - readVarUInt(metadata.data, buf); - - if (metadata.expiresAt() > now) - { - if (metadata.isDefault()) - out[index] = default_value_extractor[index]; - else - { - ignoreFromBufferToAttributeIndex(attribute_index, buf); - readBinary(out[index], buf); - } - found[index] = true; - } - }; - - getImpl(key_columns, key_types, set_value, found); -} - -void SSDComplexKeyCachePartition::getString(const size_t attribute_index, - const Columns & key_columns, const DataTypes & key_types, - StringRefs & refs, ArenaWithFreeLists & arena, std::vector & found, - std::vector & default_ids, - std::chrono::system_clock::time_point now) const -{ - auto set_value = [&](const size_t index, ReadBuffer & buf) - { - keys_pool.ignoreKey(buf); - Metadata metadata; - readBinary(metadata.data, buf); - - if (metadata.expiresAt() > now) - { - if (metadata.isDefault()) - default_ids.push_back(index); - else - { - ignoreFromBufferToAttributeIndex(attribute_index, buf); - size_t size = 0; - readVarUInt(size, buf); - char * string_ptr = arena.alloc(size); - memcpy(string_ptr, buf.position(), size); - refs[index].data = string_ptr; - refs[index].size = size; - } - found[index] = true; - } - }; - - getImpl(key_columns, key_types, set_value, found); -} - -void SSDComplexKeyCachePartition::hasKeys( - const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out, - std::vector & found, std::chrono::system_clock::time_point now) const -{ - auto set_value = [&](const size_t index, ReadBuffer & buf) - { - keys_pool.ignoreKey(buf); - Metadata metadata; - readBinary(metadata.data, buf); - - if (metadata.expiresAt() > now) - out[index] = !metadata.isDefault(); - }; - - getImpl(key_columns, key_types, set_value, found); -} - -template -void SSDComplexKeyCachePartition::getImpl( - const Columns & key_columns, const DataTypes & /* key_types */, - SetFunc & set, std::vector & found) const -{ - TemporalComplexKeysPool tmp_keys_pool; - StringRefs tmp_refs(key_columns.size()); - - std::shared_lock lock(rw_lock); - PaddedPODArray indices(key_columns.front()->size()); - for (size_t i = 0; i < key_columns.front()->size(); ++i) - { - auto key = tmp_keys_pool.allocKey(i, key_columns, tmp_refs); - SCOPE_EXIT(tmp_keys_pool.rollback(key)); - Index index; - if (found[i]) - indices[i].setNotExists(); - else if (key_to_index.get(key, index)) - indices[i] = index; - else - indices[i].setNotExists(); - } - - getValueFromMemory(indices, set); - getValueFromStorage(indices, set); -} - -template -void SSDComplexKeyCachePartition::getValueFromMemory(const PaddedPODArray & indices, SetFunc & set) const -{ - // Do not check checksum while reading from memory. - for (size_t i = 0; i < indices.size(); ++i) - { - const auto & index = indices[i]; - if (index.exists() && index.inMemory()) - { - const size_t offset = index.getBlockId() * block_size + index.getAddressInBlock(); - - ReadBufferFromMemory read_buffer(memory->data() + offset, block_size * write_buffer_size - offset); - set(i, read_buffer); - } - } -} - -template -void SSDComplexKeyCachePartition::getValueFromStorage(const PaddedPODArray & indices, SetFunc & set) const -{ - std::vector> index_to_out; - for (size_t i = 0; i < indices.size(); ++i) - { - const auto & index = indices[i]; - if (index.exists() && !index.inMemory()) - index_to_out.emplace_back(index, i); - } - if (index_to_out.empty()) - return; - - /// sort by (block_id, offset_in_block) - std::sort(std::begin(index_to_out), std::end(index_to_out)); - - Memory read_buffer(block_size * read_buffer_size, BUFFER_ALIGNMENT); - - // TODO: merge requests - std::vector requests; - std::vector pointers; - std::vector> blocks_to_indices; - requests.reserve(index_to_out.size()); - pointers.reserve(index_to_out.size()); - blocks_to_indices.reserve(index_to_out.size()); - for (size_t i = 0; i < index_to_out.size(); ++i) - { - #if defined(__FreeBSD__) - const size_t back_offset = requests.empty() ? -1 : static_cast(requests.back().aio.aio_offset); - #else - const size_t back_offset = requests.empty() ? -1 : static_cast(requests.back().aio_offset); - #endif - - if (!requests.empty() && back_offset == index_to_out[i].first.getBlockId() * block_size) - { - blocks_to_indices.back().push_back(i); - continue; - } - - iocb request{}; -#if defined(__FreeBSD__) - request.aio.aio_lio_opcode = LIO_READ; - request.aio.aio_fildes = fd; - request.aio.aio_buf = reinterpret_cast( - reinterpret_cast(read_buffer.data()) + block_size * (requests.size() % read_buffer_size)); - request.aio.aio_nbytes = block_size; - request.aio.aio_offset = index_to_out[i].first.getBlockId() * block_size; - request.aio_data = requests.size(); -#else - request.aio_lio_opcode = IOCB_CMD_PREAD; - request.aio_fildes = fd; - request.aio_buf = reinterpret_cast(read_buffer.data()) + block_size * (requests.size() % read_buffer_size); - request.aio_nbytes = block_size; - request.aio_offset = index_to_out[i].first.getBlockId() * block_size; - request.aio_data = requests.size(); -#endif - requests.push_back(request); - pointers.push_back(&requests.back()); - blocks_to_indices.emplace_back(); - blocks_to_indices.back().push_back(i); - } - - AIOContext aio_context(read_buffer_size); - - std::vector processed(requests.size(), false); - std::vector events(requests.size()); - #if defined(__linux__) - for (auto & event : events) - event.res = -1; - #endif - - - size_t to_push = 0; - size_t to_pop = 0; - while (to_pop < requests.size()) - { - /// get io tasks from previous iteration - int popped = 0; - while (to_pop < to_push && (popped = io_getevents(aio_context.ctx, to_push - to_pop, to_push - to_pop, &events[to_pop], nullptr)) <= 0) - { - if (errno != EINTR) - throwFromErrno("io_getevents: Failed to get an event for asynchronous IO", ErrorCodes::CANNOT_IO_GETEVENTS); - } - - for (size_t i = to_pop; i < to_pop + popped; ++i) - { - const auto request_id = events[i].data; - const auto & request = requests[request_id]; - - #if defined(__FreeBSD__) - const auto bytes_written = aio_return(reinterpret_cast(events[i].udata)); - #else - const auto bytes_written = events[i].res; - #endif - - if (bytes_written != static_cast(block_size)) - { - #if defined(__FreeBSD__) - throw Exception("AIO failed to read file " + path + BIN_FILE_EXT + ".", ErrorCodes::AIO_READ_ERROR); - #else - throw Exception("AIO failed to read file " + path + BIN_FILE_EXT + ". " + - "request_id= " + std::to_string(request.aio_data) + "/ " + std::to_string(requests.size()) + - ", aio_nbytes=" + std::to_string(request.aio_nbytes) + ", aio_offset=" + std::to_string(request.aio_offset) + - ", returned=" + std::to_string(events[i].res) + ", errno=" + std::to_string(errno), ErrorCodes::AIO_READ_ERROR); - #endif - } - #if defined(__FreeBSD__) - const char* buf_ptr = reinterpret_cast(reinterpret_cast(request.aio.aio_buf)); - #else - const auto* buf_ptr = reinterpret_cast(request.aio_buf); - #endif - - __msan_unpoison(buf_ptr, block_size); - uint64_t checksum = 0; - ReadBufferFromMemory buf_special(buf_ptr, block_size); - readBinary(checksum, buf_special); - uint64_t calculated_checksum = CityHash_v1_0_2::CityHash64(buf_ptr + BLOCK_CHECKSUM_SIZE_BYTES, block_size - BLOCK_CHECKSUM_SIZE_BYTES); - if (checksum != calculated_checksum) - { - throw Exception("Cache data corrupted. From block = " + std::to_string(checksum) + " calculated = " + std::to_string(calculated_checksum) + ".", ErrorCodes::CORRUPTED_DATA); - } - - for (const size_t idx : blocks_to_indices[request_id]) - { - const auto & [file_index, out_index] = index_to_out[idx]; - ReadBufferFromMemory buf( - buf_ptr + file_index.getAddressInBlock(), - block_size - file_index.getAddressInBlock()); - set(out_index, buf); - } - - processed[request_id] = true; - } - - while (to_pop < requests.size() && processed[to_pop]) - ++to_pop; - - /// add new io tasks - const int new_tasks_count = std::min(read_buffer_size - (to_push - to_pop), requests.size() - to_push); - - int pushed = 0; - while (new_tasks_count > 0 && (pushed = io_submit(aio_context.ctx, new_tasks_count, &pointers[to_push])) <= 0) - { - if (errno != EINTR) - throwFromErrno("io_submit: Failed to submit a request for asynchronous IO", ErrorCodes::CANNOT_IO_SUBMIT); - } - to_push += pushed; - } -} - -void SSDComplexKeyCachePartition::clearOldestBlocks() -{ - // write_buffer_size, because we need to erase the whole buffer. - Memory read_buffer_memory(block_size * write_buffer_size, BUFFER_ALIGNMENT); - - iocb request{}; -#if defined(__FreeBSD__) - request.aio.aio_lio_opcode = LIO_READ; - request.aio.aio_fildes = fd; - request.aio.aio_buf = reinterpret_cast(reinterpret_cast(read_buffer_memory.data())); - request.aio.aio_nbytes = block_size * write_buffer_size; - request.aio.aio_offset = (current_file_block_id % max_size) * block_size; - request.aio_data = 0; -#else - request.aio_lio_opcode = IOCB_CMD_PREAD; - request.aio_fildes = fd; - request.aio_buf = reinterpret_cast(read_buffer_memory.data()); - request.aio_nbytes = block_size * write_buffer_size; - request.aio_offset = (current_file_block_id % max_size) * block_size; - request.aio_data = 0; -#endif - - { - iocb* request_ptr = &request; - io_event event{}; - AIOContext aio_context(1); - - while (io_submit(aio_context.ctx, 1, &request_ptr) != 1) - if (errno != EINTR) - throwFromErrno("io_submit: Failed to submit a request for asynchronous IO", ErrorCodes::CANNOT_IO_SUBMIT); - - while (io_getevents(aio_context.ctx, 1, 1, &event, nullptr) != 1) - if (errno != EINTR) - throwFromErrno("io_getevents: Failed to get an event for asynchronous IO", ErrorCodes::CANNOT_IO_GETEVENTS); - -#if defined(__FreeBSD__) - if (aio_return(reinterpret_cast(event.udata)) != static_cast(request.aio.aio_nbytes)) - throw Exception("GC: AIO failed to read file " + path + BIN_FILE_EXT + ".", ErrorCodes::AIO_READ_ERROR); -#else - if (event.res != static_cast(request.aio_nbytes)) - throw Exception("GC: AIO failed to read file " + path + BIN_FILE_EXT + ". " + - "aio_nbytes=" + std::to_string(request.aio_nbytes) + - ", returned=" + std::to_string(event.res) + ".", ErrorCodes::AIO_READ_ERROR); -#endif - __msan_unpoison(read_buffer_memory.data(), read_buffer_memory.size()); - } - - TemporalComplexKeysPool tmp_keys_pool; - KeyRefs keys; - - for (size_t i = 0; i < write_buffer_size; ++i) - { - ReadBufferFromMemory read_buffer(read_buffer_memory.data() + i * block_size, block_size); - - uint64_t checksum = 0; - readBinary(checksum, read_buffer); - uint64_t calculated_checksum = CityHash_v1_0_2::CityHash64(read_buffer_memory.data() + i * block_size + BLOCK_CHECKSUM_SIZE_BYTES, block_size - BLOCK_CHECKSUM_SIZE_BYTES); - if (checksum != calculated_checksum) - { - throw Exception("Cache data corrupted. From block = " + std::to_string(checksum) + " calculated = " + std::to_string(calculated_checksum) + ".", ErrorCodes::CORRUPTED_DATA); - } - - uint32_t keys_in_current_block = 0; - readBinary(keys_in_current_block, read_buffer); - - for (uint32_t j = 0; j < keys_in_current_block; ++j) - { - keys.emplace_back(); - tmp_keys_pool.readKey(keys.back(), read_buffer); - - Metadata metadata; - readBinary(metadata.data, read_buffer); - - if (!metadata.isDefault()) - { - for (const auto & attr : attributes_structure) - { - switch (attr) - { - #define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - read_buffer.ignore(sizeof(TYPE)); \ - break; - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) - #undef DISPATCH - - case AttributeUnderlyingType::utString: - { - size_t size = 0; - readVarUInt(size, read_buffer); - read_buffer.ignore(size); - } - break; - } - } - } - } - } - - const size_t start_block = current_file_block_id % max_size; - const size_t finish_block = start_block + write_buffer_size; - for (const auto& key : keys) - { - Index index; - if (key_to_index.get(key, index)) - { - size_t block_id = index.getBlockId(); - if (start_block <= block_id && block_id < finish_block) - key_to_index.erase(key); - } - } -} - -void SSDComplexKeyCachePartition::ignoreFromBufferToAttributeIndex(const size_t attribute_index, ReadBuffer & buf) const -{ - for (size_t i = 0; i < attribute_index; ++i) - { - switch (attributes_structure[i]) - { -#define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - buf.ignore(sizeof(TYPE)); \ - break; - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - - case AttributeUnderlyingType::utString: - { - size_t size = 0; - readVarUInt(size, buf); - buf.ignore(size); - } - break; - } - } -} - -size_t SSDComplexKeyCachePartition::getId() const -{ - return file_id; -} - -double SSDComplexKeyCachePartition::getLoadFactor() const -{ - std::shared_lock lock(rw_lock); - return static_cast(current_file_block_id) / max_size; -} - -size_t SSDComplexKeyCachePartition::getElementCount() const -{ - std::shared_lock lock(rw_lock); - return key_to_index.size(); -} - -size_t SSDComplexKeyCachePartition::getBytesAllocated() const -{ - std::shared_lock lock(rw_lock); - return 16.5 * key_to_index.capacity() + keys_pool.size() + - (keys_buffer_pool ? keys_buffer_pool->size() : 0) + (memory ? memory->size() : 0); -} - -void SSDComplexKeyCachePartition::remove() -{ - std::unique_lock lock(rw_lock); - std::filesystem::remove(std::filesystem::path(path + BIN_FILE_EXT)); -} - -SSDComplexKeyCacheStorage::SSDComplexKeyCacheStorage( - const AttributeTypes & attributes_structure_, - const std::string & path_, - const size_t max_partitions_count_, - const size_t file_size_, - const size_t block_size_, - const size_t read_buffer_size_, - const size_t write_buffer_size_, - const size_t max_stored_keys_) - : attributes_structure(attributes_structure_) - , path(path_) - , max_partitions_count(max_partitions_count_) - , file_size(file_size_) - , block_size(block_size_) - , read_buffer_size(read_buffer_size_) - , write_buffer_size(write_buffer_size_) - , max_stored_keys(max_stored_keys_) - , log(&Poco::Logger::get("SSDComplexKeyCacheStorage")) -{ -} - -SSDComplexKeyCacheStorage::~SSDComplexKeyCacheStorage() -{ - std::unique_lock lock(rw_lock); - partition_delete_queue.splice(std::end(partition_delete_queue), partitions); - collectGarbage(); -} - -template -void SSDComplexKeyCacheStorage::getValue( - const size_t attribute_index, const Columns & key_columns, const DataTypes & key_types, - ResultArrayType & out, std::unordered_map> & not_found, - TemporalComplexKeysPool & not_found_pool, - GetDefault & get_default, std::chrono::system_clock::time_point now) const -{ - size_t n = key_columns.front()->size(); - std::vector found(n, false); - - { - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - partition->getValue(attribute_index, key_columns, key_types, out, found, get_default, now); - } - - size_t count_not_found = 0; - StringRefs tmp_refs(key_columns.size()); - for (size_t i = 0; i < n; ++i) - { - if (!found[i]) - { - auto key = not_found_pool.allocKey(i, key_columns, tmp_refs); - not_found[key].push_back(i); - ++count_not_found; - } - } - - query_count.fetch_add(n, std::memory_order_relaxed); - hit_count.fetch_add(n - count_not_found, std::memory_order_release); -} - -void SSDComplexKeyCacheStorage::getString( - const size_t attribute_index, const Columns & key_columns, const DataTypes & key_types, - StringRefs & refs, ArenaWithFreeLists & arena, - std::unordered_map> & not_found, - TemporalComplexKeysPool & not_found_pool, - std::vector & default_ids, std::chrono::system_clock::time_point now) const -{ - size_t n = key_columns.front()->size(); - std::vector found(n, false); - - { - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - partition->getString(attribute_index, key_columns, key_types, refs, arena, found, default_ids, now); - } - - size_t count_not_found = 0; - StringRefs tmp_refs(key_columns.size()); - for (size_t i = 0; i < n; ++i) - { - if (!found[i]) - { - auto key = not_found_pool.allocKey(i, key_columns, tmp_refs); - not_found[key].push_back(i); - ++count_not_found; - } - } - - query_count.fetch_add(n, std::memory_order_relaxed); - hit_count.fetch_add(n - count_not_found, std::memory_order_release); -} - -void SSDComplexKeyCacheStorage::hasKeys( - const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out, - std::unordered_map> & not_found, - TemporalComplexKeysPool & not_found_pool, std::chrono::system_clock::time_point now) const -{ - size_t n = key_columns.front()->size(); - for (size_t i = 0; i < n; ++i) - out[i] = HAS_NOT_FOUND; - std::vector found(n, false); - - { - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - partition->hasKeys(key_columns, key_types, out, found, now); - } - - size_t count_not_found = 0; - StringRefs tmp_refs(key_columns.size()); - for (size_t i = 0; i < n; ++i) - { - if (out[i] == HAS_NOT_FOUND) - { - auto key = not_found_pool.allocKey(i, key_columns, tmp_refs); - not_found[key].push_back(i); - ++count_not_found; - } - } - - query_count.fetch_add(n, std::memory_order_relaxed); - hit_count.fetch_add(n - count_not_found, std::memory_order_release); -} - -namespace -{ -SSDComplexKeyCachePartition::Attributes createAttributesFromBlock( - const Block & block, const size_t begin_column, const std::vector & structure) -{ - SSDComplexKeyCachePartition::Attributes attributes; - - const auto columns = block.getColumns(); - for (size_t i = 0; i < structure.size(); ++i) - { - const auto & column = columns[i + begin_column]; - switch (structure[i]) - { -#define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - { \ - SSDComplexKeyCachePartition::Attribute::Container values(column->size()); \ - memcpy(&values[0], column->getRawData().data, sizeof(TYPE) * values.size()); \ - attributes.emplace_back(); \ - attributes.back().type = structure[i]; \ - attributes.back().values = std::move(values); \ - } \ - break; - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - - case AttributeUnderlyingType::utString: - { - attributes.emplace_back(); - SSDComplexKeyCachePartition::Attribute::Container values(column->size()); - for (size_t j = 0; j < column->size(); ++j) - { - const auto ref = column->getDataAt(j); - values[j].resize(ref.size); - memcpy(values[j].data(), ref.data, ref.size); - } - attributes.back().type = structure[i]; - attributes.back().values = std::move(values); - } - break; - } - } - - return attributes; -} -} - -template -void SSDComplexKeyCacheStorage::update( - DictionarySourcePtr & source_ptr, - const Columns & key_columns, - const DataTypes & key_types, - const KeyRefs & required_keys, - const std::vector & required_rows, - TemporalComplexKeysPool & tmp_keys_pool, - PresentIdHandler && on_updated, - AbsentIdHandler && on_key_not_found, - const DictionaryLifetime lifetime) -{ - assert(key_columns.size() == key_types.size()); - - auto append_block = [&key_types, this]( - const Columns & new_keys, - const SSDComplexKeyCachePartition::Attributes & new_attributes, - const PaddedPODArray & metadata) - { - size_t inserted = 0; - while (inserted < metadata.size()) - { - if (!partitions.empty()) - inserted += partitions.front()->appendBlock( - new_keys, key_types, new_attributes, metadata, inserted); - if (inserted < metadata.size()) - { - partitions.emplace_front(std::make_unique( - AttributeUnderlyingType::utUInt64, attributes_structure, path, - (partitions.empty() ? 0 : partitions.front()->getId() + 1), - file_size, block_size, read_buffer_size, write_buffer_size, max_stored_keys)); - } - } - - collectGarbage(); - }; - - CurrentMetrics::Increment metric_increment{CurrentMetrics::DictCacheRequests}; - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, required_keys.size()); - - std::unordered_map remaining_keys{required_keys.size()}; - for (const auto & key : required_keys) - remaining_keys.insert({key, 0}); - - const auto now = std::chrono::system_clock::now(); - - { - const auto keys_size = key_columns.size(); - StringRefs keys(keys_size); - - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - - if (now > backoff_end_time) - { - try - { - if (update_error_count) - { - /// Recover after error: we have to clone the source here because - /// it could keep connections which should be reset after error. - source_ptr = source_ptr->clone(); - } - - Stopwatch watch; - auto stream = source_ptr->loadKeys(key_columns, required_rows); - stream->readPrefix(); - - while (const auto block = stream->read()) - { - const auto new_key_columns = ext::map( - ext::range(0, keys_size), - [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - const auto new_attributes = createAttributesFromBlock(block, keys_size, attributes_structure); - - const auto rows_num = block.rows(); - PaddedPODArray metadata(rows_num); - - for (const auto i : ext::range(0, rows_num)) - { - auto key = tmp_keys_pool.allocKey(i, new_key_columns, keys); - //SCOPE_EXIT(tmp_keys_pool.rollback(key)); - - std::uniform_int_distribution distribution{lifetime.min_sec, lifetime.max_sec}; - metadata[i].setExpiresAt(now + std::chrono::seconds(distribution(rnd_engine))); - /// mark corresponding id as found - on_updated(key, i, new_attributes); - remaining_keys[key] = 1; - } - - append_block(new_key_columns, new_attributes, metadata); - } - - stream->readSuffix(); - - update_error_count = 0; - last_update_exception = std::exception_ptr{}; - backoff_end_time = std::chrono::system_clock::time_point{}; - - ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed()); - } - catch (...) - { - ++update_error_count; - last_update_exception = std::current_exception(); - backoff_end_time = now + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, update_error_count)); - - tryLogException(last_update_exception, log, - "Could not update ssd cache dictionary, next update is scheduled at " + ext::to_string(backoff_end_time)); - } - } - } - - auto append_defaults = [this]( - const KeyRefs & new_keys, - const PaddedPODArray & metadata) - { - size_t inserted = 0; - while (inserted < metadata.size()) - { - if (!partitions.empty()) - inserted += partitions.front()->appendDefaults( - new_keys, metadata, inserted); - if (inserted < metadata.size()) - { - partitions.emplace_front(std::make_unique( - AttributeUnderlyingType::utUInt64, attributes_structure, path, - (partitions.empty() ? 0 : partitions.front()->getId() + 1), - file_size, block_size, read_buffer_size, write_buffer_size, max_stored_keys)); - } - } - - collectGarbage(); - }; - - size_t not_found_num = 0, found_num = 0; - /// Check which ids have not been found and require setting null_value - KeyRefs default_keys; - - PaddedPODArray metadata; - { - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - - for (const auto & key_found_pair : remaining_keys) - { - if (key_found_pair.second) - { - ++found_num; - continue; - } - ++not_found_num; - - const auto key = key_found_pair.first; - - if (update_error_count) - { - /// TODO: use old values. - - // We don't have expired data for that `id` so all we can do is - // to rethrow `last_exception`. We might have to throw the same - // exception for different callers of dictGet() in different - // threads, which might then modify the exception object, so we - // have to throw a copy. - try - { - std::rethrow_exception(last_update_exception); - } - catch (...) - { - throw DB::Exception(ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL, - "Update failed for dictionary '{}': {}", - getPath(), - getCurrentExceptionMessage(true /*with stack trace*/, - true /*check embedded stack trace*/)); - } - } - - std::uniform_int_distribution distribution{lifetime.min_sec, lifetime.max_sec}; - metadata.emplace_back(); - metadata.back().setExpiresAt(now + std::chrono::seconds(distribution(rnd_engine))); - metadata.back().setDefault(); - - default_keys.push_back(key); - - /// inform caller that the cell has not been found - on_key_not_found(key); - } - - if (not_found_num) - append_defaults(default_keys, metadata); - } - - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num); - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num); - ProfileEvents::increment(ProfileEvents::DictCacheRequests); -} - -double SSDComplexKeyCacheStorage::getLoadFactor() const -{ - double result = 0; - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - result += partition->getLoadFactor(); - return result / partitions.size(); -} - -size_t SSDComplexKeyCacheStorage::getElementCount() const -{ - size_t result = 0; - std::shared_lock lock(rw_lock); - for (const auto & partition : partitions) - result += partition->getElementCount(); - return result; -} - -void SSDComplexKeyCacheStorage::collectGarbage() -{ - // add partitions to queue - while (partitions.size() > max_partitions_count) - { - partition_delete_queue.splice(std::end(partition_delete_queue), partitions, std::prev(std::end(partitions))); - } - - // drop unused partitions - while (!partition_delete_queue.empty() && partition_delete_queue.front().use_count() == 1) - { - partition_delete_queue.front()->remove(); - partition_delete_queue.pop_front(); - } -} - -SSDComplexKeyCacheDictionary::SSDComplexKeyCacheDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - const std::string & path_, - const size_t max_partitions_count_, - const size_t file_size_, - const size_t block_size_, - const size_t read_buffer_size_, - const size_t write_buffer_size_, - const size_t max_stored_keys_) - : IDictionaryBase(dict_id_) - , dict_struct(dict_struct_) - , source_ptr(std::move(source_ptr_)) - , dict_lifetime(dict_lifetime_) - , path(path_) - , max_partitions_count(max_partitions_count_) - , file_size(file_size_) - , block_size(block_size_) - , read_buffer_size(read_buffer_size_) - , write_buffer_size(write_buffer_size_) - , max_stored_keys(max_stored_keys_) - , storage(ext::map(dict_struct.attributes, [](const auto & attribute) { return attribute.underlying_type; }), - path, max_partitions_count, file_size, block_size, read_buffer_size, write_buffer_size, max_stored_keys) - , log(&Poco::Logger::get("SSDComplexKeyCacheDictionary")) -{ - LOG_INFO(log, "Using storage path '{}'.", path); - if (!this->source_ptr->supportsSelectiveLoad()) - throw Exception{name + ": source cannot be used with CacheDictionary", ErrorCodes::UNSUPPORTED_METHOD}; - - createAttributes(); -} - -ColumnPtr SSDComplexKeyCacheDictionary::getColumn( - const std::string & attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const -{ - ColumnPtr result; - - dict_struct.validateKeyTypes(key_types); - - const auto index = getAttributeIndex(attribute_name); - const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - - auto keys_size = key_columns.front()->size(); - - auto type_call = [&](const auto &dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ColumnProvider = DictionaryAttributeColumnProvider; - - const auto & null_value = std::get(null_values[index]); - DictionaryDefaultValueExtractor default_value_extractor(null_value, default_values_column); - - auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); - - if constexpr (std::is_same_v) - { - auto * out = column.get(); - getItemsStringImpl(index, key_columns, key_types, out, default_value_extractor); - } - else - { - auto & out = column->getData(); - getItemsNumberImpl( - index, - key_columns, - key_types, - out, - default_value_extractor); - } - - result = std::move(column); - }; - - callOnDictionaryAttributeType(dict_struct.attributes[index].underlying_type, type_call); - - return result; -} - -template -void SSDComplexKeyCacheDictionary::getItemsNumberImpl( - const size_t attribute_index, - const Columns & key_columns, - const DataTypes & key_types, - ResultArrayType & out, - DefaultValueExtractor & default_value_extractor) const -{ - assert(dict_struct.key); - assert(key_columns.size() == key_types.size()); - - dict_struct.validateKeyTypes(key_types); - - const auto now = std::chrono::system_clock::now(); - - TemporalComplexKeysPool not_found_pool; - std::unordered_map> not_found_keys; - storage.getValue(attribute_index, key_columns, key_types, out, not_found_keys, not_found_pool, default_value_extractor, now); - if (not_found_keys.empty()) - return; - - std::vector required_keys(not_found_keys.size()); - std::transform(std::begin(not_found_keys), std::end(not_found_keys), std::begin(required_keys), [](const auto & pair) { return pair.first; }); - std::vector required_rows; - required_rows.reserve(required_keys.size()); - for (const auto & key_ref : required_keys) - required_rows.push_back(not_found_keys[key_ref].front()); - - TemporalComplexKeysPool tmp_keys_pool; - storage.update( - source_ptr, - key_columns, - key_types, - required_keys, - required_rows, - tmp_keys_pool, - [&](const auto key, const auto row, const auto & new_attributes) - { - for (const size_t out_row : not_found_keys[key]) - out[out_row] = std::get>(new_attributes[attribute_index].values)[row]; - }, - [&](const auto key) - { - for (const size_t row : not_found_keys[key]) - out[row] = default_value_extractor[row]; - }, - getLifetime()); -} - -void SSDComplexKeyCacheDictionary::getItemsStringImpl( - const size_t attribute_index, - const Columns & key_columns, - const DataTypes & key_types, - ColumnString * out, - DictionaryDefaultValueExtractor & default_value_extractor) const -{ - dict_struct.validateKeyTypes(key_types); - - const auto now = std::chrono::system_clock::now(); - - TemporalComplexKeysPool not_found_pool; - std::unordered_map> not_found_keys; - - const size_t n = key_columns.front()->size(); - - StringRefs refs(n); - ArenaWithFreeLists string_arena; - std::vector default_rows; - storage.getString( - attribute_index, key_columns, key_types, - refs, string_arena, not_found_keys, not_found_pool, default_rows, now); - std::sort(std::begin(default_rows), std::end(default_rows)); - - if (not_found_keys.empty()) - { - size_t default_index = 0; - for (size_t row = 0; row < n; ++row) - { - if (unlikely(default_index != default_rows.size() && default_rows[default_index] == row)) - { - auto to_insert = default_value_extractor[row]; - out->insertData(to_insert.data, to_insert.size); - ++default_index; - } - else - out->insertData(refs[row].data, refs[row].size); - } - return; - } - - std::vector required_keys(not_found_keys.size()); - std::transform(std::begin(not_found_keys), std::end(not_found_keys), std::begin(required_keys), [](const auto & pair) { return pair.first; }); - - std::unordered_map update_result; - - std::vector required_rows; - required_rows.reserve(required_keys.size()); - for (const auto & key_ref : required_keys) - required_rows.push_back(not_found_keys[key_ref].front()); - - TemporalComplexKeysPool tmp_keys_pool; - storage.update( - source_ptr, - key_columns, - key_types, - required_keys, - required_rows, - tmp_keys_pool, - [&](const auto key, const auto row, const auto & new_attributes) - { - update_result[key] = std::get>(new_attributes[attribute_index].values)[row]; - }, - [&](const auto) {}, - getLifetime()); - - StringRefs tmp_refs(key_columns.size()); - size_t default_index = 0; - for (size_t row = 0; row < n; ++row) - { - const auto key = tmp_keys_pool.allocKey(row, key_columns, tmp_refs); - SCOPE_EXIT(tmp_keys_pool.rollback(key)); - if (unlikely(default_index != default_rows.size() && default_rows[default_index] == row)) - { - auto to_insert = default_value_extractor[row]; - out->insertData(to_insert.data, to_insert.size); - ++default_index; - } - else if (auto it = not_found_keys.find(key); it == std::end(not_found_keys)) - { - out->insertData(refs[row].data, refs[row].size); - } - else if (auto it_update = update_result.find(key); it_update != std::end(update_result)) - { - out->insertData(it_update->second.data(), it_update->second.size()); - } - else - { - auto to_insert = default_value_extractor[row]; - out->insertData(to_insert.data, to_insert.size); - } - } -} - -ColumnUInt8::Ptr SSDComplexKeyCacheDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const -{ - dict_struct.validateKeyTypes(key_types); - - const auto rows_num = key_columns.front()->size(); - - auto result = ColumnUInt8::create(rows_num); - auto& out = result->getData(); - - for (const auto row : ext::range(0, rows_num)) - out[row] = false; - - const auto now = std::chrono::system_clock::now(); - - std::unordered_map> not_found_keys; - TemporalComplexKeysPool not_found_pool; - storage.hasKeys(key_columns, key_types, out, not_found_keys, not_found_pool, now); - if (not_found_keys.empty()) - return result; - - std::vector required_keys(not_found_keys.size()); - std::transform(std::begin(not_found_keys), std::end(not_found_keys), std::begin(required_keys), [](const auto & pair) { return pair.first; }); - - std::vector required_rows; - required_rows.reserve(required_keys.size()); - for (const auto & key_ref : required_keys) - required_rows.push_back(not_found_keys[key_ref].front()); - - TemporalComplexKeysPool tmp_keys_pool; - storage.update( - source_ptr, - key_columns, - key_types, - required_keys, - required_rows, - tmp_keys_pool, - [&](const auto key, const auto, const auto &) - { - for (const size_t out_row : not_found_keys[key]) - out[out_row] = true; - }, - [&](const auto key) - { - for (const size_t row : not_found_keys[key]) - out[row] = false; - }, - getLifetime()); - - return result; -} - -BlockInputStreamPtr SSDComplexKeyCacheDictionary::getBlockInputStream( - const Names & /* column_names */, size_t /* max_block_size*/) const -{ - throw DB::Exception("Method not supported.", ErrorCodes::NOT_IMPLEMENTED); -} - -size_t SSDComplexKeyCacheDictionary::getAttributeIndex(const std::string & attr_name) const -{ - auto it = attribute_index_by_name.find(attr_name); - if (it == std::end(attribute_index_by_name)) - throw Exception{"Attribute `" + name + "` does not exist.", ErrorCodes::BAD_ARGUMENTS}; - return it->second; -} - -template -AttributeValueVariant SSDComplexKeyCacheDictionary::createAttributeNullValueWithTypeImpl(const Field & null_value) -{ - AttributeValueVariant var_null_value = static_cast(null_value.get>()); - bytes_allocated += sizeof(T); - return var_null_value; -} - -template <> -AttributeValueVariant SSDComplexKeyCacheDictionary::createAttributeNullValueWithTypeImpl(const Field & null_value) -{ - AttributeValueVariant var_null_value = null_value.get(); - bytes_allocated += sizeof(StringRef); - return var_null_value; -} - -AttributeValueVariant SSDComplexKeyCacheDictionary::createAttributeNullValueWithType(const AttributeUnderlyingType type, const Field & null_value) -{ - switch (type) - { -#define DISPATCH(TYPE) \ -case AttributeUnderlyingType::ut##TYPE: \ - return createAttributeNullValueWithTypeImpl(null_value); /* NOLINT */ - - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) - DISPATCH(String) -#undef DISPATCH - } - throw Exception{"Unknown attribute type: " + std::to_string(static_cast(type)), ErrorCodes::TYPE_MISMATCH}; -} - -void SSDComplexKeyCacheDictionary::createAttributes() -{ - null_values.reserve(dict_struct.attributes.size()); - for (size_t i = 0; i < dict_struct.attributes.size(); ++i) - { - const auto & attribute = dict_struct.attributes[i]; - - attribute_index_by_name.emplace(attribute.name, i); - null_values.push_back(createAttributeNullValueWithType(attribute.underlying_type, attribute.null_value)); - - if (attribute.hierarchical) - throw Exception{name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; - } -} - -void registerDictionarySSDComplexKeyCache(DictionaryFactory & factory) -{ - auto create_layout = [=](const std::string & name, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); - - if (dict_struct.id) - throw Exception{"'id' is not supported for dictionary of layout 'complex_key_cache'", ErrorCodes::UNSUPPORTED_METHOD}; - - if (dict_struct.range_min || dict_struct.range_max) - throw Exception{name - + ": elements .structure.range_min and .structure.range_max should be defined only " - "for a dictionary of layout 'range_hashed'", - ErrorCodes::BAD_ARGUMENTS}; - const auto & layout_prefix = config_prefix + ".layout"; - - const auto max_partitions_count = config.getInt(layout_prefix + ".complex_key_ssd_cache.max_partitions_count", DEFAULT_PARTITIONS_COUNT); - if (max_partitions_count <= 0) - throw Exception{name + ": dictionary of layout 'complex_key_ssd_cache' cannot have 0 (or less) max_partitions_count", ErrorCodes::BAD_ARGUMENTS}; - - const auto block_size = config.getInt(layout_prefix + ".complex_key_ssd_cache.block_size", DEFAULT_SSD_BLOCK_SIZE_BYTES); - if (block_size <= 0) - throw Exception{name + ": dictionary of layout 'complex_key_ssd_cache' cannot have 0 (or less) block_size", ErrorCodes::BAD_ARGUMENTS}; - - const auto file_size = config.getInt64(layout_prefix + ".complex_key_ssd_cache.file_size", DEFAULT_FILE_SIZE_BYTES); - if (file_size <= 0) - throw Exception{name + ": dictionary of layout 'complex_key_ssd_cache' cannot have 0 (or less) file_size", ErrorCodes::BAD_ARGUMENTS}; - if (file_size % block_size != 0) - throw Exception{name + ": file_size must be a multiple of block_size", ErrorCodes::BAD_ARGUMENTS}; - - const auto read_buffer_size = config.getInt64(layout_prefix + ".complex_key_ssd_cache.read_buffer_size", DEFAULT_READ_BUFFER_SIZE_BYTES); - if (read_buffer_size <= 0) - throw Exception{name + ": dictionary of layout 'complex_key_ssd_cache' cannot have 0 (or less) read_buffer_size", ErrorCodes::BAD_ARGUMENTS}; - if (read_buffer_size % block_size != 0) - throw Exception{name + ": read_buffer_size must be a multiple of block_size", ErrorCodes::BAD_ARGUMENTS}; - - const auto write_buffer_size = config.getInt64(layout_prefix + ".complex_key_ssd_cache.write_buffer_size", DEFAULT_WRITE_BUFFER_SIZE_BYTES); - if (write_buffer_size <= 0) - throw Exception{name + ": dictionary of layout 'complex_key_ssd_cache' cannot have 0 (or less) write_buffer_size", ErrorCodes::BAD_ARGUMENTS}; - if (write_buffer_size % block_size != 0) - throw Exception{name + ": write_buffer_size must be a multiple of block_size", ErrorCodes::BAD_ARGUMENTS}; - - auto path = config.getString(layout_prefix + ".complex_key_ssd_cache.path"); - if (path.empty()) - throw Exception{name + ": dictionary of layout 'complex_key_ssd_cache' cannot have empty path", - ErrorCodes::BAD_ARGUMENTS}; - if (path.at(0) != '/') - path = std::filesystem::path{config.getString("path")}.concat(path).string(); - - const auto max_stored_keys = config.getInt64(layout_prefix + ".complex_key_ssd_cache.max_stored_keys", DEFAULT_MAX_STORED_KEYS); - if (max_stored_keys <= 0) - throw Exception{name + ": dictionary of layout 'complex_key_ssd_cache' cannot have 0 (or less) max_stored_keys", ErrorCodes::BAD_ARGUMENTS}; - - const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; - return std::make_unique( - dict_id, dict_struct, std::move(source_ptr), dict_lifetime, path, - max_partitions_count, file_size / block_size, block_size, - read_buffer_size / block_size, write_buffer_size / block_size, - max_stored_keys); - }; - factory.registerLayout("complex_key_ssd_cache", create_layout, true); -} - -} - -#endif diff --git a/src/Dictionaries/SSDComplexKeyCacheDictionary.h b/src/Dictionaries/SSDComplexKeyCacheDictionary.h deleted file mode 100644 index be65d823e34..00000000000 --- a/src/Dictionaries/SSDComplexKeyCacheDictionary.h +++ /dev/null @@ -1,634 +0,0 @@ -#pragma once - -#if defined(OS_LINUX) || defined(__FreeBSD__) - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryStructure.h" -#include "DictionaryHelpers.h" - -namespace DB -{ - -class KeyRef -{ -public: - explicit KeyRef(char * data) : ptr(data) {} - - KeyRef() : ptr(nullptr) {} - - inline UInt16 size() const - { - UInt16 res; - memcpy(&res, ptr, sizeof(res)); - return res; - } - - inline size_t fullSize() const - { - return static_cast(size()) + sizeof(UInt16); - } - - inline bool isNull() const - { - return ptr == nullptr; - } - - inline char * data() const - { - return ptr + sizeof(UInt16); - } - - inline char * fullData() const - { - return ptr; - } - - inline char * fullData() - { - return ptr; - } - - inline const StringRef getRef() const - { - return StringRef(data(), size()); - } - - inline bool operator==(const KeyRef & other) const - { - return getRef() == other.getRef(); - } - - inline bool operator!=(const KeyRef & other) const - { - return !(*this == other); - } - - inline bool operator<(const KeyRef & other) const - { - return getRef() < other.getRef(); - } - -private: - char * ptr; -}; - -using KeyRefs = std::vector; -} - -namespace std -{ - template <> - struct hash - { - size_t operator() (DB::KeyRef key_ref) const - { - return hasher(key_ref.getRef()); - } - - std::hash hasher; - }; -} - -namespace DB -{ - -using AttributeValueVariant = std::variant< - UInt8, - UInt16, - UInt32, - UInt64, - UInt128, - Int8, - Int16, - Int32, - Int64, - Decimal32, - Decimal64, - Decimal128, - Float32, - Float64, - String>; - -/* - The pool for storing complex keys. -*/ -template -class ComplexKeysPoolImpl -{ -public: - KeyRef allocKey(const size_t row, const Columns & key_columns, StringRefs & keys) - { - const auto keys_size = key_columns.size(); - UInt16 sum_keys_size{}; - - for (size_t j = 0; j < keys_size; ++j) - { - keys[j] = key_columns[j]->getDataAt(row); - sum_keys_size += keys[j].size; - if (!key_columns[j]->valuesHaveFixedSize()) // String - sum_keys_size += sizeof(size_t) + 1; - } - - auto place = arena.alloc(sum_keys_size + sizeof(sum_keys_size)); - - auto key_start = place; - memcpy(key_start, &sum_keys_size, sizeof(sum_keys_size)); - key_start += sizeof(sum_keys_size); - for (size_t j = 0; j < keys_size; ++j) - { - if (!key_columns[j]->valuesHaveFixedSize()) // String - { - auto key_size = keys[j].size + 1; - memcpy(key_start, &key_size, sizeof(size_t)); - key_start += sizeof(size_t); - memcpy(key_start, keys[j].data, keys[j].size); - key_start += keys[j].size; - *key_start = '\0'; - ++key_start; - } - else - { - memcpy(key_start, keys[j].data, keys[j].size); - key_start += keys[j].size; - } - } - - return KeyRef(place); - } - - KeyRef copyKeyFrom(const KeyRef & key) - { - char * data = arena.alloc(key.fullSize()); - memcpy(data, key.fullData(), key.fullSize()); - return KeyRef(data); - } - - void freeKey(const KeyRef & key) - { - if constexpr (std::is_same_v) - arena.free(key.fullData(), key.fullSize()); - } - - void rollback(const KeyRef & key) - { - if constexpr (std::is_same_v) - arena.rollback(key.fullSize()); - } - - void writeKey(const KeyRef & key, WriteBuffer & buf) - { - buf.write(key.fullData(), key.fullSize()); - } - - void readKey(KeyRef & key, ReadBuffer & buf) - { - UInt16 sz; - readBinary(sz, buf); - char * data = nullptr; - if constexpr (std::is_same_v) - data = arena.alloc(); - else - data = arena.alloc(sz + sizeof(sz)); - memcpy(data, &sz, sizeof(sz)); - buf.read(data + sizeof(sz), sz); - key = KeyRef(data); - } - - void ignoreKey(ReadBuffer & buf) const - { - UInt16 sz; - readBinary(sz, buf); - buf.ignore(sz); - } - - size_t size() const - { - return arena.size(); - } - -private: - A arena; -}; - -using TemporalComplexKeysPool = ComplexKeysPoolImpl; -using ComplexKeysPool = ComplexKeysPoolImpl; - -struct KeyDeleter -{ - KeyDeleter(ComplexKeysPool & keys_pool_) : keys_pool(keys_pool_) {} - - void operator()(const KeyRef key) const - { - keys_pool.freeKey(key); - } - - ComplexKeysPool & keys_pool; -}; - - -/* - Class for operations with cache file and index. - Supports GET/SET operations. -*/ -class SSDComplexKeyCachePartition -{ -public: - struct Index final - { - bool inMemory() const; - void setInMemory(const bool in_memory); - - bool exists() const; - void setNotExists(); - - size_t getAddressInBlock() const; - void setAddressInBlock(const size_t address_in_block); - - size_t getBlockId() const; - void setBlockId(const size_t block_id); - - bool operator< (const Index & rhs) const { return index < rhs.index; } - - /// Stores `is_in_memory` flag, block id, address in uncompressed block - uint64_t index = 0; - }; - - struct Metadata final - { - using time_point_t = std::chrono::system_clock::time_point; - using time_point_rep_t = time_point_t::rep; - using time_point_urep_t = std::make_unsigned_t; - - time_point_t expiresAt() const; - void setExpiresAt(const time_point_t & t); - - bool isDefault() const; - void setDefault(); - - /// Stores both expiration time and `is_default` flag in the most significant bit - time_point_urep_t data = 0; - }; - - using Offset = size_t; - using Offsets = std::vector; - - - SSDComplexKeyCachePartition( - const AttributeUnderlyingType & key_structure, - const std::vector & attributes_structure, - const std::string & dir_path, - const size_t file_id, - const size_t max_size, - const size_t block_size, - const size_t read_buffer_size, - const size_t write_buffer_size, - const size_t max_stored_keys); - - ~SSDComplexKeyCachePartition(); - - template - using ResultArrayType = std::conditional_t, DecimalPaddedPODArray, PaddedPODArray>; - - template - void getValue(const size_t attribute_index, - const Columns & key_columns, const DataTypes & key_types, - ResultArrayType & out, std::vector & found, GetDefault & default_value_extractor, - std::chrono::system_clock::time_point now) const; - - void getString(const size_t attribute_index, - const Columns & key_columns, const DataTypes & key_types, - StringRefs & refs, ArenaWithFreeLists & arena, std::vector & found, - std::vector & default_ids, std::chrono::system_clock::time_point now) const; - - void hasKeys(const Columns & key_columns, const DataTypes & key_types, - ResultArrayType & out, std::vector & found, - std::chrono::system_clock::time_point now) const; - - struct Attribute - { - template - using Container = std::vector; - - AttributeUnderlyingType type; - std::variant< - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container, - Container> values; - }; - using Attributes = std::vector; - - size_t appendBlock( - const Columns & key_columns, - const DataTypes & key_types, - const Attributes & new_attributes, - const PaddedPODArray & metadata, - const size_t begin); - - size_t appendDefaults( - const KeyRefs & keys, - const PaddedPODArray & metadata, - const size_t begin); - - void clearOldestBlocks(); - - void flush(); - - void remove(); - - size_t getId() const; - - double getLoadFactor() const; - - size_t getElementCount() const; - - size_t getBytesAllocated() const; - -private: - size_t append( - const KeyRefs & keys, - const Attributes & new_attributes, - const PaddedPODArray & metadata, - const size_t begin); - - template - void getImpl(const Columns & key_columns, const DataTypes & key_types, - SetFunc & set, std::vector & found) const; - - template - void getValueFromMemory(const PaddedPODArray & indices, SetFunc & set) const; - - template - void getValueFromStorage(const PaddedPODArray & indices, SetFunc & set) const; - - void ignoreFromBufferToAttributeIndex(const size_t attribute_index, ReadBuffer & buf) const; - - const size_t file_id; - const size_t max_size; - const size_t block_size; - const size_t read_buffer_size; - const size_t write_buffer_size; - const size_t max_stored_keys; - const std::string path; - - mutable std::shared_mutex rw_lock; - - int fd = -1; - - ComplexKeysPool keys_pool; - mutable BucketCacheIndex, KeyDeleter> key_to_index; - - std::optional keys_buffer_pool; - KeyRefs keys_buffer; - - const std::vector attributes_structure; - - std::optional> memory; - std::optional write_buffer; - uint32_t keys_in_block = 0; - - size_t current_memory_block_id = 0; - size_t current_file_block_id = 0; -}; - -using SSDComplexKeyCachePartitionPtr = std::shared_ptr; - - -/** Class for managing SSDCachePartition and getting data from source. - */ -class SSDComplexKeyCacheStorage -{ -public: - using AttributeTypes = std::vector; - - SSDComplexKeyCacheStorage( - const AttributeTypes & attributes_structure, - const std::string & path, - const size_t max_partitions_count, - const size_t file_size, - const size_t block_size, - const size_t read_buffer_size, - const size_t write_buffer_size, - const size_t max_stored_keys); - - ~SSDComplexKeyCacheStorage(); - - template - using ResultArrayType = SSDComplexKeyCachePartition::ResultArrayType; - - template - void getValue(const size_t attribute_index, const Columns & key_columns, const DataTypes & key_types, - ResultArrayType & out, std::unordered_map> & not_found, - TemporalComplexKeysPool & not_found_pool, - GetDefault & get_default, std::chrono::system_clock::time_point now) const; - - void getString(const size_t attribute_index, const Columns & key_columns, const DataTypes & key_types, - StringRefs & refs, ArenaWithFreeLists & arena, std::unordered_map> & not_found, - TemporalComplexKeysPool & not_found_pool, - std::vector & default_ids, std::chrono::system_clock::time_point now) const; - - void hasKeys(const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out, - std::unordered_map> & not_found, - TemporalComplexKeysPool & not_found_pool, std::chrono::system_clock::time_point now) const; - - template - void update(DictionarySourcePtr & source_ptr, - const Columns & key_columns, const DataTypes & key_types, - const KeyRefs & required_keys, const std::vector & required_rows, - TemporalComplexKeysPool & tmp_keys_pool, - PresentIdHandler && on_updated, AbsentIdHandler && on_key_not_found, - const DictionaryLifetime lifetime); - - std::exception_ptr getLastException() const { return last_update_exception; } - - const std::string & getPath() const { return path; } - - size_t getQueryCount() const { return query_count.load(std::memory_order_relaxed); } - - size_t getHitCount() const { return hit_count.load(std::memory_order_acquire); } - - size_t getElementCount() const; - - double getLoadFactor() const; - -private: - void collectGarbage(); - - const AttributeTypes attributes_structure; - - const std::string path; - const size_t max_partitions_count; - const size_t file_size; - const size_t block_size; - const size_t read_buffer_size; - const size_t write_buffer_size; - const size_t max_stored_keys; - - mutable std::shared_mutex rw_lock; - std::list partitions; - std::list partition_delete_queue; - - Poco::Logger * const log; - - mutable pcg64 rnd_engine; - - mutable std::exception_ptr last_update_exception; - mutable size_t update_error_count = 0; - mutable std::chrono::system_clock::time_point backoff_end_time; - - mutable std::atomic hit_count{0}; - mutable std::atomic query_count{0}; -}; - - -/** Dictionary interface - */ -class SSDComplexKeyCacheDictionary final : public IDictionaryBase -{ -public: - SSDComplexKeyCacheDictionary( - const StorageID & dict_id_, - const DictionaryStructure & dict_struct_, - DictionarySourcePtr source_ptr_, - const DictionaryLifetime dict_lifetime_, - const std::string & path, - const size_t max_partitions_count_, - const size_t file_size_, - const size_t block_size_, - const size_t read_buffer_size_, - const size_t write_buffer_size_, - const size_t max_stored_keys_); - - std::string getKeyDescription() const { return dict_struct.getKeyDescription(); } - - std::string getTypeName() const override { return "SSDComplexKeyCache"; } - - size_t getBytesAllocated() const override { return 0; } // TODO: ? - - size_t getQueryCount() const override { return storage.getQueryCount(); } - - double getHitRate() const override - { - return static_cast(storage.getHitCount()) / storage.getQueryCount(); - } - - size_t getElementCount() const override { return storage.getElementCount(); } - - double getLoadFactor() const override { return storage.getLoadFactor(); } - - bool supportUpdates() const override { return false; } - - std::shared_ptr clone() const override - { - return std::make_shared(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, path, - max_partitions_count, file_size, block_size, read_buffer_size, write_buffer_size, max_stored_keys); - } - - const IDictionarySource * getSource() const override { return source_ptr.get(); } - - const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } - - const DictionaryStructure & getStructure() const override { return dict_struct; } - - bool isInjective(const std::string & attribute_name) const override - { - return dict_struct.attributes[getAttributeIndex(attribute_name)].injective; - } - - std::exception_ptr getLastException() const override { return storage.getLastException(); } - - DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } - - ColumnPtr getColumn( - const std::string& attribute_name, - const DataTypePtr & result_type, - const Columns & key_columns, - const DataTypes & key_types, - const ColumnPtr default_values_column) const override; - - ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; - - template - using ResultArrayType = SSDComplexKeyCacheStorage::ResultArrayType; - - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; - -private: - size_t getAttributeIndex(const std::string & attr_name) const; - - template - AttributeValueVariant createAttributeNullValueWithTypeImpl(const Field & null_value); - AttributeValueVariant createAttributeNullValueWithType(const AttributeUnderlyingType type, const Field & null_value); - void createAttributes(); - - template - void getItemsNumberImpl( - const size_t attribute_index, - const Columns & key_columns, - const DataTypes & key_types, - ResultArrayType & out, - DefaultValueExtractor & default_value_extractor) const; - - void getItemsStringImpl( - const size_t attribute_index, - const Columns & key_columns, - const DataTypes & key_types, - ColumnString * out, - DictionaryDefaultValueExtractor & default_value_extractor) const; - - const std::string name; - const DictionaryStructure dict_struct; - mutable DictionarySourcePtr source_ptr; - const DictionaryLifetime dict_lifetime; - - const std::string path; - const size_t max_partitions_count; - const size_t file_size; - const size_t block_size; - const size_t read_buffer_size; - const size_t write_buffer_size; - const size_t max_stored_keys; - - std::map attribute_index_by_name; - std::vector null_values; - mutable SSDComplexKeyCacheStorage storage; - Poco::Logger * const log; - - mutable size_t bytes_allocated = 0; -}; - -} - -#endif diff --git a/src/Dictionaries/XDBCDictionarySource.cpp b/src/Dictionaries/XDBCDictionarySource.cpp index 3615f72605f..5774641a90f 100644 --- a/src/Dictionaries/XDBCDictionarySource.cpp +++ b/src/Dictionaries/XDBCDictionarySource.cpp @@ -11,18 +11,14 @@ #include #include #include -#include #include #include #include "DictionarySourceFactory.h" #include "DictionaryStructure.h" #include "readInvalidateQuery.h" - #include "registerDictionaries.h" +#include -#if USE_ODBC -# include // Y_IGNORE -#endif namespace DB { @@ -41,7 +37,7 @@ namespace const Poco::URI & uri, std::function callback, const Block & sample_block, - const Context & context, + ContextPtr context, UInt64 max_block_size, const ConnectionTimeouts & timeouts, const String name_) @@ -89,9 +85,9 @@ namespace else { if (!schema.empty()) - throw Exception{"Dictionary source of type " + bridge_.getName() + " specifies a schema but schema is not supported by " - + bridge_.getName() + "-driver", - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Dictionary source of type {0} specifies a schema but schema is not supported by {0}-driver", + bridge_.getName()); } return {dict_struct_, db_, schema, table, where_, bridge_.getIdentifierQuotingStyle()}; @@ -106,9 +102,10 @@ XDBCDictionarySource::XDBCDictionarySource( const Poco::Util::AbstractConfiguration & config_, const std::string & config_prefix_, const Block & sample_block_, - const Context & context_, + ContextPtr context_, const BridgeHelperPtr bridge_) - : log(&Poco::Logger::get(bridge_->getName() + "DictionarySource")) + : WithContext(context_->getGlobalContext()) + , log(&Poco::Logger::get(bridge_->getName() + "DictionarySource")) , update_time{std::chrono::system_clock::from_time_t(0)} , dict_struct{dict_struct_} , db{config_.getString(config_prefix_ + ".db", "")} @@ -122,18 +119,18 @@ XDBCDictionarySource::XDBCDictionarySource( , invalidate_query{config_.getString(config_prefix_ + ".invalidate_query", "")} , bridge_helper{bridge_} , timeouts{ConnectionTimeouts::getHTTPTimeouts(context_)} - , global_context(context_.getGlobalContext()) { bridge_url = bridge_helper->getMainURI(); - auto url_params = bridge_helper->getURLParams(sample_block_.getNamesAndTypesList().toString(), max_block_size); + auto url_params = bridge_helper->getURLParams(max_block_size); for (const auto & [name, value] : url_params) bridge_url.addQueryParameter(name, value); } /// copy-constructor is provided in order to support cloneability XDBCDictionarySource::XDBCDictionarySource(const XDBCDictionarySource & other) - : log(&Poco::Logger::get(other.bridge_helper->getName() + "DictionarySource")) + : WithContext(other.getContext()) + , log(&Poco::Logger::get(other.bridge_helper->getName() + "DictionarySource")) , update_time{other.update_time} , dict_struct{other.dict_struct} , db{other.db} @@ -148,18 +145,17 @@ XDBCDictionarySource::XDBCDictionarySource(const XDBCDictionarySource & other) , bridge_helper{other.bridge_helper} , bridge_url{other.bridge_url} , timeouts{other.timeouts} - , global_context{other.global_context} { } + std::string XDBCDictionarySource::getUpdateFieldAndDate() { if (update_time != std::chrono::system_clock::from_time_t(0)) { - auto tmp_time = update_time; + time_t hr_time = std::chrono::system_clock::to_time_t(update_time) - 1; + std::string str_time = DateLUT::instance().timeToString(hr_time); update_time = std::chrono::system_clock::now(); - time_t hr_time = std::chrono::system_clock::to_time_t(tmp_time) - 1; - std::string str_time = std::to_string(LocalDateTime(hr_time)); return query_builder.composeUpdateQuery(update_field, str_time); } else @@ -169,52 +165,61 @@ std::string XDBCDictionarySource::getUpdateFieldAndDate() } } + BlockInputStreamPtr XDBCDictionarySource::loadAll() { LOG_TRACE(log, load_all_query); - return loadBase(load_all_query); + return loadFromQuery(bridge_url, sample_block, load_all_query); } + BlockInputStreamPtr XDBCDictionarySource::loadUpdatedAll() { std::string load_query_update = getUpdateFieldAndDate(); LOG_TRACE(log, load_query_update); - return loadBase(load_query_update); + return loadFromQuery(bridge_url, sample_block, load_query_update); } + BlockInputStreamPtr XDBCDictionarySource::loadIds(const std::vector & ids) { const auto query = query_builder.composeLoadIdsQuery(ids); - return loadBase(query); + return loadFromQuery(bridge_url, sample_block, query); } + BlockInputStreamPtr XDBCDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) { const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN); - return loadBase(query); + return loadFromQuery(bridge_url, sample_block, query); } + bool XDBCDictionarySource::supportsSelectiveLoad() const { return true; } + bool XDBCDictionarySource::hasUpdateField() const { return !update_field.empty(); } + DictionarySourcePtr XDBCDictionarySource::clone() const { return std::make_unique(*this); } + std::string XDBCDictionarySource::toString() const { return bridge_helper->getName() + ": " + db + '.' + table + (where.empty() ? "" : ", where: " + where); } + bool XDBCDictionarySource::isModified() const { if (!invalidate_query.empty()) @@ -237,51 +242,48 @@ std::string XDBCDictionarySource::doInvalidateQuery(const std::string & request) bridge_helper->startBridgeSync(); auto invalidate_url = bridge_helper->getMainURI(); - auto url_params = bridge_helper->getURLParams(invalidate_sample_block.getNamesAndTypesList().toString(), max_block_size); + auto url_params = bridge_helper->getURLParams(max_block_size); for (const auto & [name, value] : url_params) invalidate_url.addQueryParameter(name, value); - XDBCBridgeBlockInputStream stream( - invalidate_url, - [request](std::ostream & os) { os << "query=" << request; }, - invalidate_sample_block, - global_context, - max_block_size, - timeouts, - bridge_helper->getName() + "BlockInputStream"); - - return readInvalidateQuery(stream); + return readInvalidateQuery(*loadFromQuery(invalidate_url, invalidate_sample_block, request)); } -BlockInputStreamPtr XDBCDictionarySource::loadBase(const std::string & query) const + +BlockInputStreamPtr XDBCDictionarySource::loadFromQuery(const Poco::URI url, const Block & required_sample_block, const std::string & query) const { bridge_helper->startBridgeSync(); + + auto write_body_callback = [required_sample_block, query](std::ostream & os) + { + os << "sample_block=" << escapeForFileName(required_sample_block.getNamesAndTypesList().toString()); + os << "&"; + os << "query=" << escapeForFileName(query); + }; + return std::make_shared( - bridge_url, - [query](std::ostream & os) { os << "query=" << query; }, - sample_block, - global_context, + url, + write_body_callback, + required_sample_block, + getContext(), max_block_size, timeouts, bridge_helper->getName() + "BlockInputStream"); } + void registerDictionarySourceXDBC(DictionarySourceFactory & factory) { -#if USE_ODBC - Poco::Data::ODBC::Connector::registerConnector(); -#endif - auto create_table_source = [=](const DictionaryStructure & dict_struct, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Block & sample_block, - const Context & context, + ContextPtr context, const std::string & /* default_database */, bool /* check_config */) -> DictionarySourcePtr { #if USE_ODBC BridgeHelperPtr bridge = std::make_shared>( - context, context.getSettings().http_receive_timeout, config.getString(config_prefix + ".odbc.connection_string")); + context, context->getSettings().http_receive_timeout, config.getString(config_prefix + ".odbc.connection_string")); return std::make_unique(dict_struct, config, config_prefix + ".odbc", sample_block, context, bridge); #else (void)dict_struct; @@ -289,24 +291,25 @@ void registerDictionarySourceXDBC(DictionarySourceFactory & factory) (void)config_prefix; (void)sample_block; (void)context; - throw Exception{"Dictionary source of type `odbc` is disabled because poco library was built without ODBC support.", - ErrorCodes::SUPPORT_IS_DISABLED}; + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Dictionary source of type `odbc` is disabled because poco library was built without ODBC support."); #endif }; factory.registerSource("odbc", create_table_source); } + void registerDictionarySourceJDBC(DictionarySourceFactory & factory) { auto create_table_source = [=](const DictionaryStructure & /* dict_struct */, const Poco::Util::AbstractConfiguration & /* config */, const std::string & /* config_prefix */, Block & /* sample_block */, - const Context & /* context */, + ContextPtr /* context */, const std::string & /* default_database */, bool /* check_config */) -> DictionarySourcePtr { - throw Exception{"Dictionary source of type `jdbc` is disabled until consistent support for nullable fields.", - ErrorCodes::SUPPORT_IS_DISABLED}; + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Dictionary source of type `jdbc` is disabled until consistent support for nullable fields."); // BridgeHelperPtr bridge = std::make_shared>(config, context.getSettings().http_receive_timeout, config.getString(config_prefix + ".connection_string")); // return std::make_unique(dict_struct, config, config_prefix + ".jdbc", sample_block, context, bridge); }; diff --git a/src/Dictionaries/XDBCDictionarySource.h b/src/Dictionaries/XDBCDictionarySource.h index 87bc42c76ab..bd473e0db8a 100644 --- a/src/Dictionaries/XDBCDictionarySource.h +++ b/src/Dictionaries/XDBCDictionarySource.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include "DictionaryStructure.h" #include "ExternalQueryBuilder.h" #include "IDictionarySource.h" @@ -23,7 +23,7 @@ class Logger; namespace DB { /// Allows loading dictionaries from a XDBC source via bridges -class XDBCDictionarySource final : public IDictionarySource +class XDBCDictionarySource final : public IDictionarySource, WithContext { public: XDBCDictionarySource( @@ -31,7 +31,7 @@ public: const Poco::Util::AbstractConfiguration & config_, const std::string & config_prefix_, const Block & sample_block_, - const Context & context_, + ContextPtr context_, BridgeHelperPtr bridge); /// copy-constructor is provided in order to support cloneability @@ -62,7 +62,7 @@ private: // execute invalidate_query. expects single cell in result std::string doInvalidateQuery(const std::string & request) const; - BlockInputStreamPtr loadBase(const std::string & query) const; + BlockInputStreamPtr loadFromQuery(const Poco::URI url, const Block & required_sample_block, const std::string & query) const; Poco::Logger * log; @@ -82,7 +82,6 @@ private: BridgeHelperPtr bridge_helper; Poco::URI bridge_url; ConnectionTimeouts timeouts; - const Context & global_context; }; } diff --git a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp index 04ba1db09fc..5ab8213020b 100644 --- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp +++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp @@ -299,15 +299,30 @@ void buildPrimaryKeyConfiguration( if (!complex) { if (key_names.size() != 1) - throw Exception("Primary key for simple dictionary must contain exactly one element", - ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, + "Primary key for simple dictionary must contain exactly one element"); AutoPtr id_element(doc->createElement("id")); root->appendChild(id_element); AutoPtr name_element(doc->createElement("name")); id_element->appendChild(name_element); - const ASTDictionaryAttributeDeclaration * dict_attr = children.front()->as(); + auto identifier_name = key_names.front(); + + auto it = std::find_if(children.begin(), children.end(), [&](const ASTPtr & node) + { + const ASTDictionaryAttributeDeclaration * dict_attr = node->as(); + return dict_attr->name == identifier_name; + }); + + if (it == children.end()) + { + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, + "Primary key field '{}' not found among attributes", + identifier_name); + } + + const ASTDictionaryAttributeDeclaration * dict_attr = (*it)->as(); AutoPtr name(doc->createTextNode(dict_attr->name)); name_element->appendChild(name); @@ -317,8 +332,8 @@ void buildPrimaryKeyConfiguration( else { if (children.size() < key_names.size()) - throw Exception( - "Primary key fields count is more, than dictionary attributes count.", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, + "Primary key fields count is more, than dictionary attributes count."); AutoPtr key_element(doc->createElement("key")); root->appendChild(key_element); @@ -336,8 +351,9 @@ void buildPrimaryKeyConfiguration( } } if (!found) - throw Exception( - "Primary key field '" + key_name + "' not found among attributes.", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, + "Primary key field '{}' not found among attributes.", + key_name); } } } @@ -358,7 +374,7 @@ NamesToTypeNames buildDictionaryAttributesConfiguration( { const ASTDictionaryAttributeDeclaration * dict_attr = child->as(); if (!dict_attr->type) - throw Exception("Dictionary attribute must has type", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Dictionary attribute must has type"); attributes_names_and_types.emplace(dict_attr->name, queryToString(dict_attr->type)); if (std::find(key_columns.begin(), key_columns.end(), dict_attr->name) == key_columns.end()) @@ -374,7 +390,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments( AutoPtr doc, AutoPtr root, const ASTExpressionList * ast_expr_list, - const Context & context) + ContextPtr context) { const auto & children = ast_expr_list->children; for (size_t i = 0; i != children.size(); ++i) @@ -416,9 +432,8 @@ void buildConfigurationFromFunctionWithKeyValueArguments( } else { - throw Exception( - "Incorrect ASTPair contains wrong value, should be literal, identifier or list", - ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, + "Incorrect ASTPair contains wrong value, should be literal, identifier or list"); } } } @@ -442,7 +457,7 @@ void buildSourceConfiguration( AutoPtr root, const ASTFunctionWithKeyValueArguments * source, const ASTDictionarySettings * settings, - const Context & context) + ContextPtr context) { AutoPtr outer_element(doc->createElement("source")); root->appendChild(outer_element); @@ -470,24 +485,24 @@ void buildSourceConfiguration( void checkAST(const ASTCreateQuery & query) { if (!query.is_dictionary || query.dictionary == nullptr) - throw Exception("Cannot convert dictionary to configuration from non-dictionary AST.", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Cannot convert dictionary to configuration from non-dictionary AST."); if (query.dictionary_attributes_list == nullptr || query.dictionary_attributes_list->children.empty()) - throw Exception("Cannot create dictionary with empty attributes list", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Cannot create dictionary with empty attributes list"); if (query.dictionary->layout == nullptr) - throw Exception("Cannot create dictionary with empty layout", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Cannot create dictionary with empty layout"); const auto is_direct_layout = !strcasecmp(query.dictionary->layout->layout_type.data(), "direct") || !strcasecmp(query.dictionary->layout->layout_type.data(), "complex_key_direct"); if (query.dictionary->lifetime == nullptr && !is_direct_layout) - throw Exception("Cannot create dictionary with empty lifetime", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Cannot create dictionary with empty lifetime"); if (query.dictionary->primary_key == nullptr) - throw Exception("Cannot create dictionary without primary key", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Cannot create dictionary without primary key"); if (query.dictionary->source == nullptr) - throw Exception("Cannot create dictionary with empty source", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Cannot create dictionary with empty source"); /// Range can be empty } @@ -496,14 +511,14 @@ void checkPrimaryKey(const NamesToTypeNames & all_attrs, const Names & key_attrs { for (const auto & key_attr : key_attrs) if (all_attrs.count(key_attr) == 0) - throw Exception("Unknown key attribute '" + key_attr + "'", ErrorCodes::INCORRECT_DICTIONARY_DEFINITION); + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Unknown key attribute '{}'", key_attr); } } DictionaryConfigurationPtr -getDictionaryConfigurationFromAST(const ASTCreateQuery & query, const Context & context, const std::string & database_) +getDictionaryConfigurationFromAST(const ASTCreateQuery & query, ContextPtr context, const std::string & database_) { checkAST(query); diff --git a/src/Dictionaries/getDictionaryConfigurationFromAST.h b/src/Dictionaries/getDictionaryConfigurationFromAST.h index 5132e3c77e0..de8659e4d7b 100644 --- a/src/Dictionaries/getDictionaryConfigurationFromAST.h +++ b/src/Dictionaries/getDictionaryConfigurationFromAST.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB { @@ -11,5 +12,5 @@ using DictionaryConfigurationPtr = Poco::AutoPtr 1) - throw Exception("Expected single column in resultset, got " + std::to_string(columns), ErrorCodes::TOO_MANY_COLUMNS); + throw Exception(ErrorCodes::TOO_MANY_COLUMNS, "Expected single column in resultset, got {}", std::to_string(columns)); auto rows = block.rows(); if (rows == 0) - throw Exception("Expected single row in resultset, got 0", ErrorCodes::RECEIVED_EMPTY_DATA); + throw Exception(ErrorCodes::RECEIVED_EMPTY_DATA, "Expected single row in resultset, got 0"); if (rows > 1) - throw Exception("Expected single row in resultset, got at least " + std::to_string(rows), ErrorCodes::TOO_MANY_ROWS); + throw Exception(ErrorCodes::TOO_MANY_ROWS, "Expected single row in resultset, got at least {}", std::to_string(rows)); WriteBufferFromOwnString out; auto & column_type = block.getByPosition(0); - column_type.type->serializeAsTextQuoted(*column_type.column->convertToFullColumnIfConst(), 0, out, FormatSettings()); + column_type.type->getDefaultSerialization()->serializeTextQuoted(*column_type.column->convertToFullColumnIfConst(), 0, out, FormatSettings()); while ((block = block_input_stream.read())) if (block.rows() > 0) - throw Exception("Expected single row in resultset, got at least " + std::to_string(rows + 1), ErrorCodes::TOO_MANY_ROWS); + throw Exception(ErrorCodes::TOO_MANY_ROWS, "Expected single row in resultset, got at least {}", std::to_string(rows + 1)); block_input_stream.readSuffix(); return out.str(); diff --git a/src/Dictionaries/registerCacheDictionaries.cpp b/src/Dictionaries/registerCacheDictionaries.cpp new file mode 100644 index 00000000000..9cc90c44418 --- /dev/null +++ b/src/Dictionaries/registerCacheDictionaries.cpp @@ -0,0 +1,304 @@ +#include "CacheDictionary.h" +#include "CacheDictionaryStorage.h" +#include "SSDCacheDictionaryStorage.h" +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TOO_SMALL_BUFFER_SIZE; + extern const int UNSUPPORTED_METHOD; + extern const int BAD_ARGUMENTS; +} + +CacheDictionaryStorageConfiguration parseCacheStorageConfiguration( + const String & full_name, + const Poco::Util::AbstractConfiguration & config, + const String & layout_prefix, + const DictionaryLifetime & dict_lifetime, + DictionaryKeyType dictionary_key_type) +{ + String dictionary_type_prefix = (dictionary_key_type == DictionaryKeyType::complex) ? ".complex_key_cache." : ".cache."; + String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix; + + const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells"); + if (size == 0) + throw Exception(ErrorCodes::TOO_SMALL_BUFFER_SIZE, + "{}: cache dictionary cannot have 0 cells", + full_name); + + size_t dict_lifetime_seconds = static_cast(dict_lifetime.max_sec); + const size_t strict_max_lifetime_seconds = config.getUInt64(dictionary_configuration_prefix + "strict_max_lifetime_seconds", dict_lifetime_seconds); + + size_t rounded_size = roundUpToPowerOfTwoOrZero(size); + + CacheDictionaryStorageConfiguration storage_configuration{rounded_size, strict_max_lifetime_seconds, dict_lifetime}; + + return storage_configuration; +} + +#if defined(OS_LINUX) || defined(__FreeBSD__) + +SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration( + const String & full_name, + const Poco::Util::AbstractConfiguration & config, + const String & layout_prefix, + const DictionaryLifetime & dict_lifetime, + DictionaryKeyType dictionary_key_type) +{ + String dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_ssd_cache." : ".ssd_cache."; + String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix; + + const size_t strict_max_lifetime_seconds + = config.getUInt64(dictionary_configuration_prefix + "strict_max_lifetime_seconds", static_cast(dict_lifetime.max_sec)); + + static constexpr size_t DEFAULT_SSD_BLOCK_SIZE_BYTES = DEFAULT_AIO_FILE_BLOCK_SIZE; + static constexpr size_t DEFAULT_FILE_SIZE_BYTES = 4 * 1024 * 1024 * 1024ULL; + static constexpr size_t DEFAULT_READ_BUFFER_SIZE_BYTES = 16 * DEFAULT_SSD_BLOCK_SIZE_BYTES; + static constexpr size_t DEFAULT_WRITE_BUFFER_SIZE_BYTES = DEFAULT_SSD_BLOCK_SIZE_BYTES; + + static constexpr size_t DEFAULT_PARTITIONS_COUNT = 16; + + const size_t max_partitions_count + = config.getInt64(dictionary_configuration_prefix + "ssd_cache.max_partitions_count", DEFAULT_PARTITIONS_COUNT); + + const size_t block_size = config.getInt64(dictionary_configuration_prefix + "block_size", DEFAULT_SSD_BLOCK_SIZE_BYTES); + const size_t file_size = config.getInt64(dictionary_configuration_prefix + "file_size", DEFAULT_FILE_SIZE_BYTES); + if (file_size % block_size != 0) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: file_size must be a multiple of block_size", + full_name); + + const size_t read_buffer_size = config.getInt64(dictionary_configuration_prefix + "read_buffer_size", DEFAULT_READ_BUFFER_SIZE_BYTES); + if (read_buffer_size % block_size != 0) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: read_buffer_size must be a multiple of block_size", + full_name); + + const size_t write_buffer_size + = config.getInt64(dictionary_configuration_prefix + "write_buffer_size", DEFAULT_WRITE_BUFFER_SIZE_BYTES); + if (write_buffer_size % block_size != 0) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: write_buffer_size must be a multiple of block_size", + full_name); + + auto directory_path = config.getString(dictionary_configuration_prefix + "path"); + if (directory_path.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: ssd cache dictionary cannot have empty path", + full_name); + + if (directory_path.at(0) != '/') + directory_path = std::filesystem::path{config.getString("path")}.concat(directory_path).string(); + + SSDCacheDictionaryStorageConfiguration configuration{ + strict_max_lifetime_seconds, + dict_lifetime, + directory_path, + max_partitions_count, + block_size, + file_size / block_size, + read_buffer_size / block_size, + write_buffer_size / block_size}; + + return configuration; +} + +#endif + +CacheDictionaryUpdateQueueConfiguration parseCacheDictionaryUpdateQueueConfiguration( + const String & full_name, + const Poco::Util::AbstractConfiguration & config, + const String & layout_prefix, + DictionaryKeyType key_type) +{ + String layout_type = key_type == DictionaryKeyType::complex ? "complex_key_cache" : "cache"; + + const size_t max_update_queue_size = config.getUInt64(layout_prefix + ".cache.max_update_queue_size", 100000); + if (max_update_queue_size == 0) + throw Exception(ErrorCodes::TOO_SMALL_BUFFER_SIZE, + "{}: dictionary of layout '{}' cannot have empty update queue of size 0", + full_name, + layout_type); + + const size_t update_queue_push_timeout_milliseconds + = config.getUInt64(layout_prefix + ".cache.update_queue_push_timeout_milliseconds", 10); + if (update_queue_push_timeout_milliseconds < 10) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: dictionary of layout '{}' have too little update_queue_push_timeout", + full_name, + layout_type); + + const size_t query_wait_timeout_milliseconds = config.getUInt64(layout_prefix + ".cache.query_wait_timeout_milliseconds", 60000); + + const size_t max_threads_for_updates = config.getUInt64(layout_prefix + ".max_threads_for_updates", 4); + if (max_threads_for_updates == 0) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: dictionary of layout) '{}' cannot have zero threads for updates", + full_name, + layout_type); + + CacheDictionaryUpdateQueueConfiguration update_queue_configuration{ + max_update_queue_size, max_threads_for_updates, update_queue_push_timeout_milliseconds, query_wait_timeout_milliseconds}; + + return update_queue_configuration; +} + +template +DictionaryPtr createCacheDictionaryLayout( + const String & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) +{ + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionary"); + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + if (dict_struct.key) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for dictionary of layout 'cache'"); + } + else if constexpr (dictionary_key_type == DictionaryKeyType::complex) + { + if (dict_struct.id) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for dictionary of layout 'complex_key_cache'"); + } + + if (dict_struct.range_min || dict_struct.range_max) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: elements .structure.range_min and .structure.range_max should be defined only " + "for a dictionary of layout 'range_hashed'", + full_name); + + const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); + if (require_nonempty) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: cache dictionary of layout cannot have 'require_nonempty' attribute set", + full_name); + + const auto & layout_prefix = config_prefix + ".layout"; + + const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); + + const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; + + const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false); + + auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type); + + std::shared_ptr storage = std::make_shared>(dict_struct, storage_configuration); + + auto update_queue_configuration = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type); + + return std::make_unique>( + dict_id, dict_struct, std::move(source_ptr), storage, update_queue_configuration, dict_lifetime, allow_read_expired_keys); +} + +#if defined(OS_LINUX) || defined(__FreeBSD__) + +template +DictionaryPtr createSSDCacheDictionaryLayout( + const String & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) +{ + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionary"); + + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + { + if (dict_struct.key) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for dictionary of layout 'ssd_cache'"); + } + else if constexpr (dictionary_key_type == DictionaryKeyType::complex) + { + if (dict_struct.id) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for dictionary of layout 'complex_key_ssd_cache'"); + } + + if (dict_struct.range_min || dict_struct.range_max) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: elements .structure.range_min and .structure.range_max should be defined only " + "for a dictionary of layout 'range_hashed'", + full_name); + + const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); + if (require_nonempty) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "{}: cache dictionary of layout cannot have 'require_nonempty' attribute set", + full_name); + + const auto & layout_prefix = config_prefix + ".layout"; + + const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix); + + const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; + + const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false); + + auto storage_configuration = parseSSDCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type); + auto storage = std::make_shared>(storage_configuration); + + auto update_queue_configuration + = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type); + + return std::make_unique>( + dict_id, dict_struct, std::move(source_ptr), storage, update_queue_configuration, dict_lifetime, allow_read_expired_keys); +} + +#endif + +void registerDictionaryCache(DictionaryFactory & factory) +{ + auto create_simple_cache_layout = [=](const String & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) -> DictionaryPtr + { + return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); + }; + + factory.registerLayout("cache", create_simple_cache_layout, false); + + auto create_complex_key_cache_layout = [=](const std::string & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) -> DictionaryPtr + { + return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); + }; + + factory.registerLayout("complex_key_cache", create_complex_key_cache_layout, true); + +#if defined(OS_LINUX) || defined(__FreeBSD__) + + auto create_simple_ssd_cache_layout = [=](const std::string & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) -> DictionaryPtr + { + return createSSDCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); + }; + + factory.registerLayout("ssd_cache", create_simple_ssd_cache_layout, false); + + auto create_complex_key_ssd_cache_layout = [=](const std::string & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) -> DictionaryPtr { + return createSSDCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); + }; + + factory.registerLayout("complex_key_ssd_cache", create_complex_key_ssd_cache_layout, true); +#endif +} + +} diff --git a/src/Dictionaries/registerDictionaries.cpp b/src/Dictionaries/registerDictionaries.cpp index abcc0ce06ad..8d24a6ea979 100644 --- a/src/Dictionaries/registerDictionaries.cpp +++ b/src/Dictionaries/registerDictionaries.cpp @@ -18,22 +18,17 @@ void registerDictionarySourceJDBC(DictionarySourceFactory & source_factory); void registerDictionarySourcePostgreSQL(DictionarySourceFactory & source_factory); #endif void registerDictionarySourceExecutable(DictionarySourceFactory & source_factory); +void registerDictionarySourceExecutablePool(DictionarySourceFactory & source_factory); void registerDictionarySourceHTTP(DictionarySourceFactory & source_factory); void registerDictionarySourceLibrary(DictionarySourceFactory & source_factory); class DictionaryFactory; void registerDictionaryRangeHashed(DictionaryFactory & factory); void registerDictionaryComplexKeyHashed(DictionaryFactory & factory); -void registerDictionaryComplexKeyCache(DictionaryFactory & factory); -void registerDictionaryComplexKeyDirect(DictionaryFactory & factory); void registerDictionaryTrie(DictionaryFactory & factory); void registerDictionaryFlat(DictionaryFactory & factory); void registerDictionaryHashed(DictionaryFactory & factory); void registerDictionaryCache(DictionaryFactory & factory); -#if defined(__linux__) || defined(__FreeBSD__) -void registerDictionarySSDCache(DictionaryFactory & factory); -void registerDictionarySSDComplexKeyCache(DictionaryFactory & factory); -#endif void registerDictionaryPolygon(DictionaryFactory & factory); void registerDictionaryDirect(DictionaryFactory & factory); @@ -54,6 +49,7 @@ void registerDictionaries() registerDictionarySourcePostgreSQL(source_factory); #endif registerDictionarySourceExecutable(source_factory); + registerDictionarySourceExecutablePool(source_factory); registerDictionarySourceHTTP(source_factory); registerDictionarySourceLibrary(source_factory); } @@ -61,17 +57,10 @@ void registerDictionaries() { auto & factory = DictionaryFactory::instance(); registerDictionaryRangeHashed(factory); - registerDictionaryComplexKeyHashed(factory); - registerDictionaryComplexKeyCache(factory); - registerDictionaryComplexKeyDirect(factory); registerDictionaryTrie(factory); registerDictionaryFlat(factory); registerDictionaryHashed(factory); registerDictionaryCache(factory); -#if defined(OS_LINUX) || defined(__FreeBSD__) - registerDictionarySSDCache(factory); - registerDictionarySSDComplexKeyCache(factory); -#endif registerDictionaryPolygon(factory); registerDictionaryDirect(factory); } diff --git a/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp b/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp new file mode 100644 index 00000000000..9fd9dc9b78c --- /dev/null +++ b/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp @@ -0,0 +1,240 @@ +#if defined(__linux__) || defined(__FreeBSD__) + +#include + +#include + +using namespace DB; + +TEST(SSDCacheDictionaryStorage, SSDCacheBlockWithSSDCacheSimpleKey) +{ + static constexpr size_t block_data_size = 4096; + std::unique_ptr block_data(new char[block_data_size]); + memset(block_data.get(), 0, block_data_size); + + { + memset(block_data.get(), 0, block_data_size); + + SSDCacheBlock block(block_data_size); + + block.reset(block_data.get()); + + std::unique_ptr data_to_insert(new char[4000]); + memset(data_to_insert.get(), 1, 4000); + + SSDCacheSimpleKey key(0, 200, data_to_insert.get()); + ASSERT_EQ(block.getKeysSize(), 0); + + bool write_result = false; + size_t offset_in_block = 0; + + ASSERT_TRUE(block.enoughtPlaceToWriteKey(key)); + write_result = block.writeKey(key, offset_in_block); + ASSERT_TRUE(write_result); + ASSERT_EQ(block.getKeysSize(), 1); + + key.key = 1; + ASSERT_TRUE(block.enoughtPlaceToWriteKey(key)); + write_result = block.writeKey(key, offset_in_block); + ASSERT_TRUE(write_result); + ASSERT_EQ(block.getKeysSize(), 2); + + key.key = 2; + ASSERT_TRUE(block.enoughtPlaceToWriteKey(key)); + write_result = block.writeKey(key, offset_in_block); + ASSERT_TRUE(write_result); + ASSERT_EQ(block.getKeysSize(), 3); + + PaddedPODArray expected = {0,1,2}; + PaddedPODArray actual; + block.readSimpleKeys(actual); + ASSERT_EQ(actual, expected); + } + { + memset(block_data.get(), 0, block_data_size); + SSDCacheBlock block(block_data_size); + block.reset(block_data.get()); + + static constexpr size_t block_header_size = SSDCacheBlock::block_header_size; + static constexpr size_t key_metadata_size = sizeof(size_t) * 2; + + std::unique_ptr data_to_insert(new char[4080]); + memset(data_to_insert.get(), 1, 4000); + + SSDCacheSimpleKey key {0, 4064, data_to_insert.get()}; + + ASSERT_TRUE(SSDCacheBlock::canBeWrittenInEmptyBlock(key, block_data_size)); + key.size = 4065; + ASSERT_FALSE(SSDCacheBlock::canBeWrittenInEmptyBlock(key, block_data_size)); + key.size = 4064; + + size_t offset_in_block; + ASSERT_TRUE(block.enoughtPlaceToWriteKey(key)); + ASSERT_TRUE(block.writeKey(key, offset_in_block)); + ASSERT_EQ(offset_in_block, block_header_size + key_metadata_size); + + ASSERT_FALSE(block.enoughtPlaceToWriteKey({1, 4065, data_to_insert.get()})); + offset_in_block = 0; + ASSERT_FALSE(block.writeKey({1, 4065, data_to_insert.get()}, offset_in_block)); + ASSERT_EQ(offset_in_block, 0); + + PaddedPODArray expected = {0}; + PaddedPODArray actual; + block.readSimpleKeys(actual); + ASSERT_EQ(actual, expected); + } + { + memset(block_data.get(), 0, block_data_size); + SSDCacheBlock block(block_data_size); + block.reset(block_data.get()); + + PaddedPODArray expected = {}; + PaddedPODArray actual; + block.readSimpleKeys(actual); + ASSERT_EQ(actual, expected); + } + { + memset(block_data.get(), 0, block_data_size); + SSDCacheBlock block(block_data_size); + block.reset(block_data.get()); + + std::unique_ptr data_to_insert(new char[4000]); + memset(data_to_insert.get(), 1, 4000); + size_t offset_in_block; + SSDCacheSimpleKey key {0, 200, data_to_insert.get()}; + block.writeKey({1, 200, data_to_insert.get()}, offset_in_block); + ASSERT_EQ(block.getKeysSize(), 1); + + ASSERT_FALSE(block.checkCheckSum()); + block.writeCheckSum(); + ASSERT_TRUE(block.checkCheckSum()); + + SSDCacheBlock other_block(block_data_size); + other_block.reset(block_data.get()); + bool write_result = other_block.writeKey({2, 200, data_to_insert.get()}, offset_in_block); + ASSERT_TRUE(write_result); + + ASSERT_FALSE(block.checkCheckSum()); + block.writeCheckSum(); + ASSERT_TRUE(block.checkCheckSum()); + } +} + +TEST(SSDCacheDictionaryStorage, SSDCacheBlockWithSSDCachComplexKey) +{ + static constexpr size_t block_data_size = 4096; + std::unique_ptr block_data(new char[block_data_size]); + memset(block_data.get(), 0, block_data_size); + + { + memset(block_data.get(), 0, block_data_size); + + SSDCacheBlock block(block_data_size); + + block.reset(block_data.get()); + + std::unique_ptr data_to_insert(new char[4000]); + memset(data_to_insert.get(), 1, 4000); + + String key = "0"; + + SSDCacheComplexKey ssd_cache_key(key, 200, data_to_insert.get()); + ASSERT_EQ(block.getKeysSize(), 0); + + bool write_result = false; + size_t offset_in_block = 0; + + ASSERT_TRUE(block.enoughtPlaceToWriteKey(ssd_cache_key)); + write_result = block.writeKey(ssd_cache_key, offset_in_block); + ASSERT_TRUE(write_result); + ASSERT_EQ(block.getKeysSize(), 1); + + ssd_cache_key.key = "1"; + ASSERT_TRUE(block.enoughtPlaceToWriteKey(ssd_cache_key)); + write_result = block.writeKey(ssd_cache_key, offset_in_block); + ASSERT_TRUE(write_result); + ASSERT_EQ(block.getKeysSize(), 2); + + ssd_cache_key.key = "2"; + ASSERT_TRUE(block.enoughtPlaceToWriteKey(ssd_cache_key)); + write_result = block.writeKey(ssd_cache_key, offset_in_block); + ASSERT_TRUE(write_result); + ASSERT_EQ(block.getKeysSize(), 3); + + PaddedPODArray expected = {"0","1","2"}; + PaddedPODArray actual; + + block.readComplexKeys(actual); + ASSERT_EQ(actual, expected); + } + { + memset(block_data.get(), 0, block_data_size); + SSDCacheBlock block(block_data_size); + block.reset(block_data.get()); + + static constexpr size_t block_header_size = SSDCacheBlock::block_header_size; + static constexpr size_t key_metadata_size = sizeof(size_t) * 2; + + std::unique_ptr data_to_insert(new char[4080]); + memset(data_to_insert.get(), 1, 4000); + + SSDCacheComplexKey key {"", 4064, data_to_insert.get()}; + + ASSERT_TRUE(SSDCacheBlock::canBeWrittenInEmptyBlock(key, block_data_size)); + key.size = 4065; + ASSERT_FALSE(SSDCacheBlock::canBeWrittenInEmptyBlock(key, block_data_size)); + key.size = 4064; + + size_t offset_in_block; + ASSERT_TRUE(block.enoughtPlaceToWriteKey(key)); + ASSERT_TRUE(block.writeKey(key, offset_in_block)); + ASSERT_EQ(offset_in_block, block_header_size + key_metadata_size); + + ASSERT_FALSE(block.enoughtPlaceToWriteKey({1, 4065, data_to_insert.get()})); + offset_in_block = 0; + ASSERT_FALSE(block.writeKey({1, 4065, data_to_insert.get()}, offset_in_block)); + ASSERT_EQ(offset_in_block, 0); + + PaddedPODArray expected = {0}; + PaddedPODArray actual; + block.readSimpleKeys(actual); + ASSERT_EQ(actual, expected); + } + { + memset(block_data.get(), 0, block_data_size); + SSDCacheBlock block(block_data_size); + block.reset(block_data.get()); + + PaddedPODArray expected = {}; + PaddedPODArray actual; + block.readComplexKeys(actual); + ASSERT_EQ(actual, expected); + } + { + memset(block_data.get(), 0, block_data_size); + SSDCacheBlock block(block_data_size); + block.reset(block_data.get()); + + std::unique_ptr data_to_insert(new char[4000]); + memset(data_to_insert.get(), 1, 4000); + size_t offset_in_block; + SSDCacheComplexKey key {"0", 200, data_to_insert.get()}; + block.writeKey({1, 200, data_to_insert.get()}, offset_in_block); + ASSERT_EQ(block.getKeysSize(), 1); + + ASSERT_FALSE(block.checkCheckSum()); + block.writeCheckSum(); + ASSERT_TRUE(block.checkCheckSum()); + + SSDCacheBlock other_block(block_data_size); + other_block.reset(block_data.get()); + bool write_result = other_block.writeKey({2, 200, data_to_insert.get()}, offset_in_block); + ASSERT_TRUE(write_result); + + ASSERT_FALSE(block.checkCheckSum()); + block.writeCheckSum(); + ASSERT_TRUE(block.checkCheckSum()); + } +} + +#endif diff --git a/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp b/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp new file mode 100644 index 00000000000..064f57dfe11 --- /dev/null +++ b/src/Dictionaries/tests/gtest_hierarchy_dictionaries_utils.cpp @@ -0,0 +1,225 @@ +#include + +#include + +#include + +using namespace DB; + +TEST(HierarchyDictionariesUtils, getHierarchy) +{ + { + HashMap child_to_parent; + child_to_parent[1] = 0; + child_to_parent[2] = 1; + child_to_parent[3] = 1; + child_to_parent[4] = 2; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3, 4, 5}; + + auto result = DB::detail::getHierarchy( + keys, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 1, 3, 1, 4, 2, 1}; + PaddedPODArray expected_offsets = {1, 3, 5, 8, 8}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + HashMap child_to_parent; + child_to_parent[1] = 2; + child_to_parent[2] = 1; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3}; + + auto result = DB::detail::getHierarchy( + keys, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 2}; + PaddedPODArray expected_offsets = {2, 3, 3}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } +} + +TEST(HierarchyDictionariesUtils, getIsInHierarchy) +{ + { + HashMap child_to_parent; + child_to_parent[1] = 0; + child_to_parent[2] = 1; + child_to_parent[3] = 1; + child_to_parent[4] = 2; + + auto is_key_valid_func = [&](auto key) { return child_to_parent.find(key) != nullptr; }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3, 4, 5}; + PaddedPODArray keys_in = {1, 1, 1, 2, 5}; + + PaddedPODArray actual = DB::detail::getIsInHierarchy( + keys, + keys_in, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + PaddedPODArray expected = {1,1,1,1,0}; + + ASSERT_EQ(actual, expected); + } + { + HashMap child_to_parent; + child_to_parent[1] = 2; + child_to_parent[2] = 1; + + auto is_key_valid_func = [&](auto key) + { + return child_to_parent.find(key) != nullptr; + }; + + auto get_parent_key_func = [&](auto key) + { + auto it = child_to_parent.find(key); + std::optional value = (it != nullptr ? std::make_optional(it->getMapped()) : std::nullopt); + return value; + }; + + UInt64 hierarchy_null_value_key = 0; + PaddedPODArray keys = {1, 2, 3}; + PaddedPODArray keys_in = {1, 2, 3}; + + PaddedPODArray actual = DB::detail::getIsInHierarchy( + keys, + keys_in, + hierarchy_null_value_key, + is_key_valid_func, + get_parent_key_func); + + PaddedPODArray expected = {1, 1, 0}; + ASSERT_EQ(actual, expected); + } +} + +TEST(HierarchyDictionariesUtils, getDescendants) +{ + { + HashMap> parent_to_child; + parent_to_child[0].emplace_back(1); + parent_to_child[1].emplace_back(2); + parent_to_child[1].emplace_back(3); + parent_to_child[2].emplace_back(4); + + PaddedPODArray keys = {0, 1, 2, 3, 4}; + + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetAllDescendantsStrategy()); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 3, 4, 2, 3, 4, 4}; + PaddedPODArray expected_offsets = {4, 7, 8, 8, 8}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetDescendantsAtSpecificLevelStrategy{1}); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {1, 2, 3, 4}; + PaddedPODArray expected_offsets = {1, 3, 4, 4, 4}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + } + { + HashMap> parent_to_child; + parent_to_child[1].emplace_back(2); + parent_to_child[2].emplace_back(1); + + PaddedPODArray keys = {1, 2, 3}; + + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetAllDescendantsStrategy()); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {2, 1, 1}; + PaddedPODArray expected_offsets = {2, 3, 3}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + { + auto result = DB::detail::getDescendants( + keys, + parent_to_child, + DB::detail::GetDescendantsAtSpecificLevelStrategy{1}); + + const auto & actual_elements = result.elements; + const auto & actual_offsets = result.offsets; + + PaddedPODArray expected_elements = {2, 1}; + PaddedPODArray expected_offsets = {1, 2, 2}; + + ASSERT_EQ(actual_elements, expected_elements); + ASSERT_EQ(actual_offsets, expected_offsets); + } + } +} diff --git a/src/Dictionaries/ya.make b/src/Dictionaries/ya.make index 4f33dc80559..36152fe439a 100644 --- a/src/Dictionaries/ya.make +++ b/src/Dictionaries/ya.make @@ -9,6 +9,7 @@ PEERDIR( contrib/libs/poco/MongoDB contrib/libs/poco/Redis contrib/libs/sparsehash + contrib/restricted/abseil-cpp ) IF (USE_ODBC) @@ -20,13 +21,12 @@ NO_COMPILER_WARNINGS() SRCS( CacheDictionary.cpp + CacheDictionaryUpdateQueue.cpp CassandraBlockInputStream.cpp CassandraDictionarySource.cpp CassandraHelpers.cpp ClickHouseDictionarySource.cpp - ComplexKeyCacheDictionary.cpp - ComplexKeyDirectDictionary.cpp - ComplexKeyHashedDictionary.cpp + DictionaryBlockInputStream.cpp DictionaryBlockInputStreamBase.cpp DictionaryFactory.cpp DictionarySourceFactory.cpp @@ -42,14 +42,15 @@ SRCS( Embedded/RegionsHierarchy.cpp Embedded/RegionsNames.cpp ExecutableDictionarySource.cpp + ExecutablePoolDictionarySource.cpp ExternalQueryBuilder.cpp FileDictionarySource.cpp FlatDictionary.cpp HTTPDictionarySource.cpp HashedDictionary.cpp + HierarchyDictionariesUtils.cpp IPAddressDictionary.cpp LibraryDictionarySource.cpp - LibraryDictionarySourceExternal.cpp MongoDBDictionarySource.cpp MySQLDictionarySource.cpp PolygonDictionary.cpp @@ -58,11 +59,10 @@ SRCS( RangeHashedDictionary.cpp RedisBlockInputStream.cpp RedisDictionarySource.cpp - SSDCacheDictionary.cpp - SSDComplexKeyCacheDictionary.cpp XDBCDictionarySource.cpp getDictionaryConfigurationFromAST.cpp readInvalidateQuery.cpp + registerCacheDictionaries.cpp registerDictionaries.cpp writeParenthesisedString.cpp diff --git a/src/Dictionaries/ya.make.in b/src/Dictionaries/ya.make.in index e52b106d034..aa82fb21ba6 100644 --- a/src/Dictionaries/ya.make.in +++ b/src/Dictionaries/ya.make.in @@ -8,6 +8,7 @@ PEERDIR( contrib/libs/poco/MongoDB contrib/libs/poco/Redis contrib/libs/sparsehash + contrib/restricted/abseil-cpp ) IF (USE_ODBC) diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index 6d991c17c67..95034b8e107 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -1,7 +1,6 @@ #include "DiskCacheWrapper.h" #include #include -#include #include namespace DB @@ -103,15 +102,21 @@ std::shared_ptr DiskCacheWrapper::acquireDownloadMetadata( } std::unique_ptr -DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const +DiskCacheWrapper::readFile( + const String & path, + size_t buf_size, + size_t estimated_size, + size_t aio_threshold, + size_t mmap_threshold, + MMappedFileCache * mmap_cache) const { if (!cache_file_predicate(path)) - return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold); + return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache); - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Read file {} from cache", backQuote(path)); + LOG_DEBUG(log, "Read file {} from cache", backQuote(path)); if (cache_disk->exists(path)) - return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold); + return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache); auto metadata = acquireDownloadMetadata(path); @@ -122,11 +127,11 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate { /// This thread will responsible for file downloading to cache. metadata->status = DOWNLOADING; - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "File {} doesn't exist in cache. Will download it", backQuote(path)); + LOG_DEBUG(log, "File {} doesn't exist in cache. Will download it", backQuote(path)); } else if (metadata->status == DOWNLOADING) { - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Waiting for file {} download to cache", backQuote(path)); + LOG_DEBUG(log, "Waiting for file {} download to cache", backQuote(path)); metadata->condition.wait(lock, [metadata] { return metadata->status == DOWNLOADED || metadata->status == ERROR; }); } } @@ -145,13 +150,13 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate auto tmp_path = path + ".tmp"; { - auto src_buffer = DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold); + auto src_buffer = DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache); auto dst_buffer = cache_disk->writeFile(tmp_path, buf_size, WriteMode::Rewrite); copyData(*src_buffer, *dst_buffer); } cache_disk->moveFile(tmp_path, path); - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "File {} downloaded to cache", backQuote(path)); + LOG_DEBUG(log, "File {} downloaded to cache", backQuote(path)); } catch (...) { @@ -169,9 +174,9 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate } if (metadata->status == DOWNLOADED) - return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold); + return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache); - return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold); + return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache); } std::unique_ptr @@ -180,7 +185,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode if (!cache_file_predicate(path)) return DiskDecorator::writeFile(path, buf_size, mode); - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Write file {} to cache", backQuote(path)); + LOG_DEBUG(log, "Write file {} to cache", backQuote(path)); auto dir_path = directoryPath(path); if (!cache_disk->exists(dir_path)) @@ -191,7 +196,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode [this, path, buf_size, mode]() { /// Copy file from cache to actual disk when cached buffer is finalized. - auto src_buffer = cache_disk->readFile(path, buf_size, 0, 0, 0); + auto src_buffer = cache_disk->readFile(path, buf_size, 0, 0, 0, nullptr); auto dst_buffer = DiskDecorator::writeFile(path, buf_size, mode); copyData(*src_buffer, *dst_buffer); dst_buffer->finalize(); @@ -209,7 +214,13 @@ void DiskCacheWrapper::clearDirectory(const String & path) void DiskCacheWrapper::moveDirectory(const String & from_path, const String & to_path) { if (cache_disk->exists(from_path)) + { + /// Destination directory may not be empty if previous directory move attempt was failed. + if (cache_disk->exists(to_path) && cache_disk->isDirectory(to_path)) + cache_disk->clearDirectory(to_path); + cache_disk->moveDirectory(from_path, to_path); + } DiskDecorator::moveDirectory(from_path, to_path); } @@ -265,6 +276,20 @@ void DiskCacheWrapper::removeRecursive(const String & path) DiskDecorator::removeRecursive(path); } +void DiskCacheWrapper::removeSharedFile(const String & path, bool keep_s3) +{ + if (cache_disk->exists(path)) + cache_disk->removeSharedFile(path, keep_s3); + DiskDecorator::removeSharedFile(path, keep_s3); +} + +void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_s3) +{ + if (cache_disk->exists(path)) + cache_disk->removeSharedRecursive(path, keep_s3); + DiskDecorator::removeSharedRecursive(path, keep_s3); +} + void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path) { /// Don't create hardlinks for cache files to shadow directory as it just waste cache disk space. diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index bf1a5df693a..6d58394640f 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "DiskDecorator.h" #include "DiskLocal.h" @@ -32,14 +33,23 @@ public: void moveDirectory(const String & from_path, const String & to_path) override; void moveFile(const String & from_path, const String & to_path) override; void replaceFile(const String & from_path, const String & to_path) override; - std::unique_ptr - readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override; - std::unique_ptr - writeFile(const String & path, size_t buf_size, WriteMode mode) override; + + std::unique_ptr readFile( + const String & path, + size_t buf_size, + size_t estimated_size, + size_t aio_threshold, + size_t mmap_threshold, + MMappedFileCache * mmap_cache) const override; + + std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode) override; + void removeFile(const String & path) override; void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; + void removeSharedFile(const String & path, bool keep_s3) override; + void removeSharedRecursive(const String & path, bool keep_s3) override; void createHardLink(const String & src_path, const String & dst_path) override; ReservationPtr reserve(UInt64 bytes) override; @@ -54,6 +64,8 @@ private: mutable std::unordered_map> file_downloads; /// Protects concurrent downloading files to cache. mutable std::mutex mutex; + + Poco::Logger * log = &Poco::Logger::get("DiskCache"); }; } diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 3ebd1b6cf3b..144dc928dd3 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -114,9 +114,10 @@ void DiskDecorator::listFiles(const String & path, std::vector & file_na } std::unique_ptr -DiskDecorator::readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const +DiskDecorator::readFile( + const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MMappedFileCache * mmap_cache) const { - return delegate->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold); + return delegate->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache); } std::unique_ptr @@ -145,6 +146,16 @@ void DiskDecorator::removeRecursive(const String & path) delegate->removeRecursive(path); } +void DiskDecorator::removeSharedFile(const String & path, bool keep_s3) +{ + delegate->removeSharedFile(path, keep_s3); +} + +void DiskDecorator::removeSharedRecursive(const String & path, bool keep_s3) +{ + delegate->removeSharedRecursive(path, keep_s3); +} + void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp) { delegate->setLastModified(path, timestamp); diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index c204d10bed9..918223b38d7 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -34,19 +34,36 @@ public: void replaceFile(const String & from_path, const String & to_path) override; void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override; void listFiles(const String & path, std::vector & file_names) override; - std::unique_ptr - readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override; - std::unique_ptr - writeFile(const String & path, size_t buf_size, WriteMode mode) override; + + std::unique_ptr readFile( + const String & path, + size_t buf_size, + size_t estimated_size, + size_t aio_threshold, + size_t mmap_threshold, + MMappedFileCache * mmap_cache) const override; + + std::unique_ptr writeFile( + const String & path, + size_t buf_size, + WriteMode mode) override; + void removeFile(const String & path) override; void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; + void removeSharedFile(const String & path, bool keep_s3) override; + void removeSharedRecursive(const String & path, bool keep_s3) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; void setReadOnly(const String & path) override; void createHardLink(const String & src_path, const String & dst_path) override; void truncateFile(const String & path, size_t size) override; + int open(const String & path, mode_t mode) const; + void close(int fd) const; + void sync(int fd) const; + String getUniqueId(const String & path) const override { return delegate->getUniqueId(path); } + bool checkUniqueId(const String & id) const override { return delegate->checkUniqueId(id); } DiskType::Type getType() const override { return delegate->getType(); } Executor & getExecutor() override; void onFreeze(const String & path) override; diff --git a/src/Disks/DiskFactory.cpp b/src/Disks/DiskFactory.cpp index b604269ae16..03412be9729 100644 --- a/src/Disks/DiskFactory.cpp +++ b/src/Disks/DiskFactory.cpp @@ -24,7 +24,7 @@ DiskPtr DiskFactory::create( const String & name, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - const Context & context) const + ContextConstPtr context) const { const auto disk_type = config.getString(config_prefix + ".type", "local"); diff --git a/src/Disks/DiskFactory.h b/src/Disks/DiskFactory.h index d41f14bd753..ba833d9e25f 100644 --- a/src/Disks/DiskFactory.h +++ b/src/Disks/DiskFactory.h @@ -1,17 +1,18 @@ #pragma once -#include #include +#include +#include + +#include +#include #include #include -#include -#include namespace DB { -class Context; /** * Disk factory. Responsible for creating new disk objects. @@ -23,7 +24,7 @@ public: const String & name, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - const Context & context)>; + ContextConstPtr context)>; static DiskFactory & instance(); @@ -33,7 +34,7 @@ public: const String & name, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - const Context & context) const; + ContextConstPtr context) const; private: using DiskTypeRegistry = std::unordered_map; diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 5035a865191..d0cf6a00344 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -2,13 +2,13 @@ #include #include "DiskFactory.h" +#include #include #include #include -#include #include -#include + #include @@ -96,7 +96,7 @@ bool DiskLocal::tryReserve(UInt64 bytes) std::lock_guard lock(DiskLocal::reservation_mutex); if (bytes == 0) { - LOG_DEBUG(&Poco::Logger::get("DiskLocal"), "Reserving 0 bytes on disk {}", backQuote(name)); + LOG_DEBUG(log, "Reserving 0 bytes on disk {}", backQuote(name)); ++reservation_count; return true; } @@ -105,7 +105,7 @@ bool DiskLocal::tryReserve(UInt64 bytes) UInt64 unreserved_space = available_space - std::min(available_space, reserved_bytes); if (unreserved_space >= bytes) { - LOG_DEBUG(&Poco::Logger::get("DiskLocal"), "Reserving {} on disk {}, having unreserved {}.", + LOG_DEBUG(log, "Reserving {} on disk {}, having unreserved {}.", ReadableSize(bytes), backQuote(name), ReadableSize(unreserved_space)); ++reservation_count; reserved_bytes += bytes; @@ -219,9 +219,10 @@ void DiskLocal::replaceFile(const String & from_path, const String & to_path) } std::unique_ptr -DiskLocal::readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const +DiskLocal::readFile( + const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MMappedFileCache * mmap_cache) const { - return createReadBufferFromFileBase(disk_path + path, estimated_size, aio_threshold, mmap_threshold, buf_size); + return createReadBufferFromFileBase(disk_path + path, estimated_size, aio_threshold, mmap_threshold, mmap_cache, buf_size); } std::unique_ptr @@ -338,7 +339,7 @@ DiskLocalReservation::~DiskLocalReservation() if (disk->reserved_bytes < size) { disk->reserved_bytes = 0; - LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservations size for disk '{}'.", disk->getName()); + LOG_ERROR(disk->log, "Unbalanced reservations size for disk '{}'.", disk->getName()); } else { @@ -346,7 +347,7 @@ DiskLocalReservation::~DiskLocalReservation() } if (disk->reservation_count == 0) - LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservation count for disk '{}'.", disk->getName()); + LOG_ERROR(disk->log, "Unbalanced reservation count for disk '{}'.", disk->getName()); else --disk->reservation_count; } @@ -362,7 +363,7 @@ void registerDiskLocal(DiskFactory & factory) auto creator = [](const String & name, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - const Context & context) -> DiskPtr { + ContextConstPtr context) -> DiskPtr { String path = config.getString(config_prefix + ".path", ""); if (name == "default") { @@ -370,7 +371,7 @@ void registerDiskLocal(DiskFactory & factory) throw Exception( "\"default\" disk path should be provided in not it ", ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); - path = context.getPath(); + path = context->getPath(); } else { @@ -382,7 +383,7 @@ void registerDiskLocal(DiskFactory & factory) if (Poco::File disk{path}; !disk.canRead() || !disk.canWrite()) { - throw Exception("There is no RW access to disk " + name + " (" + path + ")", ErrorCodes::PATH_ACCESS_DENIED); + throw Exception("There is no RW access to the disk " + name + " (" + path + ")", ErrorCodes::PATH_ACCESS_DENIED); } bool has_space_ratio = config.has(config_prefix + ".keep_free_space_ratio"); @@ -401,7 +402,7 @@ void registerDiskLocal(DiskFactory & factory) throw Exception("'keep_free_space_ratio' have to be between 0 and 1", ErrorCodes::EXCESSIVE_ELEMENT_IN_CONFIG); String tmp_path = path; if (tmp_path.empty()) - tmp_path = context.getPath(); + tmp_path = context->getPath(); // Create tmp disk for getting total disk space. keep_free_space_bytes = static_cast(DiskLocal("tmp", tmp_path, 0).getTotalSpace() * ratio); diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index 7dbfbe445f8..567ca24eb50 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -76,7 +77,8 @@ public: size_t buf_size, size_t estimated_size, size_t aio_threshold, - size_t mmap_threshold) const override; + size_t mmap_threshold, + MMappedFileCache * mmap_cache) const override; std::unique_ptr writeFile( const String & path, @@ -114,6 +116,8 @@ private: UInt64 reservation_count = 0; static std::mutex reservation_mutex; + + Poco::Logger * log = &Poco::Logger::get("DiskLocal"); }; } diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp index a0905e67427..68257ec4948 100644 --- a/src/Disks/DiskMemory.cpp +++ b/src/Disks/DiskMemory.cpp @@ -314,7 +314,7 @@ void DiskMemory::replaceFileImpl(const String & from_path, const String & to_pat files.insert(std::move(node)); } -std::unique_ptr DiskMemory::readFile(const String & path, size_t /*buf_size*/, size_t, size_t, size_t) const +std::unique_ptr DiskMemory::readFile(const String & path, size_t /*buf_size*/, size_t, size_t, size_t, MMappedFileCache *) const { std::lock_guard lock(mutex); @@ -451,7 +451,7 @@ void registerDiskMemory(DiskFactory & factory) auto creator = [](const String & name, const Poco::Util::AbstractConfiguration & /*config*/, const String & /*config_prefix*/, - const Context & /*context*/) -> DiskPtr { return std::make_shared(name); }; + ContextConstPtr /*context*/) -> DiskPtr { return std::make_shared(name); }; factory.registerDiskType("memory", creator); } diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h index 29ac4919833..d5c57b20a4a 100644 --- a/src/Disks/DiskMemory.h +++ b/src/Disks/DiskMemory.h @@ -67,7 +67,8 @@ public: size_t buf_size, size_t estimated_size, size_t aio_threshold, - size_t mmap_threshold) const override; + size_t mmap_threshold, + MMappedFileCache * mmap_cache) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskSelector.cpp b/src/Disks/DiskSelector.cpp index 0fb728a4f02..e317d8508da 100644 --- a/src/Disks/DiskSelector.cpp +++ b/src/Disks/DiskSelector.cpp @@ -18,7 +18,7 @@ namespace ErrorCodes extern const int UNKNOWN_DISK; } -DiskSelector::DiskSelector(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Context & context) +DiskSelector::DiskSelector(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextConstPtr context) { Poco::Util::AbstractConfiguration::Keys keys; config.keys(config_prefix, keys); @@ -40,12 +40,12 @@ DiskSelector::DiskSelector(const Poco::Util::AbstractConfiguration & config, con disks.emplace(disk_name, factory.create(disk_name, config, disk_config_prefix, context)); } if (!has_default_disk) - disks.emplace(default_disk_name, std::make_shared(default_disk_name, context.getPath(), 0)); + disks.emplace(default_disk_name, std::make_shared(default_disk_name, context->getPath(), 0)); } DiskSelectorPtr DiskSelector::updateFromConfig( - const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Context & context) const + const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) const { Poco::Util::AbstractConfiguration::Keys keys; config.keys(config_prefix, keys); diff --git a/src/Disks/DiskSelector.h b/src/Disks/DiskSelector.h index 5d023fe1fbc..6b7e8174d3d 100644 --- a/src/Disks/DiskSelector.h +++ b/src/Disks/DiskSelector.h @@ -10,7 +10,6 @@ namespace DB { -class Context; class DiskSelector; using DiskSelectorPtr = std::shared_ptr; using DisksMap = std::map; @@ -20,13 +19,13 @@ using DisksMap = std::map; class DiskSelector { public: - DiskSelector(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Context & context); + DiskSelector(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextConstPtr context); DiskSelector(const DiskSelector & from) : disks(from.disks) { } DiskSelectorPtr updateFromConfig( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - const Context & context + ContextPtr context ) const; /// Get disk by name diff --git a/src/Disks/DiskType.h b/src/Disks/DiskType.h new file mode 100644 index 00000000000..4e0ae226af4 --- /dev/null +++ b/src/Disks/DiskType.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +namespace DB +{ + +struct DiskType +{ + enum class Type + { + Local, + RAM, + S3 + }; + static String toString(Type disk_type) + { + switch (disk_type) + { + case Type::Local: + return "local"; + case Type::RAM: + return "memory"; + case Type::S3: + return "s3"; + } + __builtin_unreachable(); + } +}; + +} + diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 6f021346174..726145cb5d2 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ using Reservations = std::vector; class ReadBufferFromFileBase; class WriteBufferFromFileBase; +class MMappedFileCache; /** * Mode of opening a file for write. @@ -57,29 +59,6 @@ public: using SpacePtr = std::shared_ptr; -struct DiskType -{ - enum class Type - { - Local, - RAM, - S3 - }; - static String toString(Type disk_type) - { - switch (disk_type) - { - case Type::Local: - return "local"; - case Type::RAM: - return "memory"; - case Type::S3: - return "s3"; - } - __builtin_unreachable(); - } -}; - /** * A guard, that should synchronize file's or directory's state * with storage device (e.g. fsync in POSIX) in its destructor. @@ -175,7 +154,8 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, size_t estimated_size = 0, size_t aio_threshold = 0, - size_t mmap_threshold = 0) const = 0; + size_t mmap_threshold = 0, + MMappedFileCache * mmap_cache = nullptr) const = 0; /// Open the file for write and return WriteBufferFromFileBase object. virtual std::unique_ptr writeFile( @@ -195,6 +175,21 @@ public: /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. virtual void removeRecursive(const String & path) = 0; + /// Remove file. Throws exception if file doesn't exists or if directory is not empty. + /// Differs from removeFile for S3 disks + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + virtual void removeSharedFile(const String & path, bool) { removeFile(path); } + + /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. + /// Differs from removeRecursive for S3 disks + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + virtual void removeSharedRecursive(const String & path, bool) { removeRecursive(path); } + + /// Remove file or directory if it exists. + /// Differs from removeFileIfExists for S3 disks + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + virtual void removeSharedFileIfExists(const String & path, bool) { removeFileIfExists(path); } + /// Set last modified time to file or directory at `path`. virtual void setLastModified(const String & path, const Poco::Timestamp & timestamp) = 0; @@ -216,6 +211,15 @@ public: /// Invoked when Global Context is shutdown. virtual void shutdown() { } + /// Return some uniq string for file, overrode for S3 + /// Required for distinguish different copies of the same part on S3 + virtual String getUniqueId(const String & path) const { return path; } + + /// Check file exists and ClickHouse has an access to it + /// Overrode in DiskS3 + /// Required for S3 to ensure that replica has access to data wroten by other node + virtual bool checkUniqueId(const String & id) const { return exists(id); } + /// Returns executor to perform asynchronous operations. virtual Executor & getExecutor() { return *executor; } diff --git a/src/Disks/IStoragePolicy.h b/src/Disks/IStoragePolicy.h index a41ea87c328..59cff3c85d5 100644 --- a/src/Disks/IStoragePolicy.h +++ b/src/Disks/IStoragePolicy.h @@ -1,4 +1,7 @@ #pragma once + +#include + #include #include #include @@ -36,6 +39,7 @@ public: /// mutations files virtual DiskPtr getAnyDisk() const = 0; virtual DiskPtr getDiskByName(const String & disk_name) const = 0; + virtual Disks getDisksByType(DiskType::Type type) const = 0; /// Get free space from most free disk virtual UInt64 getMaxUnreservedFreeSpace() const = 0; /// Reserves space on any volume with index > min_volume_index or returns nullptr @@ -57,6 +61,7 @@ public: /// Check if we have any volume with stopped merges virtual bool hasAnyVolumeWithDisabledMerges() const = 0; virtual bool containsVolume(const String & volume_name) const = 0; + /// Returns disks by type ordered by volumes priority }; } diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 3d91d5fbb78..14d3e1e5d5e 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -2,9 +2,11 @@ #include "Disks/DiskFactory.h" +#include #include #include #include +#include #include #include #include @@ -18,7 +20,6 @@ #include #include #include -#include #include #include @@ -90,6 +91,16 @@ void throwIfError(Aws::Utils::Outcome & response) } } +template +void throwIfError(const Aws::Utils::Outcome & response) +{ + if (!response.IsSuccess()) + { + const auto & err = response.GetError(); + throw Exception(err.GetMessage(), static_cast(err.GetErrorType())); + } +} + /** * S3 metadata file layout: * Number of S3 objects, Total size of all S3 objects. @@ -239,8 +250,12 @@ class ReadIndirectBufferFromS3 final : public ReadBufferFromFileBase { public: ReadIndirectBufferFromS3( - std::shared_ptr client_ptr_, const String & bucket_, DiskS3::Metadata metadata_, size_t buf_size_) - : client_ptr(std::move(client_ptr_)), bucket(bucket_), metadata(std::move(metadata_)), buf_size(buf_size_) + std::shared_ptr client_ptr_, const String & bucket_, DiskS3::Metadata metadata_, UInt64 s3_max_single_read_retries_, size_t buf_size_) + : client_ptr(std::move(client_ptr_)) + , bucket(bucket_) + , metadata(std::move(metadata_)) + , s3_max_single_read_retries(s3_max_single_read_retries_) + , buf_size(buf_size_) { } @@ -296,7 +311,7 @@ private: const auto & [path, size] = metadata.s3_objects[i]; if (size > offset) { - auto buf = std::make_unique(client_ptr, bucket, metadata.s3_root_path + path, buf_size); + auto buf = std::make_unique(client_ptr, bucket, metadata.s3_root_path + path, s3_max_single_read_retries, buf_size); buf->seek(offset, SEEK_SET); return buf; } @@ -325,7 +340,7 @@ private: ++current_buf_idx; const auto & path = metadata.s3_objects[current_buf_idx].first; - current_buf = std::make_unique(client_ptr, bucket, metadata.s3_root_path + path, buf_size); + current_buf = std::make_unique(client_ptr, bucket, metadata.s3_root_path + path, s3_max_single_read_retries, buf_size); current_buf->next(); working_buffer = current_buf->buffer(); absolute_position += working_buffer.size(); @@ -336,6 +351,7 @@ private: std::shared_ptr client_ptr; const String & bucket; DiskS3::Metadata metadata; + UInt64 s3_max_single_read_retries; size_t buf_size; size_t absolute_position = 0; @@ -481,7 +497,7 @@ public: if (disk->reserved_bytes < size) { disk->reserved_bytes = 0; - LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservations size for disk '{}'.", disk->getName()); + LOG_ERROR(disk->log, "Unbalanced reservations size for disk '{}'.", disk->getName()); } else { @@ -489,7 +505,7 @@ public: } if (disk->reservation_count == 0) - LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservation count for disk '{}'.", disk->getName()); + LOG_ERROR(disk->log, "Unbalanced reservation count for disk '{}'.", disk->getName()); else --disk->reservation_count; } @@ -525,7 +541,7 @@ public: } catch (...) { - tryLogCurrentException(&Poco::Logger::get("DiskS3"), "Failed to run async task"); + tryLogCurrentException("DiskS3", "Failed to run async task"); try { @@ -549,6 +565,7 @@ DiskS3::DiskS3( String bucket_, String s3_root_path_, String metadata_path_, + UInt64 s3_max_single_read_retries_, size_t min_upload_part_size_, size_t max_single_part_upload_size_, size_t min_bytes_for_seek_, @@ -562,6 +579,7 @@ DiskS3::DiskS3( , bucket(std::move(bucket_)) , s3_root_path(std::move(s3_root_path_)) , metadata_path(std::move(metadata_path_)) + , s3_max_single_read_retries(s3_max_single_read_retries_) , min_upload_part_size(min_upload_part_size_) , max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) @@ -609,6 +627,15 @@ void DiskS3::createDirectories(const String & path) Poco::File(metadata_path + path).createDirectories(); } +String DiskS3::getUniqueId(const String & path) const +{ + Metadata metadata(s3_root_path, metadata_path, path); + String id; + if (!metadata.s3_objects.empty()) + id = metadata.s3_root_path + metadata.s3_objects[0].first; + return id; +} + DiskDirectoryIteratorPtr DiskS3::iterateDirectory(const String & path) { return std::make_unique(metadata_path + path, path); @@ -629,7 +656,7 @@ void DiskS3::moveFile(const String & from_path, const String & to_path) if (send_metadata) { auto revision = ++revision_counter; - const DiskS3::ObjectMetadata object_metadata { + const ObjectMetadata object_metadata { {"from_path", from_path}, {"to_path", to_path} }; @@ -652,14 +679,14 @@ void DiskS3::replaceFile(const String & from_path, const String & to_path) moveFile(from_path, to_path); } -std::unique_ptr DiskS3::readFile(const String & path, size_t buf_size, size_t, size_t, size_t) const +std::unique_ptr DiskS3::readFile(const String & path, size_t buf_size, size_t, size_t, size_t, MMappedFileCache *) const { auto metadata = readMeta(path); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Read from file by path: {}. Existing S3 objects: {}", + LOG_DEBUG(log, "Read from file by path: {}. Existing S3 objects: {}", backQuote(metadata_path + path), metadata.s3_objects.size()); - auto reader = std::make_unique(client, bucket, metadata, buf_size); + auto reader = std::make_unique(client, bucket, metadata, s3_max_single_read_retries, buf_size); return std::make_unique(std::move(reader), min_bytes_for_seek); } @@ -692,7 +719,7 @@ std::unique_ptr DiskS3::writeFile(const String & path, /// Save empty metadata to disk to have ability to get file size while buffer is not finalized. metadata.save(); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Write to file by path: {}. New S3 path: {}", backQuote(metadata_path + path), s3_root_path + s3_path); + LOG_DEBUG(log, "Write to file by path: {}. New S3 path: {}", backQuote(metadata_path + path), s3_root_path + s3_path); return std::make_unique( client, bucket, metadata, s3_path, object_metadata, min_upload_part_size, max_single_part_upload_size, buf_size); @@ -701,7 +728,7 @@ std::unique_ptr DiskS3::writeFile(const String & path, { auto metadata = readMeta(path); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Append to file by path: {}. New S3 path: {}. Existing S3 objects: {}.", + LOG_DEBUG(log, "Append to file by path: {}. New S3 path: {}. Existing S3 objects: {}.", backQuote(metadata_path + path), s3_root_path + s3_path, metadata.s3_objects.size()); return std::make_unique( @@ -711,7 +738,7 @@ std::unique_ptr DiskS3::writeFile(const String & path, void DiskS3::removeMeta(const String & path, AwsS3KeyKeeper & keys) { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Remove file by path: {}", backQuote(metadata_path + path)); + LOG_DEBUG(log, "Remove file by path: {}", backQuote(metadata_path + path)); Poco::File file(metadata_path + path); @@ -743,7 +770,7 @@ void DiskS3::removeMeta(const String & path, AwsS3KeyKeeper & keys) if (e.code() == ErrorCodes::UNKNOWN_FORMAT) { LOG_WARNING( - &Poco::Logger::get("DiskS3"), + log, "Metadata file {} can't be read by reason: {}. Removing it forcibly.", backQuote(path), e.nested() ? e.nested()->message() : e.message()); @@ -791,13 +818,6 @@ void DiskS3::removeAws(const AwsS3KeyKeeper & keys) } } -void DiskS3::removeFile(const String & path) -{ - AwsS3KeyKeeper keys; - removeMeta(path, keys); - removeAws(keys); -} - void DiskS3::removeFileIfExists(const String & path) { AwsS3KeyKeeper keys; @@ -813,11 +833,20 @@ void DiskS3::removeDirectory(const String & path) Poco::File(metadata_path + path).remove(); } -void DiskS3::removeRecursive(const String & path) +void DiskS3::removeSharedFile(const String & path, bool keep_s3) +{ + AwsS3KeyKeeper keys; + removeMeta(path, keys); + if (!keep_s3) + removeAws(keys); +} + +void DiskS3::removeSharedRecursive(const String & path, bool keep_s3) { AwsS3KeyKeeper keys; removeMetaRecursive(path, keys); - removeAws(keys); + if (!keep_s3) + removeAws(keys); } bool DiskS3::tryReserve(UInt64 bytes) @@ -825,7 +854,7 @@ bool DiskS3::tryReserve(UInt64 bytes) std::lock_guard lock(reservation_mutex); if (bytes == 0) { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Reserving 0 bytes on s3 disk {}", backQuote(name)); + LOG_DEBUG(log, "Reserving 0 bytes on s3 disk {}", backQuote(name)); ++reservation_count; return true; } @@ -834,7 +863,7 @@ bool DiskS3::tryReserve(UInt64 bytes) UInt64 unreserved_space = available_space - std::min(available_space, reserved_bytes); if (unreserved_space >= bytes) { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Reserving {} on disk {}, having unreserved {}.", + LOG_DEBUG(log, "Reserving {} on disk {}, having unreserved {}.", ReadableSize(bytes), backQuote(name), ReadableSize(unreserved_space)); ++reservation_count; reserved_bytes += bytes; @@ -919,34 +948,159 @@ void DiskS3::startup() if (!send_metadata) return; - LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting up disk {}", name); + LOG_INFO(log, "Starting up disk {}", name); - /// Find last revision. - UInt64 l = 0, r = LATEST_REVISION; - while (l < r) - { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Check revision in bounds {}-{}", l, r); + if (readSchemaVersion(bucket, s3_root_path) < RESTORABLE_SCHEMA_VERSION) + migrateToRestorableSchema(); - auto revision = l + (r - l + 1) / 2; - auto revision_str = revisionToString(revision); + findLastRevision(); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Check object with revision {}", revision); - - /// Check file or operation with such revision exists. - if (checkObjectExists(s3_root_path + "r" + revision_str) - || checkObjectExists(s3_root_path + "operations/r" + revision_str)) - l = revision; - else - r = revision - 1; - } - revision_counter = l; - LOG_INFO(&Poco::Logger::get("DiskS3"), "Found last revision number {} for disk {}", revision_counter, name); + LOG_INFO(log, "Disk {} started up", name); } -bool DiskS3::checkObjectExists(const String & prefix) +void DiskS3::findLastRevision() +{ + /// Construct revision number from high to low bits. + String revision; + revision.reserve(64); + for (int bit = 0; bit < 64; bit++) + { + auto revision_prefix = revision + "1"; + + LOG_DEBUG(log, "Check object exists with revision prefix {}", revision_prefix); + + /// Check file or operation with such revision prefix exists. + if (checkObjectExists(bucket, s3_root_path + "r" + revision_prefix) + || checkObjectExists(bucket, s3_root_path + "operations/r" + revision_prefix)) + revision += "1"; + else + revision += "0"; + } + revision_counter = static_cast(std::bitset<64>(revision).to_ullong()); + LOG_INFO(log, "Found last revision number {} for disk {}", revision_counter, name); +} + +int DiskS3::readSchemaVersion(const String & source_bucket, const String & source_path) +{ + int version = 0; + if (!checkObjectExists(source_bucket, source_path + SCHEMA_VERSION_OBJECT)) + return version; + + ReadBufferFromS3 buffer(client, source_bucket, source_path + SCHEMA_VERSION_OBJECT, s3_max_single_read_retries); + readIntText(version, buffer); + + return version; +} + +void DiskS3::saveSchemaVersion(const int & version) +{ + WriteBufferFromS3 buffer (client, bucket, s3_root_path + SCHEMA_VERSION_OBJECT, min_upload_part_size, max_single_part_upload_size); + writeIntText(version, buffer); + buffer.finalize(); +} + +void DiskS3::updateObjectMetadata(const String & key, const ObjectMetadata & metadata) +{ + Aws::S3::Model::CopyObjectRequest request; + request.SetCopySource(bucket + "/" + key); + request.SetBucket(bucket); + request.SetKey(key); + request.SetMetadata(metadata); + request.SetMetadataDirective(Aws::S3::Model::MetadataDirective::REPLACE); + + auto outcome = client->CopyObject(request); + throwIfError(outcome); +} + +void DiskS3::migrateFileToRestorableSchema(const String & path) +{ + LOG_DEBUG(log, "Migrate file {} to restorable schema", metadata_path + path); + + auto meta = readMeta(path); + + for (const auto & [key, _] : meta.s3_objects) + { + ObjectMetadata metadata { + {"path", path} + }; + updateObjectMetadata(s3_root_path + key, metadata); + } +} + +void DiskS3::migrateToRestorableSchemaRecursive(const String & path, Futures & results) +{ + checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks. + + LOG_DEBUG(log, "Migrate directory {} to restorable schema", metadata_path + path); + + bool dir_contains_only_files = true; + for (auto it = iterateDirectory(path); it->isValid(); it->next()) + if (isDirectory(it->path())) + { + dir_contains_only_files = false; + break; + } + + /// The whole directory can be migrated asynchronously. + if (dir_contains_only_files) + { + auto result = getExecutor().execute([this, path] + { + for (auto it = iterateDirectory(path); it->isValid(); it->next()) + migrateFileToRestorableSchema(it->path()); + }); + + results.push_back(std::move(result)); + } + else + { + for (auto it = iterateDirectory(path); it->isValid(); it->next()) + if (!isDirectory(it->path())) + { + auto source_path = it->path(); + auto result = getExecutor().execute([this, source_path] + { + migrateFileToRestorableSchema(source_path); + }); + + results.push_back(std::move(result)); + } + else + migrateToRestorableSchemaRecursive(it->path(), results); + } +} + +void DiskS3::migrateToRestorableSchema() +{ + try + { + LOG_INFO(log, "Start migration to restorable schema for disk {}", name); + + Futures results; + + for (const auto & root : data_roots) + if (exists(root)) + migrateToRestorableSchemaRecursive(root + '/', results); + + for (auto & result : results) + result.wait(); + for (auto & result : results) + result.get(); + + saveSchemaVersion(RESTORABLE_SCHEMA_VERSION); + } + catch (const Exception &) + { + tryLogCurrentException(log, fmt::format("Failed to migrate to restorable schema for disk {}", name)); + + throw; + } +} + +bool DiskS3::checkObjectExists(const String & source_bucket, const String & prefix) { Aws::S3::Model::ListObjectsV2Request request; - request.SetBucket(bucket); + request.SetBucket(source_bucket); request.SetPrefix(prefix); request.SetMaxKeys(1); @@ -956,6 +1110,23 @@ bool DiskS3::checkObjectExists(const String & prefix) return !outcome.GetResult().GetContents().empty(); } +bool DiskS3::checkUniqueId(const String & id) const +{ + /// Check that we have right s3 and have access rights + /// Actually interprets id as s3 object name and checks if it exists + Aws::S3::Model::ListObjectsV2Request request; + request.SetBucket(bucket); + request.SetPrefix(id); + auto resp = client->ListObjectsV2(request); + throwIfError(resp); + Aws::Vector object_list = resp.GetResult().GetContents(); + + for (const auto & object : object_list) + if (object.GetKey() == id) + return true; + return false; +} + Aws::S3::Model::HeadObjectResult DiskS3::headObject(const String & source_bucket, const String & key) { Aws::S3::Model::HeadObjectRequest request; @@ -1006,46 +1177,64 @@ struct DiskS3::RestoreInformation UInt64 revision = LATEST_REVISION; String source_bucket; String source_path; + bool detached = false; }; void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_information) { - ReadBufferFromFile buffer(metadata_path + restore_file_name, 512); + ReadBufferFromFile buffer(metadata_path + RESTORE_FILE_NAME, 512); buffer.next(); - /// Empty file - just restore all metadata. - if (!buffer.hasPendingData()) - return; - try { - readIntText(restore_information.revision, buffer); - assertChar('\n', buffer); + std::map properties; - if (!buffer.hasPendingData()) - return; + while (buffer.hasPendingData()) + { + String property; + readText(property, buffer); + assertChar('\n', buffer); - readText(restore_information.source_bucket, buffer); - assertChar('\n', buffer); + auto pos = property.find('='); + if (pos == String::npos || pos == 0 || pos == property.length()) + throw Exception(fmt::format("Invalid property {} in restore file", property), ErrorCodes::UNKNOWN_FORMAT); - if (!buffer.hasPendingData()) - return; + auto key = property.substr(0, pos); + auto value = property.substr(pos + 1); - readText(restore_information.source_path, buffer); - assertChar('\n', buffer); + auto it = properties.find(key); + if (it != properties.end()) + throw Exception(fmt::format("Property key duplication {} in restore file", key), ErrorCodes::UNKNOWN_FORMAT); - if (buffer.hasPendingData()) - throw Exception("Extra information at the end of restore file", ErrorCodes::UNKNOWN_FORMAT); + properties[key] = value; + } + + for (const auto & [key, value] : properties) + { + ReadBufferFromString value_buffer (value); + + if (key == "revision") + readIntText(restore_information.revision, value_buffer); + else if (key == "source_bucket") + readText(restore_information.source_bucket, value_buffer); + else if (key == "source_path") + readText(restore_information.source_path, value_buffer); + else if (key == "detached") + readBoolTextWord(restore_information.detached, value_buffer); + else + throw Exception(fmt::format("Unknown key {} in restore file", key), ErrorCodes::UNKNOWN_FORMAT); + } } - catch (const Exception & e) + catch (const Exception &) { - throw Exception("Failed to read restore information", e, ErrorCodes::UNKNOWN_FORMAT); + tryLogCurrentException(log, "Failed to read restore information"); + throw; } } void DiskS3::restore() { - if (!exists(restore_file_name)) + if (!exists(RESTORE_FILE_NAME)) return; try @@ -1072,33 +1261,43 @@ void DiskS3::restore() throw Exception("Restoring to the same bucket is allowed only if source path is not a sub-path of configured path in S3 disk", ErrorCodes::BAD_ARGUMENTS); } - ///TODO: Cleanup FS and bucket if previous restore was failed. - - LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting to restore disk {}. Revision: {}, Source bucket: {}, Source path: {}", + LOG_INFO(log, "Starting to restore disk {}. Revision: {}, Source bucket: {}, Source path: {}", name, information.revision, information.source_bucket, information.source_path); - restoreFiles(information.source_bucket, information.source_path, information.revision); - restoreFileOperations(information.source_bucket, information.source_path, information.revision); + if (readSchemaVersion(information.source_bucket, information.source_path) < RESTORABLE_SCHEMA_VERSION) + throw Exception("Source bucket doesn't have restorable schema.", ErrorCodes::BAD_ARGUMENTS); - Poco::File restore_file(metadata_path + restore_file_name); + LOG_INFO(log, "Removing old metadata..."); + + bool cleanup_s3 = information.source_bucket != bucket || information.source_path != s3_root_path; + for (const auto & root : data_roots) + if (exists(root)) + removeSharedRecursive(root + '/', !cleanup_s3); + + restoreFiles(information); + restoreFileOperations(information); + + Poco::File restore_file(metadata_path + RESTORE_FILE_NAME); restore_file.remove(); - LOG_INFO(&Poco::Logger::get("DiskS3"), "Restore disk {} finished", name); + saveSchemaVersion(RESTORABLE_SCHEMA_VERSION); + + LOG_INFO(log, "Restore disk {} finished", name); } - catch (const Exception & e) + catch (const Exception &) { - LOG_ERROR(&Poco::Logger::get("DiskS3"), "Failed to restore disk. Code: {}, e.displayText() = {}, Stack trace:\n\n{}", e.code(), e.displayText(), e.getStackTraceString()); + tryLogCurrentException(log, fmt::format("Failed to restore disk {}", name)); throw; } } -void DiskS3::restoreFiles(const String & source_bucket, const String & source_path, UInt64 target_revision) +void DiskS3::restoreFiles(const RestoreInformation & restore_information) { - LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore files for disk {}", name); + LOG_INFO(log, "Starting restore files for disk {}", name); std::vector> results; - listObjects(source_bucket, source_path, [this, &source_bucket, &source_path, &target_revision, &results](auto list_result) + auto restore_files = [this, &restore_information, &results](auto list_result) { std::vector keys; for (const auto & row : list_result.GetContents()) @@ -1111,7 +1310,7 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa const auto [revision, _] = extractRevisionAndOperationFromKey(key); /// Filter early if it's possible to get revision from key. - if (revision > target_revision) + if (revision > restore_information.revision) continue; keys.push_back(key); @@ -1119,23 +1318,26 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa if (!keys.empty()) { - auto result = getExecutor().execute([this, &source_bucket, &source_path, keys]() + auto result = getExecutor().execute([this, &restore_information, keys]() { - processRestoreFiles(source_bucket, source_path, keys); + processRestoreFiles(restore_information.source_bucket, restore_information.source_path, keys); }); results.push_back(std::move(result)); } return true; - }); + }; + + /// Execute. + listObjects(restore_information.source_bucket, restore_information.source_path, restore_files); for (auto & result : results) result.wait(); for (auto & result : results) result.get(); - LOG_INFO(&Poco::Logger::get("DiskS3"), "Files are restored for disk {}", name); + LOG_INFO(log, "Files are restored for disk {}", name); } void DiskS3::processRestoreFiles(const String & source_bucket, const String & source_path, Strings keys) @@ -1148,7 +1350,11 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so /// Restore file if object has 'path' in metadata. auto path_entry = object_metadata.find("path"); if (path_entry == object_metadata.end()) - throw Exception("Failed to restore key " + key + " because it doesn't have 'path' in metadata", ErrorCodes::S3_ERROR); + { + /// Such keys can remain after migration, we can skip them. + LOG_WARNING(log, "Skip key {} because it doesn't have 'path' in metadata", key); + continue; + } const auto & path = path_entry->second; @@ -1163,18 +1369,19 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so metadata.addObject(relative_key, head_result.GetContentLength()); metadata.save(); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored file {}", path); + LOG_DEBUG(log, "Restored file {}", path); } } -void DiskS3::restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 target_revision) +void DiskS3::restoreFileOperations(const RestoreInformation & restore_information) { - LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore file operations for disk {}", name); + LOG_INFO(log, "Starting restore file operations for disk {}", name); /// Enable recording file operations if we restore to different bucket / path. - send_metadata = bucket != source_bucket || s3_root_path != source_path; + send_metadata = bucket != restore_information.source_bucket || s3_root_path != restore_information.source_path; - listObjects(source_bucket, source_path + "operations/", [this, &source_bucket, &target_revision](auto list_result) + std::set renames; + auto restore_file_operations = [this, &restore_information, &renames](auto list_result) { const String rename = "rename"; const String hardlink = "hardlink"; @@ -1186,20 +1393,20 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String & const auto [revision, operation] = extractRevisionAndOperationFromKey(key); if (revision == UNKNOWN_REVISION) { - LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} with unknown revision", key); + LOG_WARNING(log, "Skip key {} with unknown revision", key); continue; } /// S3 ensures that keys will be listed in ascending UTF-8 bytes order (revision order). /// We can stop processing if revision of the object is already more than required. - if (revision > target_revision) + if (revision > restore_information.revision) return false; /// Keep original revision if restore to different bucket / path. if (send_metadata) revision_counter = revision - 1; - auto object_metadata = headObject(source_bucket, key).GetMetadata(); + auto object_metadata = headObject(restore_information.source_bucket, key).GetMetadata(); if (operation == rename) { auto from_path = object_metadata["from_path"]; @@ -1207,7 +1414,23 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String & if (exists(from_path)) { moveFile(from_path, to_path); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Revision {}. Restored rename {} -> {}", revision, from_path, to_path); + LOG_DEBUG(log, "Revision {}. Restored rename {} -> {}", revision, from_path, to_path); + + if (restore_information.detached && isDirectory(to_path)) + { + /// Sometimes directory paths are passed without trailing '/'. We should keep them in one consistent way. + if (!from_path.ends_with('/')) + from_path += '/'; + if (!to_path.ends_with('/')) + to_path += '/'; + + /// Always keep latest actual directory path to avoid 'detaching' not existing paths. + auto it = renames.find(from_path); + if (it != renames.end()) + renames.erase(it); + + renames.insert(to_path); + } } } else if (operation == hardlink) @@ -1218,27 +1441,55 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String & { createDirectories(directoryPath(dst_path)); createHardLink(src_path, dst_path); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Revision {}. Restored hardlink {} -> {}", revision, src_path, dst_path); + LOG_DEBUG(log, "Revision {}. Restored hardlink {} -> {}", revision, src_path, dst_path); } } } return true; - }); + }; + + /// Execute. + listObjects(restore_information.source_bucket, restore_information.source_path + "operations/", restore_file_operations); + + if (restore_information.detached) + { + Strings not_finished_prefixes{"tmp_", "delete_tmp_", "attaching_", "deleting_"}; + + for (const auto & path : renames) + { + /// Skip already detached parts. + if (path.find("/detached/") != std::string::npos) + continue; + + /// Skip not finished parts. They shouldn't be in 'detached' directory, because CH wouldn't be able to finish processing them. + Poco::Path directory_path (path); + auto directory_name = directory_path.directory(directory_path.depth() - 1); + auto predicate = [&directory_name](String & prefix) { return directory_name.starts_with(prefix); }; + if (std::any_of(not_finished_prefixes.begin(), not_finished_prefixes.end(), predicate)) + continue; + + auto detached_path = pathToDetached(path); + + LOG_DEBUG(log, "Move directory to 'detached' {} -> {}", path, detached_path); + + Poco::File(metadata_path + path).moveTo(metadata_path + detached_path); + } + } send_metadata = true; - LOG_INFO(&Poco::Logger::get("DiskS3"), "File operations restored for disk {}", name); + LOG_INFO(log, "File operations restored for disk {}", name); } std::tuple DiskS3::extractRevisionAndOperationFromKey(const String & key) { - UInt64 revision = UNKNOWN_REVISION; + String revision_str; String operation; - re2::RE2::FullMatch(key, key_regexp, &revision, &operation); + re2::RE2::FullMatch(key, key_regexp, &revision_str, &operation); - return {revision, operation}; + return {(revision_str.empty() ? UNKNOWN_REVISION : static_cast(std::bitset<64>(revision_str).to_ullong())), operation}; } String DiskS3::shrinkKey(const String & path, const String & key) @@ -1251,15 +1502,12 @@ String DiskS3::shrinkKey(const String & path, const String & key) String DiskS3::revisionToString(UInt64 revision) { - static constexpr size_t max_digits = 19; /// UInt64 max digits in decimal representation. + return std::bitset<64>(revision).to_string(); +} - /// Align revision number with leading zeroes to have strict lexicographical order of them. - auto revision_str = std::to_string(revision); - auto digits_to_align = max_digits - revision_str.length(); - for (size_t i = 0; i < digits_to_align; ++i) - revision_str = "0" + revision_str; - - return revision_str; +String DiskS3::pathToDetached(const String & source_path) +{ + return Poco::Path(source_path).parent().append(Poco::Path("detached")).toString() + '/'; } void DiskS3::onFreeze(const String & path) diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 5182ae4801b..758d4055a3e 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "Disks/DiskFactory.h" #include "Disks/Executor.h" #include "ProxyConfiguration.h" @@ -25,6 +26,7 @@ class DiskS3 : public IDisk { public: using ObjectMetadata = std::map; + using Futures = std::vector>; friend class DiskS3Reservation; @@ -39,6 +41,7 @@ public: String bucket_, String s3_root_path_, String metadata_path_, + UInt64 s3_max_single_read_retries_, size_t min_upload_part_size_, size_t max_single_part_upload_size_, size_t min_bytes_for_seek_, @@ -89,17 +92,21 @@ public: size_t buf_size, size_t estimated_size, size_t aio_threshold, - size_t mmap_threshold) const override; + size_t mmap_threshold, + MMappedFileCache * mmap_cache) const override; std::unique_ptr writeFile( const String & path, size_t buf_size, WriteMode mode) override; - void removeFile(const String & path) override; + void removeFile(const String & path) override { removeSharedFile(path, false); } void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; - void removeRecursive(const String & path) override; + void removeRecursive(const String & path) override { removeSharedRecursive(path, false); } + + void removeSharedFile(const String & path, bool keep_s3) override; + void removeSharedRecursive(const String & path, bool keep_s3) override; void createHardLink(const String & src_path, const String & dst_path) override; @@ -115,6 +122,14 @@ public: void shutdown() override; + /// Return some uniq string for file + /// Required for distinguish different copies of the same part on S3 + String getUniqueId(const String & path) const override; + + /// Check file exists and ClickHouse has an access to it + /// Required for S3 to ensure that replica has access to data wroten by other node + bool checkUniqueId(const String & id) const override; + /// Actions performed after disk creation. void startup(); @@ -135,29 +150,43 @@ private: Metadata createMeta(const String & path) const; void createFileOperationObject(const String & operation_name, UInt64 revision, const ObjectMetadata & metadata); + /// Converts revision to binary string with leading zeroes (64 bit). static String revisionToString(UInt64 revision); - bool checkObjectExists(const String & prefix); + bool checkObjectExists(const String & source_bucket, const String & prefix); + void findLastRevision(); + + int readSchemaVersion(const String & source_bucket, const String & source_path); + void saveSchemaVersion(const int & version); + void updateObjectMetadata(const String & key, const ObjectMetadata & metadata); + void migrateFileToRestorableSchema(const String & path); + void migrateToRestorableSchemaRecursive(const String & path, Futures & results); + void migrateToRestorableSchema(); + Aws::S3::Model::HeadObjectResult headObject(const String & source_bucket, const String & key); void listObjects(const String & source_bucket, const String & source_path, std::function callback); void copyObject(const String & src_bucket, const String & src_key, const String & dst_bucket, const String & dst_key); void readRestoreInformation(RestoreInformation & restore_information); - void restoreFiles(const String & source_bucket, const String & source_path, UInt64 target_revision); + void restoreFiles(const RestoreInformation & restore_information); void processRestoreFiles(const String & source_bucket, const String & source_path, std::vector keys); - void restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 target_revision); + void restoreFileOperations(const RestoreInformation & restore_information); /// Remove 'path' prefix from 'key' to get relative key. /// It's needed to store keys to metadata files in RELATIVE_PATHS version. static String shrinkKey(const String & path, const String & key); std::tuple extractRevisionAndOperationFromKey(const String & key); + /// Forms detached path '../../detached/part_name/' from '../../part_name/' + static String pathToDetached(const String & source_path); + const String name; std::shared_ptr client; std::shared_ptr proxy_configuration; const String bucket; const String s3_root_path; - const String metadata_path; + String metadata_path; + UInt64 s3_max_single_read_retries; size_t min_upload_part_size; size_t max_single_part_upload_size; size_t min_bytes_for_seek; @@ -168,16 +197,25 @@ private: std::mutex reservation_mutex; std::atomic revision_counter; - static constexpr UInt64 LATEST_REVISION = (static_cast(1)) << 63; + static constexpr UInt64 LATEST_REVISION = std::numeric_limits::max(); static constexpr UInt64 UNKNOWN_REVISION = 0; /// File at path {metadata_path}/restore contains metadata restore information - const String restore_file_name = "restore"; + inline static const String RESTORE_FILE_NAME = "restore"; /// The number of keys listed in one request (1000 is max value) int list_object_keys_size; /// Key has format: ../../r{revision}-{operation} const re2::RE2 key_regexp {".*/r(\\d+)-(\\w+).*"}; + + /// Object contains information about schema version. + inline static const String SCHEMA_VERSION_OBJECT = ".SCHEMA_VERSION"; + /// Version with possibility to backup-restore metadata. + static constexpr int RESTORABLE_SCHEMA_VERSION = 1; + /// Directories with data. + const std::vector data_roots {"data", "store"}; + + Poco::Logger * log = &Poco::Logger::get("DiskS3"); }; } diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp index 3ce2f909760..352f7467ba0 100644 --- a/src/Disks/S3/registerDiskS3.cpp +++ b/src/Disks/S3/registerDiskS3.cpp @@ -1,9 +1,17 @@ -#include +#if !defined(ARCADIA_BUILD) + #include +#endif + #include -#include #include #include #include + + +#if USE_AWS_S3 + +#include +#include #include "DiskS3.h" #include "Disks/DiskCacheWrapper.h" #include "Disks/DiskFactory.h" @@ -109,20 +117,20 @@ void registerDiskS3(DiskFactory & factory) auto creator = [](const String & name, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - const Context & context) -> DiskPtr { - Poco::File disk{context.getPath() + "disks/" + name}; + ContextConstPtr context) -> DiskPtr { + Poco::File disk{context->getPath() + "disks/" + name}; disk.createDirectories(); S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( - context.getRemoteHostFilter(), - context.getGlobalContext().getSettingsRef().s3_max_redirects); + context->getRemoteHostFilter(), + context->getGlobalContext()->getSettingsRef().s3_max_redirects); S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint"))); if (uri.key.back() != '/') throw Exception("S3 path must ends with '/', but '" + uri.key + "' doesn't.", ErrorCodes::BAD_ARGUMENTS); client_configuration.connectTimeoutMs = config.getUInt(config_prefix + ".connect_timeout_ms", 10000); - client_configuration.httpRequestTimeoutMs = config.getUInt(config_prefix + ".request_timeout_ms", 5000); + client_configuration.requestTimeoutMs = config.getUInt(config_prefix + ".request_timeout_ms", 5000); client_configuration.maxConnections = config.getUInt(config_prefix + ".max_connections", 100); client_configuration.endpointOverride = uri.endpoint; @@ -140,10 +148,11 @@ void registerDiskS3(DiskFactory & factory) config.getString(config_prefix + ".secret_access_key", ""), config.getString(config_prefix + ".server_side_encryption_customer_key_base64", ""), {}, - config.getBool(config_prefix + ".use_environment_credentials", config.getBool("s3.use_environment_credentials", false)) + config.getBool(config_prefix + ".use_environment_credentials", config.getBool("s3.use_environment_credentials", false)), + config.getBool(config_prefix + ".use_insecure_imds_request", config.getBool("s3.use_insecure_imds_request", false)) ); - String metadata_path = config.getString(config_prefix + ".metadata_path", context.getPath() + "disks/" + name + "/"); + String metadata_path = config.getString(config_prefix + ".metadata_path", context->getPath() + "disks/" + name + "/"); auto s3disk = std::make_shared( name, @@ -152,8 +161,9 @@ void registerDiskS3(DiskFactory & factory) uri.bucket, uri.key, metadata_path, - context.getSettingsRef().s3_min_upload_part_size, - context.getSettingsRef().s3_max_single_part_upload_size, + context->getSettingsRef().s3_max_single_read_retries, + context->getSettingsRef().s3_min_upload_part_size, + context->getSettingsRef().s3_max_single_part_upload_size, config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getBool(config_prefix + ".send_metadata", false), config.getInt(config_prefix + ".thread_pool_size", 16), @@ -174,7 +184,7 @@ void registerDiskS3(DiskFactory & factory) if (cache_enabled) { - String cache_path = config.getString(config_prefix + ".cache_path", context.getPath() + "disks/" + name + "/cache/"); + String cache_path = config.getString(config_prefix + ".cache_path", context->getPath() + "disks/" + name + "/cache/"); if (metadata_path == cache_path) throw Exception("Metadata and cache path should be different: " + metadata_path, ErrorCodes::BAD_ARGUMENTS); @@ -196,3 +206,10 @@ void registerDiskS3(DiskFactory & factory) } } + +#else + +void registerDiskS3(DiskFactory &) {} + +#endif + diff --git a/src/Disks/StoragePolicy.cpp b/src/Disks/StoragePolicy.cpp index a1345879c83..cff2685ca24 100644 --- a/src/Disks/StoragePolicy.cpp +++ b/src/Disks/StoragePolicy.cpp @@ -159,6 +159,17 @@ Disks StoragePolicy::getDisks() const } +Disks StoragePolicy::getDisksByType(DiskType::Type type) const +{ + Disks res; + for (const auto & volume : volumes) + for (const auto & disk : volume->getDisks()) + if (disk->getType() == type) + res.push_back(disk); + return res; +} + + DiskPtr StoragePolicy::getAnyDisk() const { /// StoragePolicy must contain at least one Volume diff --git a/src/Disks/StoragePolicy.h b/src/Disks/StoragePolicy.h index 6676ab19043..71773e91f70 100644 --- a/src/Disks/StoragePolicy.h +++ b/src/Disks/StoragePolicy.h @@ -47,6 +47,9 @@ public: /// Returns disks ordered by volumes priority Disks getDisks() const override; + /// Returns disks by type ordered by volumes priority + Disks getDisksByType(DiskType::Type type) const override; + /// Returns any disk /// Used when it's not important, for example for /// mutations files diff --git a/src/Disks/tests/gtest_disk.cpp b/src/Disks/tests/gtest_disk.cpp index 525b5e6ce38..3b9dca63002 100644 --- a/src/Disks/tests/gtest_disk.cpp +++ b/src/Disks/tests/gtest_disk.cpp @@ -4,7 +4,7 @@ #include #include "gtest_disk.h" -#if !__clang__ +#if !defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wsuggest-override" #endif diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index f7f32cf9b6f..6774c5eed88 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -43,16 +43,15 @@ const FormatFactory::Creators & FormatFactory::getCreators(const String & name) throw Exception("Unknown format " + name, ErrorCodes::UNKNOWN_FORMAT); } -FormatSettings getFormatSettings(const Context & context) +FormatSettings getFormatSettings(ContextConstPtr context) { - const auto & settings = context.getSettingsRef(); + const auto & settings = context->getSettingsRef(); return getFormatSettings(context, settings); } template -FormatSettings getFormatSettings(const Context & context, - const Settings & settings) +FormatSettings getFormatSettings(ContextConstPtr context, const Settings & settings) { FormatSettings format_settings; @@ -99,8 +98,8 @@ FormatSettings getFormatSettings(const Context & context, format_settings.regexp.regexp = settings.format_regexp; format_settings.regexp.skip_unmatched = settings.format_regexp_skip_unmatched; format_settings.schema.format_schema = settings.format_schema; - format_settings.schema.format_schema_path = context.getFormatSchemaPath(); - format_settings.schema.is_server = context.hasGlobalContext() && (context.getGlobalContext().getApplicationType() == Context::ApplicationType::SERVER); + format_settings.schema.format_schema_path = context->getFormatSchemaPath(); + format_settings.schema.is_server = context->hasGlobalContext() && (context->getGlobalContext()->getApplicationType() == Context::ApplicationType::SERVER); format_settings.skip_unknown_fields = settings.input_format_skip_unknown_fields; format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; @@ -120,26 +119,22 @@ FormatSettings getFormatSettings(const Context & context, { const Poco::URI & avro_schema_registry_url = settings.format_avro_schema_registry_url; if (!avro_schema_registry_url.empty()) - context.getRemoteHostFilter().checkURL(avro_schema_registry_url); + context->getRemoteHostFilter().checkURL(avro_schema_registry_url); } return format_settings; } -template -FormatSettings getFormatSettings(const Context & context, - const FormatFactorySettings & settings); +template FormatSettings getFormatSettings(ContextConstPtr context, const FormatFactorySettings & settings); -template -FormatSettings getFormatSettings(const Context & context, - const Settings & settings); +template FormatSettings getFormatSettings(ContextConstPtr context, const Settings & settings); InputFormatPtr FormatFactory::getInput( const String & name, ReadBuffer & buf, const Block & sample, - const Context & context, + ContextConstPtr context, UInt64 max_block_size, const std::optional & _format_settings) const { @@ -154,7 +149,7 @@ InputFormatPtr FormatFactory::getInput( throw Exception("Format " + name + " is not suitable for input (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT); } - const Settings & settings = context.getSettingsRef(); + const Settings & settings = context->getSettingsRef(); const auto & file_segmentation_engine = getCreators(name).file_segmentation_engine; // Doesn't make sense to use parallel parsing with less than four threads @@ -204,13 +199,17 @@ InputFormatPtr FormatFactory::getInput( return format; } -BlockOutputStreamPtr FormatFactory::getOutputStreamParallelIfPossible(const String & name, - WriteBuffer & buf, const Block & sample, const Context & context, - WriteCallback callback, const std::optional & _format_settings) const +BlockOutputStreamPtr FormatFactory::getOutputStreamParallelIfPossible( + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextConstPtr context, + WriteCallback callback, + const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_processor_creator; - const Settings & settings = context.getSettingsRef(); + const Settings & settings = context->getSettingsRef(); bool parallel_formatting = settings.output_format_parallel_formatting; if (output_getter && parallel_formatting && getCreators(name).supports_parallel_formatting @@ -237,12 +236,15 @@ BlockOutputStreamPtr FormatFactory::getOutputStreamParallelIfPossible(const Stri } -BlockOutputStreamPtr FormatFactory::getOutputStream(const String & name, - WriteBuffer & buf, const Block & sample, const Context & context, - WriteCallback callback, const std::optional & _format_settings) const +BlockOutputStreamPtr FormatFactory::getOutputStream( + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextConstPtr context, + WriteCallback callback, + const std::optional & _format_settings) const { - auto format_settings = _format_settings - ? *_format_settings : getFormatSettings(context); + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); if (!getCreators(name).output_processor_creator) { @@ -267,7 +269,7 @@ InputFormatPtr FormatFactory::getInputFormat( const String & name, ReadBuffer & buf, const Block & sample, - const Context & context, + ContextConstPtr context, UInt64 max_block_size, const std::optional & _format_settings) const { @@ -275,13 +277,12 @@ InputFormatPtr FormatFactory::getInputFormat( if (!input_getter) throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT); - const Settings & settings = context.getSettingsRef(); + const Settings & settings = context->getSettingsRef(); - if (context.hasQueryContext() && settings.log_queries) - context.getQueryContext().addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); + if (context->hasQueryContext() && settings.log_queries) + context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); - auto format_settings = _format_settings - ? *_format_settings : getFormatSettings(context); + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); RowInputFormatParams params; params.max_block_size = max_block_size; @@ -289,7 +290,6 @@ InputFormatPtr FormatFactory::getInputFormat( params.allow_errors_ratio = format_settings.input_allow_errors_ratio; params.max_execution_time = settings.max_execution_time; params.timeout_overflow_mode = settings.timeout_overflow_mode; - auto format = input_getter(buf, sample, params, format_settings); /// It's a kludge. Because I cannot remove context from values format. @@ -300,18 +300,20 @@ InputFormatPtr FormatFactory::getInputFormat( } OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( - const String & name, WriteBuffer & buf, const Block & sample, - const Context & context, WriteCallback callback, + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextConstPtr context, + WriteCallback callback, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_processor_creator; if (!output_getter) throw Exception("Format " + name + " is not suitable for output (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT); - auto format_settings = _format_settings - ? *_format_settings : getFormatSettings(context); + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); - const Settings & settings = context.getSettingsRef(); + const Settings & settings = context->getSettingsRef(); if (settings.output_format_parallel_formatting && getCreators(name).supports_parallel_formatting && !settings.output_format_json_array_of_rows) @@ -322,8 +324,8 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( ParallelFormattingOutputFormat::Params builder{buf, sample, formatter_creator, settings.max_threads}; - if (context.hasQueryContext() && settings.log_queries) - context.getQueryContext().addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); + if (context->hasQueryContext() && settings.log_queries) + context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); return std::make_shared(builder); } @@ -333,16 +335,19 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( OutputFormatPtr FormatFactory::getOutputFormat( - const String & name, WriteBuffer & buf, const Block & sample, - const Context & context, WriteCallback callback, + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextConstPtr context, + WriteCallback callback, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_processor_creator; if (!output_getter) throw Exception("Format " + name + " is not suitable for output (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT); - if (context.hasQueryContext() && context.getSettingsRef().log_queries) - context.getQueryContext().addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); + if (context->hasQueryContext() && context->getSettingsRef().log_queries) + context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); RowOutputFormatParams params; params.callback = std::move(callback); @@ -412,11 +417,26 @@ void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & na { auto & target = dict[name].supports_parallel_formatting; if (target) - throw Exception("FormatFactory: Output format " + name + " is already marked as supporting parallel formatting.", ErrorCodes::LOGICAL_ERROR); + throw Exception("FormatFactory: Output format " + name + " is already marked as supporting parallel formatting", ErrorCodes::LOGICAL_ERROR); target = true; } +void FormatFactory::markFormatAsColumnOriented(const String & name) +{ + auto & target = dict[name].is_column_oriented; + if (target) + throw Exception("FormatFactory: Format " + name + " is already marked as column oriented", ErrorCodes::LOGICAL_ERROR); + target = true; +} + + +bool FormatFactory::checkIfFormatIsColumnOriented(const String & name) +{ + const auto & target = getCreators(name); + return target.is_column_oriented; +} + FormatFactory & FormatFactory::instance() { static FormatFactory ret; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index 4fa7e9a0c01..d5e4b1b0341 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -1,21 +1,22 @@ #pragma once -#include #include #include #include +#include #include +#include + +#include #include #include #include -#include namespace DB { class Block; -class Context; struct Settings; struct FormatFactorySettings; @@ -34,11 +35,10 @@ struct RowOutputFormatParams; using InputFormatPtr = std::shared_ptr; using OutputFormatPtr = std::shared_ptr; -FormatSettings getFormatSettings(const Context & context); +FormatSettings getFormatSettings(ContextConstPtr context); template -FormatSettings getFormatSettings(const Context & context, - const T & settings); +FormatSettings getFormatSettings(ContextConstPtr context, const T & settings); /** Allows to create an IBlockInputStream or IBlockOutputStream by the name of the format. * Note: format and compression are independent things. @@ -101,6 +101,7 @@ private: OutputProcessorCreator output_processor_creator; FileSegmentationEngine file_segmentation_engine; bool supports_parallel_formatting{false}; + bool is_column_oriented{false}; }; using FormatsDictionary = std::unordered_map; @@ -112,38 +113,52 @@ public: const String & name, ReadBuffer & buf, const Block & sample, - const Context & context, + ContextConstPtr context, UInt64 max_block_size, const std::optional & format_settings = std::nullopt) const; /// Checks all preconditions. Returns ordinary stream if parallel formatting cannot be done. /// Currently used only in Client. Don't use it something else! Better look at getOutputFormatParallelIfPossible. - BlockOutputStreamPtr getOutputStreamParallelIfPossible(const String & name, WriteBuffer & buf, - const Block & sample, const Context & context, WriteCallback callback = {}, + BlockOutputStreamPtr getOutputStreamParallelIfPossible( + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextConstPtr context, + WriteCallback callback = {}, const std::optional & format_settings = std::nullopt) const; /// Currently used only in Client. Don't use it something else! Better look at getOutputFormat. - BlockOutputStreamPtr getOutputStream(const String & name, WriteBuffer & buf, - const Block & sample, const Context & context, WriteCallback callback = {}, + BlockOutputStreamPtr getOutputStream( + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextConstPtr context, + WriteCallback callback = {}, const std::optional & format_settings = std::nullopt) const; InputFormatPtr getInputFormat( const String & name, ReadBuffer & buf, const Block & sample, - const Context & context, + ContextConstPtr context, UInt64 max_block_size, const std::optional & format_settings = std::nullopt) const; /// Checks all preconditions. Returns ordinary format if parallel formatting cannot be done. OutputFormatPtr getOutputFormatParallelIfPossible( - const String & name, WriteBuffer & buf, const Block & sample, - const Context & context, WriteCallback callback = {}, + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextConstPtr context, + WriteCallback callback = {}, const std::optional & format_settings = std::nullopt) const; OutputFormatPtr getOutputFormat( - const String & name, WriteBuffer & buf, const Block & sample, - const Context & context, WriteCallback callback = {}, + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextConstPtr context, + WriteCallback callback = {}, const std::optional & format_settings = std::nullopt) const; /// Register format by its name. @@ -155,6 +170,9 @@ public: void registerOutputFormatProcessor(const String & name, OutputProcessorCreator output_creator); void markOutputFormatSupportsParallelFormatting(const String & name); + void markFormatAsColumnOriented(const String & name); + + bool checkIfFormatIsColumnOriented(const String & name); const FormatsDictionary & getAllFormats() const { diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index c1f02c65748..33d51b1797f 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -27,7 +27,7 @@ struct FormatSettings bool with_names_use_header = false; bool write_statistics = true; bool import_nested_json = false; - bool null_as_default = false; + bool null_as_default = true; enum class DateTimeInputFormat { diff --git a/src/Formats/IRowInputStream.cpp b/src/Formats/IRowInputStream.cpp deleted file mode 100644 index f3603982de5..00000000000 --- a/src/Formats/IRowInputStream.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - -void IRowInputStream::syncAfterError() -{ - throw Exception("Method syncAfterError is not implemented for input format", ErrorCodes::NOT_IMPLEMENTED); -} - -} diff --git a/src/Formats/IRowInputStream.h b/src/Formats/IRowInputStream.h deleted file mode 100644 index e0b8a574f17..00000000000 --- a/src/Formats/IRowInputStream.h +++ /dev/null @@ -1,51 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - - -namespace DB -{ - -/// Contains extra information about read data. -struct RowReadExtension -{ - /// IRowInputStream.read() output. It contains non zero for columns that actually read from the source and zero otherwise. - /// It's used to attach defaults for partially filled rows. - /// Can be empty, this means that all columns are read. - std::vector read_columns; -}; - -/** Interface of stream, that allows to read data by rows. - */ -class IRowInputStream : private boost::noncopyable -{ -public: - /** Read next row and append it to the columns. - * If no more rows - return false. - */ - virtual bool read(MutableColumns & columns, RowReadExtension & extra) = 0; - - virtual void readPrefix() {} /// delimiter before begin of result - virtual void readSuffix() {} /// delimiter after end of result - - /// Skip data until next row. - /// This is intended for text streams, that allow skipping of errors. - /// By default - throws not implemented exception. - virtual bool allowSyncAfterError() const { return false; } - virtual void syncAfterError(); - - /// In case of parse error, try to roll back and parse last one or two rows very carefully - /// and collect as much as possible diagnostic information about error. - /// If not implemented, returns empty string. - virtual std::string getDiagnosticInfo() { return {}; } - - virtual ~IRowInputStream() {} -}; - -using RowInputStreamPtr = std::shared_ptr; - -} diff --git a/src/Formats/IRowOutputStream.cpp b/src/Formats/IRowOutputStream.cpp deleted file mode 100644 index f84d810b8e8..00000000000 --- a/src/Formats/IRowOutputStream.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - - -void IRowOutputStream::write(const Block & block, size_t row_num) -{ - size_t columns = block.columns(); - - writeRowStartDelimiter(); - - for (size_t i = 0; i < columns; ++i) - { - if (i != 0) - writeFieldDelimiter(); - - const auto & col = block.getByPosition(i); - writeField(*col.column, *col.type, row_num); - } - - writeRowEndDelimiter(); -} - -void IRowOutputStream::writeField(const IColumn &, const IDataType &, size_t) -{ - throw Exception("Method writeField is not implemented for output format", ErrorCodes::NOT_IMPLEMENTED); -} - -} diff --git a/src/Formats/IRowOutputStream.h b/src/Formats/IRowOutputStream.h deleted file mode 100644 index 7cf6251cd0d..00000000000 --- a/src/Formats/IRowOutputStream.h +++ /dev/null @@ -1,63 +0,0 @@ -#pragma once - -#include -#include -#include -#include - - -namespace DB -{ - -class Block; -class IColumn; -class IDataType; -struct Progress; - - -/** Interface of stream for writing data by rows (for example: for output to terminal). - */ -class IRowOutputStream : private boost::noncopyable -{ -public: - - /** Write a row. - * Default implementation calls methods to write single values and delimiters - * (except delimiter between rows (writeRowBetweenDelimiter())). - */ - virtual void write(const Block & block, size_t row_num); - - /** Write single value. */ - virtual void writeField(const IColumn & column, const IDataType & type, size_t row_num); - - /** Write delimiter. */ - virtual void writeFieldDelimiter() {} /// delimiter between values - virtual void writeRowStartDelimiter() {} /// delimiter before each row - virtual void writeRowEndDelimiter() {} /// delimiter after each row - virtual void writeRowBetweenDelimiter() {} /// delimiter between rows - virtual void writePrefix() {} /// delimiter before resultset - virtual void writeSuffix() {} /// delimiter after resultset - - /** Flush output buffers if any. */ - virtual void flush() {} - - /** Methods to set additional information for output in formats, that support it. - */ - virtual void setRowsBeforeLimit(size_t /*rows_before_limit*/) {} - virtual void setTotals(const Block & /*totals*/) {} - virtual void setExtremes(const Block & /*extremes*/) {} - - /** Notify about progress. Method could be called from different threads. - * Passed value are delta, that must be summarized. - */ - virtual void onProgress(const Progress & /*progress*/) {} - - /** Content-Type to set when sending HTTP response. */ - virtual String getContentType() const { return "text/plain; charset=UTF-8"; } - - virtual ~IRowOutputStream() {} -}; - -using RowOutputStreamPtr = std::shared_ptr; - -} diff --git a/src/Formats/MySQLBlockInputStream.cpp b/src/Formats/MySQLBlockInputStream.cpp index 87df0c1f4b1..be0cb31f22d 100644 --- a/src/Formats/MySQLBlockInputStream.cpp +++ b/src/Formats/MySQLBlockInputStream.cpp @@ -1,31 +1,44 @@ #if !defined(ARCADIA_BUILD) -# include "config_core.h" +#include "config_core.h" #endif #if USE_MYSQL -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include "MySQLBlockInputStream.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "MySQLBlockInputStream.h" + namespace DB { + namespace ErrorCodes { extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; extern const int NOT_IMPLEMENTED; } +StreamSettings::StreamSettings(const Settings & settings, bool auto_close_, bool fetch_by_name_, size_t max_retry_) + : max_read_mysql_row_nums((settings.external_storage_max_read_rows) ? settings.external_storage_max_read_rows : settings.max_block_size) + , max_read_mysql_bytes_size(settings.external_storage_max_read_bytes) + , auto_close(auto_close_) + , fetch_by_name(fetch_by_name_) + , default_num_tries_on_connection_loss(max_retry_) +{ +} + MySQLBlockInputStream::Connection::Connection( const mysqlxx::PoolWithFailover::Entry & entry_, const std::string & query_str) @@ -35,72 +48,137 @@ MySQLBlockInputStream::Connection::Connection( { } +/// Used in MaterializeMySQL and in doInvalidateQuery for dictionary source. MySQLBlockInputStream::MySQLBlockInputStream( const mysqlxx::PoolWithFailover::Entry & entry, const std::string & query_str, const Block & sample_block, - const UInt64 max_block_size_, - const bool auto_close_, - const bool fetch_by_name_) - : connection{std::make_unique(entry, query_str)} - , max_block_size{max_block_size_} - , auto_close{auto_close_} - , fetch_by_name(fetch_by_name_) + const StreamSettings & settings_) + : log(&Poco::Logger::get("MySQLBlockInputStream")) + , connection{std::make_unique(entry, query_str)} + , settings{std::make_unique(settings_)} { description.init(sample_block); initPositionMappingFromQueryResultStructure(); } +/// For descendant MySQLWithFailoverBlockInputStream + MySQLBlockInputStream::MySQLBlockInputStream(const Block &sample_block_, const StreamSettings & settings_) + : log(&Poco::Logger::get("MySQLBlockInputStream")) + , settings(std::make_unique(settings_)) +{ + description.init(sample_block_); +} + +/// Used by MySQL storage / table function and dictionary source. +MySQLWithFailoverBlockInputStream::MySQLWithFailoverBlockInputStream( + mysqlxx::PoolWithFailoverPtr pool_, + const std::string & query_str_, + const Block & sample_block_, + const StreamSettings & settings_) +: MySQLBlockInputStream(sample_block_, settings_) +, pool(pool_) +, query_str(query_str_) +{ +} + +void MySQLWithFailoverBlockInputStream::readPrefix() +{ + size_t count_connect_attempts = 0; + + /// For recovering from "Lost connection to MySQL server during query" errors + while (true) + { + try + { + connection = std::make_unique(pool->get(), query_str); + break; + } + catch (const mysqlxx::ConnectionLost & ecl) /// There are two retriable failures: CR_SERVER_GONE_ERROR, CR_SERVER_LOST + { + LOG_WARNING(log, "Failed connection ({}/{}). Trying to reconnect... (Info: {})", count_connect_attempts, settings->default_num_tries_on_connection_loss, ecl.displayText()); + } + + if (++count_connect_attempts > settings->default_num_tries_on_connection_loss) + { + LOG_ERROR(log, "Failed to create connection to MySQL. ({}/{})", count_connect_attempts, settings->default_num_tries_on_connection_loss); + throw; + } + } + + initPositionMappingFromQueryResultStructure(); +} namespace { using ValueType = ExternalResultDescription::ValueType; - void insertValue(const IDataType & data_type, IColumn & column, const ValueType type, const mysqlxx::Value & value) + void insertValue(const IDataType & data_type, IColumn & column, const ValueType type, const mysqlxx::Value & value, size_t & read_bytes_size) { switch (type) { case ValueType::vtUInt8: assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 1; break; case ValueType::vtUInt16: assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 2; break; case ValueType::vtUInt32: assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 4; break; case ValueType::vtUInt64: assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 8; break; case ValueType::vtInt8: assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 1; break; case ValueType::vtInt16: assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 2; break; case ValueType::vtInt32: assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 4; break; case ValueType::vtInt64: assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 8; break; case ValueType::vtFloat32: assert_cast(column).insertValue(value.getDouble()); + read_bytes_size += 4; break; case ValueType::vtFloat64: assert_cast(column).insertValue(value.getDouble()); + read_bytes_size += 8; break; case ValueType::vtString: assert_cast(column).insertData(value.data(), value.size()); + read_bytes_size += assert_cast(column).byteSize(); break; case ValueType::vtDate: assert_cast(column).insertValue(UInt16(value.getDate().getDayNum())); + read_bytes_size += 2; break; case ValueType::vtDateTime: - assert_cast(column).insertValue(UInt32(value.getDateTime())); + { + ReadBufferFromString in(value); + time_t time = 0; + readDateTimeText(time, in); + if (time < 0) + time = 0; + assert_cast(column).insertValue(time); + read_bytes_size += 4; break; + } case ValueType::vtUUID: assert_cast(column).insert(parse(value.data(), value.size())); + read_bytes_size += assert_cast(column).byteSize(); break; case ValueType::vtDateTime64:[[fallthrough]]; case ValueType::vtDecimal32: [[fallthrough]]; @@ -109,11 +187,13 @@ namespace case ValueType::vtDecimal256: { ReadBuffer buffer(const_cast(value.data()), value.size(), 0); - data_type.deserializeAsWholeText(column, buffer, FormatSettings{}); + data_type.getDefaultSerialization()->deserializeWholeText(column, buffer, FormatSettings{}); + read_bytes_size += column.sizeOfValueIfFixed(); break; } case ValueType::vtFixedString: assert_cast(column).insertData(value.data(), value.size()); + read_bytes_size += column.sizeOfValueIfFixed(); break; default: throw Exception("Unsupported value type", ErrorCodes::NOT_IMPLEMENTED); @@ -129,8 +209,9 @@ Block MySQLBlockInputStream::readImpl() auto row = connection->result.fetch(); if (!row) { - if (auto_close) + if (settings->auto_close) connection->entry.disconnect(); + return {}; } @@ -139,6 +220,8 @@ Block MySQLBlockInputStream::readImpl() columns[i] = description.sample_block.getByPosition(i).column->cloneEmpty(); size_t num_rows = 0; + size_t read_bytes_size = 0; + while (row) { for (size_t index = 0; index < position_mapping.size(); ++index) @@ -154,12 +237,12 @@ Block MySQLBlockInputStream::readImpl() { ColumnNullable & column_nullable = assert_cast(*columns[index]); const auto & data_type = assert_cast(*sample.type); - insertValue(*data_type.getNestedType(), column_nullable.getNestedColumn(), description.types[index].first, value); + insertValue(*data_type.getNestedType(), column_nullable.getNestedColumn(), description.types[index].first, value, read_bytes_size); column_nullable.getNullMapData().emplace_back(false); } else { - insertValue(*sample.type, *columns[index], description.types[index].first, value); + insertValue(*sample.type, *columns[index], description.types[index].first, value, read_bytes_size); } } else @@ -175,7 +258,7 @@ Block MySQLBlockInputStream::readImpl() } ++num_rows; - if (num_rows == max_block_size) + if (num_rows == settings->max_read_mysql_row_nums || (settings->max_read_mysql_bytes_size && read_bytes_size >= settings->max_read_mysql_bytes_size)) break; row = connection->result.fetch(); @@ -183,23 +266,11 @@ Block MySQLBlockInputStream::readImpl() return description.sample_block.cloneWithColumns(std::move(columns)); } -MySQLBlockInputStream::MySQLBlockInputStream( - const Block & sample_block_, - UInt64 max_block_size_, - bool auto_close_, - bool fetch_by_name_) - : max_block_size(max_block_size_) - , auto_close(auto_close_) - , fetch_by_name(fetch_by_name_) -{ - description.init(sample_block_); -} - void MySQLBlockInputStream::initPositionMappingFromQueryResultStructure() { position_mapping.resize(description.sample_block.columns()); - if (!fetch_by_name) + if (!settings->fetch_by_name) { if (description.sample_block.columns() != connection->result.getNumFields()) throw Exception{"mysqlxx::UseQueryResult contains " + toString(connection->result.getNumFields()) + " columns while " @@ -242,25 +313,6 @@ void MySQLBlockInputStream::initPositionMappingFromQueryResultStructure() } } -MySQLLazyBlockInputStream::MySQLLazyBlockInputStream( - mysqlxx::Pool & pool_, - const std::string & query_str_, - const Block & sample_block_, - const UInt64 max_block_size_, - const bool auto_close_, - const bool fetch_by_name_) - : MySQLBlockInputStream(sample_block_, max_block_size_, auto_close_, fetch_by_name_) - , pool(pool_) - , query_str(query_str_) -{ -} - -void MySQLLazyBlockInputStream::readPrefix() -{ - connection = std::make_unique(pool.get(), query_str); - initPositionMappingFromQueryResultStructure(); -} - } #endif diff --git a/src/Formats/MySQLBlockInputStream.h b/src/Formats/MySQLBlockInputStream.h index 269b630fcc7..12deb9c3146 100644 --- a/src/Formats/MySQLBlockInputStream.h +++ b/src/Formats/MySQLBlockInputStream.h @@ -6,11 +6,24 @@ #include #include #include - +#include namespace DB { +struct StreamSettings +{ + /// Check if setting is enabled, otherwise use common `max_block_size` setting. + size_t max_read_mysql_row_nums; + size_t max_read_mysql_bytes_size; + bool auto_close; + bool fetch_by_name; + size_t default_num_tries_on_connection_loss; + + StreamSettings(const Settings & settings, bool auto_close_ = false, bool fetch_by_name_ = false, size_t max_retry_ = 5); + +}; + /// Allows processing results of a MySQL query as a sequence of Blocks, simplifies chaining class MySQLBlockInputStream : public IBlockInputStream { @@ -19,16 +32,14 @@ public: const mysqlxx::PoolWithFailover::Entry & entry, const std::string & query_str, const Block & sample_block, - const UInt64 max_block_size_, - const bool auto_close_ = false, - const bool fetch_by_name_ = false); + const StreamSettings & settings_); String getName() const override { return "MySQL"; } Block getHeader() const override { return description.sample_block.cloneEmpty(); } protected: - MySQLBlockInputStream(const Block & sample_block_, UInt64 max_block_size_, bool auto_close_, bool fetch_by_name_); + MySQLBlockInputStream(const Block & sample_block_, const StreamSettings & settings); Block readImpl() override; void initPositionMappingFromQueryResultStructure(); @@ -41,32 +52,31 @@ protected: mysqlxx::UseQueryResult result; }; + Poco::Logger * log; std::unique_ptr connection; - const UInt64 max_block_size; - const bool auto_close; - const bool fetch_by_name; + const std::unique_ptr settings; std::vector position_mapping; ExternalResultDescription description; }; /// Like MySQLBlockInputStream, but allocates connection only when reading is starting. /// It allows to create a lot of stream objects without occupation of all connection pool. -class MySQLLazyBlockInputStream final : public MySQLBlockInputStream +/// Also makes attempts to reconnect in case of connection failures. +class MySQLWithFailoverBlockInputStream final : public MySQLBlockInputStream { public: - MySQLLazyBlockInputStream( - mysqlxx::Pool & pool_, + + MySQLWithFailoverBlockInputStream( + mysqlxx::PoolWithFailoverPtr pool_, const std::string & query_str_, const Block & sample_block_, - const UInt64 max_block_size_, - const bool auto_close_ = false, - const bool fetch_by_name_ = false); + const StreamSettings & settings_); private: void readPrefix() override; - mysqlxx::Pool & pool; + mysqlxx::PoolWithFailoverPtr pool; std::string query_str; }; diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 82149460773..3539628f98e 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -24,6 +24,8 @@ # include # include # include +# include +# include # include # include # include @@ -525,16 +527,16 @@ namespace { public: using ColumnType = std::conditional_t; - using StringDataType = std::conditional_t; ProtobufSerializerString( - const StringDataType & string_data_type_, + const std::shared_ptr & fixed_string_data_type_, const google::protobuf::FieldDescriptor & field_descriptor_, const ProtobufReaderOrWriter & reader_or_writer_) : ProtobufSerializerSingleValue(field_descriptor_, reader_or_writer_) + , fixed_string_data_type(fixed_string_data_type_) + , n(fixed_string_data_type->getN()) { static_assert(is_fixed_string, "This constructor for FixedString only"); - n = string_data_type_.getN(); setFunctions(); prepareEnumMapping(); } @@ -583,11 +585,11 @@ namespace { if (row_num < old_size) { - ColumnFixedString::alignStringLength(text_buffer, n, 0); + SerializationFixedString::alignStringLength(n, text_buffer, 0); memcpy(data.data() + row_num * n, text_buffer.data(), n); } else - ColumnFixedString::alignStringLength(data, n, old_data_size); + SerializationFixedString::alignStringLength(n, data, old_data_size); } else { @@ -817,7 +819,7 @@ namespace auto str = default_function(); arr.insert(str.data(), str.data() + str.size()); if constexpr (is_fixed_string) - ColumnFixedString::alignStringLength(arr, n, 0); + SerializationFixedString::alignStringLength(n, arr, 0); default_string = std::move(arr); } return *default_string; @@ -865,7 +867,8 @@ namespace str.insert(name.data(), name.data() + name.length()); } - size_t n = 0; + const std::shared_ptr fixed_string_data_type; + const size_t n = 0; std::function write_function; std::function &)> read_function; std::function default_function; @@ -1325,7 +1328,7 @@ namespace if constexpr (std::is_same_v) readDateTime64Text(decimal, scale, buf); else - DataTypeDecimal::readText(decimal, buf, precision, scale); + SerializationDecimal::readText(decimal, buf, precision, scale); return decimal; } @@ -1485,6 +1488,8 @@ namespace ReadBufferFromString buf{str}; time_t tm = 0; readDateTimeText(tm, buf); + if (tm < 0) + tm = 0; return tm; } @@ -2765,7 +2770,7 @@ namespace case TypeIndex::DateTime: return std::make_unique(field_descriptor, reader_or_writer); case TypeIndex::DateTime64: return std::make_unique(assert_cast(*data_type), field_descriptor, reader_or_writer); case TypeIndex::String: return std::make_unique>(field_descriptor, reader_or_writer); - case TypeIndex::FixedString: return std::make_unique>(assert_cast(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::FixedString: return std::make_unique>(typeid_cast>(data_type), field_descriptor, reader_or_writer); case TypeIndex::Enum8: return std::make_unique>(typeid_cast>(data_type), field_descriptor, reader_or_writer); case TypeIndex::Enum16: return std::make_unique>(typeid_cast>(data_type), field_descriptor, reader_or_writer); case TypeIndex::Decimal32: return std::make_unique>(assert_cast &>(*data_type), field_descriptor, reader_or_writer); @@ -2810,12 +2815,7 @@ namespace const auto & array_data_type = assert_cast(*data_type); if (!allow_repeat) - { - throw Exception( - "The field " + quoteString(field_descriptor.full_name()) - + " must be repeated in the protobuf schema to match the column " + backQuote(StringRef{column_name}), - ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED); - } + throwFieldNotRepeated(field_descriptor, column_name); auto nested_serializer = buildFieldSerializer(column_name, array_data_type.getNestedType(), field_descriptor, /* allow_repeat = */ false); // We do our repeating now, so for nested type we forget about the repeating. @@ -2860,12 +2860,7 @@ namespace /// Serialize as a repeated field. if (!allow_repeat && (size_of_tuple > 1)) - { - throw Exception( - "The field " + quoteString(field_descriptor.full_name()) - + " must be repeated in the protobuf schema to match the column " + backQuote(StringRef{column_name}), - ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED); - } + throwFieldNotRepeated(field_descriptor, column_name); std::vector> nested_serializers; for (const auto & nested_data_type : tuple_data_type.getElements()) @@ -2891,6 +2886,21 @@ namespace } } + [[noreturn]] static void throwFieldNotRepeated(const FieldDescriptor & field_descriptor, const std::string_view & column_name) + { + if (!field_descriptor.is_repeated()) + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + + " must be repeated in the protobuf schema to match the column " + backQuote(StringRef{column_name}), + ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED); + + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + + " is repeated but the level of repeatedness is not enough to serialize a multidimensional array from the column " + + backQuote(StringRef{column_name}) + ". It's recommended to make the parent field repeated as well.", + ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED); + } + const ProtobufReaderOrWriter reader_or_writer; }; } diff --git a/src/Formats/ya.make b/src/Formats/ya.make index 8fe938be125..476e13f9a4f 100644 --- a/src/Formats/ya.make +++ b/src/Formats/ya.make @@ -13,8 +13,6 @@ PEERDIR( SRCS( FormatFactory.cpp FormatSchemaInfo.cpp - IRowInputStream.cpp - IRowOutputStream.cpp JSONEachRowUtils.cpp MySQLBlockInputStream.cpp NativeFormat.cpp diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 1c3beb2e47d..7cbca175c0d 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -1,5 +1,7 @@ configure_file(config_functions.h.in ${ConfigIncludePath}/config_functions.h) +add_subdirectory(divide) + include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake) add_headers_and_sources(clickhouse_functions .) @@ -25,7 +27,7 @@ target_link_libraries(clickhouse_functions PRIVATE ${ZLIB_LIBRARIES} boost::filesystem - libdivide + divide_impl ) if (OPENSSL_CRYPTO_LIBRARY) diff --git a/src/Functions/CustomWeekTransforms.h b/src/Functions/CustomWeekTransforms.h index afcbadc835c..28da546eb93 100644 --- a/src/Functions/CustomWeekTransforms.h +++ b/src/Functions/CustomWeekTransforms.h @@ -33,14 +33,21 @@ static inline UInt32 dateIsNotSupported(const char * name) /// This factor transformation will say that the function is monotone everywhere. struct ZeroTransform { - static inline UInt16 execute(UInt32, UInt8, const DateLUTImpl &) { return 0; } static inline UInt16 execute(UInt16, UInt8, const DateLUTImpl &) { return 0; } + static inline UInt16 execute(UInt32, UInt8, const DateLUTImpl &) { return 0; } + static inline UInt16 execute(Int64, UInt8, const DateLUTImpl &) { return 0; } }; struct ToWeekImpl { static constexpr auto name = "toWeek"; + static inline UInt8 execute(Int64 t, UInt8 week_mode, const DateLUTImpl & time_zone) + { + // TODO: ditch conversion to DayNum, since it doesn't support extended range. + YearWeek yw = time_zone.toYearWeek(time_zone.toDayNum(t), week_mode); + return yw.second; + } static inline UInt8 execute(UInt32 t, UInt8 week_mode, const DateLUTImpl & time_zone) { YearWeek yw = time_zone.toYearWeek(time_zone.toDayNum(t), week_mode); @@ -59,6 +66,13 @@ struct ToYearWeekImpl { static constexpr auto name = "toYearWeek"; + static inline UInt32 execute(Int64 t, UInt8 week_mode, const DateLUTImpl & time_zone) + { + // TODO: ditch toDayNum() + YearWeek yw = time_zone.toYearWeek(time_zone.toDayNum(t), week_mode | static_cast(WeekModeFlag::YEAR)); + return yw.first * 100 + yw.second; + } + static inline UInt32 execute(UInt32 t, UInt8 week_mode, const DateLUTImpl & time_zone) { YearWeek yw = time_zone.toYearWeek(time_zone.toDayNum(t), week_mode | static_cast(WeekModeFlag::YEAR)); @@ -77,13 +91,19 @@ struct ToStartOfWeekImpl { static constexpr auto name = "toStartOfWeek"; + static inline UInt16 execute(Int64 t, UInt8 week_mode, const DateLUTImpl & time_zone) + { + return time_zone.toFirstDayNumOfWeek(time_zone.toDayNum(t), week_mode); +// return time_zone.toFirstDayNumOfWeek(t, week_mode); + } static inline UInt16 execute(UInt32 t, UInt8 week_mode, const DateLUTImpl & time_zone) { return time_zone.toFirstDayNumOfWeek(time_zone.toDayNum(t), week_mode); +// return time_zone.toFirstDayNumOfWeek(t, week_mode); } static inline UInt16 execute(UInt16 d, UInt8 week_mode, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfWeek(DayNum(d), week_mode); + return time_zone.toFirstDayNumOfWeek(ExtendedDayNum(d), week_mode); } using FactorTransform = ZeroTransform; diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 333b397312d..c299b9c4169 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -3,6 +3,7 @@ #include #include #include +//#include #include #include #include @@ -33,14 +34,15 @@ namespace ErrorCodes * factor-transformation F is "round to the nearest month" (2015-02-03 -> 2015-02-01). */ -static inline UInt32 dateIsNotSupported(const char * name) -{ - throw Exception("Illegal type Date of argument for function " + std::string(name), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); -} + static inline UInt32 dateIsNotSupported(const char * name) + { + throw Exception("Illegal type Date of argument for function " + std::string(name), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } /// This factor transformation will say that the function is monotone everywhere. struct ZeroTransform { + static inline UInt16 execute(Int64, const DateLUTImpl &) { return 0; } static inline UInt16 execute(UInt32, const DateLUTImpl &) { return 0; } static inline UInt16 execute(UInt16, const DateLUTImpl &) { return 0; } }; @@ -49,6 +51,10 @@ struct ToDateImpl { static constexpr auto name = "toDate"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return UInt16(time_zone.toDayNum(t)); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { return UInt16(time_zone.toDayNum(t)); @@ -65,13 +71,18 @@ struct ToStartOfDayImpl { static constexpr auto name = "toStartOfDay"; + //TODO: right now it is hardcoded to produce DateTime only, needs fixing later. See date_and_time_type_details::ResultDataTypeMap for deduction of result type example. + static inline UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) + { + return time_zone.toDate(static_cast(t.whole)); + } static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toDate(t); } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDate(DayNum(d)); + return time_zone.toDate(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -81,13 +92,19 @@ struct ToMondayImpl { static constexpr auto name = "toMonday"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + //return time_zone.toFirstDayNumOfWeek(time_zone.toDayNum(t)); + return time_zone.toFirstDayNumOfWeek(t); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfWeek(time_zone.toDayNum(t)); + //return time_zone.toFirstDayNumOfWeek(time_zone.toDayNum(t)); + return time_zone.toFirstDayNumOfWeek(t); } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfWeek(DayNum(d)); + return time_zone.toFirstDayNumOfWeek(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -97,13 +114,17 @@ struct ToStartOfMonthImpl { static constexpr auto name = "toStartOfMonth"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toFirstDayNumOfMonth(time_zone.toDayNum(t)); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toFirstDayNumOfMonth(time_zone.toDayNum(t)); } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfMonth(DayNum(d)); + return time_zone.toFirstDayNumOfMonth(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -113,13 +134,17 @@ struct ToStartOfQuarterImpl { static constexpr auto name = "toStartOfQuarter"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toFirstDayNumOfQuarter(time_zone.toDayNum(t)); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toFirstDayNumOfQuarter(time_zone.toDayNum(t)); } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfQuarter(DayNum(d)); + return time_zone.toFirstDayNumOfQuarter(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -129,13 +154,17 @@ struct ToStartOfYearImpl { static constexpr auto name = "toStartOfYear"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toFirstDayNumOfYear(time_zone.toDayNum(t)); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toFirstDayNumOfYear(time_zone.toDayNum(t)); } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfYear(DayNum(d)); + return time_zone.toFirstDayNumOfYear(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -144,9 +173,13 @@ struct ToStartOfYearImpl struct ToTimeImpl { + /// When transforming to time, the date will be equated to 1970-01-01. static constexpr auto name = "toTime"; - /// When transforming to time, the date will be equated to 1970-01-02. + static UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) + { + return time_zone.toTime(t.whole) + 86400; + } static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toTime(t) + 86400; @@ -164,6 +197,10 @@ struct ToStartOfMinuteImpl { static constexpr auto name = "toStartOfMinute"; + static inline UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) + { + return time_zone.toStartOfMinute(t.whole); + } static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toStartOfMinute(t); @@ -215,6 +252,10 @@ struct ToStartOfFiveMinuteImpl { static constexpr auto name = "toStartOfFiveMinute"; + static inline UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) + { + return time_zone.toStartOfFiveMinute(t.whole); + } static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toStartOfFiveMinute(t); @@ -231,6 +272,10 @@ struct ToStartOfTenMinutesImpl { static constexpr auto name = "toStartOfTenMinutes"; + static inline UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) + { + return time_zone.toStartOfTenMinutes(t.whole); + } static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toStartOfTenMinutes(t); @@ -247,6 +292,10 @@ struct ToStartOfFifteenMinutesImpl { static constexpr auto name = "toStartOfFifteenMinutes"; + static inline UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) + { + return time_zone.toStartOfFifteenMinutes(t.whole); + } static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toStartOfFifteenMinutes(t); @@ -264,6 +313,12 @@ struct TimeSlotImpl { static constexpr auto name = "timeSlot"; + //static inline DecimalUtils::DecimalComponents execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl &) + static inline UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl &) + { + return t.whole / 1800 * 1800; + } + static inline UInt32 execute(UInt32 t, const DateLUTImpl &) { return t / 1800 * 1800; @@ -281,6 +336,11 @@ struct ToStartOfHourImpl { static constexpr auto name = "toStartOfHour"; + static inline UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) + { + return time_zone.toStartOfHour(t.whole); + } + static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toStartOfHour(t); @@ -298,13 +358,17 @@ struct ToYearImpl { static constexpr auto name = "toYear"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toYear(t); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toYear(t); } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toYear(DayNum(d)); + return time_zone.toYear(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -314,13 +378,17 @@ struct ToQuarterImpl { static constexpr auto name = "toQuarter"; + static inline UInt8 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toQuarter(t); + } static inline UInt8 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toQuarter(t); } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toQuarter(DayNum(d)); + return time_zone.toQuarter(ExtendedDayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -330,13 +398,17 @@ struct ToMonthImpl { static constexpr auto name = "toMonth"; + static inline UInt8 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toMonth(t); + } static inline UInt8 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toMonth(t); } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toMonth(DayNum(d)); + return time_zone.toMonth(ExtendedDayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -346,13 +418,17 @@ struct ToDayOfMonthImpl { static constexpr auto name = "toDayOfMonth"; + static inline UInt8 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toDayOfMonth(t); + } static inline UInt8 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toDayOfMonth(t); } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfMonth(DayNum(d)); + return time_zone.toDayOfMonth(ExtendedDayNum(d)); } using FactorTransform = ToStartOfMonthImpl; @@ -362,13 +438,17 @@ struct ToDayOfWeekImpl { static constexpr auto name = "toDayOfWeek"; + static inline UInt8 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toDayOfWeek(t); + } static inline UInt8 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toDayOfWeek(t); } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfWeek(DayNum(d)); + return time_zone.toDayOfWeek(ExtendedDayNum(d)); } using FactorTransform = ToMondayImpl; @@ -378,13 +458,17 @@ struct ToDayOfYearImpl { static constexpr auto name = "toDayOfYear"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toDayOfYear(t); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toDayOfYear(t); } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfYear(DayNum(d)); + return time_zone.toDayOfYear(ExtendedDayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -394,6 +478,10 @@ struct ToHourImpl { static constexpr auto name = "toHour"; + static inline UInt8 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toHour(t); + } static inline UInt8 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toHour(t); @@ -411,6 +499,11 @@ struct TimezoneOffsetImpl { static constexpr auto name = "timezoneOffset"; + static inline time_t execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.timezoneOffset(t); + } + static inline time_t execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.timezoneOffset(t); @@ -428,6 +521,10 @@ struct ToMinuteImpl { static constexpr auto name = "toMinute"; + static inline UInt8 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toMinute(t); + } static inline UInt8 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toMinute(t); @@ -444,6 +541,10 @@ struct ToSecondImpl { static constexpr auto name = "toSecond"; + static inline UInt8 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toSecond(t); + } static inline UInt8 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toSecond(t); @@ -460,13 +561,17 @@ struct ToISOYearImpl { static constexpr auto name = "toISOYear"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toISOYear(time_zone.toDayNum(t)); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toISOYear(time_zone.toDayNum(t)); } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toISOYear(DayNum(d)); + return time_zone.toISOYear(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -476,13 +581,17 @@ struct ToStartOfISOYearImpl { static constexpr auto name = "toStartOfISOYear"; + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toFirstDayNumOfISOYear(time_zone.toDayNum(t)); + } static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toFirstDayNumOfISOYear(time_zone.toDayNum(t)); } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfISOYear(DayNum(d)); + return time_zone.toFirstDayNumOfISOYear(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -492,13 +601,17 @@ struct ToISOWeekImpl { static constexpr auto name = "toISOWeek"; + static inline UInt8 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toISOWeek(time_zone.toDayNum(t)); + } static inline UInt8 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toISOWeek(time_zone.toDayNum(t)); } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toISOWeek(DayNum(d)); + return time_zone.toISOWeek(ExtendedDayNum(d)); } using FactorTransform = ToISOYearImpl; @@ -508,13 +621,17 @@ struct ToRelativeYearNumImpl { static constexpr auto name = "toRelativeYearNum"; - static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) { return time_zone.toYear(t); } + static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + { + return time_zone.toYear(static_cast(t)); + } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toYear(DayNum(d)); + return time_zone.toYear(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -524,13 +641,17 @@ struct ToRelativeQuarterNumImpl { static constexpr auto name = "toRelativeQuarterNum"; - static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) { return time_zone.toRelativeQuarterNum(t); } + static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + { + return time_zone.toRelativeQuarterNum(static_cast(t)); + } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeQuarterNum(DayNum(d)); + return time_zone.toRelativeQuarterNum(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -540,13 +661,17 @@ struct ToRelativeMonthNumImpl { static constexpr auto name = "toRelativeMonthNum"; - static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) { return time_zone.toRelativeMonthNum(t); } + static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + { + return time_zone.toRelativeMonthNum(static_cast(t)); + } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeMonthNum(DayNum(d)); + return time_zone.toRelativeMonthNum(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -556,13 +681,17 @@ struct ToRelativeWeekNumImpl { static constexpr auto name = "toRelativeWeekNum"; - static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) { return time_zone.toRelativeWeekNum(t); } + static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + { + return time_zone.toRelativeWeekNum(static_cast(t)); + } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeWeekNum(DayNum(d)); + return time_zone.toRelativeWeekNum(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -572,10 +701,14 @@ struct ToRelativeDayNumImpl { static constexpr auto name = "toRelativeDayNum"; - static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) { return time_zone.toDayNum(t); } + static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + { + return time_zone.toDayNum(static_cast(t)); + } static inline UInt16 execute(UInt16 d, const DateLUTImpl &) { return static_cast(d); @@ -589,13 +722,17 @@ struct ToRelativeHourNumImpl { static constexpr auto name = "toRelativeHourNum"; - static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) + static inline UInt32 execute(Int64 t, const DateLUTImpl & time_zone) { return time_zone.toRelativeHourNum(t); } + static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) + { + return time_zone.toRelativeHourNum(static_cast(t)); + } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeHourNum(DayNum(d)); + return time_zone.toRelativeHourNum(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -605,13 +742,17 @@ struct ToRelativeMinuteNumImpl { static constexpr auto name = "toRelativeMinuteNum"; - static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) + static inline UInt32 execute(Int64 t, const DateLUTImpl & time_zone) { return time_zone.toRelativeMinuteNum(t); } + static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) + { + return time_zone.toRelativeMinuteNum(static_cast(t)); + } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeMinuteNum(DayNum(d)); + return time_zone.toRelativeMinuteNum(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -621,13 +762,17 @@ struct ToRelativeSecondNumImpl { static constexpr auto name = "toRelativeSecondNum"; + static inline Int64 execute(Int64 t, const DateLUTImpl &) + { + return t; + } static inline UInt32 execute(UInt32 t, const DateLUTImpl &) { return t; } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(DayNum(d)); + return time_zone.fromDayNum(ExtendedDayNum(d)); } using FactorTransform = ZeroTransform; @@ -637,6 +782,10 @@ struct ToYYYYMMImpl { static constexpr auto name = "toYYYYMM"; + static inline UInt32 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toNumYYYYMM(t); + } static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toNumYYYYMM(t); @@ -653,6 +802,10 @@ struct ToYYYYMMDDImpl { static constexpr auto name = "toYYYYMMDD"; + static inline UInt32 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toNumYYYYMMDD(t); + } static inline UInt32 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toNumYYYYMMDD(t); @@ -669,6 +822,10 @@ struct ToYYYYMMDDhhmmssImpl { static constexpr auto name = "toYYYYMMDDhhmmss"; + static inline UInt64 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toNumYYYYMMDDhhmmss(t); + } static inline UInt64 execute(UInt32 t, const DateLUTImpl & time_zone) { return time_zone.toNumYYYYMMDDhhmmss(t); diff --git a/src/Functions/DivisionUtils.h b/src/Functions/DivisionUtils.h index 2b4c07b1cff..174ea7ee797 100644 --- a/src/Functions/DivisionUtils.h +++ b/src/Functions/DivisionUtils.h @@ -105,7 +105,7 @@ struct DivideIntegralImpl auto res = checkedDivision(CastA(a), CastB(b)); if constexpr (std::is_floating_point_v) - if (isNaN(res) || res >= std::numeric_limits::max() || res <= std::numeric_limits::lowest()) + if (isNaN(res) || res >= static_cast(std::numeric_limits::max()) || res <= std::numeric_limits::lowest()) throw Exception("Cannot perform integer division, because it will produce infinite or too large number", ErrorCodes::ILLEGAL_DIVISION); diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index 4887e9fe8cc..d060b86c54b 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -61,7 +61,7 @@ class FunctionBase64Conversion : public IFunction public: static constexpr auto name = Func::name; - static FunctionPtr create(const Context &) + static FunctionPtr create(ContextPtr) { return std::make_shared(); } diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index bb85ae32622..ef9d05b4b5e 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -512,7 +512,7 @@ class FunctionBinaryArithmetic : public IFunction static constexpr const bool is_multiply = IsOperation::multiply; static constexpr const bool is_division = IsOperation::division; - const Context & context; + ContextPtr context; bool check_decimal_overflow = true; template @@ -593,7 +593,7 @@ class FunctionBinaryArithmetic : public IFunction } static FunctionOverloadResolverPtr - getFunctionForIntervalArithmetic(const DataTypePtr & type0, const DataTypePtr & type1, const Context & context) + getFunctionForIntervalArithmetic(const DataTypePtr & type0, const DataTypePtr & type1, ContextPtr context) { bool first_is_date_or_datetime = isDateOrDateTime(type0); bool second_is_date_or_datetime = isDateOrDateTime(type1); @@ -939,9 +939,9 @@ class FunctionBinaryArithmetic : public IFunction public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context & context) { return std::make_shared(context); } + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } - explicit FunctionBinaryArithmetic(const Context & context_) + explicit FunctionBinaryArithmetic(ContextPtr context_) : context(context_), check_decimal_overflow(decimalCheckArithmeticOverflow(context)) {} @@ -955,7 +955,7 @@ public: return getReturnTypeImplStatic(arguments, context); } - static DataTypePtr getReturnTypeImplStatic(const DataTypes & arguments, const Context & context) + static DataTypePtr getReturnTypeImplStatic(const DataTypes & arguments, ContextPtr context) { /// Special case when multiply aggregate function state if (isAggregateMultiply(arguments[0], arguments[1])) @@ -1367,7 +1367,7 @@ public: const ColumnWithTypeAndName & left_, const ColumnWithTypeAndName & right_, const DataTypePtr & return_type_, - const Context & context) + ContextPtr context) { return std::make_shared(left_, right_, return_type_, context); } @@ -1376,7 +1376,7 @@ public: const ColumnWithTypeAndName & left_, const ColumnWithTypeAndName & right_, const DataTypePtr & return_type_, - const Context & context_) + ContextPtr context_) : Base(context_), left(left_), right(right_), return_type(return_type_) { } @@ -1530,12 +1530,12 @@ class BinaryArithmeticOverloadResolver : public IFunctionOverloadResolverImpl { public: static constexpr auto name = Name::name; - static FunctionOverloadResolverImplPtr create(const Context & context) + static FunctionOverloadResolverImplPtr create(ContextPtr context) { return std::make_unique(context); } - explicit BinaryArithmeticOverloadResolver(const Context & context_) : context(context_) {} + explicit BinaryArithmeticOverloadResolver(ContextPtr context_) : context(context_) {} String getName() const override { return name; } size_t getNumberOfArguments() const override { return 2; } @@ -1571,6 +1571,6 @@ public: } private: - const Context & context; + ContextPtr context; }; } diff --git a/src/Functions/FunctionBitTestMany.h b/src/Functions/FunctionBitTestMany.h index 6d527c66390..a2be56c42b0 100644 --- a/src/Functions/FunctionBitTestMany.h +++ b/src/Functions/FunctionBitTestMany.h @@ -24,7 +24,7 @@ struct FunctionBitTestMany : public IFunction { public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } diff --git a/src/Functions/FunctionCustomWeekToSomething.h b/src/Functions/FunctionCustomWeekToSomething.h index 8a343cffb95..c6f56fdc50a 100644 --- a/src/Functions/FunctionCustomWeekToSomething.h +++ b/src/Functions/FunctionCustomWeekToSomething.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include @@ -23,7 +23,7 @@ class FunctionCustomWeekToSomething : public IFunction { public: static constexpr auto name = Transform::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index 5f964b899b4..8a9d454028c 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -25,31 +26,6 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } -/// AddOnDateTime64DefaultImpl provides default implementation of add-X functionality for DateTime64. -/// -/// Default implementation is not to change fractional part, but only modify whole part as if it was DateTime. -/// That means large whole values (for scale less than 9) might not fit into UInt32-range, -/// and hence default implementation will produce incorrect results. -template -struct AddOnDateTime64DefaultImpl -{ - AddOnDateTime64DefaultImpl(UInt32 scale_ = 0) - : scale_multiplier(DecimalUtils::scaleMultiplier(scale_)) - {} - - // Default implementation for add/sub on DateTime64: do math on whole part (the same way as for DateTime), leave fractional as it is. - inline DateTime64 execute(const DateTime64 & t, Int64 delta, const DateLUTImpl & time_zone) const - { - const auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); - - const auto whole = static_cast(this)->execute(static_cast(components.whole), delta, time_zone); - return DecimalUtils::decimalFromComponentsWithMultiplier(static_cast(whole), components.fractional, scale_multiplier); - } - - UInt32 scale_multiplier = 1; -}; - - /// Type of first argument of 'execute' function overload defines what INPUT DataType it is used for. /// Return type defines what is the OUTPUT (return) type of the CH function. /// Corresponding types: @@ -60,14 +36,16 @@ struct AddOnDateTime64DefaultImpl /// - 'AddSecondsImpl::execute(UInt32, ...) -> UInt32' is available to the ClickHouse users as 'addSeconds(DateTime, ...) -> DateTime' /// - 'AddSecondsImpl::execute(UInt16, ...) -> UInt32' is available to the ClickHouse users as 'addSeconds(Date, ...) -> DateTime' -struct AddSecondsImpl : public AddOnDateTime64DefaultImpl +struct AddSecondsImpl { - using Base = AddOnDateTime64DefaultImpl; - using Base::Base; - using Base::execute; - static constexpr auto name = "addSeconds"; + static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents + execute(DecimalUtils::DecimalComponents t, Int64 delta, const DateLUTImpl &) + { + return {t.whole + delta, t.fractional}; + } + static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &) { return t + delta; @@ -75,18 +53,20 @@ struct AddSecondsImpl : public AddOnDateTime64DefaultImpl static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(DayNum(d)) + delta; + return time_zone.fromDayNum(ExtendedDayNum(d)) + delta; } }; -struct AddMinutesImpl : public AddOnDateTime64DefaultImpl +struct AddMinutesImpl { - using Base = AddOnDateTime64DefaultImpl; - using Base::Base; - using Base::execute; - static constexpr auto name = "addMinutes"; + static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents + execute(DecimalUtils::DecimalComponents t, Int64 delta, const DateLUTImpl &) + { + return {t.whole + delta * 60, t.fractional}; + } + static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &) { return t + delta * 60; @@ -94,18 +74,19 @@ struct AddMinutesImpl : public AddOnDateTime64DefaultImpl static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(DayNum(d)) + delta * 60; + return time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 60; } }; -struct AddHoursImpl : public AddOnDateTime64DefaultImpl +struct AddHoursImpl { - using Base = AddOnDateTime64DefaultImpl; - using Base::Base; - using Base::execute; - static constexpr auto name = "addHours"; + static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents + execute(DecimalUtils::DecimalComponents t, Int64 delta, const DateLUTImpl &) + { + return {t.whole + delta * 3600, t.fractional}; + } static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &) { return t + delta * 3600; @@ -113,19 +94,21 @@ struct AddHoursImpl : public AddOnDateTime64DefaultImpl static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(DayNum(d)) + delta * 3600; + return time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 3600; } }; -struct AddDaysImpl : public AddOnDateTime64DefaultImpl +struct AddDaysImpl { - using Base = AddOnDateTime64DefaultImpl; - using Base::Base; - using Base::execute; - static constexpr auto name = "addDays"; - static inline UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl & time_zone) + static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents + execute(DecimalUtils::DecimalComponents t, Int64 delta, const DateLUTImpl & time_zone) + { + return {time_zone.addDays(t.whole, delta), t.fractional}; + } + + static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl & time_zone) { return time_zone.addDays(t, delta); } @@ -136,14 +119,16 @@ struct AddDaysImpl : public AddOnDateTime64DefaultImpl } }; -struct AddWeeksImpl : public AddOnDateTime64DefaultImpl +struct AddWeeksImpl { - using Base = AddOnDateTime64DefaultImpl; - using Base::Base; - using Base::execute; - static constexpr auto name = "addWeeks"; + static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents + execute(DecimalUtils::DecimalComponents t, Int64 delta, const DateLUTImpl & time_zone) + { + return {time_zone.addWeeks(t.whole, delta), t.fractional}; + } + static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl & time_zone) { return time_zone.addWeeks(t, delta); @@ -155,14 +140,16 @@ struct AddWeeksImpl : public AddOnDateTime64DefaultImpl } }; -struct AddMonthsImpl : public AddOnDateTime64DefaultImpl +struct AddMonthsImpl { - using Base = AddOnDateTime64DefaultImpl; - using Base::Base; - using Base::execute; - static constexpr auto name = "addMonths"; + static inline DecimalUtils::DecimalComponents + execute(DecimalUtils::DecimalComponents t, Int64 delta, const DateLUTImpl & time_zone) + { + return {time_zone.addMonths(t.whole, delta), t.fractional}; + } + static inline UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl & time_zone) { return time_zone.addMonths(t, delta); @@ -170,18 +157,20 @@ struct AddMonthsImpl : public AddOnDateTime64DefaultImpl static inline UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.addMonths(DayNum(d), delta); + return time_zone.addMonths(ExtendedDayNum(d), delta); } }; -struct AddQuartersImpl : public AddOnDateTime64DefaultImpl +struct AddQuartersImpl { - using Base = AddOnDateTime64DefaultImpl; - using Base::Base; - using Base::execute; - static constexpr auto name = "addQuarters"; + static inline DecimalUtils::DecimalComponents + execute(DecimalUtils::DecimalComponents t, Int64 delta, const DateLUTImpl & time_zone) + { + return {time_zone.addQuarters(t.whole, delta), t.fractional}; + } + static inline UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl & time_zone) { return time_zone.addQuarters(t, delta); @@ -189,18 +178,20 @@ struct AddQuartersImpl : public AddOnDateTime64DefaultImpl static inline UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.addQuarters(DayNum(d), delta); + return time_zone.addQuarters(ExtendedDayNum(d), delta); } }; -struct AddYearsImpl : public AddOnDateTime64DefaultImpl +struct AddYearsImpl { - using Base = AddOnDateTime64DefaultImpl; - using Base::Base; - using Base::execute; - static constexpr auto name = "addYears"; + static inline DecimalUtils::DecimalComponents + execute(DecimalUtils::DecimalComponents t, Int64 delta, const DateLUTImpl & time_zone) + { + return {time_zone.addYears(t.whole, delta), t.fractional}; + } + static inline UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl & time_zone) { return time_zone.addYears(t, delta); @@ -208,7 +199,7 @@ struct AddYearsImpl : public AddOnDateTime64DefaultImpl static inline UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.addYears(DayNum(d), delta); + return time_zone.addYears(ExtendedDayNum(d), delta); } }; @@ -282,14 +273,16 @@ struct Adder private: template - NO_INLINE NO_SANITIZE_UNDEFINED void vectorVector(const FromVectorType & vec_from, ToVectorType & vec_to, const DeltaColumnType & delta, const DateLUTImpl & time_zone, size_t size) const + NO_INLINE NO_SANITIZE_UNDEFINED void vectorVector( + const FromVectorType & vec_from, ToVectorType & vec_to, const DeltaColumnType & delta, const DateLUTImpl & time_zone, size_t size) const { for (size_t i = 0; i < size; ++i) vec_to[i] = transform.execute(vec_from[i], delta.getData()[i], time_zone); } template - NO_INLINE NO_SANITIZE_UNDEFINED void constantVector(const FromType & from, ToVectorType & vec_to, const DeltaColumnType & delta, const DateLUTImpl & time_zone, size_t size) const + NO_INLINE NO_SANITIZE_UNDEFINED void constantVector( + const FromType & from, ToVectorType & vec_to, const DeltaColumnType & delta, const DateLUTImpl & time_zone, size_t size) const { for (size_t i = 0; i < size; ++i) vec_to[i] = transform.execute(from, delta.getData()[i], time_zone); @@ -351,6 +344,7 @@ template <> struct ResultDataTypeMap { using ResultDataType = DataTy template <> struct ResultDataTypeMap { using ResultDataType = DataTypeDateTime; }; template <> struct ResultDataTypeMap { using ResultDataType = DataTypeDateTime; }; template <> struct ResultDataTypeMap { using ResultDataType = DataTypeDateTime64; }; +template <> struct ResultDataTypeMap { using ResultDataType = DataTypeDateTime64; }; } template @@ -358,7 +352,7 @@ class FunctionDateOrDateTimeAddInterval : public IFunction { public: static constexpr auto name = Transform::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -417,10 +411,18 @@ public: } } + // TransformDateTime64 helps choosing correct overload of exec and does some transformations + // on input and output parameters to simplify support of DateTime64 in concrete Transform. + template + using TransformType = std::conditional_t< + std::is_same_v, + TransformDateTime64, + Transform>; + /// Helper templates to deduce return type based on argument type, since some overloads may promote or denote types, /// e.g. addSeconds(Date, 1) => DateTime template - using TransformExecuteReturnType = decltype(std::declval().execute(FieldType(), 0, std::declval())); + using TransformExecuteReturnType = decltype(std::declval>().execute(FieldType(), 0, std::declval())); // Deduces RETURN DataType from INPUT DataType, based on return type of Transform{}.execute(INPUT_TYPE, UInt64, DateLUTImpl). // e.g. for Transform-type that has execute()-overload with 'UInt16' input and 'UInt32' return, @@ -475,8 +477,9 @@ public: } else if (const auto * datetime64_type = assert_cast(from_type)) { - return DateTimeAddIntervalImpl, Transform>::execute( - Transform{datetime64_type->getScale()}, arguments, result_type); + using WrappedTransformType = TransformType; + return DateTimeAddIntervalImpl, WrappedTransformType>::execute( + WrappedTransformType{datetime64_type->getScale()}, arguments, result_type); } else throw Exception("Illegal type " + arguments[0].type->getName() + " of first argument of function " + getName(), diff --git a/src/Functions/FunctionDateOrDateTimeToSomething.h b/src/Functions/FunctionDateOrDateTimeToSomething.h index e0676f3dc0f..b9d0a7f45fe 100644 --- a/src/Functions/FunctionDateOrDateTimeToSomething.h +++ b/src/Functions/FunctionDateOrDateTimeToSomething.h @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -23,7 +24,7 @@ class FunctionDateOrDateTimeToSomething : public IFunction { public: static constexpr auto name = Transform::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -107,6 +108,7 @@ public: else if (which.isDateTime64()) { const auto scale = static_cast(from_type)->getScale(); + const TransformDateTime64 transformer(scale); return DateTimeTransformImpl::execute(arguments, result_type, input_rows_count, transformer); } @@ -133,7 +135,6 @@ public: /// This method is called only if the function has one argument. Therefore, we do not care about the non-local time zone. const DateLUTImpl & date_lut = DateLUT::instance(); - if (left.isNull() || right.isNull()) return is_not_monotonic; diff --git a/src/Functions/FunctionFQDN.cpp b/src/Functions/FunctionFQDN.cpp index 7b3b89eb511..9cad9011cd4 100644 --- a/src/Functions/FunctionFQDN.cpp +++ b/src/Functions/FunctionFQDN.cpp @@ -12,7 +12,7 @@ class FunctionFQDN : public IFunction { public: static constexpr auto name = "FQDN"; - static FunctionPtr create(const Context &) + static FunctionPtr create(ContextPtr) { return std::make_shared(); } diff --git a/src/Functions/FunctionFactory.cpp b/src/Functions/FunctionFactory.cpp index 09fd360a925..35ac9ab647b 100644 --- a/src/Functions/FunctionFactory.cpp +++ b/src/Functions/FunctionFactory.cpp @@ -52,7 +52,7 @@ void FunctionFactory::registerFunction(const FunctionOverloadResolverImplPtr FunctionFactory::getImpl( const std::string & name, - const Context & context) const + ContextPtr context) const { auto res = tryGetImpl(name, context); if (!res) @@ -82,14 +82,14 @@ std::vector FunctionFactory::getAllNames() const FunctionOverloadResolverPtr FunctionFactory::get( const std::string & name, - const Context & context) const + ContextPtr context) const { return std::make_shared(getImpl(name, context)); } FunctionOverloadResolverImplPtr FunctionFactory::tryGetImpl( const std::string & name_param, - const Context & context) const + ContextPtr context) const { String name = getAliasToOrName(name_param); FunctionOverloadResolverImplPtr res; @@ -99,7 +99,8 @@ FunctionOverloadResolverImplPtr FunctionFactory::tryGetImpl( res = it->second(context); else { - it = case_insensitive_functions.find(Poco::toLower(name)); + name = Poco::toLower(name); + it = case_insensitive_functions.find(name); if (case_insensitive_functions.end() != it) res = it->second(context); } @@ -109,7 +110,7 @@ FunctionOverloadResolverImplPtr FunctionFactory::tryGetImpl( if (CurrentThread::isInitialized()) { - const auto * query_context = CurrentThread::get().getQueryContext(); + auto query_context = CurrentThread::get().getQueryContext(); if (query_context && query_context->getSettingsRef().log_queries) query_context->addQueryFactoriesInfo(Context::QueryLogFactories::Function, name); } @@ -119,7 +120,7 @@ FunctionOverloadResolverImplPtr FunctionFactory::tryGetImpl( FunctionOverloadResolverPtr FunctionFactory::tryGet( const std::string & name, - const Context & context) const + ContextPtr context) const { auto impl = tryGetImpl(name, context); return impl ? std::make_shared(std::move(impl)) diff --git a/src/Functions/FunctionFactory.h b/src/Functions/FunctionFactory.h index 7990e78daf8..96238a88420 100644 --- a/src/Functions/FunctionFactory.h +++ b/src/Functions/FunctionFactory.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -12,14 +13,12 @@ namespace DB { -class Context; - - /** Creates function by name. * Function could use for initialization (take ownership of shared_ptr, for example) * some dictionaries from Context. */ -class FunctionFactory : private boost::noncopyable, public IFactoryWithAliases> +class FunctionFactory : private boost::noncopyable, + public IFactoryWithAliases> { public: static FunctionFactory & instance(); @@ -43,14 +42,14 @@ public: std::vector getAllNames() const; /// Throws an exception if not found. - FunctionOverloadResolverPtr get(const std::string & name, const Context & context) const; + FunctionOverloadResolverPtr get(const std::string & name, ContextPtr context) const; /// Returns nullptr if not found. - FunctionOverloadResolverPtr tryGet(const std::string & name, const Context & context) const; + FunctionOverloadResolverPtr tryGet(const std::string & name, ContextPtr context) const; /// The same methods to get developer interface implementation. - FunctionOverloadResolverImplPtr getImpl(const std::string & name, const Context & context) const; - FunctionOverloadResolverImplPtr tryGetImpl(const std::string & name, const Context & context) const; + FunctionOverloadResolverImplPtr getImpl(const std::string & name, ContextPtr context) const; + FunctionOverloadResolverImplPtr tryGetImpl(const std::string & name, ContextPtr context) const; private: using Functions = std::unordered_map; @@ -59,7 +58,7 @@ private: Functions case_insensitive_functions; template - static FunctionOverloadResolverImplPtr createDefaultFunction(const Context & context) + static FunctionOverloadResolverImplPtr createDefaultFunction(ContextPtr context) { return std::make_unique(Function::create(context)); } diff --git a/src/Functions/FunctionFile.cpp b/src/Functions/FunctionFile.cpp index f477f6123c3..9247152367b 100644 --- a/src/Functions/FunctionFile.cpp +++ b/src/Functions/FunctionFile.cpp @@ -21,12 +21,12 @@ namespace ErrorCodes } /// A function to read file as a string. -class FunctionFile : public IFunction +class FunctionFile : public IFunction, WithContext { public: static constexpr auto name = "file"; - static FunctionPtr create(const Context &context) { return std::make_shared(context); } - explicit FunctionFile(const Context &context_) : context(context_) {} + static FunctionPtr create(ContextPtr context_) { return std::make_shared(context_); } + explicit FunctionFile(ContextPtr context_) : WithContext(context_) {} String getName() const override { return name; } @@ -68,7 +68,7 @@ public: { const char * filename = reinterpret_cast(&chars[source_offset]); - const String user_files_path = context.getUserFilesPath(); + const String user_files_path = getContext()->getUserFilesPath(); String user_files_absolute_path = Poco::Path(user_files_path).makeAbsolute().makeDirectory().toString(); Poco::Path poco_filepath = Poco::Path(filename); if (poco_filepath.isRelative()) @@ -113,7 +113,7 @@ private: void checkReadIsAllowedOrThrow(const std::string & user_files_absolute_path, const std::string & file_absolute_path) const { // If run in Local mode, no need for path checking. - if (context.getApplicationType() != Context::ApplicationType::LOCAL) + if (getContext()->getApplicationType() != Context::ApplicationType::LOCAL) if (file_absolute_path.find(user_files_absolute_path) != 0) throw Exception("File is not inside " + user_files_absolute_path, ErrorCodes::DATABASE_ACCESS_DENIED); @@ -121,8 +121,6 @@ private: if (path_poco_file.exists() && path_poco_file.isDirectory()) throw Exception("File can't be a directory", ErrorCodes::INCORRECT_FILE_NAME); } - - const Context & context; }; diff --git a/src/Functions/FunctionJoinGet.cpp b/src/Functions/FunctionJoinGet.cpp index 6b15bf821b2..2ad9985d44c 100644 --- a/src/Functions/FunctionJoinGet.cpp +++ b/src/Functions/FunctionJoinGet.cpp @@ -25,16 +25,18 @@ ColumnPtr ExecutableFunctionJoinGet::execute(const ColumnsWithTypeAndNa auto key = arguments[i]; keys.emplace_back(std::move(key)); } - return join->joinGet(keys, result_columns).column; + return storage_join->joinGet(keys, result_columns).column; } template ExecutableFunctionImplPtr FunctionJoinGet::prepare(const ColumnsWithTypeAndName &) const { - return std::make_unique>(join, DB::Block{{return_type->createColumn(), return_type, attr_name}}); + Block result_columns {{return_type->createColumn(), return_type, attr_name}}; + return std::make_unique>(table_lock, storage_join, result_columns); } -static auto getJoin(const ColumnsWithTypeAndName & arguments, const Context & context) +static std::pair, String> +getJoin(const ColumnsWithTypeAndName & arguments, ContextPtr context) { String join_name; if (const auto * name_col = checkAndGetColumnConst(arguments[0].column.get())) @@ -50,7 +52,7 @@ static auto getJoin(const ColumnsWithTypeAndName & arguments, const Context & co String database_name; if (dot == String::npos) { - database_name = context.getCurrentDatabase(); + database_name = context->getCurrentDatabase(); dot = 0; } else @@ -86,14 +88,13 @@ FunctionBaseImplPtr JoinGetOverloadResolver::build(const ColumnsWithTyp "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size()) + ", should be greater or equal to 3", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - auto [storage_join, attr_name] = getJoin(arguments, context); - auto join = storage_join->getJoin(); + auto [storage_join, attr_name] = getJoin(arguments, getContext()); DataTypes data_types(arguments.size() - 2); for (size_t i = 2; i < arguments.size(); ++i) data_types[i - 2] = arguments[i].type; - auto return_type = join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null); - auto table_lock = storage_join->lockForShare(context.getInitialQueryId(), context.getSettingsRef().lock_acquire_timeout); - return std::make_unique>(table_lock, storage_join, join, attr_name, data_types, return_type); + auto return_type = storage_join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null); + auto table_lock = storage_join->lockForShare(getContext()->getInitialQueryId(), getContext()->getSettingsRef().lock_acquire_timeout); + return std::make_unique>(table_lock, storage_join, attr_name, data_types, return_type); } void registerFunctionJoinGet(FunctionFactory & factory) diff --git a/src/Functions/FunctionJoinGet.h b/src/Functions/FunctionJoinGet.h index 27f348e9698..016ef15f2d5 100644 --- a/src/Functions/FunctionJoinGet.h +++ b/src/Functions/FunctionJoinGet.h @@ -1,5 +1,7 @@ #pragma once + #include +#include #include #include #include @@ -7,16 +9,21 @@ namespace DB { -class Context; class HashJoin; -using HashJoinPtr = std::shared_ptr; +class StorageJoin; +using StorageJoinPtr = std::shared_ptr; template class ExecutableFunctionJoinGet final : public IExecutableFunctionImpl { public: - ExecutableFunctionJoinGet(HashJoinPtr join_, const DB::Block & result_columns_) - : join(std::move(join_)), result_columns(result_columns_) {} + ExecutableFunctionJoinGet(TableLockHolder table_lock_, + StorageJoinPtr storage_join_, + const DB::Block & result_columns_) + : table_lock(std::move(table_lock_)) + , storage_join(std::move(storage_join_)) + , result_columns(result_columns_) + {} static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet"; @@ -29,7 +36,8 @@ public: String getName() const override { return name; } private: - HashJoinPtr join; + TableLockHolder table_lock; + StorageJoinPtr storage_join; DB::Block result_columns; }; @@ -39,12 +47,11 @@ class FunctionJoinGet final : public IFunctionBaseImpl public: static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet"; - FunctionJoinGet(TableLockHolder table_lock_, StoragePtr storage_join_, - HashJoinPtr join_, String attr_name_, + FunctionJoinGet(TableLockHolder table_lock_, + StorageJoinPtr storage_join_, String attr_name_, DataTypes argument_types_, DataTypePtr return_type_) : table_lock(std::move(table_lock_)) - , storage_join(std::move(storage_join_)) - , join(std::move(join_)) + , storage_join(storage_join_) , attr_name(std::move(attr_name_)) , argument_types(std::move(argument_types_)) , return_type(std::move(return_type_)) @@ -60,21 +67,20 @@ public: private: TableLockHolder table_lock; - StoragePtr storage_join; - HashJoinPtr join; + StorageJoinPtr storage_join; const String attr_name; DataTypes argument_types; DataTypePtr return_type; }; template -class JoinGetOverloadResolver final : public IFunctionOverloadResolverImpl +class JoinGetOverloadResolver final : public IFunctionOverloadResolverImpl, WithContext { public: static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet"; - static FunctionOverloadResolverImplPtr create(const Context & context) { return std::make_unique(context); } + static FunctionOverloadResolverImplPtr create(ContextPtr context_) { return std::make_unique(context_); } - explicit JoinGetOverloadResolver(const Context & context_) : context(context_) {} + explicit JoinGetOverloadResolver(ContextPtr context_) : WithContext(context_) {} String getName() const override { return name; } @@ -87,9 +93,6 @@ public: bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } - -private: - const Context & context; }; } diff --git a/src/Functions/FunctionMathBinaryFloat64.h b/src/Functions/FunctionMathBinaryFloat64.h index 0a0688dc75c..00f6909f65c 100644 --- a/src/Functions/FunctionMathBinaryFloat64.h +++ b/src/Functions/FunctionMathBinaryFloat64.h @@ -28,7 +28,7 @@ class FunctionMathBinaryFloat64 : public IFunction { public: static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } static_assert(Impl::rows_per_iteration > 0, "Impl must process at least one row per iteration"); bool useDefaultImplementationForConstants() const override { return true; } diff --git a/src/Functions/FunctionMathConstFloat64.h b/src/Functions/FunctionMathConstFloat64.h index f03f469bc35..04ade5b4637 100644 --- a/src/Functions/FunctionMathConstFloat64.h +++ b/src/Functions/FunctionMathConstFloat64.h @@ -13,7 +13,7 @@ class FunctionMathConstFloat64 : public IFunction { public: static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } private: String getName() const override { return name; } diff --git a/src/Functions/FunctionMathUnary.h b/src/Functions/FunctionMathUnary.h index 49b0428811a..e24d19e5a9b 100644 --- a/src/Functions/FunctionMathUnary.h +++ b/src/Functions/FunctionMathUnary.h @@ -35,7 +35,7 @@ class FunctionMathUnary : public IFunction { public: static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } private: String getName() const override { return name; } diff --git a/src/Functions/FunctionNumericPredicate.h b/src/Functions/FunctionNumericPredicate.h index 72a17adac4c..9bb0f700fe0 100644 --- a/src/Functions/FunctionNumericPredicate.h +++ b/src/Functions/FunctionNumericPredicate.h @@ -1,8 +1,10 @@ #pragma once + #include #include #include #include +#include #include @@ -21,7 +23,7 @@ class FunctionNumericPredicate : public IFunction { public: static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) + static FunctionPtr create(ContextPtr) { return std::make_shared(); } diff --git a/src/Functions/FunctionStartsEndsWith.h b/src/Functions/FunctionStartsEndsWith.h index 2899bc259d5..87c07c5efd2 100644 --- a/src/Functions/FunctionStartsEndsWith.h +++ b/src/Functions/FunctionStartsEndsWith.h @@ -142,7 +142,7 @@ template class FunctionStartsEndsWith : public TargetSpecific::Default::FunctionStartsEndsWith { public: - explicit FunctionStartsEndsWith(const Context & context) : selector(context) + explicit FunctionStartsEndsWith(ContextPtr context) : selector(context) { selector.registerImplementation>(); @@ -164,7 +164,7 @@ public: return selector.selectAndExecute(arguments, result_type, input_rows_count); } - static FunctionPtr create(const Context & context) + static FunctionPtr create(ContextPtr context) { return std::make_shared>(context); } diff --git a/src/Functions/FunctionStringOrArrayToT.h b/src/Functions/FunctionStringOrArrayToT.h index e00acc006cf..40ce62a5666 100644 --- a/src/Functions/FunctionStringOrArrayToT.h +++ b/src/Functions/FunctionStringOrArrayToT.h @@ -25,7 +25,7 @@ class FunctionStringOrArrayToT : public IFunction { public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) + static FunctionPtr create(ContextPtr) { return std::make_shared(); } diff --git a/src/Functions/FunctionStringReplace.h b/src/Functions/FunctionStringReplace.h index bd8edbf9202..2e99f58531a 100644 --- a/src/Functions/FunctionStringReplace.h +++ b/src/Functions/FunctionStringReplace.h @@ -23,7 +23,7 @@ class FunctionStringReplace : public IFunction { public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } diff --git a/src/Functions/FunctionStringToString.h b/src/Functions/FunctionStringToString.h index 4123b41c547..36009dc30c0 100644 --- a/src/Functions/FunctionStringToString.h +++ b/src/Functions/FunctionStringToString.h @@ -1,9 +1,11 @@ #pragma once + #include #include #include #include #include +#include namespace DB @@ -21,7 +23,7 @@ class FunctionStringToString : public IFunction { public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) + static FunctionPtr create(ContextPtr) { return std::make_shared(); } diff --git a/src/Functions/FunctionUnaryArithmetic.h b/src/Functions/FunctionUnaryArithmetic.h index 06469a83e3e..9145c4a2ec5 100644 --- a/src/Functions/FunctionUnaryArithmetic.h +++ b/src/Functions/FunctionUnaryArithmetic.h @@ -110,7 +110,7 @@ class FunctionUnaryArithmetic : public IFunction public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { diff --git a/src/Functions/FunctionUnixTimestamp64.h b/src/Functions/FunctionUnixTimestamp64.h index 20e225990bd..18f9c7a8b02 100644 --- a/src/Functions/FunctionUnixTimestamp64.h +++ b/src/Functions/FunctionUnixTimestamp64.h @@ -33,7 +33,7 @@ public: static_assert(std::is_same_v || std::is_same_v); - static auto create(const Context &) + static auto create(ContextPtr) { return std::make_shared>(); } diff --git a/src/Functions/FunctionsAES.h b/src/Functions/FunctionsAES.h index 132e94907f5..b76b454fd77 100644 --- a/src/Functions/FunctionsAES.h +++ b/src/Functions/FunctionsAES.h @@ -141,7 +141,7 @@ class FunctionEncrypt : public IFunction public: static constexpr OpenSSLDetails::CompatibilityMode compatibility_mode = Impl::compatibility_mode; static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } private: using CipherMode = OpenSSLDetails::CipherMode; @@ -416,7 +416,7 @@ class FunctionDecrypt : public IFunction public: static constexpr OpenSSLDetails::CompatibilityMode compatibility_mode = Impl::compatibility_mode; static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } private: using CipherMode = OpenSSLDetails::CipherMode; @@ -538,8 +538,9 @@ private: [[maybe_unused]] const auto block_size = static_cast(EVP_CIPHER_block_size(evp_cipher)); [[maybe_unused]] const auto iv_size = static_cast(EVP_CIPHER_iv_length(evp_cipher)); - const auto key_size = static_cast(EVP_CIPHER_key_length(evp_cipher)); - const auto tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1 + + const size_t key_size = static_cast(EVP_CIPHER_key_length(evp_cipher)); + static constexpr size_t tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1 auto decrypted_result_column = ColumnString::create(); auto & decrypted_result_column_data = decrypted_result_column->getChars(); @@ -549,9 +550,17 @@ private: size_t resulting_size = 0; for (size_t r = 0; r < input_rows_count; ++r) { - resulting_size += input_column->getDataAt(r).size + 1; + size_t string_size = input_column->getDataAt(r).size; + resulting_size += string_size + 1; /// With terminating zero. + if constexpr (mode == CipherMode::RFC5116_AEAD_AES_GCM) + { + if (string_size < tag_size) + throw Exception("Encrypted data is smaller than the size of additional data for AEAD mode, cannot decrypt.", + ErrorCodes::BAD_ARGUMENTS); + resulting_size -= tag_size; + } } #if defined(MEMORY_SANITIZER) @@ -565,6 +574,7 @@ private: decrypted_result_column_data.resize(resulting_size); #endif } + auto * decrypted = decrypted_result_column_data.data(); KeyHolder key_holder; @@ -631,7 +641,7 @@ private: // 1.a.2: Set AAD if present if (aad_column) { - const auto aad_data = aad_column->getDataAt(r); + StringRef aad_data = aad_column->getDataAt(r); int tmp_len = 0; if (aad_data.size != 0 && EVP_DecryptUpdate(evp_ctx, nullptr, &tmp_len, reinterpret_cast(aad_data.data), aad_data.size) != 1) diff --git a/src/Functions/FunctionsBitmap.h b/src/Functions/FunctionsBitmap.h index 054f8800630..aa4a6b50ea3 100644 --- a/src/Functions/FunctionsBitmap.h +++ b/src/Functions/FunctionsBitmap.h @@ -93,7 +93,7 @@ class FunctionBitmapBuildImpl : public IFunction public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -221,7 +221,7 @@ class FunctionBitmapToArrayImpl : public IFunction public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -311,7 +311,7 @@ class FunctionBitmapSubset : public IFunction public: static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared>(); } + static FunctionPtr create(ContextPtr) { return std::make_shared>(); } String getName() const override { return name; } @@ -469,7 +469,7 @@ class FunctionBitmapTransform : public IFunction public: static constexpr auto name = "bitmapTransform"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -635,7 +635,7 @@ class FunctionBitmapSelfCardinalityImpl : public IFunction public: static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared>(); } + static FunctionPtr create(ContextPtr) { return std::make_shared>(); } String getName() const override { return name; } @@ -807,7 +807,7 @@ class FunctionBitmapContains : public IFunction public: static constexpr auto name = "bitmapContains"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -911,7 +911,7 @@ class FunctionBitmapCardinality : public IFunction public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -1054,7 +1054,7 @@ class FunctionBitmap : public IFunction public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -1100,14 +1100,14 @@ public: return executeBitmapData(arguments, input_rows_count); else if (which.isUInt64()) return executeBitmapData(arguments, input_rows_count); - else if (which.isUInt8()) - return executeBitmapData(arguments, input_rows_count); - else if (which.isUInt16()) - return executeBitmapData(arguments, input_rows_count); - else if (which.isUInt32()) - return executeBitmapData(arguments, input_rows_count); - else if (which.isUInt64()) - return executeBitmapData(arguments, input_rows_count); + else if (which.isInt8()) + return executeBitmapData(arguments, input_rows_count); + else if (which.isInt16()) + return executeBitmapData(arguments, input_rows_count); + else if (which.isInt32()) + return executeBitmapData(arguments, input_rows_count); + else if (which.isInt64()) + return executeBitmapData(arguments, input_rows_count); else throw Exception( "Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); diff --git a/src/Functions/FunctionsCoding.cpp b/src/Functions/FunctionsCoding.cpp index cf85b4512d5..c1e20a657b2 100644 --- a/src/Functions/FunctionsCoding.cpp +++ b/src/Functions/FunctionsCoding.cpp @@ -20,7 +20,7 @@ void registerFunctionsCoding(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(FunctionFactory::CaseInsensitive); - factory.registerFunction(); + factory.registerFunction(FunctionFactory::CaseInsensitive); factory.registerFunction(FunctionFactory::CaseInsensitive); factory.registerFunction(); factory.registerFunction(); diff --git a/src/Functions/FunctionsCoding.h b/src/Functions/FunctionsCoding.h index abfb1e83a77..933d4f01b92 100644 --- a/src/Functions/FunctionsCoding.h +++ b/src/Functions/FunctionsCoding.h @@ -1,28 +1,29 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include -#include +#include +#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -68,7 +69,7 @@ class FunctionIPv6NumToString : public IFunction { public: static constexpr auto name = "IPv6NumToString"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -138,7 +139,7 @@ class FunctionCutIPv6 : public IFunction { public: static constexpr auto name = "cutIPv6"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -261,7 +262,7 @@ class FunctionIPv6StringToNum : public IFunction { public: static constexpr auto name = "IPv6StringToNum"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } static inline bool tryParseIPv4(const char * pos) { @@ -339,7 +340,7 @@ class FunctionIPv4NumToString : public IFunction { public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared>(); } + static FunctionPtr create(ContextPtr) { return std::make_shared>(); } String getName() const override { @@ -400,7 +401,7 @@ class FunctionIPv4StringToNum : public IFunction { public: static constexpr auto name = "IPv4StringToNum"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -463,7 +464,7 @@ class FunctionIPv4ToIPv6 : public IFunction { public: static constexpr auto name = "IPv4ToIPv6"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -518,7 +519,7 @@ class FunctionToIPv4 : public FunctionIPv4StringToNum { public: static constexpr auto name = "toIPv4"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -541,7 +542,7 @@ class FunctionToIPv6 : public FunctionIPv6StringToNum { public: static constexpr auto name = "toIPv6"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -559,7 +560,7 @@ class FunctionMACNumToString : public IFunction { public: static constexpr auto name = "MACNumToString"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -689,7 +690,7 @@ class FunctionMACStringTo : public IFunction { public: static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared>(); } + static FunctionPtr create(ContextPtr) { return std::make_shared>(); } String getName() const override { @@ -752,7 +753,7 @@ class FunctionUUIDNumToString : public IFunction public: static constexpr auto name = "UUIDNumToString"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -850,7 +851,7 @@ private: public: static constexpr auto name = "UUIDStringToNum"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -954,7 +955,7 @@ class FunctionHex : public IFunction { public: static constexpr auto name = "hex"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -1237,7 +1238,7 @@ class FunctionUnhex : public IFunction { public: static constexpr auto name = "unhex"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -1326,7 +1327,7 @@ class FunctionChar : public IFunction { public: static constexpr auto name = "char"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -1421,7 +1422,7 @@ class FunctionBitmaskToArray : public IFunction { public: static constexpr auto name = "bitmaskToArray"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -1507,7 +1508,7 @@ class FunctionToStringCutToZero : public IFunction { public: static constexpr auto name = "toStringCutToZero"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { @@ -1645,7 +1646,7 @@ private: static inline void applyCIDRMask(const UInt8 * __restrict src, UInt8 * __restrict dst_lower, UInt8 * __restrict dst_upper, UInt8 bits_to_keep) { - __m128i mask = _mm_loadu_si128(reinterpret_cast(getCIDRMaskIPv6(bits_to_keep))); + __m128i mask = _mm_loadu_si128(reinterpret_cast(getCIDRMaskIPv6(bits_to_keep).data())); __m128i lower = _mm_and_si128(_mm_loadu_si128(reinterpret_cast(src)), mask); _mm_storeu_si128(reinterpret_cast<__m128i *>(dst_lower), lower); @@ -1659,7 +1660,7 @@ private: /// NOTE IPv6 is stored in memory in big endian format that makes some difficulties. static void applyCIDRMask(const UInt8 * __restrict src, UInt8 * __restrict dst_lower, UInt8 * __restrict dst_upper, UInt8 bits_to_keep) { - const auto * mask = getCIDRMaskIPv6(bits_to_keep); + const auto & mask = getCIDRMaskIPv6(bits_to_keep); for (size_t i = 0; i < 16; ++i) { @@ -1672,7 +1673,7 @@ private: public: static constexpr auto name = "IPv6CIDRToRange"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } size_t getNumberOfArguments() const override { return 2; } @@ -1786,7 +1787,7 @@ private: public: static constexpr auto name = "IPv4CIDRToRange"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } size_t getNumberOfArguments() const override { return 2; } @@ -1867,7 +1868,7 @@ class FunctionIsIPv4String : public FunctionIPv4StringToNum public: static constexpr auto name = "isIPv4String"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -1913,7 +1914,7 @@ class FunctionIsIPv6String : public FunctionIPv6StringToNum public: static constexpr auto name = "isIPv6String"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } diff --git a/src/Functions/FunctionsComparison.h b/src/Functions/FunctionsComparison.h index bbfa1ea20d8..025b7c67b21 100644 --- a/src/Functions/FunctionsComparison.h +++ b/src/Functions/FunctionsComparison.h @@ -552,22 +552,18 @@ struct NameLessOrEquals { static constexpr auto name = "lessOrEquals"; }; struct NameGreaterOrEquals { static constexpr auto name = "greaterOrEquals"; }; -template < - template class Op, - typename Name> +template