diff --git a/.gitignore b/.gitignore index 4bc162c1b0f..8a745655cbf 100644 --- a/.gitignore +++ b/.gitignore @@ -159,6 +159,7 @@ website/package-lock.json /programs/server/store /programs/server/uuid /programs/server/coordination +/programs/server/workload # temporary test files tests/queries/0_stateless/test_* diff --git a/.gitmodules b/.gitmodules index bd61c52a5e0..a3b6450032a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -227,12 +227,6 @@ [submodule "contrib/minizip-ng"] path = contrib/minizip-ng url = https://github.com/zlib-ng/minizip-ng -[submodule "contrib/qpl"] - path = contrib/qpl - url = https://github.com/intel/qpl -[submodule "contrib/idxd-config"] - path = contrib/idxd-config - url = https://github.com/intel/idxd-config [submodule "contrib/QAT-ZSTD-Plugin"] path = contrib/QAT-ZSTD-Plugin url = https://github.com/intel/QAT-ZSTD-Plugin @@ -338,7 +332,7 @@ url = https://github.com/ClickHouse/usearch.git [submodule "contrib/SimSIMD"] path = contrib/SimSIMD - url = https://github.com/ashvardanian/SimSIMD.git + url = https://github.com/ClickHouse/SimSIMD.git [submodule "contrib/FP16"] path = contrib/FP16 url = https://github.com/Maratyszcza/FP16.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c0d21a4698..dacee73440f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ ### Table of Contents +**[ClickHouse release v24.10, 2024-10-31](#2410)**
**[ClickHouse release v24.9, 2024-09-26](#249)**
**[ClickHouse release v24.8 LTS, 2024-08-20](#248)**
**[ClickHouse release v24.7, 2024-07-30](#247)**
@@ -12,6 +13,165 @@ # 2024 Changelog +### ClickHouse release 24.10, 2024-10-31 + +#### Backward Incompatible Change +* Allow to write `SETTINGS` before `FORMAT` in a chain of queries with `UNION` when subqueries are inside parentheses. This closes [#39712](https://github.com/ClickHouse/ClickHouse/issues/39712). Change the behavior when a query has the SETTINGS clause specified twice in a sequence. The closest SETTINGS clause will have a preference for the corresponding subquery. In the previous versions, the outermost SETTINGS clause could take a preference over the inner one. [#68614](https://github.com/ClickHouse/ClickHouse/pull/68614) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Reordering of filter conditions from `[PRE]WHERE` clause is now allowed by default. It could be disabled by setting `allow_reorder_prewhere_conditions` to `false`. [#70657](https://github.com/ClickHouse/ClickHouse/pull/70657) ([Nikita Taranov](https://github.com/nickitat)). +* Remove the `idxd-config` library, which has an incompatible license. This also removes the experimental Intel DeflateQPL codec. [#70987](https://github.com/ClickHouse/ClickHouse/pull/70987) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### New Feature +* Allow to grant access to the wildcard prefixes. `GRANT SELECT ON db.table_pefix_* TO user`. [#65311](https://github.com/ClickHouse/ClickHouse/pull/65311) ([pufit](https://github.com/pufit)). +* If you press space bar during query runtime, the client will display a real-time table with detailed metrics. You can enable it globally with the new `--progress-table` option in clickhouse-client; a new `--enable-progress-table-toggle` is associated with the `--progress-table` option, and toggles the rendering of the progress table by pressing the control key (Space). [#63689](https://github.com/ClickHouse/ClickHouse/pull/63689) ([Maria Khristenko](https://github.com/mariaKhr)), [#70423](https://github.com/ClickHouse/ClickHouse/pull/70423) ([Julia Kartseva](https://github.com/jkartseva)). +* Allow to cache read files for object storage table engines and data lakes using hash from ETag + file path as cache key. [#70135](https://github.com/ClickHouse/ClickHouse/pull/70135) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support creating a table with a query: `CREATE TABLE ... CLONE AS ...`. It clones the source table's schema and then attaches all partitions to the newly created table. This feature is only supported with tables of the `MergeTree` family Closes [#65015](https://github.com/ClickHouse/ClickHouse/issues/65015). [#69091](https://github.com/ClickHouse/ClickHouse/pull/69091) ([tuanpach](https://github.com/tuanpach)). +* Add a new system table, `system.query_metric_log` which contains history of memory and metric values from table system.events for individual queries, periodically flushed to disk. [#66532](https://github.com/ClickHouse/ClickHouse/pull/66532) ([Pablo Marcos](https://github.com/pamarcos)). +* A simple SELECT query can be written with implicit SELECT to enable calculator-style expressions, e.g., `ch "1 + 2"`. This is controlled by a new setting, `implicit_select`. [#68502](https://github.com/ClickHouse/ClickHouse/pull/68502) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support the `--copy` mode for clickhouse local as a shortcut for format conversion [#68503](https://github.com/ClickHouse/ClickHouse/issues/68503). [#68583](https://github.com/ClickHouse/ClickHouse/pull/68583) ([Denis Hananein](https://github.com/denis-hananein)). +* Add a builtin HTML page for visualizing merges which is available at the `/merges` path. [#70821](https://github.com/ClickHouse/ClickHouse/pull/70821) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add support for `arrayUnion` function. [#68989](https://github.com/ClickHouse/ClickHouse/pull/68989) ([Peter Nguyen](https://github.com/petern48)). +* Allow parametrised SQL aliases. [#50665](https://github.com/ClickHouse/ClickHouse/pull/50665) ([Anton Kozlov](https://github.com/tonickkozlov)). +* A new aggregate function `quantileExactWeightedInterpolated`, which is a interpolated version based on quantileExactWeighted. Some people may wonder why we need a new `quantileExactWeightedInterpolated` since we already have `quantileExactInterpolatedWeighted`. The reason is the new one is more accurate than the old one. This is for spark compatibility. [#69619](https://github.com/ClickHouse/ClickHouse/pull/69619) ([李扬](https://github.com/taiyang-li)). +* A new function `arrayElementOrNull`. It returns `NULL` if the array index is out of range or a Map key not found. [#69646](https://github.com/ClickHouse/ClickHouse/pull/69646) ([李扬](https://github.com/taiyang-li)). +* Allows users to specify regular expressions through new `message_regexp` and `message_regexp_negative` fields in the `config.xml` file to filter out logging. The logging is applied to the formatted un-colored text for the most intuitive developer experience. [#69657](https://github.com/ClickHouse/ClickHouse/pull/69657) ([Peter Nguyen](https://github.com/petern48)). +* Added `RIPEMD160` function, which computes the RIPEMD-160 cryptographic hash of a string. Example: `SELECT HEX(RIPEMD160('The quick brown fox jumps over the lazy dog'))` returns `37F332F68DB77BD9D7EDD4969571AD671CF9DD3B`. [#70087](https://github.com/ClickHouse/ClickHouse/pull/70087) ([Dergousov Maxim](https://github.com/m7kss1)). +* Support reading `Iceberg` tables on `HDFS`. [#70268](https://github.com/ClickHouse/ClickHouse/pull/70268) ([flynn](https://github.com/ucasfl)). +* Support for CTE in the form of `WITH ... INSERT`, as previously we only supported `INSERT ... WITH ...`. [#70593](https://github.com/ClickHouse/ClickHouse/pull/70593) ([Shichao Jin](https://github.com/jsc0218)). +* MongoDB integration: support for all MongoDB types, support for WHERE and ORDER BY statements on MongoDB side, restriction for expressions unsupported by MongoDB. Note that the new inegration is disabled by default, to use it, please set `` to `false` in server config. [#63279](https://github.com/ClickHouse/ClickHouse/pull/63279) ([Kirill Nikiforov](https://github.com/allmazz)). +* A new function `getSettingOrDefault` added to return the default value and avoid exception if a custom setting is not found in the current profile. [#69917](https://github.com/ClickHouse/ClickHouse/pull/69917) ([Shankar](https://github.com/shiyer7474)). + +#### Experimental feature +* Refreshable materialized views are production ready. [#70550](https://github.com/ClickHouse/ClickHouse/pull/70550) ([Michael Kolupaev](https://github.com/al13n321)). Refreshable materialized views are now supported in Replicated databases. [#60669](https://github.com/ClickHouse/ClickHouse/pull/60669) ([Michael Kolupaev](https://github.com/al13n321)). +* Parallel replicas are moved from experimental to beta. Reworked settings that control the behavior of parallel replicas algorithms. A quick recap: ClickHouse has four different algorithms for parallel reading involving multiple replicas, which is reflected in the setting `parallel_replicas_mode`, the default value for it is `read_tasks` Additionally, the toggle-switch setting `enable_parallel_replicas` has been added. [#63151](https://github.com/ClickHouse/ClickHouse/pull/63151) ([Alexey Milovidov](https://github.com/alexey-milovidov)), ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Support for the `Dynamic` type in most functions by executing them on internal types inside `Dynamic`. [#69691](https://github.com/ClickHouse/ClickHouse/pull/69691) ([Pavel Kruglov](https://github.com/Avogar)). +* Allow to read/write the `JSON` type as a binary string in `RowBinary` format under settings `input_format_binary_read_json_as_string/output_format_binary_write_json_as_string`. [#70288](https://github.com/ClickHouse/ClickHouse/pull/70288) ([Pavel Kruglov](https://github.com/Avogar)). +* Allow to serialize/deserialize `JSON` column as single String column in the Native format. For output use setting `output_format_native_write_json_as_string`. For input, use serialization version `1` before the column data. [#70312](https://github.com/ClickHouse/ClickHouse/pull/70312) ([Pavel Kruglov](https://github.com/Avogar)). +* Introduced a special (experimental) mode of a merge selector for MergeTree tables which makes it more aggressive for the partitions that are close to the limit by the number of parts. It is controlled by the `merge_selector_use_blurry_base` MergeTree-level setting. [#70645](https://github.com/ClickHouse/ClickHouse/pull/70645) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Implement generic ser/de between Avro's `Union` and ClickHouse's `Variant` types. Resolves [#69713](https://github.com/ClickHouse/ClickHouse/issues/69713). [#69712](https://github.com/ClickHouse/ClickHouse/pull/69712) ([Jiří Kozlovský](https://github.com/jirislav)). + +#### Performance Improvement +* Refactor `IDisk` and `IObjectStorage` for better performance. Tables from `plain` and `plain_rewritable` object storages will initialize faster. [#68146](https://github.com/ClickHouse/ClickHouse/pull/68146) ([Alexey Milovidov](https://github.com/alexey-milovidov), [Julia Kartseva](https://github.com/jkartseva)). Do not call the LIST object storage API when determining if a file or directory exists on the plain rewritable disk, as it can be cost-inefficient. [#70852](https://github.com/ClickHouse/ClickHouse/pull/70852) ([Julia Kartseva](https://github.com/jkartseva)). Reduce the number of object storage HEAD API requests in the plain_rewritable disk. [#70915](https://github.com/ClickHouse/ClickHouse/pull/70915) ([Julia Kartseva](https://github.com/jkartseva)). +* Added an ability to parse data directly into sparse columns. [#69828](https://github.com/ClickHouse/ClickHouse/pull/69828) ([Anton Popov](https://github.com/CurtizJ)). +* Improved performance of parsing formats with high number of missed values (e.g. `JSONEachRow`). [#69875](https://github.com/ClickHouse/ClickHouse/pull/69875) ([Anton Popov](https://github.com/CurtizJ)). +* Supports parallel reading of parquet row groups and prefetching of row groups in single-threaded mode. [#69862](https://github.com/ClickHouse/ClickHouse/pull/69862) ([LiuNeng](https://github.com/liuneng1994)). +* Support minmax index for `pointInPolygon`. [#62085](https://github.com/ClickHouse/ClickHouse/pull/62085) ([JackyWoo](https://github.com/JackyWoo)). +* Use bloom filters when reading Parquet files. [#62966](https://github.com/ClickHouse/ClickHouse/pull/62966) ([Arthur Passos](https://github.com/arthurpassos)). +* Lock-free parts rename to avoid INSERT affect SELECT (due to parts lock) (under normal circumstances with `fsync_part_directory`, QPS of SELECT with INSERT in parallel, increased 2x, under heavy load the effect is even bigger). Note, this only includes `ReplicatedMergeTree` for now. [#64955](https://github.com/ClickHouse/ClickHouse/pull/64955) ([Azat Khuzhin](https://github.com/azat)). +* Respect `ttl_only_drop_parts` on `materialize ttl`; only read necessary columns to recalculate TTL and drop parts by replacing them with an empty one. [#65488](https://github.com/ClickHouse/ClickHouse/pull/65488) ([Andrey Zvonov](https://github.com/zvonand)). +* Optimized thread creation in the ThreadPool to minimize lock contention. Thread creation is now performed outside of the critical section to avoid delays in job scheduling and thread management under high load conditions. This leads to a much more responsive ClickHouse under heavy concurrent load. [#68694](https://github.com/ClickHouse/ClickHouse/pull/68694) ([filimonov](https://github.com/filimonov)). +* Enable reading `LowCardinality` string columns from `ORC`. [#69481](https://github.com/ClickHouse/ClickHouse/pull/69481) ([李扬](https://github.com/taiyang-li)). +* Use `LowCardinality` for `ProfileEvents` in system logs such as `part_log`, `query_views_log`, `filesystem_cache_log`. [#70152](https://github.com/ClickHouse/ClickHouse/pull/70152) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Improve performance of `fromUnixTimestamp`/`toUnixTimestamp` functions. [#71042](https://github.com/ClickHouse/ClickHouse/pull/71042) ([kevinyhzou](https://github.com/KevinyhZou)). +* Don't disable nonblocking read from page cache for the entire server when reading from a blocking I/O. This was leading to a poorer performance when a single filesystem (e.g., tmpfs) didn't support the `preadv2` syscall while others do. [#70299](https://github.com/ClickHouse/ClickHouse/pull/70299) ([Antonio Andelic](https://github.com/antonio2368)). +* `ALTER TABLE .. REPLACE PARTITION` doesn't wait anymore for mutations/merges that happen in other partitions. [#59138](https://github.com/ClickHouse/ClickHouse/pull/59138) ([Vasily Nemkov](https://github.com/Enmk)). +* Don't do validation when synchronizing ACL from Keeper. It's validating during creation. It shouldn't matter that much, but there are installations with tens of thousands or even more user created, and the unnecessary hash validation can take a long time to finish during server startup (it synchronizes everything from keeper). [#70644](https://github.com/ClickHouse/ClickHouse/pull/70644) ([Raúl Marín](https://github.com/Algunenano)). + +#### Improvement +* `CREATE TABLE AS` will copy `PRIMARY KEY`, `ORDER BY`, and similar clauses (of `MergeTree` tables). [#69739](https://github.com/ClickHouse/ClickHouse/pull/69739) ([sakulali](https://github.com/sakulali)). +* Support 64-bit XID in Keeper. It can be enabled with the `use_xid_64` configuration value. [#69908](https://github.com/ClickHouse/ClickHouse/pull/69908) ([Antonio Andelic](https://github.com/antonio2368)). +* Command-line arguments for Bool settings are set to true when no value is provided for the argument (e.g. `clickhouse-client --optimize_aggregation_in_order --query "SELECT 1"`). [#70459](https://github.com/ClickHouse/ClickHouse/pull/70459) ([davidtsuk](https://github.com/davidtsuk)). +* Added user-level settings `min_free_disk_bytes_to_throw_insert` and `min_free_disk_ratio_to_throw_insert` to prevent insertions on disks that are almost full. [#69755](https://github.com/ClickHouse/ClickHouse/pull/69755) ([Marco Vilas Boas](https://github.com/marco-vb)). +* Embedded documentation for settings will be strictly more detailed and complete than the documentation on the website. This is the first step before making the website documentation always auto-generated from the source code. This has long-standing implications: - it will be guaranteed to have every setting; - there is no chance of having default values obsolete; - we can generate this documentation for each ClickHouse version; - the documentation can be displayed by the server itself even without Internet access. Generate the docs on the website from the source code. [#70289](https://github.com/ClickHouse/ClickHouse/pull/70289) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Allow empty needle in the function `replace`, the same behavior with PostgreSQL. [#69918](https://github.com/ClickHouse/ClickHouse/pull/69918) ([zhanglistar](https://github.com/zhanglistar)). +* Allow empty needle in functions `replaceRegexp*`. [#70053](https://github.com/ClickHouse/ClickHouse/pull/70053) ([zhanglistar](https://github.com/zhanglistar)). +* Symbolic links for tables in the `data/database_name/` directory are created for the actual paths to the table's data, depending on the storage policy, instead of the `store/...` directory on the default disk. [#61777](https://github.com/ClickHouse/ClickHouse/pull/61777) ([Kirill](https://github.com/kirillgarbar)). +* While parsing an `Enum` field from `JSON`, a string containing an integer will be interpreted as the corresponding `Enum` element. This closes [#65119](https://github.com/ClickHouse/ClickHouse/issues/65119). [#66801](https://github.com/ClickHouse/ClickHouse/pull/66801) ([scanhex12](https://github.com/scanhex12)). +* Allow `TRIM` -ing `LEADING` or `TRAILING` empty string as a no-op. Closes [#67792](https://github.com/ClickHouse/ClickHouse/issues/67792). [#68455](https://github.com/ClickHouse/ClickHouse/pull/68455) ([Peter Nguyen](https://github.com/petern48)). +* Improve compatibility of `cast(timestamp as String)` with Spark. [#69179](https://github.com/ClickHouse/ClickHouse/pull/69179) ([Wenzheng Liu](https://github.com/lwz9103)). +* Always use the new analyzer to calculate constant expressions when `enable_analyzer` is set to `true`. Support calculation of `executable` table function arguments without using `SELECT` query for constant expressions. [#69292](https://github.com/ClickHouse/ClickHouse/pull/69292) ([Dmitry Novik](https://github.com/novikd)). +* Add a setting `enable_secure_identifiers` to disallow identifiers with special characters. [#69411](https://github.com/ClickHouse/ClickHouse/pull/69411) ([tuanpach](https://github.com/tuanpach)). +* Add `show_create_query_identifier_quoting_rule` to define identifier quoting behavior in the `SHOW CREATE TABLE` query result. Possible values: - `user_display`: When the identifiers is a keyword. - `when_necessary`: When the identifiers is one of `{"distinct", "all", "table"}` and when it could lead to ambiguity: column names, dictionary attribute names. - `always`: Always quote identifiers. [#69448](https://github.com/ClickHouse/ClickHouse/pull/69448) ([tuanpach](https://github.com/tuanpach)). +* Improve restoring of access entities' dependencies [#69563](https://github.com/ClickHouse/ClickHouse/pull/69563) ([Vitaly Baranov](https://github.com/vitlibar)). +* If you run `clickhouse-client` or other CLI application, and it starts up slowly due to an overloaded server, and you start typing your query, such as `SELECT`, the previous versions will display the remaining of the terminal echo contents before printing the greetings message, such as `SELECTClickHouse local version 24.10.1.1.` instead of `ClickHouse local version 24.10.1.1.`. Now it is fixed. This closes [#31696](https://github.com/ClickHouse/ClickHouse/issues/31696). [#69856](https://github.com/ClickHouse/ClickHouse/pull/69856) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add new column `readonly_duration` to the `system.replicas` table. Needed to be able to distinguish actual readonly replicas from sentinel ones in alerts. [#69871](https://github.com/ClickHouse/ClickHouse/pull/69871) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). +* Change the type of `join_output_by_rowlist_perkey_rows_threshold` setting type to unsigned integer. [#69886](https://github.com/ClickHouse/ClickHouse/pull/69886) ([kevinyhzou](https://github.com/KevinyhZou)). +* Enhance OpenTelemetry span logging to include query settings. [#70011](https://github.com/ClickHouse/ClickHouse/pull/70011) ([sharathks118](https://github.com/sharathks118)). +* Add diagnostic info about higher-order array functions if lambda result type is unexpected. [#70093](https://github.com/ClickHouse/ClickHouse/pull/70093) ([ttanay](https://github.com/ttanay)). +* Keeper improvement: less locking during cluster changes. [#70275](https://github.com/ClickHouse/ClickHouse/pull/70275) ([Antonio Andelic](https://github.com/antonio2368)). +* Add `WITH IMPLICIT` and `FINAL` keywords to the `SHOW GRANTS` command. Fix a minor bug with implicit grants: [#70094](https://github.com/ClickHouse/ClickHouse/issues/70094). [#70293](https://github.com/ClickHouse/ClickHouse/pull/70293) ([pufit](https://github.com/pufit)). +* Respect `compatibility` for MergeTree settings. The `compatibility` value is taken from the `default` profile on server startup, and default MergeTree settings are changed accordingly. Further changes of the `compatibility` setting do not affect MergeTree settings. [#70322](https://github.com/ClickHouse/ClickHouse/pull/70322) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Avoid spamming the logs with large HTTP response bodies in case of errors during inter-server communication. [#70487](https://github.com/ClickHouse/ClickHouse/pull/70487) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Added a new setting `max_parts_to_move` to control the maximum number of parts that can be moved at once. [#70520](https://github.com/ClickHouse/ClickHouse/pull/70520) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Limit the frequency of certain log messages. [#70601](https://github.com/ClickHouse/ClickHouse/pull/70601) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* `CHECK TABLE` with `PART` qualifier was incorrectly formatted in the client. [#70660](https://github.com/ClickHouse/ClickHouse/pull/70660) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support writing the column index and the offset index using parquet native writer. [#70669](https://github.com/ClickHouse/ClickHouse/pull/70669) ([LiuNeng](https://github.com/liuneng1994)). +* Support parsing `DateTime64` for microsecond and timezone in joda syntax ("joda" is a popular Java library for date and time, and the "joda syntax" is that library's style). [#70737](https://github.com/ClickHouse/ClickHouse/pull/70737) ([kevinyhzou](https://github.com/KevinyhZou)). +* Changed an approach to figure out if a cloud storage supports [batch delete](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) or not. [#70786](https://github.com/ClickHouse/ClickHouse/pull/70786) ([Vitaly Baranov](https://github.com/vitlibar)). +* Support for Parquet page v2 in the native reader. [#70807](https://github.com/ClickHouse/ClickHouse/pull/70807) ([Arthur Passos](https://github.com/arthurpassos)). +* A check if table has both `storage_policy` and `disk` set. A check if a new storage policy is compatible with an old one when using `disk` setting is added. [#70839](https://github.com/ClickHouse/ClickHouse/pull/70839) ([Kirill](https://github.com/kirillgarbar)). +* Add `system.s3_queue_settings` and `system.azure_queue_settings`. [#70841](https://github.com/ClickHouse/ClickHouse/pull/70841) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Functions `base58Encode` and `base58Decode` now accept arguments of type `FixedString`. Example: `SELECT base58Encode(toFixedString('plaintext', 9));`. [#70846](https://github.com/ClickHouse/ClickHouse/pull/70846) ([Faizan Patel](https://github.com/faizan2786)). +* Add the `partition` column to every entry type of the part log. Previously, it was set only for some entries. This closes [#70819](https://github.com/ClickHouse/ClickHouse/issues/70819). [#70848](https://github.com/ClickHouse/ClickHouse/pull/70848) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add `MergeStart` and `MutateStart` events into `system.part_log` which helps with merges analysis and visualization. [#70850](https://github.com/ClickHouse/ClickHouse/pull/70850) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add a profile event about the number of merged source parts. It allows the monitoring of the fanout of the merge tree in production. [#70908](https://github.com/ClickHouse/ClickHouse/pull/70908) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Background downloads to the filesystem cache were enabled back. [#70929](https://github.com/ClickHouse/ClickHouse/pull/70929) ([Nikita Taranov](https://github.com/nickitat)). +* Add a new merge selector algorithm, named `Trivial`, for professional usage only. It is worse than the `Simple` merge selector. [#70969](https://github.com/ClickHouse/ClickHouse/pull/70969) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support for atomic `CREATE OR REPLACE VIEW`. [#70536](https://github.com/ClickHouse/ClickHouse/pull/70536) ([tuanpach](https://github.com/tuanpach)) +* Added `strict_once` mode to aggregate function `windowFunnel` to avoid counting one event several times in case it matches multiple conditions, close [#21835](https://github.com/ClickHouse/ClickHouse/issues/21835). [#69738](https://github.com/ClickHouse/ClickHouse/pull/69738) ([Vladimir Cherkasov](https://github.com/vdimir)). + +#### Bug Fix (user-visible misbehavior in an official stable release) +* Apply configuration updates in global context object. It fixes issues like [#62308](https://github.com/ClickHouse/ClickHouse/issues/62308). [#62944](https://github.com/ClickHouse/ClickHouse/pull/62944) ([Amos Bird](https://github.com/amosbird)). +* Fix `ReadSettings` not using user set values, because defaults were only used. [#65625](https://github.com/ClickHouse/ClickHouse/pull/65625) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix type mismatch issue in `sumMapFiltered` when using signed arguments. [#58408](https://github.com/ClickHouse/ClickHouse/pull/58408) ([Chen768959](https://github.com/Chen768959)). +* Fix toHour-like conversion functions' monotonicity when optional time zone argument is passed. [#60264](https://github.com/ClickHouse/ClickHouse/pull/60264) ([Amos Bird](https://github.com/amosbird)). +* Relax `supportsPrewhere` check for `Merge` tables. This fixes [#61064](https://github.com/ClickHouse/ClickHouse/issues/61064). It was hardened unnecessarily in [#60082](https://github.com/ClickHouse/ClickHouse/issues/60082). [#61091](https://github.com/ClickHouse/ClickHouse/pull/61091) ([Amos Bird](https://github.com/amosbird)). +* Fix `use_concurrency_control` setting handling for proper `concurrent_threads_soft_limit_num` limit enforcing. This enables concurrency control by default because previously it was broken. [#61473](https://github.com/ClickHouse/ClickHouse/pull/61473) ([Sergei Trifonov](https://github.com/serxa)). +* Fix incorrect `JOIN ON` section optimization in case of `IS NULL` check under any other function (like `NOT`) that may lead to wrong results. Closes [#67915](https://github.com/ClickHouse/ClickHouse/issues/67915). [#68049](https://github.com/ClickHouse/ClickHouse/pull/68049) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Prevent `ALTER` queries that would make the `CREATE` query of tables invalid. [#68574](https://github.com/ClickHouse/ClickHouse/pull/68574) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix inconsistent AST formatting for `negate` (`-`) and `NOT` functions with tuples and arrays. [#68600](https://github.com/ClickHouse/ClickHouse/pull/68600) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fix insertion of incomplete type into `Dynamic` during deserialization. It could lead to `Parameter out of bound` errors. [#69291](https://github.com/ClickHouse/ClickHouse/pull/69291) ([Pavel Kruglov](https://github.com/Avogar)). +* Zero-copy replication, which is experimental and should not be used in production: fix inf loop after `restore replica` in the replicated merge tree with zero copy. [#69293](https://github.com/CljmnickHouse/ClickHouse/pull/69293) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* Return back default value of `processing_threads_num` as number of cpu cores in storage `S3Queue`. [#69384](https://github.com/ClickHouse/ClickHouse/pull/69384) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Bypass try/catch flow when de/serializing nested repeated protobuf to nested columns (fixes [#41971](https://github.com/ClickHouse/ClickHouse/issues/41971)). [#69556](https://github.com/ClickHouse/ClickHouse/pull/69556) ([Eliot Hautefeuille](https://github.com/hileef)). +* Fix crash during insertion into FixedString column in PostgreSQL engine. [#69584](https://github.com/ClickHouse/ClickHouse/pull/69584) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix crash when executing `create view t as (with recursive 42 as ttt select ttt);`. [#69676](https://github.com/ClickHouse/ClickHouse/pull/69676) ([Han Fei](https://github.com/hanfei1991)). +* Fixed `maxMapState` throwing 'Bad get' if value type is DateTime64. [#69787](https://github.com/ClickHouse/ClickHouse/pull/69787) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix `getSubcolumn` with `LowCardinality` columns by overriding `useDefaultImplementationForLowCardinalityColumns` to return `true`. [#69831](https://github.com/ClickHouse/ClickHouse/pull/69831) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). +* Fix permanent blocked distributed sends if a DROP of distributed table failed. [#69843](https://github.com/ClickHouse/ClickHouse/pull/69843) ([Azat Khuzhin](https://github.com/azat)). +* Fix non-cancellable queries containing WITH FILL with NaN keys. This closes [#69261](https://github.com/ClickHouse/ClickHouse/issues/69261). [#69845](https://github.com/ClickHouse/ClickHouse/pull/69845) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix analyzer default with old compatibility value. [#69895](https://github.com/ClickHouse/ClickHouse/pull/69895) ([Raúl Marín](https://github.com/Algunenano)). +* Don't check dependencies during CREATE OR REPLACE VIEW during DROP of old table. Previously CREATE OR REPLACE query failed when there are dependent tables of the recreated view. [#69907](https://github.com/ClickHouse/ClickHouse/pull/69907) ([Pavel Kruglov](https://github.com/Avogar)). +* Something for Decimal. Fixes [#69730](https://github.com/ClickHouse/ClickHouse/issues/69730). [#69978](https://github.com/ClickHouse/ClickHouse/pull/69978) ([Arthur Passos](https://github.com/arthurpassos)). +* Now DEFINER/INVOKER will work with parameterized views. [#69984](https://github.com/ClickHouse/ClickHouse/pull/69984) ([pufit](https://github.com/pufit)). +* Fix parsing for view's definers. [#69985](https://github.com/ClickHouse/ClickHouse/pull/69985) ([pufit](https://github.com/pufit)). +* Fixed a bug when the timezone could change the result of the query with a `Date` or `Date32` arguments. [#70036](https://github.com/ClickHouse/ClickHouse/pull/70036) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Fixes `Block structure mismatch` for queries with nested views and `WHERE` condition. Fixes [#66209](https://github.com/ClickHouse/ClickHouse/issues/66209). [#70054](https://github.com/ClickHouse/ClickHouse/pull/70054) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Avoid reusing columns among different named tuples when evaluating `tuple` functions. This fixes [#70022](https://github.com/ClickHouse/ClickHouse/issues/70022). [#70103](https://github.com/ClickHouse/ClickHouse/pull/70103) ([Amos Bird](https://github.com/amosbird)). +* Fix wrong LOGICAL_ERROR when replacing literals in ranges. [#70122](https://github.com/ClickHouse/ClickHouse/pull/70122) ([Pablo Marcos](https://github.com/pamarcos)). +* Check for Nullable(Nothing) type during ALTER TABLE MODIFY COLUMN/QUERY to prevent tables with such data type. [#70123](https://github.com/ClickHouse/ClickHouse/pull/70123) ([Pavel Kruglov](https://github.com/Avogar)). +* Proper error message for illegal query `JOIN ... ON *` , close [#68650](https://github.com/ClickHouse/ClickHouse/issues/68650). [#70124](https://github.com/ClickHouse/ClickHouse/pull/70124) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fix wrong result with skipping index. [#70127](https://github.com/ClickHouse/ClickHouse/pull/70127) ([Raúl Marín](https://github.com/Algunenano)). +* Fix data race in ColumnObject/ColumnTuple decompress method that could lead to heap use after free. [#70137](https://github.com/ClickHouse/ClickHouse/pull/70137) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix possible hung in ALTER COLUMN with Dynamic type. [#70144](https://github.com/ClickHouse/ClickHouse/pull/70144) ([Pavel Kruglov](https://github.com/Avogar)). +* Now ClickHouse will consider more errors as retriable and will not mark data parts as broken in case of such errors. [#70145](https://github.com/ClickHouse/ClickHouse/pull/70145) ([alesapin](https://github.com/alesapin)). +* Use correct `max_types` parameter during Dynamic type creation for JSON subcolumn. [#70147](https://github.com/ClickHouse/ClickHouse/pull/70147) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix the password being displayed in `system.query_log` for users with bcrypt password authentication method. [#70148](https://github.com/ClickHouse/ClickHouse/pull/70148) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix event counter for the native interface (InterfaceNativeSendBytes). [#70153](https://github.com/ClickHouse/ClickHouse/pull/70153) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix possible crash related to JSON columns. [#70172](https://github.com/ClickHouse/ClickHouse/pull/70172) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix multiple issues with arrayMin and arrayMax. [#70207](https://github.com/ClickHouse/ClickHouse/pull/70207) ([Raúl Marín](https://github.com/Algunenano)). +* Respect setting allow_simdjson in the JSON type parser. [#70218](https://github.com/ClickHouse/ClickHouse/pull/70218) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix a null pointer dereference on creating a materialized view with two selects and an `INTERSECT`, e.g. `CREATE MATERIALIZED VIEW v0 AS (SELECT 1) INTERSECT (SELECT 1);`. [#70264](https://github.com/ClickHouse/ClickHouse/pull/70264) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Don't modify global settings with startup scripts. Previously, changing a setting in a startup script would change it globally. [#70310](https://github.com/ClickHouse/ClickHouse/pull/70310) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix ALTER of `Dynamic` type with reducing max_types parameter that could lead to server crash. [#70328](https://github.com/ClickHouse/ClickHouse/pull/70328) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix crash when using WITH FILL incorrectly. [#70338](https://github.com/ClickHouse/ClickHouse/pull/70338) ([Raúl Marín](https://github.com/Algunenano)). +* Fix possible use-after-free in `SYSTEM DROP FORMAT SCHEMA CACHE FOR Protobuf`. [#70358](https://github.com/ClickHouse/ClickHouse/pull/70358) ([Azat Khuzhin](https://github.com/azat)). +* Fix crash during GROUP BY JSON sub-object subcolumn. [#70374](https://github.com/ClickHouse/ClickHouse/pull/70374) ([Pavel Kruglov](https://github.com/Avogar)). +* Don't prefetch parts for vertical merges if part has no rows. [#70452](https://github.com/ClickHouse/ClickHouse/pull/70452) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix crash in WHERE with lambda functions. [#70464](https://github.com/ClickHouse/ClickHouse/pull/70464) ([Raúl Marín](https://github.com/Algunenano)). +* Fix table creation with `CREATE ... AS table_function(...)` with database `Replicated` and unavailable table function source on secondary replica. [#70511](https://github.com/ClickHouse/ClickHouse/pull/70511) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Ignore all output on async insert with `wait_for_async_insert=1`. Closes [#62644](https://github.com/ClickHouse/ClickHouse/issues/62644). [#70530](https://github.com/ClickHouse/ClickHouse/pull/70530) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Ignore frozen_metadata.txt while traversing shadow directory from system.remote_data_paths. [#70590](https://github.com/ClickHouse/ClickHouse/pull/70590) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Fix creation of stateful window functions on misaligned memory. [#70631](https://github.com/ClickHouse/ClickHouse/pull/70631) ([Raúl Marín](https://github.com/Algunenano)). +* Fixed rare crashes in `SELECT`-s and merges after adding a column of `Array` type with non-empty default expression. [#70695](https://github.com/ClickHouse/ClickHouse/pull/70695) ([Anton Popov](https://github.com/CurtizJ)). +* Insert into table function s3 will respect query settings. [#70696](https://github.com/ClickHouse/ClickHouse/pull/70696) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fix infinite recursion when inferring a protobuf schema when skipping unsupported fields is enabled. [#70697](https://github.com/ClickHouse/ClickHouse/pull/70697) ([Raúl Marín](https://github.com/Algunenano)). +* Disable enable_named_columns_in_function_tuple by default. [#70833](https://github.com/ClickHouse/ClickHouse/pull/70833) ([Raúl Marín](https://github.com/Algunenano)). +* Fix S3Queue table engine setting processing_threads_num not being effective in case it was deduced from the number of cpu cores on the server. [#70837](https://github.com/ClickHouse/ClickHouse/pull/70837) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Normalize named tuple arguments in aggregation states. This fixes [#69732](https://github.com/ClickHouse/ClickHouse/issues/69732) . [#70853](https://github.com/ClickHouse/ClickHouse/pull/70853) ([Amos Bird](https://github.com/amosbird)). +* Fix a logical error due to negative zeros in the two-level hash table. This closes [#70973](https://github.com/ClickHouse/ClickHouse/issues/70973). [#70979](https://github.com/ClickHouse/ClickHouse/pull/70979) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix `limit by`, `limit with ties` for distributed and parallel replicas. [#70880](https://github.com/ClickHouse/ClickHouse/pull/70880) ([Nikita Taranov](https://github.com/nickitat)). + + ### ClickHouse release 24.9, 2024-09-26 #### Backward Incompatible Change @@ -328,6 +488,7 @@ * Remove `is_deterministic` field from the `system.functions` table. [#66630](https://github.com/ClickHouse/ClickHouse/pull/66630) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Function `tuple` will now try to construct named tuples in query (controlled by `enable_named_columns_in_function_tuple`). Introduce function `tupleNames` to extract names from tuples. [#54881](https://github.com/ClickHouse/ClickHouse/pull/54881) ([Amos Bird](https://github.com/amosbird)). * Change how deduplication for Materialized Views works. Fixed a lot of cases like: - on destination table: data is split for 2 or more blocks and that blocks is considered as duplicate when that block is inserted in parallel. - on MV destination table: the equal blocks are deduplicated, that happens when MV often produces equal data as a result for different input data due to performing aggregation. - on MV destination table: the equal blocks which comes from different MV are deduplicated. [#61601](https://github.com/ClickHouse/ClickHouse/pull/61601) ([Sema Checherinda](https://github.com/CheSema)). +* Functions `bitShiftLeft` and `bitShitfRight` return an error for out of bounds shift positions [#65838](https://github.com/ClickHouse/ClickHouse/pull/65838) ([Pablo Marcos](https://github.com/pamarcos)). #### New Feature * Add `ASOF JOIN` support for `full_sorting_join` algorithm. [#55051](https://github.com/ClickHouse/ClickHouse/pull/55051) ([vdimir](https://github.com/vdimir)). @@ -439,7 +600,6 @@ * Functions `bitTest`, `bitTestAll`, and `bitTestAny` now return an error if the specified bit index is out-of-bounds [#65818](https://github.com/ClickHouse/ClickHouse/pull/65818) ([Pablo Marcos](https://github.com/pamarcos)). * Setting `join_any_take_last_row` is supported in any query with hash join. [#65820](https://github.com/ClickHouse/ClickHouse/pull/65820) ([vdimir](https://github.com/vdimir)). * Better handling of join conditions involving `IS NULL` checks (for example `ON (a = b AND (a IS NOT NULL) AND (b IS NOT NULL) ) OR ( (a IS NULL) AND (b IS NULL) )` is rewritten to `ON a <=> b`), fix incorrect optimization when condition other then `IS NULL` are present. [#65835](https://github.com/ClickHouse/ClickHouse/pull/65835) ([vdimir](https://github.com/vdimir)). -* Functions `bitShiftLeft` and `bitShitfRight` return an error for out of bounds shift positions [#65838](https://github.com/ClickHouse/ClickHouse/pull/65838) ([Pablo Marcos](https://github.com/pamarcos)). * Fix growing memory usage in S3Queue. [#65839](https://github.com/ClickHouse/ClickHouse/pull/65839) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix tie handling in `arrayAUC` to match sklearn. [#65840](https://github.com/ClickHouse/ClickHouse/pull/65840) ([gabrielmcg44](https://github.com/gabrielmcg44)). * Fix possible issues with MySQL server protocol TLS connections. [#65917](https://github.com/ClickHouse/ClickHouse/pull/65917) ([Azat Khuzhin](https://github.com/azat)). diff --git a/CMakeLists.txt b/CMakeLists.txt index f0965530739..a165be799c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,7 @@ string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC) list(REVERSE CMAKE_FIND_LIBRARY_SUFFIXES) option (ENABLE_FUZZING "Fuzzy testing using libfuzzer" OFF) +option (ENABLE_FUZZER_TEST "Build testing fuzzers in order to test libFuzzer functionality" OFF) if (ENABLE_FUZZING) # Also set WITH_COVERAGE=1 for better fuzzing process diff --git a/README.md b/README.md index 3b5209dcbe9..abaf27abf11 100644 --- a/README.md +++ b/README.md @@ -42,31 +42,20 @@ Keep an eye out for upcoming meetups and events around the world. Somewhere else Upcoming meetups -* [Jakarta Meetup](https://www.meetup.com/clickhouse-indonesia-user-group/events/303191359/) - October 1 -* [Singapore Meetup](https://www.meetup.com/clickhouse-singapore-meetup-group/events/303212064/) - October 3 -* [Madrid Meetup](https://www.meetup.com/clickhouse-spain-user-group/events/303096564/) - October 22 -* [Oslo Meetup](https://www.meetup.com/open-source-real-time-data-warehouse-real-time-analytics/events/302938622) - October 31 * [Barcelona Meetup](https://www.meetup.com/clickhouse-spain-user-group/events/303096876/) - November 12 * [Ghent Meetup](https://www.meetup.com/clickhouse-belgium-user-group/events/303049405/) - November 19 * [Dubai Meetup](https://www.meetup.com/clickhouse-dubai-meetup-group/events/303096989/) - November 21 * [Paris Meetup](https://www.meetup.com/clickhouse-france-user-group/events/303096434) - November 26 +* [Amsterdam Meetup](https://www.meetup.com/clickhouse-netherlands-user-group/events/303638814) - December 3 +* [Stockholm Meetup](https://www.meetup.com/clickhouse-stockholm-user-group/events/304382411) - December 9 +* [New York Meetup](https://www.meetup.com/clickhouse-new-york-user-group/events/304268174) - December 9 +* [San Francisco Meetup](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/304286951/) - December 12 Recently completed meetups -* [ClickHouse Guangzhou User Group Meetup](https://mp.weixin.qq.com/s/GSvo-7xUoVzCsuUvlLTpCw) - August 25 -* [Seattle Meetup (Statsig)](https://www.meetup.com/clickhouse-seattle-user-group/events/302518075/) - August 27 -* [Melbourne Meetup](https://www.meetup.com/clickhouse-australia-user-group/events/302732666/) - August 27 -* [Sydney Meetup](https://www.meetup.com/clickhouse-australia-user-group/events/302862966/) - September 5 -* [Zurich Meetup](https://www.meetup.com/clickhouse-switzerland-meetup-group/events/302267429/) - September 5 -* [San Francisco Meetup (Cloudflare)](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/302540575) - September 5 -* [Raleigh Meetup (Deutsche Bank)](https://www.meetup.com/triangletechtalks/events/302723486/) - September 9 -* [New York Meetup (Rokt)](https://www.meetup.com/clickhouse-new-york-user-group/events/302575342) - September 10 -* [Toronto Meetup (Shopify)](https://www.meetup.com/clickhouse-toronto-user-group/events/301490855/) - September 10 -* [Chicago Meetup (Jump Capital)](https://lu.ma/43tvmrfw) - September 12 -* [London Meetup](https://www.meetup.com/clickhouse-london-user-group/events/302977267) - September 17 -* [Austin Meetup](https://www.meetup.com/clickhouse-austin-user-group/events/302558689/) - September 17 -* [Bangalore Meetup](https://www.meetup.com/clickhouse-bangalore-user-group/events/303208274/) - September 18 -* [Tel Aviv Meetup](https://www.meetup.com/clickhouse-meetup-israel/events/303095121) - September 22 +* [Madrid Meetup](https://www.meetup.com/clickhouse-spain-user-group/events/303096564/) - October 22 +* [Singapore Meetup](https://www.meetup.com/clickhouse-singapore-meetup-group/events/303212064/) - October 3 +* [Jakarta Meetup](https://www.meetup.com/clickhouse-indonesia-user-group/events/303191359/) - October 1 ## Recent Recordings * **Recent Meetup Videos**: [Meetup Playlist](https://www.youtube.com/playlist?list=PL0Z2YDlm0b3iNDUzpY1S3L_iV4nARda_U) Whenever possible recordings of the ClickHouse Community Meetups are edited and presented as individual talks. Current featuring "Modern SQL in 2023", "Fast, Concurrent, and Consistent Asynchronous INSERTS in ClickHouse", and "Full-Text Indices: Design and Experiments" diff --git a/SECURITY.md b/SECURITY.md index db302da8ecd..1b0648dc489 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -14,9 +14,10 @@ The following versions of ClickHouse server are currently supported with securit | Version | Supported | |:-|:-| +| 24.10 | ✔️ | | 24.9 | ✔️ | | 24.8 | ✔️ | -| 24.7 | ✔️ | +| 24.7 | ❌ | | 24.6 | ❌ | | 24.5 | ❌ | | 24.4 | ❌ | diff --git a/base/base/StringRef.h b/base/base/StringRef.h index af3441c2a75..74878b50545 100644 --- a/base/base/StringRef.h +++ b/base/base/StringRef.h @@ -86,7 +86,7 @@ using StringRefs = std::vector; * For more information, see hash_map_string_2.cpp */ -inline bool compare8(const char * p1, const char * p2) +inline bool compare16(const char * p1, const char * p2) { return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(p1)), @@ -115,7 +115,7 @@ inline bool compare64(const char * p1, const char * p2) #elif defined(__aarch64__) && defined(__ARM_NEON) -inline bool compare8(const char * p1, const char * p2) +inline bool compare16(const char * p1, const char * p2) { uint64_t mask = getNibbleMask(vceqq_u8( vld1q_u8(reinterpret_cast(p1)), vld1q_u8(reinterpret_cast(p2)))); @@ -185,13 +185,22 @@ inline bool memequalWide(const char * p1, const char * p2, size_t size) switch (size / 16) // NOLINT(bugprone-switch-missing-default-case) { - case 3: if (!compare8(p1 + 32, p2 + 32)) return false; [[fallthrough]]; - case 2: if (!compare8(p1 + 16, p2 + 16)) return false; [[fallthrough]]; - case 1: if (!compare8(p1, p2)) return false; [[fallthrough]]; + case 3: + if (!compare16(p1 + 32, p2 + 32)) + return false; + [[fallthrough]]; + case 2: + if (!compare16(p1 + 16, p2 + 16)) + return false; + [[fallthrough]]; + case 1: + if (!compare16(p1, p2)) + return false; + [[fallthrough]]; default: ; } - return compare8(p1 + size - 16, p2 + size - 16); + return compare16(p1 + size - 16, p2 + size - 16); } #endif @@ -369,11 +378,15 @@ namespace PackedZeroTraits { template class PackedPairNoInit> inline bool check(const PackedPairNoInit p) - { return 0 == p.key.size; } + { + return 0 == p.key.size; + } template class PackedPairNoInit> inline void set(PackedPairNoInit & p) - { p.key.size = 0; } + { + p.key.size = 0; + } } diff --git a/base/base/chrono_io.h b/base/base/chrono_io.h index 4ee8dec6634..d55aa11bc1d 100644 --- a/base/base/chrono_io.h +++ b/base/base/chrono_io.h @@ -4,6 +4,7 @@ #include #include #include +#include inline std::string to_string(const std::time_t & time) @@ -11,18 +12,6 @@ inline std::string to_string(const std::time_t & time) return cctz::format("%Y-%m-%d %H:%M:%S", std::chrono::system_clock::from_time_t(time), cctz::local_time_zone()); } -template -std::string to_string(const std::chrono::time_point & tp) -{ - // Don't use DateLUT because it shows weird characters for - // TimePoint::max(). I wish we could use C++20 format, but it's not - // there yet. - // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp)); - - auto in_time_t = std::chrono::system_clock::to_time_t(tp); - return to_string(in_time_t); -} - template > std::string to_string(const std::chrono::duration & duration) { @@ -33,6 +22,20 @@ std::string to_string(const std::chrono::duration & duration) return std::to_string(seconds_as_double.count()) + "s"; } +template +std::string to_string(const std::chrono::time_point & tp) +{ + // Don't use DateLUT because it shows weird characters for + // TimePoint::max(). I wish we could use C++20 format, but it's not + // there yet. + // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp)); + + if constexpr (std::is_same_v) + return to_string(std::chrono::system_clock::to_time_t(tp)); + else + return to_string(tp.time_since_epoch()); +} + template std::ostream & operator<<(std::ostream & o, const std::chrono::time_point & tp) { @@ -44,3 +47,23 @@ std::ostream & operator<<(std::ostream & o, const std::chrono::duration +struct fmt::formatter> : fmt::formatter +{ + template + auto format(const std::chrono::time_point & tp, FormatCtx & ctx) const + { + return fmt::formatter::format(::to_string(tp), ctx); + } +}; + +template +struct fmt::formatter> : fmt::formatter +{ + template + auto format(const std::chrono::duration & duration, FormatCtx & ctx) const + { + return fmt::formatter::format(::to_string(duration), ctx); + } +}; diff --git a/base/glibc-compatibility/musl/getauxval.c b/base/glibc-compatibility/musl/getauxval.c index ec2cce1e4aa..cc0cdf25b03 100644 --- a/base/glibc-compatibility/musl/getauxval.c +++ b/base/glibc-compatibility/musl/getauxval.c @@ -25,9 +25,10 @@ // We don't have libc struct available here. // Compute aux vector manually (from /proc/self/auxv). // -// Right now there is only 51 AT_* constants, -// so 64 should be enough until this implementation will be replaced with musl. -static unsigned long __auxv_procfs[64]; +// Right now there are 51 AT_* constants. Custom kernels have been encountered +// making use of up to 71. 128 should be enough until this implementation is +// replaced with musl. +static unsigned long __auxv_procfs[128]; static unsigned long __auxv_secure = 0; // Common static unsigned long * __auxv_environ = NULL; diff --git a/base/poco/Foundation/include/Poco/Logger.h b/base/poco/Foundation/include/Poco/Logger.h index 74ddceea9dd..f7da3c08fa3 100644 --- a/base/poco/Foundation/include/Poco/Logger.h +++ b/base/poco/Foundation/include/Poco/Logger.h @@ -952,6 +952,8 @@ private: static std::pair add(Logger * pLogger); static std::optional find(const std::string & name); static Logger * findRawPtr(const std::string & name); + void unsafeSetChannel(Channel * pChannel); + Channel* unsafeGetChannel() const; Logger(); Logger(const Logger &); diff --git a/base/poco/Foundation/src/Logger.cpp b/base/poco/Foundation/src/Logger.cpp index 779af384b0b..55564a7a175 100644 --- a/base/poco/Foundation/src/Logger.cpp +++ b/base/poco/Foundation/src/Logger.cpp @@ -61,6 +61,13 @@ Logger::~Logger() void Logger::setChannel(Channel* pChannel) +{ + std::lock_guard lock(getLoggerMutex()); + unsafeSetChannel(pChannel); +} + + +void Logger::unsafeSetChannel(Channel* pChannel) { if (_pChannel) _pChannel->release(); _pChannel = pChannel; @@ -69,6 +76,14 @@ void Logger::setChannel(Channel* pChannel) Channel* Logger::getChannel() const +{ + std::lock_guard lock(getLoggerMutex()); + + return unsafeGetChannel(); +} + + +Channel* Logger::unsafeGetChannel() const { return _pChannel; } @@ -89,7 +104,7 @@ void Logger::setLevel(const std::string& level) void Logger::setProperty(const std::string& name, const std::string& value) { if (name == "channel") - setChannel(LoggingRegistry::defaultRegistry().channelForName(value)); + unsafeSetChannel(LoggingRegistry::defaultRegistry().channelForName(value)); else if (name == "level") setLevel(value); else @@ -160,7 +175,7 @@ void Logger::setChannel(const std::string& name, Channel* pChannel) if (len == 0 || (it.first.compare(0, len, name) == 0 && (it.first.length() == len || it.first[len] == '.'))) { - it.second.logger->setChannel(pChannel); + it.second.logger->unsafeSetChannel(pChannel); } } } @@ -393,7 +408,7 @@ std::pair Logger::unsafeGet(const std::string& else { Logger& par = parent(name); - logger = new Logger(name, par.getChannel(), par.getLevel()); + logger = new Logger(name, par.unsafeGetChannel(), par.getLevel()); } return add(logger); diff --git a/ci/README.md b/ci/README.md new file mode 100644 index 00000000000..192243d598b --- /dev/null +++ b/ci/README.md @@ -0,0 +1 @@ +Note: This directory is under active development for CI improvements and is not currently in use within the scope of the existing CI pipeline. diff --git a/ci/docker/fasttest/Dockerfile b/ci/docker/fasttest/Dockerfile new file mode 100644 index 00000000000..66e48b163b8 --- /dev/null +++ b/ci/docker/fasttest/Dockerfile @@ -0,0 +1,109 @@ +# docker build -t clickhouse/fasttest . +FROM ubuntu:22.04 + +# ARG for quick switch to a given ubuntu mirror +ARG apt_archive="http://archive.ubuntu.com" +RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list + +ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=18 + +RUN apt-get update \ + && apt-get install \ + apt-transport-https \ + apt-utils \ + ca-certificates \ + curl \ + gnupg \ + lsb-release \ + wget \ + git \ + --yes --no-install-recommends --verbose-versions \ + && export LLVM_PUBKEY_HASH="bda960a8da687a275a2078d43c111d66b1c6a893a3275271beedf266c1ff4a0cdecb429c7a5cccf9f486ea7aa43fd27f" \ + && wget -nv -O /tmp/llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key \ + && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \ + && apt-key add /tmp/llvm-snapshot.gpg.key \ + && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ + && echo "deb https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ + /etc/apt/sources.list \ + && apt-get update \ + && apt-get install --yes --no-install-recommends --verbose-versions llvm-${LLVM_VERSION} \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* + +# moreutils - provides ts fo FT +# expect, bzip2 - requried by FT +# bsdmainutils - provides hexdump for FT +# nasm - nasm copiler for one of submodules, required from normal build +# yasm - asssembler for libhdfs3, required from normal build + +RUN apt-get update \ + && apt-get install \ + clang-${LLVM_VERSION} \ + cmake \ + libclang-${LLVM_VERSION}-dev \ + libclang-rt-${LLVM_VERSION}-dev \ + lld-${LLVM_VERSION} \ + llvm-${LLVM_VERSION}-dev \ + lsof \ + ninja-build \ + python3 \ + python3-pip \ + zstd \ + moreutils \ + expect \ + bsdmainutils \ + pv \ + jq \ + bzip2 \ + nasm \ + yasm \ + --yes --no-install-recommends \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* + +COPY --from=clickhouse/cctools:0d6b90a7a490 /opt/gdb /opt/gdb +# Give suid to gdb to grant it attach permissions +RUN chmod u+s /opt/gdb/bin/gdb +ENV PATH="/opt/gdb/bin:${PATH}" + +# This symlink is required by gcc to find the lld linker +RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld +# FIXME: workaround for "The imported target "merge-fdata" references the file" error +# https://salsa.debian.org/pkg-llvm-team/llvm-toolchain/-/commit/992e52c0b156a5ba9c6a8a54f8c4857ddd3d371d +RUN sed -i '/_IMPORT_CHECK_FILES_FOR_\(mlir-\|llvm-bolt\|merge-fdata\|MLIR\)/ {s|^|#|}' /usr/lib/llvm-${LLVM_VERSION}/lib/cmake/llvm/LLVMExports-*.cmake + +# LLVM changes paths for compiler-rt libraries. For some reason clang-18.1.8 cannot catch up libraries from default install path. +# It's very dirty workaround, better to build compiler and LLVM ourself and use it. Details: https://github.com/llvm/llvm-project/issues/95792 +RUN test ! -d /usr/lib/llvm-18/lib/clang/18/lib/x86_64-pc-linux-gnu || ln -s /usr/lib/llvm-18/lib/clang/18/lib/x86_64-pc-linux-gnu /usr/lib/llvm-18/lib/clang/18/lib/x86_64-unknown-linux-gnu + +ARG TARGETARCH +ARG SCCACHE_VERSION=v0.7.7 +ENV SCCACHE_IGNORE_SERVER_IO_ERROR=1 +# sccache requires a value for the region. So by default we use The Default Region +ENV SCCACHE_REGION=us-east-1 +RUN arch=${TARGETARCH} \ + && case $arch in \ + amd64) rarch=x86_64 ;; \ + arm64) rarch=aarch64 ;; \ + esac \ + && curl -Ls "https://github.com/mozilla/sccache/releases/download/$SCCACHE_VERSION/sccache-$SCCACHE_VERSION-$rarch-unknown-linux-musl.tar.gz" | \ + tar xz -C /tmp \ + && mv "/tmp/sccache-$SCCACHE_VERSION-$rarch-unknown-linux-musl/sccache" /usr/bin \ + && rm "/tmp/sccache-$SCCACHE_VERSION-$rarch-unknown-linux-musl" -r + +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r /requirements.txt + +# chmod 777 to make the container user independent +RUN mkdir -p /var/lib/clickhouse \ + && chmod 777 /var/lib/clickhouse + +ENV TZ=Europe/Amsterdam +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +RUN groupadd --system --gid 1000 clickhouse \ + && useradd --system --gid 1000 --uid 1000 -m clickhouse \ + && mkdir -p /.cache/sccache && chmod 777 /.cache/sccache + +ENV PYTHONPATH="/wd" +ENV PYTHONUNBUFFERED=1 diff --git a/ci/docker/fasttest/requirements.txt b/ci/docker/fasttest/requirements.txt new file mode 100644 index 00000000000..a1488ee33f0 --- /dev/null +++ b/ci/docker/fasttest/requirements.txt @@ -0,0 +1,6 @@ +Jinja2==3.1.3 +numpy==1.26.4 +requests==2.32.3 +pandas==1.5.3 +scipy==1.12.0 +#https://clickhouse-builds.s3.amazonaws.com/packages/praktika-0.1-py3-none-any.whl diff --git a/ci_v2/docker/style-test/Dockerfile b/ci/docker/style-test/Dockerfile similarity index 100% rename from ci_v2/docker/style-test/Dockerfile rename to ci/docker/style-test/Dockerfile diff --git a/ci/docker/style-test/requirements.txt b/ci/docker/style-test/requirements.txt new file mode 100644 index 00000000000..ab48f245fd2 --- /dev/null +++ b/ci/docker/style-test/requirements.txt @@ -0,0 +1,5 @@ +requests==2.32.3 +yamllint==1.26.3 +codespell==2.2.1 +#use praktika from CH repo +#https://clickhouse-builds.s3.amazonaws.com/packages/praktika-0.1-py3-none-any.whl diff --git a/ci/jobs/build_clickhouse.py b/ci/jobs/build_clickhouse.py new file mode 100644 index 00000000000..21ed8091608 --- /dev/null +++ b/ci/jobs/build_clickhouse.py @@ -0,0 +1,102 @@ +import argparse + +from praktika.result import Result +from praktika.settings import Settings +from praktika.utils import MetaClasses, Shell, Utils + + +class JobStages(metaclass=MetaClasses.WithIter): + CHECKOUT_SUBMODULES = "checkout" + CMAKE = "cmake" + BUILD = "build" + + +def parse_args(): + parser = argparse.ArgumentParser(description="ClickHouse Build Job") + parser.add_argument("BUILD_TYPE", help="Type: ") + parser.add_argument("--param", help="Optional custom job start stage", default=None) + return parser.parse_args() + + +def main(): + + args = parse_args() + + stop_watch = Utils.Stopwatch() + + stages = list(JobStages) + stage = args.param or JobStages.CHECKOUT_SUBMODULES + if stage: + assert stage in JobStages, f"--param must be one of [{list(JobStages)}]" + print(f"Job will start from stage [{stage}]") + while stage in stages: + stages.pop(0) + stages.insert(0, stage) + + cmake_build_type = "Release" + sanitizer = "" + + if "debug" in args.BUILD_TYPE.lower(): + print("Build type set: debug") + cmake_build_type = "Debug" + + if "asan" in args.BUILD_TYPE.lower(): + print("Sanitizer set: address") + sanitizer = "address" + + # if Environment.is_local_run(): + # build_cache_type = "disabled" + # else: + build_cache_type = "sccache" + + current_directory = Utils.cwd() + build_dir = f"{Settings.TEMP_DIR}/build" + + res = True + results = [] + + if res and JobStages.CHECKOUT_SUBMODULES in stages: + Shell.check(f"rm -rf {build_dir} && mkdir -p {build_dir}") + results.append( + Result.create_from_command_execution( + name="Checkout Submodules", + command=f"git submodule sync --recursive && git submodule init && git submodule update --depth 1 --recursive --jobs {min([Utils.cpu_count(), 20])}", + ) + ) + res = results[-1].is_ok() + + if res and JobStages.CMAKE in stages: + results.append( + Result.create_from_command_execution( + name="Cmake configuration", + command=f"cmake --debug-trycompile -DCMAKE_VERBOSE_MAKEFILE=1 -LA -DCMAKE_BUILD_TYPE={cmake_build_type} \ + -DSANITIZE={sanitizer} -DENABLE_CHECK_HEAVY_BUILDS=1 -DENABLE_CLICKHOUSE_SELF_EXTRACTING=1 -DENABLE_TESTS=0 \ + -DENABLE_UTILS=0 -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_INSTALL_SYSCONFDIR=/etc -DCMAKE_INSTALL_LOCALSTATEDIR=/var -DCMAKE_SKIP_INSTALL_ALL_DEPENDENCY=ON \ + -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 -DCOMPILER_CACHE={build_cache_type} -DENABLE_TESTS=1 \ + -DENABLE_BUILD_PROFILING=1 {current_directory}", + workdir=build_dir, + with_log=True, + ) + ) + res = results[-1].is_ok() + + if res and JobStages.BUILD in stages: + Shell.check("sccache --show-stats") + results.append( + Result.create_from_command_execution( + name="Build ClickHouse", + command="ninja clickhouse-bundle clickhouse-odbc-bridge clickhouse-library-bridge", + workdir=build_dir, + with_log=True, + ) + ) + Shell.check("sccache --show-stats") + Shell.check(f"ls -l {build_dir}/programs/") + res = results[-1].is_ok() + + Result.create_from(results=results, stopwatch=stop_watch).finish_job_accordingly() + + +if __name__ == "__main__": + main() diff --git a/ci_v2/jobs/check_style.py b/ci/jobs/check_style.py similarity index 77% rename from ci_v2/jobs/check_style.py rename to ci/jobs/check_style.py index 4dd3864e865..f9cdc76302d 100644 --- a/ci_v2/jobs/check_style.py +++ b/ci/jobs/check_style.py @@ -2,7 +2,6 @@ import math import multiprocessing import os import re -import sys from concurrent.futures import ProcessPoolExecutor from pathlib import Path @@ -51,25 +50,6 @@ def run_check_concurrent(check_name, check_function, files, nproc=NPROC): return result -def run_simple_check(check_name, check_function, **kwargs): - stop_watch = Utils.Stopwatch() - - error = check_function(**kwargs) - - result = Result( - name=check_name, - status=Result.Status.SUCCESS if not error else Result.Status.FAILED, - start_time=stop_watch.start_time, - duration=stop_watch.duration, - info=error, - ) - return result - - -def run_check(check_name, check_function, files): - return run_check_concurrent(check_name, check_function, files, nproc=1) - - def check_duplicate_includes(file_path): includes = [] with open(file_path, "r", encoding="utf-8", errors="ignore") as f: @@ -88,7 +68,7 @@ def check_duplicate_includes(file_path): def check_whitespaces(file_paths): for file in file_paths: exit_code, out, err = Shell.get_res_stdout_stderr( - f'./ci_v2/jobs/scripts/check_style/double_whitespaces.pl "{file}"', + f'./ci/jobs/scripts/check_style/double_whitespaces.pl "{file}"', verbose=False, ) if out or err: @@ -117,7 +97,7 @@ def check_xmllint(file_paths): def check_functional_test_cases(files): """ Queries with event_date should have yesterday() not today() - NOTE: it is not that accuate, but at least something. + NOTE: it is not that accurate, but at least something. """ patterns = [ @@ -194,7 +174,7 @@ def check_broken_links(path, exclude_paths): def check_cpp_code(): res, out, err = Shell.get_res_stdout_stderr( - "./ci_v2/jobs/scripts/check_style/check_cpp.sh" + "./ci/jobs/scripts/check_style/check_cpp.sh" ) if err: out += err @@ -203,7 +183,7 @@ def check_cpp_code(): def check_repo_submodules(): res, out, err = Shell.get_res_stdout_stderr( - "./ci_v2/jobs/scripts/check_style/check_submodules.sh" + "./ci/jobs/scripts/check_style/check_submodules.sh" ) if err: out += err @@ -212,7 +192,7 @@ def check_repo_submodules(): def check_other(): res, out, err = Shell.get_res_stdout_stderr( - "./ci_v2/jobs/scripts/check_style/checks_to_refactor.sh" + "./ci/jobs/scripts/check_style/checks_to_refactor.sh" ) if err: out += err @@ -221,7 +201,7 @@ def check_other(): def check_codespell(): res, out, err = Shell.get_res_stdout_stderr( - "./ci_v2/jobs/scripts/check_style/check_typos.sh" + "./ci/jobs/scripts/check_style/check_typos.sh" ) if err: out += err @@ -230,7 +210,7 @@ def check_codespell(): def check_aspell(): res, out, err = Shell.get_res_stdout_stderr( - "./ci_v2/jobs/scripts/check_style/check_aspell.sh" + "./ci/jobs/scripts/check_style/check_aspell.sh" ) if err: out += err @@ -239,7 +219,7 @@ def check_aspell(): def check_mypy(): res, out, err = Shell.get_res_stdout_stderr( - "./ci_v2/jobs/scripts/check_style/check-mypy" + "./ci/jobs/scripts/check_style/check-mypy" ) if err: out += err @@ -248,7 +228,7 @@ def check_mypy(): def check_pylint(): res, out, err = Shell.get_res_stdout_stderr( - "./ci_v2/jobs/scripts/check_style/check-pylint" + "./ci/jobs/scripts/check_style/check-pylint" ) if err: out += err @@ -345,66 +325,58 @@ if __name__ == "__main__": ) ) results.append( - run_check( - check_name="Check Tests Numbers", - check_function=check_gaps_in_tests_numbers, - files=functional_test_files, + Result.create_from_command_execution( + name="Check Tests Numbers", + command=check_gaps_in_tests_numbers, + command_args=[functional_test_files], ) ) results.append( - run_simple_check( - check_name="Check Broken Symlinks", - check_function=check_broken_links, - path="./", - exclude_paths=["contrib/", "metadata/", "programs/server/data"], + Result.create_from_command_execution( + name="Check Broken Symlinks", + command=check_broken_links, + command_kwargs={ + "path": "./", + "exclude_paths": ["contrib/", "metadata/", "programs/server/data"], + }, ) ) results.append( - run_simple_check( - check_name="Check CPP code", - check_function=check_cpp_code, + Result.create_from_command_execution( + name="Check CPP code", + command=check_cpp_code, ) ) results.append( - run_simple_check( - check_name="Check Submodules", - check_function=check_repo_submodules, + Result.create_from_command_execution( + name="Check Submodules", + command=check_repo_submodules, ) ) results.append( - run_check( - check_name="Check File Names", - check_function=check_file_names, - files=all_files, + Result.create_from_command_execution( + name="Check File Names", + command=check_file_names, + command_args=[all_files], ) ) results.append( - run_simple_check( - check_name="Check Many Different Things", - check_function=check_other, + Result.create_from_command_execution( + name="Check Many Different Things", + command=check_other, ) ) results.append( - run_simple_check( - check_name="Check Codespell", - check_function=check_codespell, + Result.create_from_command_execution( + name="Check Codespell", + command=check_codespell, ) ) results.append( - run_simple_check( - check_name="Check Aspell", - check_function=check_aspell, + Result.create_from_command_execution( + name="Check Aspell", + command=check_aspell, ) ) - res = Result.create_from(results=results, stopwatch=stop_watch).dump() - - if not res.is_ok(): - print("Style check: failed") - for result in results: - if not result.is_ok(): - print("Failed check:") - print(" | ", result) - sys.exit(1) - else: - print("Style check: ok") + Result.create_from(results=results, stopwatch=stop_watch).finish_job_accordingly() diff --git a/ci/jobs/fast_test.py b/ci/jobs/fast_test.py new file mode 100644 index 00000000000..1dcd65b6ed2 --- /dev/null +++ b/ci/jobs/fast_test.py @@ -0,0 +1,337 @@ +import argparse +import threading +from pathlib import Path + +from praktika.result import Result +from praktika.settings import Settings +from praktika.utils import MetaClasses, Shell, Utils + +from ci.jobs.scripts.functional_tests_results import FTResultsProcessor + + +class ClickHouseProc: + def __init__(self): + self.ch_config_dir = f"{Settings.TEMP_DIR}/etc/clickhouse-server" + self.pid_file = f"{self.ch_config_dir}/clickhouse-server.pid" + self.config_file = f"{self.ch_config_dir}/config.xml" + self.user_files_path = f"{self.ch_config_dir}/user_files" + self.test_output_file = f"{Settings.OUTPUT_DIR}/test_result.txt" + self.command = f"clickhouse-server --config-file {self.config_file} --pid-file {self.pid_file} -- --path {self.ch_config_dir} --user_files_path {self.user_files_path} --top_level_domains_path {self.ch_config_dir}/top_level_domains --keeper_server.storage_path {self.ch_config_dir}/coordination" + self.proc = None + self.pid = 0 + nproc = int(Utils.cpu_count() / 2) + self.fast_test_command = f"clickhouse-test --hung-check --fast-tests-only --no-random-settings --no-random-merge-tree-settings --no-long --testname --shard --zookeeper --check-zookeeper-session --order random --print-time --report-logs-stats --jobs {nproc} -- '' | ts '%Y-%m-%d %H:%M:%S' \ + | tee -a \"{self.test_output_file}\"" + # TODO: store info in case of failure + self.info = "" + self.info_file = "" + + Utils.set_env("CLICKHOUSE_CONFIG_DIR", self.ch_config_dir) + Utils.set_env("CLICKHOUSE_CONFIG", self.config_file) + Utils.set_env("CLICKHOUSE_USER_FILES", self.user_files_path) + Utils.set_env("CLICKHOUSE_SCHEMA_FILES", f"{self.ch_config_dir}/format_schemas") + + def start(self): + print("Starting ClickHouse server") + Shell.check(f"rm {self.pid_file}") + + def run_clickhouse(): + self.proc = Shell.run_async( + self.command, verbose=True, suppress_output=True + ) + + thread = threading.Thread(target=run_clickhouse) + thread.daemon = True # Allow program to exit even if thread is still running + thread.start() + + # self.proc = Shell.run_async(self.command, verbose=True) + + started = False + try: + for _ in range(5): + pid = Shell.get_output(f"cat {self.pid_file}").strip() + if not pid: + Utils.sleep(1) + continue + started = True + print(f"Got pid from fs [{pid}]") + _ = int(pid) + break + except Exception: + pass + + if not started: + stdout = self.proc.stdout.read().strip() if self.proc.stdout else "" + stderr = self.proc.stderr.read().strip() if self.proc.stderr else "" + Utils.print_formatted_error("Failed to start ClickHouse", stdout, stderr) + return False + + print(f"ClickHouse server started successfully, pid [{pid}]") + return True + + def wait_ready(self): + res, out, err = 0, "", "" + attempts = 30 + delay = 2 + for attempt in range(attempts): + res, out, err = Shell.get_res_stdout_stderr( + 'clickhouse-client --query "select 1"', verbose=True + ) + if out.strip() == "1": + print("Server ready") + break + else: + print(f"Server not ready, wait") + Utils.sleep(delay) + else: + Utils.print_formatted_error( + f"Server not ready after [{attempts*delay}s]", out, err + ) + return False + return True + + def run_fast_test(self): + if Path(self.test_output_file).exists(): + Path(self.test_output_file).unlink() + exit_code = Shell.run(self.fast_test_command) + return exit_code == 0 + + def terminate(self): + print("Terminate ClickHouse process") + timeout = 10 + if self.proc: + Utils.terminate_process_group(self.proc.pid) + + self.proc.terminate() + try: + self.proc.wait(timeout=10) + print(f"Process {self.proc.pid} terminated gracefully.") + except Exception: + print( + f"Process {self.proc.pid} did not terminate in {timeout} seconds, killing it..." + ) + Utils.terminate_process_group(self.proc.pid, force=True) + self.proc.wait() # Wait for the process to be fully killed + print(f"Process {self.proc} was killed.") + + +def clone_submodules(): + submodules_to_update = [ + "contrib/sysroot", + "contrib/magic_enum", + "contrib/abseil-cpp", + "contrib/boost", + "contrib/zlib-ng", + "contrib/libxml2", + "contrib/libunwind", + "contrib/fmtlib", + "contrib/aklomp-base64", + "contrib/cctz", + "contrib/libcpuid", + "contrib/libdivide", + "contrib/double-conversion", + "contrib/llvm-project", + "contrib/lz4", + "contrib/zstd", + "contrib/fastops", + "contrib/rapidjson", + "contrib/re2", + "contrib/sparsehash-c11", + "contrib/croaring", + "contrib/miniselect", + "contrib/xz", + "contrib/dragonbox", + "contrib/fast_float", + "contrib/NuRaft", + "contrib/jemalloc", + "contrib/replxx", + "contrib/wyhash", + "contrib/c-ares", + "contrib/morton-nd", + "contrib/xxHash", + "contrib/expected", + "contrib/simdjson", + "contrib/liburing", + "contrib/libfiu", + "contrib/incbin", + "contrib/yaml-cpp", + ] + + res = Shell.check("git submodule sync", verbose=True, strict=True) + res = res and Shell.check("git submodule init", verbose=True, strict=True) + res = res and Shell.check( + command=f"xargs --max-procs={min([Utils.cpu_count(), 20])} --null --no-run-if-empty --max-args=1 git submodule update --depth 1 --single-branch", + stdin_str="\0".join(submodules_to_update) + "\0", + timeout=120, + retries=3, + verbose=True, + ) + res = res and Shell.check("git submodule foreach git reset --hard", verbose=True) + res = res and Shell.check("git submodule foreach git checkout @ -f", verbose=True) + res = res and Shell.check("git submodule foreach git clean -xfd", verbose=True) + return res + + +def update_path_ch_config(config_file_path=""): + print("Updating path in clickhouse config") + config_file_path = ( + config_file_path or f"{Settings.TEMP_DIR}/etc/clickhouse-server/config.xml" + ) + ssl_config_file_path = ( + f"{Settings.TEMP_DIR}/etc/clickhouse-server/config.d/ssl_certs.xml" + ) + try: + with open(config_file_path, "r", encoding="utf-8") as file: + content = file.read() + + with open(ssl_config_file_path, "r", encoding="utf-8") as file: + ssl_config_content = file.read() + content = content.replace(">/var/", f">{Settings.TEMP_DIR}/var/") + content = content.replace(">/etc/", f">{Settings.TEMP_DIR}/etc/") + ssl_config_content = ssl_config_content.replace( + ">/etc/", f">{Settings.TEMP_DIR}/etc/" + ) + with open(config_file_path, "w", encoding="utf-8") as file: + file.write(content) + with open(ssl_config_file_path, "w", encoding="utf-8") as file: + file.write(ssl_config_content) + except Exception as e: + print(f"ERROR: failed to update config, exception: {e}") + return False + return True + + +class JobStages(metaclass=MetaClasses.WithIter): + CHECKOUT_SUBMODULES = "checkout" + CMAKE = "cmake" + BUILD = "build" + CONFIG = "config" + TEST = "test" + + +def parse_args(): + parser = argparse.ArgumentParser(description="ClickHouse Fast Test Job") + parser.add_argument("--param", help="Optional custom job start stage", default=None) + return parser.parse_args() + + +def main(): + args = parse_args() + stop_watch = Utils.Stopwatch() + + stages = list(JobStages) + stage = args.param or JobStages.CHECKOUT_SUBMODULES + if stage: + assert stage in JobStages, f"--param must be one of [{list(JobStages)}]" + print(f"Job will start from stage [{stage}]") + while stage in stages: + stages.pop(0) + stages.insert(0, stage) + + current_directory = Utils.cwd() + build_dir = f"{Settings.TEMP_DIR}/build" + + Utils.add_to_PATH(f"{build_dir}/programs:{current_directory}/tests") + + res = True + results = [] + + if res and JobStages.CHECKOUT_SUBMODULES in stages: + Shell.check(f"rm -rf {build_dir} && mkdir -p {build_dir}") + results.append( + Result.create_from_command_execution( + name="Checkout Submodules for Minimal Build", + command=clone_submodules, + ) + ) + res = results[-1].is_ok() + + if res and JobStages.CMAKE in stages: + results.append( + Result.create_from_command_execution( + name="Cmake configuration", + command=f"cmake {current_directory} -DCMAKE_CXX_COMPILER=clang++-18 -DCMAKE_C_COMPILER=clang-18 \ + -DCMAKE_TOOLCHAIN_FILE={current_directory}/cmake/linux/toolchain-x86_64-musl.cmake -DENABLE_LIBRARIES=0 \ + -DENABLE_TESTS=0 -DENABLE_UTILS=0 -DENABLE_THINLTO=0 -DENABLE_NURAFT=1 -DENABLE_SIMDJSON=1 \ + -DENABLE_JEMALLOC=1 -DENABLE_LIBURING=1 -DENABLE_YAML_CPP=1 -DCOMPILER_CACHE=sccache", + workdir=build_dir, + with_log=True, + ) + ) + res = results[-1].is_ok() + + if res and JobStages.BUILD in stages: + Shell.check("sccache --show-stats") + results.append( + Result.create_from_command_execution( + name="Build ClickHouse", + command="ninja clickhouse-bundle clickhouse-stripped", + workdir=build_dir, + with_log=True, + ) + ) + Shell.check("sccache --show-stats") + res = results[-1].is_ok() + + if res and JobStages.BUILD in stages: + commands = [ + f"mkdir -p {Settings.OUTPUT_DIR}/binaries", + f"cp ./programs/clickhouse {Settings.OUTPUT_DIR}/binaries/clickhouse", + f"zstd --threads=0 --force programs/clickhouse-stripped -o {Settings.OUTPUT_DIR}/binaries/clickhouse-stripped.zst", + "sccache --show-stats", + "clickhouse-client --version", + "clickhouse-test --help", + ] + results.append( + Result.create_from_command_execution( + name="Check and Compress binary", + command=commands, + workdir=build_dir, + with_log=True, + ) + ) + res = results[-1].is_ok() + + if res and JobStages.CONFIG in stages: + commands = [ + f"rm -rf {Settings.TEMP_DIR}/etc/ && mkdir -p {Settings.TEMP_DIR}/etc/clickhouse-client {Settings.TEMP_DIR}/etc/clickhouse-server", + f"cp {current_directory}/programs/server/config.xml {current_directory}/programs/server/users.xml {Settings.TEMP_DIR}/etc/clickhouse-server/", + f"{current_directory}/tests/config/install.sh {Settings.TEMP_DIR}/etc/clickhouse-server {Settings.TEMP_DIR}/etc/clickhouse-client", + # f"cp -a {current_directory}/programs/server/config.d/log_to_console.xml {Settings.TEMP_DIR}/etc/clickhouse-server/config.d/", + f"rm -f {Settings.TEMP_DIR}/etc/clickhouse-server/config.d/secure_ports.xml", + update_path_ch_config, + ] + results.append( + Result.create_from_command_execution( + name="Install ClickHouse Config", + command=commands, + with_log=True, + ) + ) + res = results[-1].is_ok() + + CH = ClickHouseProc() + if res and JobStages.TEST in stages: + stop_watch_ = Utils.Stopwatch() + step_name = "Start ClickHouse Server" + print(step_name) + res = CH.start() + res = res and CH.wait_ready() + results.append( + Result.create_from(name=step_name, status=res, stopwatch=stop_watch_) + ) + + if res and JobStages.TEST in stages: + step_name = "Tests" + print(step_name) + res = res and CH.run_fast_test() + if res: + results.append(FTResultsProcessor(wd=Settings.OUTPUT_DIR).run()) + + CH.terminate() + + Result.create_from(results=results, stopwatch=stop_watch).finish_job_accordingly() + + +if __name__ == "__main__": + main() diff --git a/ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt b/ci/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt similarity index 100% rename from ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt rename to ci/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt diff --git a/ci_v2/jobs/scripts/check_style/check_aspell.sh b/ci/jobs/scripts/check_style/check_aspell.sh similarity index 100% rename from ci_v2/jobs/scripts/check_style/check_aspell.sh rename to ci/jobs/scripts/check_style/check_aspell.sh diff --git a/ci_v2/jobs/scripts/check_style/check_cpp.sh b/ci/jobs/scripts/check_style/check_cpp.sh similarity index 87% rename from ci_v2/jobs/scripts/check_style/check_cpp.sh rename to ci/jobs/scripts/check_style/check_cpp.sh index 1611fac8c5e..2e47b253bac 100755 --- a/ci_v2/jobs/scripts/check_style/check_cpp.sh +++ b/ci/jobs/scripts/check_style/check_cpp.sh @@ -14,7 +14,8 @@ LC_ALL="en_US.UTF-8" ROOT_PATH="." -EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/.*.cpp|utils/keeper-bench/example.yaml' +EXCLUDE='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/.*.cpp|utils/keeper-bench/example.yaml' +EXCLUDE_DOCS='Settings\.cpp|FormatFactorySettingsDeclaration\.h' # From [1]: # But since array_to_string_internal() in array.c still loops over array @@ -31,7 +32,8 @@ function in_array() } find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | + grep -vP $EXCLUDE_DOCS | xargs grep $@ -P '((class|struct|namespace|enum|if|for|while|else|throw|switch).*|\)(\s*const)?(\s*override)?\s*)\{$|\s$|^ {1,3}[^\* ]\S|\t|^\s*(if|else if|if constexpr|else if constexpr|for|while|catch|switch)\(|\( [^\s\\]|\S \)' | # a curly brace not in a new line, but not for the case of C++11 init or agg. initialization | trailing whitespace | number of ws not a multiple of 4, but not in the case of comment continuation | missing whitespace after for/if/while... before opening brace | whitespaces inside braces grep -v -P '(//|:\s+\*|\$\(\()| \)"' @@ -39,39 +41,19 @@ find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/n # Tabs find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | - grep -vP $EXCLUDE_DIRS | - xargs grep $@ -F $'\t' + grep -vP $EXCLUDE | + xargs grep $@ -F $'\t' && echo '^ tabs are not allowed' # // namespace comments are unneeded find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep $@ -P '}\s*//+\s*namespace\s*' # Broken symlinks find -L $ROOT_PATH -type l 2>/dev/null | grep -v contrib && echo "^ Broken symlinks found" # Duplicated or incorrect setting declarations -SETTINGS_FILE=$(mktemp) -cat $ROOT_PATH/src/Core/Settings.cpp $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h | grep "M(" | awk '{print substr($2, 0, length($2) - 1) " " substr($1, 3, length($1) - 3) " SettingsDeclaration" }' > ${SETTINGS_FILE} -find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep "extern const Settings" -T | awk '{print substr($5, 0, length($5) -1) " " substr($4, 9) " " substr($1, 0, length($1) - 1)}' >> ${SETTINGS_FILE} - -# Duplicate extern declarations for settings -awk '{if (seen[$0]++) print $3 " -> " $1 ;}' ${SETTINGS_FILE} | while read line; -do - echo "Found duplicated setting declaration in: $line" -done - -# Incorrect declarations for settings -for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | sort | uniq | awk '{ print $1 }' | sort | uniq -d); -do - expected=$(grep "^$setting " ${SETTINGS_FILE} | grep SettingsDeclaration | awk '{ print $2 }') - grep "^$setting " ${SETTINGS_FILE} | grep -v " $expected" | awk '{ print $3 " found setting " $1 " with type " $2 }' | while read line; - do - echo "In $line but it should be $expected" - done -done - -rm ${SETTINGS_FILE} +bash $ROOT_PATH/utils/check-style/check-settings-style # Unused/Undefined/Duplicates ErrorCodes/ProfileEvents/CurrentMetrics declare -A EXTERN_TYPES @@ -91,12 +73,14 @@ EXTERN_TYPES_EXCLUDES=( ProfileEvents::Timer ProfileEvents::Type ProfileEvents::TypeEnum + ProfileEvents::ValueType ProfileEvents::dumpToMapColumn ProfileEvents::getProfileEvents ProfileEvents::ThreadIdToCountersSnapshot ProfileEvents::LOCAL_NAME ProfileEvents::keeper_profile_events ProfileEvents::CountersIncrement + ProfileEvents::size CurrentMetrics::add CurrentMetrics::sub @@ -108,6 +92,7 @@ EXTERN_TYPES_EXCLUDES=( CurrentMetrics::values CurrentMetrics::Value CurrentMetrics::keeper_metrics + CurrentMetrics::size ErrorCodes::ErrorCode ErrorCodes::getName @@ -130,7 +115,7 @@ for extern_type in ${!EXTERN_TYPES[@]}; do # and this matches with zkutil::CreateMode grep -v -e 'src/Common/ZooKeeper/Types.h' -e 'src/Coordination/KeeperConstants.cpp' } | { - grep -vP $EXCLUDE_DIRS | xargs grep -l -P "extern const $type_of_extern $allowed_chars" + grep -vP $EXCLUDE | xargs grep -l -P "extern const $type_of_extern $allowed_chars" } | while read file; do grep -P "extern const $type_of_extern $allowed_chars;" $file | sed -r -e "s/^.*?extern const $type_of_extern ($allowed_chars);.*?$/\1/" | while read val; do if ! grep -q "$extern_type::$val" $file; then @@ -148,7 +133,7 @@ for extern_type in ${!EXTERN_TYPES[@]}; do # sed -i -r "0,/(\s*)extern const $type_of_extern [$allowed_chars]+/s//\1extern const $type_of_extern $val;\n&/" $file || \ # awk '{ print; if (ns == 1) { ns = 2 }; if (ns == 2) { ns = 0; print "namespace $extern_type\n{\n extern const $type_of_extern '$val';\n}" } }; /namespace DB/ { ns = 1; };' < $file > ${file}.tmp && mv ${file}.tmp $file ) find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { - grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars" + grep -vP $EXCLUDE | xargs grep -l -P "$extern_type::$allowed_chars" } | while read file; do grep -P "$extern_type::$allowed_chars" $file | grep -P -v '^\s*//' | sed -r -e "s/^.*?$extern_type::($allowed_chars).*?$/\1/" | while read val; do if ! grep -q "extern const $type_of_extern $val" $file; then @@ -161,7 +146,7 @@ for extern_type in ${!EXTERN_TYPES[@]}; do # Duplicates find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { - grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars" + grep -vP $EXCLUDE | xargs grep -l -P "$extern_type::$allowed_chars" } | while read file; do grep -P "extern const $type_of_extern $allowed_chars;" $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate $extern_type in file $file" done @@ -169,32 +154,32 @@ done # Three or more consecutive empty lines find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done # Check that every header file has #pragma once in first line find $ROOT_PATH/{src,programs,utils} -name '*.h' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | while read file; do [[ $(head -n1 $file) != '#pragma once' ]] && echo "File $file must have '#pragma once' in first line"; done # Too many exclamation marks find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -F '!!!' | grep -P '.' && echo "Too many exclamation marks (looks dirty, unconfident)." # Exclamation mark in a message find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -F '!",' | grep -P '.' && echo "No need for an exclamation mark (looks dirty, unconfident)." # Trailing whitespaces find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -n -P ' $' | grep -n -P '.' && echo "^ Trailing whitespaces." # Forbid stringstream because it's easy to use them incorrectly and hard to debug possible issues find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P 'std::[io]?stringstream' | grep -v "STYLE_CHECK_ALLOW_STD_STRING_STREAM" && echo "Use WriteBufferFromOwnString or ReadBufferFromString instead of std::stringstream" # Forbid std::cerr/std::cout in src (fine in programs/utils) @@ -204,6 +189,7 @@ std_cerr_cout_excludes=( _fuzzer # OK src/Common/ProgressIndication.cpp + src/Common/ProgressTable.cpp # only under #ifdef DBMS_HASH_MAP_DEBUG_RESIZES, that is used only in tests src/Common/HashTable/HashTable.h # SensitiveDataMasker::printStats() @@ -230,11 +216,10 @@ std_cerr_cout_excludes=( ) sources_with_std_cerr_cout=( $( find $ROOT_PATH/{src,base} -name '*.h' -or -name '*.cpp' | \ - grep -vP $EXCLUDE_DIRS | \ + grep -vP $EXCLUDE | \ grep -F -v $(printf -- "-e %s " "${std_cerr_cout_excludes[@]}") | \ xargs grep -F --with-filename -e std::cerr -e std::cout | cut -d: -f1 | sort -u ) ) - # Exclude comments for src in "${sources_with_std_cerr_cout[@]}"; do # suppress stderr, since it may contain warning for #pargma once in headers @@ -279,23 +264,23 @@ fi # Forbid std::filesystem::is_symlink and std::filesystem::read_symlink, because it's easy to use them incorrectly find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P '::(is|read)_symlink' | grep -v "STYLE_CHECK_ALLOW_STD_FS_SYMLINK" && echo "Use DB::FS::isSymlink and DB::FS::readSymlink instead" # Forbid __builtin_unreachable(), because it's hard to debug when it becomes reachable find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P '__builtin_unreachable' && echo "Use UNREACHABLE() from defines.h instead" # Forbid mt19937() and random_device() which are outdated and slow find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P '(std::mt19937|std::mersenne_twister_engine|std::random_device)' && echo "Use pcg64_fast (from pcg_random.h) and randomSeed (from Common/randomSeed.h) instead" # Require checking return value of close(), # since it can hide fd misuse and break other places. find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -e ' close(.*fd' -e ' ::close(' | grep -v = && echo "Return value of close() should be checked" # A small typo can lead to debug code in release builds, see https://github.com/ClickHouse/ClickHouse/pull/47647 @@ -322,18 +307,15 @@ ls -1d $ROOT_PATH/contrib/*-cmake | xargs -I@ find @ -name 'CMakeLists.txt' -or # Wrong spelling of abbreviations, e.g. SQL is right, Sql is wrong. XMLHttpRequest is very wrong. find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P 'Sql|Html|Xml|Cpu|Tcp|Udp|Http|Db|Json|Yaml' | grep -v -P 'RabbitMQ|Azure|Aws|aws|Avro|IO/S3' && echo "Abbreviations such as SQL, XML, HTTP, should be in all caps. For example, SQL is right, Sql is wrong. XMLHttpRequest is very wrong." find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -F -i 'ErrorCodes::LOGICAL_ERROR, "Logical error:' && echo "If an exception has LOGICAL_ERROR code, there is no need to include the text 'Logical error' in the exception message, because then the phrase 'Logical error' will be printed twice." -# There shouldn't be any code snippets under GPL or LGPL -find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL" - PATTERN="allow_"; DIFF=$(comm -3 <(grep -o "\b$PATTERN\w*\b" $ROOT_PATH/src/Core/Settings.cpp | sort -u) <(grep -o -h "\b$PATTERN\w*\b" $ROOT_PATH/src/Databases/enableAllExperimentalSettings.cpp $ROOT_PATH/utils/check-style/experimental_settings_ignore.txt | sort -u)); [ -n "$DIFF" ] && echo "$DIFF" && echo "^^ Detected 'allow_*' settings that might need to be included in src/Databases/enableAllExperimentalSettings.cpp" && echo "Alternatively, consider adding an exception to utils/check-style/experimental_settings_ignore.txt" diff --git a/ci_v2/jobs/scripts/check_style/check_submodules.sh b/ci/jobs/scripts/check_style/check_submodules.sh similarity index 100% rename from ci_v2/jobs/scripts/check_style/check_submodules.sh rename to ci/jobs/scripts/check_style/check_submodules.sh diff --git a/ci_v2/jobs/scripts/check_style/check_typos.sh b/ci/jobs/scripts/check_style/check_typos.sh similarity index 100% rename from ci_v2/jobs/scripts/check_style/check_typos.sh rename to ci/jobs/scripts/check_style/check_typos.sh diff --git a/ci_v2/jobs/scripts/check_style/checks_to_refactor.sh b/ci/jobs/scripts/check_style/checks_to_refactor.sh similarity index 100% rename from ci_v2/jobs/scripts/check_style/checks_to_refactor.sh rename to ci/jobs/scripts/check_style/checks_to_refactor.sh diff --git a/ci_v2/jobs/scripts/check_style/double_whitespaces.pl b/ci/jobs/scripts/check_style/double_whitespaces.pl similarity index 100% rename from ci_v2/jobs/scripts/check_style/double_whitespaces.pl rename to ci/jobs/scripts/check_style/double_whitespaces.pl diff --git a/ci/jobs/scripts/functional_tests_results.py b/ci/jobs/scripts/functional_tests_results.py new file mode 100755 index 00000000000..5ac9d6b985d --- /dev/null +++ b/ci/jobs/scripts/functional_tests_results.py @@ -0,0 +1,284 @@ +import dataclasses +from typing import List + +from praktika.environment import Environment +from praktika.result import Result + +OK_SIGN = "[ OK " +FAIL_SIGN = "[ FAIL " +TIMEOUT_SIGN = "[ Timeout! " +UNKNOWN_SIGN = "[ UNKNOWN " +SKIPPED_SIGN = "[ SKIPPED " +HUNG_SIGN = "Found hung queries in processlist" +SERVER_DIED_SIGN = "Server died, terminating all processes" +SERVER_DIED_SIGN2 = "Server does not respond to health check" +DATABASE_SIGN = "Database: " + +SUCCESS_FINISH_SIGNS = ["All tests have finished", "No tests were run"] + +RETRIES_SIGN = "Some tests were restarted" + + +# def write_results(results_file, status_file, results, status): +# with open(results_file, "w", encoding="utf-8") as f: +# out = csv.writer(f, delimiter="\t") +# out.writerows(results) +# with open(status_file, "w", encoding="utf-8") as f: +# out = csv.writer(f, delimiter="\t") +# out.writerow(status) + +BROKEN_TESTS_ANALYZER_TECH_DEBT = [ + "01624_soft_constraints", + # Check after ConstantNode refactoring + "02944_variant_as_common_type", +] + + +class FTResultsProcessor: + @dataclasses.dataclass + class Summary: + total: int + skipped: int + unknown: int + failed: int + success: int + test_results: List[Result] + hung: bool = False + server_died: bool = False + retries: bool = False + success_finish: bool = False + test_end: bool = True + + def __init__(self, wd): + self.tests_output_file = f"{wd}/test_result.txt" + # self.test_results_parsed_file = f"{wd}/test_result.tsv" + # self.status_file = f"{wd}/check_status.tsv" + self.broken_tests = BROKEN_TESTS_ANALYZER_TECH_DEBT + + def _process_test_output(self): + total = 0 + skipped = 0 + unknown = 0 + failed = 0 + success = 0 + hung = False + server_died = False + retries = False + success_finish = False + test_results = [] + test_end = True + + with open(self.tests_output_file, "r", encoding="utf-8") as test_file: + for line in test_file: + original_line = line + line = line.strip() + + if any(s in line for s in SUCCESS_FINISH_SIGNS): + success_finish = True + # Ignore hung check report, since it may be quite large. + # (and may break python parser which has limit of 128KiB for each row). + if HUNG_SIGN in line: + hung = True + break + if SERVER_DIED_SIGN in line or SERVER_DIED_SIGN2 in line: + server_died = True + if RETRIES_SIGN in line: + retries = True + if any( + sign in line + for sign in (OK_SIGN, FAIL_SIGN, UNKNOWN_SIGN, SKIPPED_SIGN) + ): + test_name = line.split(" ")[2].split(":")[0] + + test_time = "" + try: + time_token = line.split("]")[1].strip().split()[0] + float(time_token) + test_time = time_token + except: + pass + + total += 1 + if TIMEOUT_SIGN in line: + if test_name in self.broken_tests: + success += 1 + test_results.append((test_name, "BROKEN", test_time, [])) + else: + failed += 1 + test_results.append((test_name, "Timeout", test_time, [])) + elif FAIL_SIGN in line: + if test_name in self.broken_tests: + success += 1 + test_results.append((test_name, "BROKEN", test_time, [])) + else: + failed += 1 + test_results.append((test_name, "FAIL", test_time, [])) + elif UNKNOWN_SIGN in line: + unknown += 1 + test_results.append((test_name, "FAIL", test_time, [])) + elif SKIPPED_SIGN in line: + skipped += 1 + test_results.append((test_name, "SKIPPED", test_time, [])) + else: + if OK_SIGN in line and test_name in self.broken_tests: + skipped += 1 + test_results.append( + ( + test_name, + "NOT_FAILED", + test_time, + [ + "This test passed. Update analyzer_tech_debt.txt.\n" + ], + ) + ) + else: + success += int(OK_SIGN in line) + test_results.append((test_name, "OK", test_time, [])) + test_end = False + elif ( + len(test_results) > 0 + and test_results[-1][1] == "FAIL" + and not test_end + ): + test_results[-1][3].append(original_line) + # Database printed after everything else in case of failures, + # so this is a stop marker for capturing test output. + # + # And it is handled after everything else to include line with database into the report. + if DATABASE_SIGN in line: + test_end = True + + test_results = [ + Result( + name=test[0], + status=test[1], + start_time=None, + duration=float(test[2]), + info="".join(test[3])[:8192], + ) + for test in test_results + ] + + s = self.Summary( + total=total, + skipped=skipped, + unknown=unknown, + failed=failed, + success=success, + test_results=test_results, + hung=hung, + server_died=server_died, + success_finish=success_finish, + retries=retries, + ) + + return s + + def run(self): + state = Result.Status.SUCCESS + s = self._process_test_output() + test_results = s.test_results + + # # Check test_results.tsv for sanitizer asserts, crashes and other critical errors. + # # If the file is present, it's expected to be generated by stress_test.lib check for critical errors + # # In the end this file will be fully regenerated, including both results from critical errors check and + # # functional test results. + # if test_results_path and os.path.exists(test_results_path): + # with open(test_results_path, "r", encoding="utf-8") as test_results_file: + # existing_test_results = list( + # csv.reader(test_results_file, delimiter="\t") + # ) + # for test in existing_test_results: + # if len(test) < 2: + # unknown += 1 + # else: + # test_results.append(test) + # + # if test[1] != "OK": + # failed += 1 + # else: + # success += 1 + + # is_flaky_check = 1 < int(os.environ.get("NUM_TRIES", 1)) + # logging.info("Is flaky check: %s", is_flaky_check) + # # If no tests were run (success == 0) it indicates an error (e.g. server did not start or crashed immediately) + # # But it's Ok for "flaky checks" - they can contain just one test for check which is marked as skipped. + # if failed != 0 or unknown != 0 or (success == 0 and (not is_flaky_check)): + if s.failed != 0 or s.unknown != 0: + state = Result.Status.FAILED + + if s.hung: + state = Result.Status.FAILED + test_results.append( + Result("Some queries hung", "FAIL", info="Some queries hung") + ) + elif s.server_died: + state = Result.Status.FAILED + # When ClickHouse server crashes, some tests are still running + # and fail because they cannot connect to server + for result in test_results: + if result.status == "FAIL": + result.status = "SERVER_DIED" + test_results.append(Result("Server died", "FAIL", info="Server died")) + elif not s.success_finish: + state = Result.Status.FAILED + test_results.append( + Result("Tests are not finished", "FAIL", info="Tests are not finished") + ) + elif s.retries: + test_results.append( + Result("Some tests restarted", "SKIPPED", info="Some tests restarted") + ) + else: + pass + + # TODO: !!! + # def test_result_comparator(item): + # # sort by status then by check name + # order = { + # "FAIL": 0, + # "SERVER_DIED": 1, + # "Timeout": 2, + # "NOT_FAILED": 3, + # "BROKEN": 4, + # "OK": 5, + # "SKIPPED": 6, + # } + # return order.get(item[1], 10), str(item[0]), item[1] + # + # test_results.sort(key=test_result_comparator) + + return Result.create_from( + name=Environment.JOB_NAME, + results=test_results, + status=state, + files=[self.tests_output_file], + with_info_from_results=False, + ) + + +# if __name__ == "__main__": +# logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") +# parser = argparse.ArgumentParser( +# description="ClickHouse script for parsing results of functional tests" +# ) +# +# parser.add_argument("--out-results-file", default="/test_output/test_results.tsv") +# parser.add_argument("--out-status-file", default="/test_output/check_status.tsv") +# args = parser.parse_args() +# +# broken_tests = [] +# state, description, test_results = process_result( +# args.in_results_dir, +# broken_tests, +# args.in_test_result_file, +# args.in_results_file, +# ) +# logging.info("Result parsed") +# status = (state, description) +# +# +# +# write_results(args.out_results_file, args.out_status_file, test_results, status) +# logging.info("Result written") diff --git a/ci/praktika/__init__.py b/ci/praktika/__init__.py new file mode 100644 index 00000000000..bde8fd6066a --- /dev/null +++ b/ci/praktika/__init__.py @@ -0,0 +1,5 @@ +from .artifact import Artifact +from .docker import Docker +from .job import Job +from .secret import Secret +from .workflow import Workflow diff --git a/ci/praktika/__main__.py b/ci/praktika/__main__.py new file mode 100644 index 00000000000..7f472ecd9ae --- /dev/null +++ b/ci/praktika/__main__.py @@ -0,0 +1,94 @@ +import argparse +import sys + +from praktika.html_prepare import Html +from praktika.utils import Utils +from praktika.validator import Validator +from praktika.yaml_generator import YamlGenerator + + +def create_parser(): + parser = argparse.ArgumentParser(prog="python3 -m praktika") + + subparsers = parser.add_subparsers(dest="command", help="Available subcommands") + + run_parser = subparsers.add_parser("run", help="Job Runner") + run_parser.add_argument("--job", help="Job Name", type=str, required=True) + run_parser.add_argument( + "--workflow", + help="Workflow Name (required if job name is not uniq per config)", + type=str, + default="", + ) + run_parser.add_argument( + "--no-docker", + help="Do not run job in docker even if job config says so, for local test", + action="store_true", + ) + run_parser.add_argument( + "--docker", + help="Custom docker image for job run, for local test", + type=str, + default="", + ) + run_parser.add_argument( + "--param", + help="Custom parameter to pass into a job script, it's up to job script how to use it, for local test", + type=str, + default=None, + ) + run_parser.add_argument( + "--ci", + help="When not set - dummy env will be generated, for local test", + action="store_true", + default="", + ) + + _yaml_parser = subparsers.add_parser("yaml", help="Generates Yaml Workflows") + + _html_parser = subparsers.add_parser("html", help="Uploads HTML page for reports") + + return parser + + +if __name__ == "__main__": + parser = create_parser() + args = parser.parse_args() + + if args.command == "yaml": + Validator().validate() + YamlGenerator().generate() + elif args.command == "html": + Html.prepare() + elif args.command == "run": + from praktika.mangle import _get_workflows + from praktika.runner import Runner + + workflows = _get_workflows(name=args.workflow or None) + job_workflow_pairs = [] + for workflow in workflows: + job = workflow.find_job(args.job, lazy=True) + if job: + job_workflow_pairs.append((job, workflow)) + if not job_workflow_pairs: + Utils.raise_with_error( + f"Failed to find job [{args.job}] workflow [{args.workflow}]" + ) + elif len(job_workflow_pairs) > 1: + Utils.raise_with_error( + f"More than one job [{args.job}] found - try specifying workflow name with --workflow" + ) + else: + job, workflow = job_workflow_pairs[0][0], job_workflow_pairs[0][1] + print(f"Going to run job [{job.name}], workflow [{workflow.name}]") + Runner().run( + workflow=workflow, + job=job, + docker=args.docker, + dummy_env=not args.ci, + no_docker=args.no_docker, + param=args.param, + ) + else: + parser.print_help() + sys.exit(1) diff --git a/ci/praktika/_environment.py b/ci/praktika/_environment.py new file mode 100644 index 00000000000..ce9c6f5b486 --- /dev/null +++ b/ci/praktika/_environment.py @@ -0,0 +1,198 @@ +import dataclasses +import json +import os +from pathlib import Path +from types import SimpleNamespace +from typing import Any, Dict, List, Type + +from praktika import Workflow +from praktika._settings import _Settings +from praktika.utils import MetaClasses, T + + +@dataclasses.dataclass +class _Environment(MetaClasses.Serializable): + WORKFLOW_NAME: str + JOB_NAME: str + REPOSITORY: str + BRANCH: str + SHA: str + PR_NUMBER: int + EVENT_TYPE: str + JOB_OUTPUT_STREAM: str + EVENT_FILE_PATH: str + CHANGE_URL: str + COMMIT_URL: str + BASE_BRANCH: str + RUN_ID: str + RUN_URL: str + INSTANCE_TYPE: str + INSTANCE_ID: str + INSTANCE_LIFE_CYCLE: str + LOCAL_RUN: bool = False + PARAMETER: Any = None + REPORT_INFO: List[str] = dataclasses.field(default_factory=list) + name = "environment" + + @classmethod + def file_name_static(cls, _name=""): + return f"{_Settings.TEMP_DIR}/{cls.name}.json" + + @classmethod + def from_dict(cls: Type[T], obj: Dict[str, Any]) -> T: + JOB_OUTPUT_STREAM = os.getenv("GITHUB_OUTPUT", "") + obj["JOB_OUTPUT_STREAM"] = JOB_OUTPUT_STREAM + if "PARAMETER" in obj: + obj["PARAMETER"] = _to_object(obj["PARAMETER"]) + return cls(**obj) + + def add_info(self, info): + self.REPORT_INFO.append(info) + self.dump() + + @classmethod + def get(cls): + if Path(cls.file_name_static()).is_file(): + return cls.from_fs("environment") + else: + print("WARNING: Environment: get from env") + env = cls.from_env() + env.dump() + return env + + def set_job_name(self, job_name): + self.JOB_NAME = job_name + self.dump() + return self + + @staticmethod + def get_needs_statuses(): + if Path(_Settings.WORKFLOW_STATUS_FILE).is_file(): + with open(_Settings.WORKFLOW_STATUS_FILE, "r", encoding="utf8") as f: + return json.load(f) + else: + print( + f"ERROR: Status file [{_Settings.WORKFLOW_STATUS_FILE}] does not exist" + ) + raise RuntimeError() + + @classmethod + def from_env(cls) -> "_Environment": + WORKFLOW_NAME = os.getenv("GITHUB_WORKFLOW", "") + JOB_NAME = os.getenv("JOB_NAME", "") + REPOSITORY = os.getenv("GITHUB_REPOSITORY", "") + BRANCH = os.getenv("GITHUB_HEAD_REF", "") + + EVENT_FILE_PATH = os.getenv("GITHUB_EVENT_PATH", "") + JOB_OUTPUT_STREAM = os.getenv("GITHUB_OUTPUT", "") + RUN_ID = os.getenv("GITHUB_RUN_ID", "0") + RUN_URL = f"https://github.com/{REPOSITORY}/actions/runs/{RUN_ID}" + BASE_BRANCH = os.getenv("GITHUB_BASE_REF", "") + + if EVENT_FILE_PATH: + with open(EVENT_FILE_PATH, "r", encoding="utf-8") as f: + github_event = json.load(f) + if "pull_request" in github_event: + EVENT_TYPE = Workflow.Event.PULL_REQUEST + PR_NUMBER = github_event["pull_request"]["number"] + SHA = github_event["pull_request"]["head"]["sha"] + CHANGE_URL = github_event["pull_request"]["html_url"] + COMMIT_URL = CHANGE_URL + f"/commits/{SHA}" + elif "commits" in github_event: + EVENT_TYPE = Workflow.Event.PUSH + SHA = github_event["after"] + CHANGE_URL = github_event["head_commit"]["url"] # commit url + PR_NUMBER = 0 + COMMIT_URL = CHANGE_URL + else: + assert False, "TODO: not supported" + else: + print("WARNING: Local execution - dummy Environment will be generated") + SHA = "TEST" + PR_NUMBER = -1 + EVENT_TYPE = Workflow.Event.PUSH + CHANGE_URL = "" + COMMIT_URL = "" + + INSTANCE_TYPE = ( + os.getenv("INSTANCE_TYPE", None) + # or Shell.get_output("ec2metadata --instance-type") + or "" + ) + INSTANCE_ID = ( + os.getenv("INSTANCE_ID", None) + # or Shell.get_output("ec2metadata --instance-id") + or "" + ) + INSTANCE_LIFE_CYCLE = ( + os.getenv("INSTANCE_LIFE_CYCLE", None) + # or Shell.get_output( + # "curl -s --fail http://169.254.169.254/latest/meta-data/instance-life-cycle" + # ) + or "" + ) + + return _Environment( + WORKFLOW_NAME=WORKFLOW_NAME, + JOB_NAME=JOB_NAME, + REPOSITORY=REPOSITORY, + BRANCH=BRANCH, + EVENT_FILE_PATH=EVENT_FILE_PATH, + JOB_OUTPUT_STREAM=JOB_OUTPUT_STREAM, + SHA=SHA, + EVENT_TYPE=EVENT_TYPE, + PR_NUMBER=PR_NUMBER, + RUN_ID=RUN_ID, + CHANGE_URL=CHANGE_URL, + COMMIT_URL=COMMIT_URL, + RUN_URL=RUN_URL, + BASE_BRANCH=BASE_BRANCH, + INSTANCE_TYPE=INSTANCE_TYPE, + INSTANCE_ID=INSTANCE_ID, + INSTANCE_LIFE_CYCLE=INSTANCE_LIFE_CYCLE, + REPORT_INFO=[], + ) + + def get_s3_prefix(self, latest=False): + return self.get_s3_prefix_static(self.PR_NUMBER, self.BRANCH, self.SHA, latest) + + @classmethod + def get_s3_prefix_static(cls, pr_number, branch, sha, latest=False): + prefix = "" + if pr_number > 0: + prefix += f"{pr_number}" + else: + prefix += f"{branch}" + if latest: + prefix += f"/latest" + elif sha: + prefix += f"/{sha}" + return prefix + + # TODO: find a better place for the function. This file should not import praktika.settings + # as it's requires reading users config, that's why imports nested inside the function + def get_report_url(self): + import urllib + + from praktika.settings import Settings + from praktika.utils import Utils + + path = Settings.HTML_S3_PATH + for bucket, endpoint in Settings.S3_BUCKET_TO_HTTP_ENDPOINT.items(): + if bucket in path: + path = path.replace(bucket, endpoint) + break + REPORT_URL = f"https://{path}/{Path(Settings.HTML_PAGE_FILE).name}?PR={self.PR_NUMBER}&sha={self.SHA}&name_0={urllib.parse.quote(self.WORKFLOW_NAME, safe='')}&name_1={urllib.parse.quote(self.JOB_NAME, safe='')}" + return REPORT_URL + + def is_local_run(self): + return self.LOCAL_RUN + + +def _to_object(data): + if isinstance(data, dict): + return SimpleNamespace(**{k: _to_object(v) for k, v in data.items()}) + elif isinstance(data, list): + return [_to_object(i) for i in data] + else: + return data diff --git a/ci/praktika/_settings.py b/ci/praktika/_settings.py new file mode 100644 index 00000000000..3052d8ef877 --- /dev/null +++ b/ci/praktika/_settings.py @@ -0,0 +1,124 @@ +import dataclasses +from pathlib import Path +from typing import Dict, Iterable, List, Optional + + +@dataclasses.dataclass +class _Settings: + ###################################### + # Pipeline generation settings # + ###################################### + CI_PATH = "./ci" + WORKFLOW_PATH_PREFIX: str = "./.github/workflows" + WORKFLOWS_DIRECTORY: str = f"{CI_PATH}/workflows" + SETTINGS_DIRECTORY: str = f"{CI_PATH}/settings" + CI_CONFIG_JOB_NAME = "Config Workflow" + DOCKER_BUILD_JOB_NAME = "Docker Builds" + FINISH_WORKFLOW_JOB_NAME = "Finish Workflow" + READY_FOR_MERGE_STATUS_NAME = "Ready for Merge" + CI_CONFIG_RUNS_ON: Optional[List[str]] = None + DOCKER_BUILD_RUNS_ON: Optional[List[str]] = None + VALIDATE_FILE_PATHS: bool = True + + ###################################### + # Runtime Settings # + ###################################### + MAX_RETRIES_S3 = 3 + MAX_RETRIES_GH = 3 + + ###################################### + # S3 (artifact storage) settings # + ###################################### + S3_ARTIFACT_PATH: str = "" + + ###################################### + # CI workspace settings # + ###################################### + TEMP_DIR: str = "/tmp/praktika" + OUTPUT_DIR: str = f"{TEMP_DIR}/output" + INPUT_DIR: str = f"{TEMP_DIR}/input" + PYTHON_INTERPRETER: str = "python3" + PYTHON_PACKET_MANAGER: str = "pip3" + PYTHON_VERSION: str = "3.9" + INSTALL_PYTHON_FOR_NATIVE_JOBS: bool = False + INSTALL_PYTHON_REQS_FOR_NATIVE_JOBS: str = "./ci/requirements.txt" + ENVIRONMENT_VAR_FILE: str = f"{TEMP_DIR}/environment.json" + RUN_LOG: str = f"{TEMP_DIR}/praktika_run.log" + + SECRET_GH_APP_ID: str = "GH_APP_ID" + SECRET_GH_APP_PEM_KEY: str = "GH_APP_PEM_KEY" + + ENV_SETUP_SCRIPT: str = "/tmp/praktika_setup_env.sh" + WORKFLOW_STATUS_FILE: str = f"{TEMP_DIR}/workflow_status.json" + + ###################################### + # CI Cache settings # + ###################################### + CACHE_VERSION: int = 1 + CACHE_DIGEST_LEN: int = 20 + CACHE_S3_PATH: str = "" + CACHE_LOCAL_PATH: str = f"{TEMP_DIR}/ci_cache" + + ###################################### + # Report settings # + ###################################### + HTML_S3_PATH: str = "" + HTML_PAGE_FILE: str = "./praktika/json.html" + TEXT_CONTENT_EXTENSIONS: Iterable[str] = frozenset([".txt", ".log"]) + S3_BUCKET_TO_HTTP_ENDPOINT: Optional[Dict[str, str]] = None + + DOCKERHUB_USERNAME: str = "" + DOCKERHUB_SECRET: str = "" + DOCKER_WD: str = "/wd" + + ###################################### + # CI DB Settings # + ###################################### + SECRET_CI_DB_URL: str = "CI_DB_URL" + SECRET_CI_DB_PASSWORD: str = "CI_DB_PASSWORD" + CI_DB_DB_NAME = "" + CI_DB_TABLE_NAME = "" + CI_DB_INSERT_TIMEOUT_SEC = 5 + + +_USER_DEFINED_SETTINGS = [ + "S3_ARTIFACT_PATH", + "CACHE_S3_PATH", + "HTML_S3_PATH", + "S3_BUCKET_TO_HTTP_ENDPOINT", + "TEXT_CONTENT_EXTENSIONS", + "TEMP_DIR", + "OUTPUT_DIR", + "INPUT_DIR", + "CI_CONFIG_RUNS_ON", + "DOCKER_BUILD_RUNS_ON", + "CI_CONFIG_JOB_NAME", + "PYTHON_INTERPRETER", + "PYTHON_VERSION", + "PYTHON_PACKET_MANAGER", + "INSTALL_PYTHON_FOR_NATIVE_JOBS", + "INSTALL_PYTHON_REQS_FOR_NATIVE_JOBS", + "MAX_RETRIES_S3", + "MAX_RETRIES_GH", + "VALIDATE_FILE_PATHS", + "DOCKERHUB_USERNAME", + "DOCKERHUB_SECRET", + "READY_FOR_MERGE_STATUS_NAME", + "SECRET_CI_DB_URL", + "SECRET_CI_DB_PASSWORD", + "CI_DB_DB_NAME", + "CI_DB_TABLE_NAME", + "CI_DB_INSERT_TIMEOUT_SEC", + "SECRET_GH_APP_PEM_KEY", + "SECRET_GH_APP_ID", +] + + +class GHRunners: + ubuntu = "ubuntu-latest" + + +if __name__ == "__main__": + for setting in _USER_DEFINED_SETTINGS: + print(_Settings().__getattribute__(setting)) + # print(dataclasses.asdict(_Settings())) diff --git a/ci/praktika/artifact.py b/ci/praktika/artifact.py new file mode 100644 index 00000000000..ba05f18b9b1 --- /dev/null +++ b/ci/praktika/artifact.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass + + +class Artifact: + class Type: + GH = "github" + S3 = "s3" + PHONY = "phony" + + @dataclass + class Config: + """ + name - artifact name + type - artifact type, see Artifact.Type + path - file path or glob, e.g. "path/**/[abc]rtifac?/*" + """ + + name: str + type: str + path: str + _provided_by: str = "" + _s3_path: str = "" + + def is_s3_artifact(self): + return self.type == Artifact.Type.S3 + + @classmethod + def define_artifact(cls, name, type, path): + return cls.Config(name=name, type=type, path=path) + + @classmethod + def define_gh_artifact(cls, name, path): + return cls.define_artifact(name=name, type=cls.Type.GH, path=path) diff --git a/ci/praktika/cache.py b/ci/praktika/cache.py new file mode 100644 index 00000000000..cbaea9b489b --- /dev/null +++ b/ci/praktika/cache.py @@ -0,0 +1,127 @@ +import dataclasses +import json +from pathlib import Path + +from praktika import Artifact, Job, Workflow +from praktika._environment import _Environment +from praktika.digest import Digest +from praktika.s3 import S3 +from praktika.settings import Settings +from praktika.utils import Utils + + +class Cache: + @dataclasses.dataclass + class CacheRecord: + class Type: + SUCCESS = "success" + + type: str + sha: str + pr_number: int + branch: str + + def dump(self, path): + with open(path, "w", encoding="utf8") as f: + json.dump(dataclasses.asdict(self), f) + + @classmethod + def from_fs(cls, path): + with open(path, "r", encoding="utf8") as f: + return Cache.CacheRecord(**json.load(f)) + + @classmethod + def from_dict(cls, obj): + return Cache.CacheRecord(**obj) + + def __init__(self): + self.digest = Digest() + self.success = {} # type Dict[str, Any] + + @classmethod + def push_success_record(cls, job_name, job_digest, sha): + type_ = Cache.CacheRecord.Type.SUCCESS + record = Cache.CacheRecord( + type=type_, + sha=sha, + pr_number=_Environment.get().PR_NUMBER, + branch=_Environment.get().BRANCH, + ) + assert ( + Settings.CACHE_S3_PATH + ), f"Setting CACHE_S3_PATH must be defined with enabled CI Cache" + record_path = f"{Settings.CACHE_S3_PATH}/v{Settings.CACHE_VERSION}/{Utils.normalize_string(job_name)}/{job_digest}" + record_file = Path(Settings.TEMP_DIR) / type_ + record.dump(record_file) + S3.copy_file_to_s3(s3_path=record_path, local_path=record_file) + record_file.unlink() + + def fetch_success(self, job_name, job_digest): + type_ = Cache.CacheRecord.Type.SUCCESS + assert ( + Settings.CACHE_S3_PATH + ), f"Setting CACHE_S3_PATH must be defined with enabled CI Cache" + record_path = f"{Settings.CACHE_S3_PATH}/v{Settings.CACHE_VERSION}/{Utils.normalize_string(job_name)}/{job_digest}/{type_}" + record_file_local_dir = ( + f"{Settings.CACHE_LOCAL_PATH}/{Utils.normalize_string(job_name)}/" + ) + Path(record_file_local_dir).mkdir(parents=True, exist_ok=True) + + if S3.head_object(record_path): + res = S3.copy_file_from_s3( + s3_path=record_path, local_path=record_file_local_dir + ) + else: + res = None + + if res: + print(f"Cache record found, job [{job_name}], digest [{job_digest}]") + self.success[job_name] = True + return Cache.CacheRecord.from_fs(Path(record_file_local_dir) / type_) + return None + + +if __name__ == "__main__": + # test + c = Cache() + workflow = Workflow.Config( + name="TEST", + event=Workflow.Event.PULL_REQUEST, + jobs=[ + Job.Config( + name="JobA", + runs_on=["some"], + command="python -m unittest ./ci/tests/example_1/test_example_produce_artifact.py", + provides=["greet"], + job_requirements=Job.Requirements( + python_requirements_txt="./ci/requirements.txt" + ), + digest_config=Job.CacheDigestConfig( + # example: use glob to include files + include_paths=["./ci/tests/example_1/test_example_consume*.py"], + ), + ), + Job.Config( + name="JobB", + runs_on=["some"], + command="python -m unittest ./ci/tests/example_1/test_example_consume_artifact.py", + requires=["greet"], + job_requirements=Job.Requirements( + python_requirements_txt="./ci/requirements.txt" + ), + digest_config=Job.CacheDigestConfig( + # example: use dir to include files recursively + include_paths=["./ci/tests/example_1"], + # example: use glob to exclude files from digest + exclude_paths=[ + "./ci/tests/example_1/test_example_consume*", + "./**/*.pyc", + ], + ), + ), + ], + artifacts=[Artifact.Config(type="s3", name="greet", path="hello")], + enable_cache=True, + ) + for job in workflow.jobs: + print(c.digest.calc_job_digest(job)) diff --git a/ci/praktika/cidb.py b/ci/praktika/cidb.py new file mode 100644 index 00000000000..087845ec762 --- /dev/null +++ b/ci/praktika/cidb.py @@ -0,0 +1,136 @@ +import copy +import dataclasses +import json +from typing import Optional + +import requests +from praktika._environment import _Environment +from praktika.result import Result +from praktika.settings import Settings +from praktika.utils import Utils + + +class CIDB: + @dataclasses.dataclass + class TableRecord: + pull_request_number: int + commit_sha: str + commit_url: str + check_name: str + check_status: str + check_duration_ms: int + check_start_time: int + report_url: str + pull_request_url: str + base_ref: str + base_repo: str + head_ref: str + head_repo: str + task_url: str + instance_type: str + instance_id: str + test_name: str + test_status: str + test_duration_ms: Optional[int] + test_context_raw: str + + def __init__(self, url, passwd): + self.url = url + self.auth = { + "X-ClickHouse-User": "default", + "X-ClickHouse-Key": passwd, + } + + @classmethod + def json_data_generator(cls, result: Result): + env = _Environment.get() + base_record = cls.TableRecord( + pull_request_number=env.PR_NUMBER, + commit_sha=env.SHA, + commit_url=env.COMMIT_URL, + check_name=result.name, + check_status=result.status, + check_duration_ms=int(result.duration * 1000), + check_start_time=Utils.timestamp_to_str(result.start_time), + report_url=env.get_report_url(), + pull_request_url=env.CHANGE_URL, + base_ref=env.BASE_BRANCH, + base_repo=env.REPOSITORY, + head_ref=env.BRANCH, + # TODO: remove from table? + head_repo=env.REPOSITORY, + # TODO: remove from table? + task_url="", + instance_type=",".join([env.INSTANCE_TYPE, env.INSTANCE_LIFE_CYCLE]), + instance_id=env.INSTANCE_ID, + test_name="", + test_status="", + test_duration_ms=None, + test_context_raw=result.info, + ) + yield json.dumps(dataclasses.asdict(base_record)) + for result_ in result.results: + record = copy.deepcopy(base_record) + record.test_name = result_.name + if result_.start_time: + record.check_start_time = (Utils.timestamp_to_str(result.start_time),) + record.test_status = result_.status + record.test_duration_ms = int(result_.duration * 1000) + record.test_context_raw = result_.info + yield json.dumps(dataclasses.asdict(record)) + + def insert(self, result: Result): + # Create a session object + params = { + "database": Settings.CI_DB_DB_NAME, + "query": f"INSERT INTO {Settings.CI_DB_TABLE_NAME} FORMAT JSONEachRow", + "date_time_input_format": "best_effort", + "send_logs_level": "warning", + } + + session = requests.Session() + + for json_str in self.json_data_generator(result): + try: + response1 = session.post( + url=self.url, + params=params, + data=json_str, + headers=self.auth, + timeout=Settings.CI_DB_INSERT_TIMEOUT_SEC, + ) + except Exception as ex: + raise ex + + session.close() + + def check(self): + # Create a session object + params = { + "database": Settings.CI_DB_DB_NAME, + "query": f"SELECT 1", + } + try: + response = requests.post( + url=self.url, + params=params, + data="", + headers=self.auth, + timeout=Settings.CI_DB_INSERT_TIMEOUT_SEC, + ) + if not response.ok: + print("ERROR: No connection to CI DB") + return ( + False, + f"ERROR: No connection to CI DB [{response.status_code}/{response.reason}]", + ) + if not response.json() == 1: + print("ERROR: CI DB smoke test failed select 1 == 1") + return ( + False, + f"ERROR: CI DB smoke test failed [select 1 ==> {response.json()}]", + ) + except Exception as ex: + print(f"ERROR: Exception [{ex}]") + return False, "CIDB: ERROR: Exception [{ex}]" + return True, "" diff --git a/ci/praktika/digest.py b/ci/praktika/digest.py new file mode 100644 index 00000000000..93b62b13dc0 --- /dev/null +++ b/ci/praktika/digest.py @@ -0,0 +1,112 @@ +import dataclasses +import hashlib +import os +from hashlib import md5 +from pathlib import Path +from typing import List + +from praktika import Job +from praktika.docker import Docker +from praktika.settings import Settings +from praktika.utils import Utils + + +class Digest: + def __init__(self): + self.digest_cache = {} + + @staticmethod + def _hash_digest_config(digest_config: Job.CacheDigestConfig) -> str: + data_dict = dataclasses.asdict(digest_config) + hash_obj = md5() + hash_obj.update(str(data_dict).encode()) + hash_string = hash_obj.hexdigest() + return hash_string + + def calc_job_digest(self, job_config: Job.Config): + config = job_config.digest_config + if not config: + return "f" * Settings.CACHE_DIGEST_LEN + + cache_key = self._hash_digest_config(config) + + if cache_key in self.digest_cache: + return self.digest_cache[cache_key] + + included_files = Utils.traverse_paths( + job_config.digest_config.include_paths, + job_config.digest_config.exclude_paths, + sorted=True, + ) + + print( + f"calc digest for job [{job_config.name}]: hash_key [{cache_key}], include [{len(included_files)}] files" + ) + # Sort files to ensure consistent hash calculation + included_files.sort() + + # Calculate MD5 hash + res = "" + if not included_files: + res = "f" * Settings.CACHE_DIGEST_LEN + print(f"NOTE: empty digest config [{config}] - return dummy digest") + else: + hash_md5 = hashlib.md5() + for file_path in included_files: + res = self._calc_file_digest(file_path, hash_md5) + assert res + self.digest_cache[cache_key] = res + return res + + def calc_docker_digest( + self, + docker_config: Docker.Config, + dependency_configs: List[Docker.Config], + hash_md5=None, + ): + """ + + :param hash_md5: + :param dependency_configs: list of Docker.Config(s) that :param docker_config: depends on + :param docker_config: Docker.Config to calculate digest for + :return: + """ + print(f"Calculate digest for docker [{docker_config.name}]") + paths = Utils.traverse_path(docker_config.path, sorted=True) + if not hash_md5: + hash_md5 = hashlib.md5() + + dependencies = [] + for dependency_name in docker_config.depends_on: + for dependency_config in dependency_configs: + if dependency_config.name == dependency_name: + print( + f"Add docker [{dependency_config.name}] as dependency for docker [{docker_config.name}] digest calculation" + ) + dependencies.append(dependency_config) + + for dependency in dependencies: + _ = self.calc_docker_digest(dependency, dependency_configs, hash_md5) + + for path in paths: + _ = self._calc_file_digest(path, hash_md5=hash_md5) + + return hash_md5.hexdigest()[: Settings.CACHE_DIGEST_LEN] + + @staticmethod + def _calc_file_digest(file_path, hash_md5): + # Resolve file path if it's a symbolic link + resolved_path = file_path + if Path(file_path).is_symlink(): + resolved_path = os.path.realpath(file_path) + if not Path(resolved_path).is_file(): + print( + f"WARNING: No valid file resolved by link {file_path} -> {resolved_path} - skipping digest calculation" + ) + return hash_md5.hexdigest()[: Settings.CACHE_DIGEST_LEN] + + with open(resolved_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + + return hash_md5.hexdigest()[: Settings.CACHE_DIGEST_LEN] diff --git a/ci/praktika/docker.py b/ci/praktika/docker.py new file mode 100644 index 00000000000..82e97b4624c --- /dev/null +++ b/ci/praktika/docker.py @@ -0,0 +1,60 @@ +import dataclasses +from typing import List + +from praktika.utils import Shell + + +class Docker: + class Platforms: + ARM = "linux/arm64" + AMD = "linux/amd64" + arm_amd = [ARM, AMD] + + @dataclasses.dataclass + class Config: + name: str + path: str + depends_on: List[str] + platforms: List[str] + + @classmethod + def build(cls, config: "Docker.Config", log_file, digests, add_latest): + tags_substr = f" -t {config.name}:{digests[config.name]}" + if add_latest: + tags_substr = f" -t {config.name}:latest" + + from_tag = "" + if config.depends_on: + assert ( + len(config.depends_on) == 1 + ), f"Only one dependency in depends_on is currently supported, docker [{config}]" + from_tag = f" --build-arg FROM_TAG={digests[config.depends_on[0]]}" + + command = f"docker buildx build --platform {','.join(config.platforms)} {tags_substr} {from_tag} --cache-to type=inline --cache-from type=registry,ref={config.name} --push {config.path}" + return Shell.run(command, log_file=log_file, verbose=True) + + @classmethod + def sort_in_build_order(cls, dockers: List["Docker.Config"]): + ready_names = [] + i = 0 + while i < len(dockers): + docker = dockers[i] + if not docker.depends_on or all( + dep in ready_names for dep in docker.depends_on + ): + ready_names.append(docker.name) + i += 1 + else: + dockers.append(dockers.pop(i)) + return dockers + + @classmethod + def login(cls, user_name, user_password): + print("Docker: log in to dockerhub") + return Shell.check( + f"docker login --username '{user_name}' --password-stdin", + strict=True, + stdin_str=user_password, + encoding="utf-8", + verbose=True, + ) diff --git a/ci/praktika/environment.py b/ci/praktika/environment.py new file mode 100644 index 00000000000..8f53aa6230b --- /dev/null +++ b/ci/praktika/environment.py @@ -0,0 +1,3 @@ +from praktika._environment import _Environment + +Environment = _Environment.get() diff --git a/tests/integration/test_system_replicated_fetches/__init__.py b/ci/praktika/execution/__init__.py similarity index 100% rename from tests/integration/test_system_replicated_fetches/__init__.py rename to ci/praktika/execution/__init__.py diff --git a/ci/praktika/execution/__main__.py b/ci/praktika/execution/__main__.py new file mode 100644 index 00000000000..c1f08fcca6a --- /dev/null +++ b/ci/praktika/execution/__main__.py @@ -0,0 +1,4 @@ +from praktika.execution.machine_init import run + +if __name__ == "__main__": + run() diff --git a/ci/praktika/execution/execution_settings.py b/ci/praktika/execution/execution_settings.py new file mode 100644 index 00000000000..d04b9a773ec --- /dev/null +++ b/ci/praktika/execution/execution_settings.py @@ -0,0 +1,31 @@ +import os + +from praktika.utils import MetaClasses + + +class ScalingType(metaclass=MetaClasses.WithIter): + DISABLED = "disabled" + AUTOMATIC_SCALE_DOWN = "scale_down" + AUTOMATIC_SCALE_UP_DOWN = "scale" + + +class DefaultExecutionSettings: + GH_ACTIONS_DIRECTORY: str = "/home/ubuntu/gh_actions" + RUNNER_SCALING_TYPE: str = ScalingType.AUTOMATIC_SCALE_UP_DOWN + MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC: int = 30 + + +class ExecutionSettings: + GH_ACTIONS_DIRECTORY = os.getenv( + "GH_ACTIONS_DIRECTORY", DefaultExecutionSettings.GH_ACTIONS_DIRECTORY + ) + RUNNER_SCALING_TYPE = os.getenv( + "RUNNER_SCALING_TYPE", DefaultExecutionSettings.RUNNER_SCALING_TYPE + ) + MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC = int( + os.getenv( + "MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC", + DefaultExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC, + ) + ) + LOCAL_EXECUTION = bool(os.getenv("CLOUD", "0") == "0") diff --git a/ci/praktika/execution/machine_init.py b/ci/praktika/execution/machine_init.py new file mode 100644 index 00000000000..7829538c5a9 --- /dev/null +++ b/ci/praktika/execution/machine_init.py @@ -0,0 +1,338 @@ +import os +import platform +import signal +import time +import traceback + +import requests +from praktika.execution.execution_settings import ExecutionSettings, ScalingType +from praktika.utils import ContextManager, Shell + + +class StateMachine: + class StateNames: + INIT = "init" + WAIT = "wait" + RUN = "run" + + def __init__(self): + self.state = self.StateNames.INIT + self.scale_type = ExecutionSettings.RUNNER_SCALING_TYPE + self.machine = Machine(scaling_type=self.scale_type).update_instance_info() + self.state_updated_at = int(time.time()) + self.forked = False + + def kick(self): + if self.state == self.StateNames.INIT: + self.machine.config_actions().run_actions_async() + print("State Machine: INIT -> WAIT") + self.state = self.StateNames.WAIT + self.state_updated_at = int(time.time()) + # TODO: add monitoring + if not self.machine.is_actions_process_healthy(): + print(f"ERROR: GH runner process unexpectedly died") + self.machine.self_terminate(decrease_capacity=False) + elif self.state == self.StateNames.WAIT: + res = self.machine.check_job_assigned() + if res: + print("State Machine: WAIT -> RUN") + self.state = self.StateNames.RUN + self.state_updated_at = int(time.time()) + self.check_scale_up() + else: + self.check_scale_down() + elif self.state == self.StateNames.RUN: + res = self.machine.check_job_running() + if res: + pass + else: + print("State Machine: RUN -> INIT") + self.state = self.StateNames.INIT + self.state_updated_at = int(time.time()) + + def check_scale_down(self): + if self.scale_type not in ( + ScalingType.AUTOMATIC_SCALE_DOWN, + ScalingType.AUTOMATIC_SCALE_UP_DOWN, + ): + return + if ScalingType.AUTOMATIC_SCALE_UP_DOWN and not self.forked: + print( + f"Scaling type is AUTOMATIC_SCALE_UP_DOWN and machine has not run a job - do not scale down" + ) + return + if ( + int(time.time()) - self.state_updated_at + > ExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC + ): + print( + f"No job assigned for more than MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC [{ExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC}] - scale down the instance" + ) + if not ExecutionSettings.LOCAL_EXECUTION: + self.machine.self_terminate(decrease_capacity=True) + else: + print("Local execution - skip scaling operation") + + def check_scale_up(self): + if self.scale_type not in (ScalingType.AUTOMATIC_SCALE_UP_DOWN,): + return + if self.forked: + print("This instance already forked once - do not scale up") + return + self.machine.self_fork() + self.forked = True + + def run(self): + self.machine.unconfig_actions() + while True: + self.kick() + time.sleep(5) + + def terminate(self): + try: + self.machine.unconfig_actions() + except: + print("WARNING: failed to unconfig runner") + if not ExecutionSettings.LOCAL_EXECUTION: + if self.machine is not None: + self.machine.self_terminate(decrease_capacity=False) + time.sleep(10) + # wait termination + print("ERROR: failed to terminate instance via aws cli - try os call") + os.system("sudo shutdown now") + else: + print("NOTE: Local execution - machine won't be terminated") + + +class Machine: + @staticmethod + def get_latest_gh_actions_release(): + url = f"https://api.github.com/repos/actions/runner/releases/latest" + response = requests.get(url, timeout=5) + if response.status_code == 200: + latest_release = response.json() + return latest_release["tag_name"].removeprefix("v") + else: + print(f"Failed to get the latest release: {response.status_code}") + return None + + def __init__(self, scaling_type): + self.os_name = platform.system().lower() + assert self.os_name == "linux", f"Unsupported OS [{self.os_name}]" + if platform.machine() == "x86_64": + self.arch = "x64" + elif "aarch64" in platform.machine().lower(): + self.arch = "arm64" + else: + assert False, f"Unsupported arch [{platform.machine()}]" + self.instance_id = None + self.asg_name = None + self.runner_api_endpoint = None + self.runner_type = None + self.labels = [] + self.proc = None + assert scaling_type in ScalingType + self.scaling_type = scaling_type + + def install_gh_actions_runner(self): + gh_actions_version = self.get_latest_gh_actions_release() + assert self.os_name and gh_actions_version and self.arch + Shell.check( + f"rm -rf {ExecutionSettings.GH_ACTIONS_DIRECTORY}", + strict=True, + verbose=True, + ) + Shell.check( + f"mkdir {ExecutionSettings.GH_ACTIONS_DIRECTORY}", strict=True, verbose=True + ) + with ContextManager.cd(ExecutionSettings.GH_ACTIONS_DIRECTORY): + Shell.check( + f"curl -O -L https://github.com/actions/runner/releases/download/v{gh_actions_version}/actions-runner-{self.os_name}-{self.arch}-{gh_actions_version}.tar.gz", + strict=True, + verbose=True, + ) + Shell.check(f"tar xzf *tar.gz", strict=True, verbose=True) + Shell.check(f"rm -f *tar.gz", strict=True, verbose=True) + Shell.check(f"sudo ./bin/installdependencies.sh", strict=True, verbose=True) + Shell.check( + f"chown -R ubuntu:ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}", + strict=True, + verbose=True, + ) + + def _get_gh_token_from_ssm(self): + gh_token = Shell.get_output_or_raise( + "/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value" + ) + return gh_token + + def update_instance_info(self): + self.instance_id = Shell.get_output_or_raise("ec2metadata --instance-id") + assert self.instance_id + self.asg_name = Shell.get_output( + f"aws ec2 describe-instances --instance-id {self.instance_id} --query \"Reservations[].Instances[].Tags[?Key=='aws:autoscaling:groupName'].Value\" --output text" + ) + # self.runner_type = Shell.get_output_or_raise( + # f'/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values={self.instance_id}" --query "Tags[?Key==\'github:runner-type\'].Value" --output text' + # ) + self.runner_type = self.asg_name + if ( + self.scaling_type != ScalingType.DISABLED + and not ExecutionSettings.LOCAL_EXECUTION + ): + assert ( + self.asg_name and self.runner_type + ), f"Failed to retrieve ASG name, which is required for scaling_type [{self.scaling_type}]" + org = os.getenv("MY_ORG", "") + assert ( + org + ), "MY_ORG env variable myst be set to use init script for runner machine" + self.runner_api_endpoint = f"https://github.com/{org}" + + self.labels = ["self-hosted", self.runner_type] + return self + + @classmethod + def check_job_assigned(cls): + runner_pid = Shell.get_output_or_raise("pgrep Runner.Listener") + if not runner_pid: + print("check_job_assigned: No runner pid") + return False + log_file = Shell.get_output_or_raise( + f"lsof -p {runner_pid} | grep -o {ExecutionSettings.GH_ACTIONS_DIRECTORY}/_diag/Runner.*log" + ) + if not log_file: + print("check_job_assigned: No log file") + return False + return Shell.check(f"grep -q 'Terminal] .* Running job:' {log_file}") + + def check_job_running(self): + if self.proc is None: + print(f"WARNING: No job started") + return False + exit_code = self.proc.poll() + if exit_code is None: + return True + else: + print(f"Job runner finished with exit code [{exit_code}]") + self.proc = None + return False + + def config_actions(self): + if not self.instance_id: + self.update_instance_info() + token = self._get_gh_token_from_ssm() + assert token and self.instance_id and self.runner_api_endpoint and self.labels + command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/config.sh --token {token} \ + --url {self.runner_api_endpoint} --ephemeral --unattended --replace \ + --runnergroup Default --labels {','.join(self.labels)} --work wd --name {self.instance_id}" + res = 1 + i = 0 + while i < 10 and res != 0: + res = Shell.run(command) + i += 1 + if res != 0: + print( + f"ERROR: failed to configure GH actions runner after [{i}] attempts, exit code [{res}], retry after 10s" + ) + time.sleep(10) + self._get_gh_token_from_ssm() + if res == 0: + print("GH action runner has been configured") + else: + assert False, "GH actions runner configuration failed" + return self + + def unconfig_actions(self): + token = self._get_gh_token_from_ssm() + command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/config.sh remove --token {token}" + Shell.check(command, strict=True) + return self + + def run_actions_async(self): + command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/run.sh" + self.proc = Shell.run_async(command) + assert self.proc is not None + return self + + def is_actions_process_healthy(self): + try: + if self.proc.poll() is None: + return True + + stdout, stderr = self.proc.communicate() + + if self.proc.returncode != 0: + # Handle failure + print( + f"GH Action process failed with return code {self.proc.returncode}" + ) + print(f"Error output: {stderr}") + return False + else: + print(f"GH Action process is not running") + return False + except Exception as e: + print(f"GH Action process exception: {e}") + return False + + def self_terminate(self, decrease_capacity): + print( + f"WARNING: Self terminate is called, decrease_capacity [{decrease_capacity}]" + ) + traceback.print_stack() + if not self.instance_id: + self.update_instance_info() + assert self.instance_id + command = f"aws autoscaling terminate-instance-in-auto-scaling-group --instance-id {self.instance_id}" + if decrease_capacity: + command += " --should-decrement-desired-capacity" + else: + command += " --no-should-decrement-desired-capacity" + Shell.check( + command=command, + verbose=True, + ) + + def self_fork(self): + current_capacity = Shell.get_output( + f'aws autoscaling describe-auto-scaling-groups --auto-scaling-group-name {self.asg_name} \ + --query "AutoScalingGroups[0].DesiredCapacity" --output text' + ) + current_capacity = int(current_capacity) + if not current_capacity: + print("ERROR: failed to get current capacity - cannot scale up") + return + desired_capacity = current_capacity + 1 + command = f"aws autoscaling set-desired-capacity --auto-scaling-group-name {self.asg_name} --desired-capacity {desired_capacity}" + print(f"Increase capacity [{current_capacity} -> {desired_capacity}]") + res = Shell.check( + command=command, + verbose=True, + ) + if not res: + print("ERROR: failed to increase capacity - cannot scale up") + + +def handle_signal(signum, _frame): + print(f"FATAL: Received signal {signum}") + raise RuntimeError(f"killed by signal {signum}") + + +def run(): + signal.signal(signal.SIGINT, handle_signal) + signal.signal(signal.SIGTERM, handle_signal) + m = None + try: + m = StateMachine() + m.run() + except Exception as e: + print(f"FATAL: Exception [{e}] - terminate instance") + time.sleep(10) + if m: + m.terminate() + raise e + + +if __name__ == "__main__": + run() diff --git a/ci/praktika/favicon/lambda_function.py b/ci/praktika/favicon/lambda_function.py new file mode 100644 index 00000000000..7d89566de8c --- /dev/null +++ b/ci/praktika/favicon/lambda_function.py @@ -0,0 +1,102 @@ +import base64 +import random +import struct +import zlib + + +def create_favicon(): + # Image dimensions + width = 32 + height = 32 + + # Initialize a transparent background image (RGBA: 4 bytes per pixel) + image_data = bytearray( + [0, 0, 0, 0] * width * height + ) # Set alpha to 0 for transparency + + # Draw 4 vertical lines with color #FAFF68 (RGB: 250, 255, 104) + line_color = [250, 255, 104, 255] # RGBA for #FAFF68 with full opacity + line_width = 4 + space_width = 3 + x_start = space_width + line_number = 4 + + line_height = height - space_width + + for i in range(line_number): + # Randomly pick a starting y position for each line + y_start = random.randint(0, height - 1) + + # Draw the line with random shift along Y-axis + for y in range(line_height): + y_pos = (y + y_start) % height + for x in range(line_width): + pixel_index = (y_pos * width + x_start + x) * 4 + image_data[pixel_index : pixel_index + 4] = line_color + + x_start += line_width + space_width + + # Convert the RGBA image to PNG format + png_data = create_png(width, height, image_data) + + # Convert PNG to ICO format + ico_data = create_ico(png_data) + + return ico_data + + +def create_png(width, height, image_data): + def write_chunk(chunk_type, data): + chunk_len = struct.pack(">I", len(data)) + chunk_crc = struct.pack(">I", zlib.crc32(chunk_type + data) & 0xFFFFFFFF) + return chunk_len + chunk_type + data + chunk_crc + + png_signature = b"\x89PNG\r\n\x1a\n" + ihdr_chunk = struct.pack(">IIBBBBB", width, height, 8, 6, 0, 0, 0) + idat_data = zlib.compress( + b"".join( + b"\x00" + image_data[y * width * 4 : (y + 1) * width * 4] + for y in range(height) + ), + 9, + ) + idat_chunk = write_chunk(b"IDAT", idat_data) + iend_chunk = write_chunk(b"IEND", b"") + + return png_signature + write_chunk(b"IHDR", ihdr_chunk) + idat_chunk + iend_chunk + + +def create_ico(png_data): + # ICO header: reserved (2 bytes), type (2 bytes), image count (2 bytes) + ico_header = struct.pack(" None: + wf = _get_workflows(workflow_name) # type: List[Workflow.Config] + pem = wf[0].get_secret(Settings.SECRET_GH_APP_PEM_KEY).get_value() + assert pem + app_id = wf[0].get_secret(Settings.SECRET_GH_APP_ID).get_value() + # Generate JWT + jwt_token = cls._generate_jwt(app_id, pem) + # Get Installation ID + installation_id = cls._get_installation_id(jwt_token) + # Get Installation Access Token + access_token = cls._get_access_token(jwt_token, installation_id) + Shell.check(f"echo {access_token} | gh auth login --with-token", strict=True) + + +if __name__ == "__main__": + GHAuth.auth(sys.argv[1]) diff --git a/ci/praktika/hook_cache.py b/ci/praktika/hook_cache.py new file mode 100644 index 00000000000..b1b5c654f20 --- /dev/null +++ b/ci/praktika/hook_cache.py @@ -0,0 +1,124 @@ +from praktika._environment import _Environment +from praktika.cache import Cache +from praktika.mangle import _get_workflows +from praktika.runtime import RunConfig +from praktika.settings import Settings +from praktika.utils import Utils + + +class CacheRunnerHooks: + @classmethod + def configure(cls, _workflow): + workflow_config = RunConfig.from_fs(_workflow.name) + cache = Cache() + assert _Environment.get().WORKFLOW_NAME + workflow = _get_workflows(name=_Environment.get().WORKFLOW_NAME)[0] + print(f"Workflow Configure, workflow [{workflow.name}]") + assert ( + workflow.enable_cache + ), f"Outdated yaml pipelines or BUG. Configuration must be run only for workflow with enabled cache, workflow [{workflow.name}]" + artifact_digest_map = {} + job_digest_map = {} + for job in workflow.jobs: + if not job.digest_config: + print( + f"NOTE: job [{job.name}] has no Config.digest_config - skip cache check, always run" + ) + digest = cache.digest.calc_job_digest(job_config=job) + job_digest_map[job.name] = digest + if job.provides: + # assign the job digest also to the artifacts it provides + for artifact in job.provides: + artifact_digest_map[artifact] = digest + for job in workflow.jobs: + digests_combined_list = [] + if job.requires: + # include digest of required artifact to the job digest, so that they affect job state + for artifact_name in job.requires: + if artifact_name not in [ + artifact.name for artifact in workflow.artifacts + ]: + # phony artifact assumed to be not affecting jobs that depend on it + continue + digests_combined_list.append(artifact_digest_map[artifact_name]) + digests_combined_list.append(job_digest_map[job.name]) + final_digest = "-".join(digests_combined_list) + workflow_config.digest_jobs[job.name] = final_digest + + assert ( + workflow_config.digest_jobs + ), f"BUG, Workflow with enabled cache must have job digests after configuration, wf [{workflow.name}]" + + print("Check remote cache") + job_to_cache_record = {} + for job_name, job_digest in workflow_config.digest_jobs.items(): + record = cache.fetch_success(job_name=job_name, job_digest=job_digest) + if record: + assert ( + Utils.normalize_string(job_name) + not in workflow_config.cache_success + ) + workflow_config.cache_success.append(job_name) + workflow_config.cache_success_base64.append(Utils.to_base64(job_name)) + job_to_cache_record[job_name] = record + + print("Check artifacts to reuse") + for job in workflow.jobs: + if job.name in workflow_config.cache_success: + if job.provides: + for artifact_name in job.provides: + workflow_config.cache_artifacts[artifact_name] = ( + job_to_cache_record[job.name] + ) + + print(f"Write config to GH's job output") + with open(_Environment.get().JOB_OUTPUT_STREAM, "a", encoding="utf8") as f: + print( + f"DATA={workflow_config.to_json()}", + file=f, + ) + print(f"WorkflowRuntimeConfig: [{workflow_config.to_json(pretty=True)}]") + print( + "Dump WorkflowConfig to fs, the next hooks in this job might want to see it" + ) + workflow_config.dump() + + return workflow_config + + @classmethod + def pre_run(cls, _workflow, _job, _required_artifacts=None): + path_prefixes = [] + if _job.name == Settings.CI_CONFIG_JOB_NAME: + # SPECIAL handling + return path_prefixes + env = _Environment.get() + runtime_config = RunConfig.from_fs(_workflow.name) + required_artifacts = [] + if _required_artifacts: + required_artifacts = _required_artifacts + for artifact in required_artifacts: + if artifact.name in runtime_config.cache_artifacts: + record = runtime_config.cache_artifacts[artifact.name] + print(f"Reuse artifact [{artifact.name}] from [{record}]") + path_prefixes.append( + env.get_s3_prefix_static( + record.pr_number, record.branch, record.sha + ) + ) + else: + path_prefixes.append(env.get_s3_prefix()) + return path_prefixes + + @classmethod + def run(cls, workflow, job): + pass + + @classmethod + def post_run(cls, workflow, job): + if job.name == Settings.CI_CONFIG_JOB_NAME: + return + if job.digest_config: + # cache is enabled, and it's a job that supposed to be cached (has defined digest config) + workflow_runtime = RunConfig.from_fs(workflow.name) + job_digest = workflow_runtime.digest_jobs[job.name] + Cache.push_success_record(job.name, job_digest, workflow_runtime.sha) diff --git a/ci/praktika/hook_html.py b/ci/praktika/hook_html.py new file mode 100644 index 00000000000..f4bd4435511 --- /dev/null +++ b/ci/praktika/hook_html.py @@ -0,0 +1,198 @@ +import dataclasses +import json +import urllib.parse +from pathlib import Path +from typing import List + +from praktika._environment import _Environment +from praktika.gh import GH +from praktika.parser import WorkflowConfigParser +from praktika.result import Result, ResultInfo +from praktika.runtime import RunConfig +from praktika.s3 import S3 +from praktika.settings import Settings +from praktika.utils import Shell, Utils + + +@dataclasses.dataclass +class GitCommit: + date: str + message: str + sha: str + + @staticmethod + def from_json(json_data: str) -> List["GitCommit"]: + commits = [] + try: + data = json.loads(json_data) + + commits = [ + GitCommit( + message=commit["messageHeadline"], + sha=commit["oid"], + date=commit["committedDate"], + ) + for commit in data.get("commits", []) + ] + except Exception as e: + print( + f"ERROR: Failed to deserialize commit's data: [{json_data}], ex: [{e}]" + ) + + return commits + + +class HtmlRunnerHooks: + @classmethod + def configure(cls, _workflow): + + def _get_pr_commits(pr_number): + res = [] + if not pr_number: + return res + output = Shell.get_output(f"gh pr view {pr_number} --json commits") + if output: + res = GitCommit.from_json(output) + return res + + # generate pending Results for all jobs in the workflow + if _workflow.enable_cache: + skip_jobs = RunConfig.from_fs(_workflow.name).cache_success + else: + skip_jobs = [] + + env = _Environment.get() + results = [] + for job in _workflow.jobs: + if job.name not in skip_jobs: + result = Result.generate_pending(job.name) + else: + result = Result.generate_skipped(job.name) + results.append(result) + summary_result = Result.generate_pending(_workflow.name, results=results) + summary_result.aux_links.append(env.CHANGE_URL) + summary_result.aux_links.append(env.RUN_URL) + summary_result.start_time = Utils.timestamp() + page_url = "/".join( + ["https:/", Settings.HTML_S3_PATH, str(Path(Settings.HTML_PAGE_FILE).name)] + ) + for bucket, endpoint in Settings.S3_BUCKET_TO_HTTP_ENDPOINT.items(): + page_url = page_url.replace(bucket, endpoint) + # TODO: add support for non-PRs (use branch?) + page_url += f"?PR={env.PR_NUMBER}&sha=latest&name_0={urllib.parse.quote(env.WORKFLOW_NAME, safe='')}" + summary_result.html_link = page_url + + # clean the previous latest results in PR if any + if env.PR_NUMBER: + S3.clean_latest_result() + S3.copy_result_to_s3( + summary_result, + unlock=False, + ) + + print(f"CI Status page url [{page_url}]") + + res1 = GH.post_commit_status( + name=_workflow.name, + status=Result.Status.PENDING, + description="", + url=page_url, + ) + res2 = GH.post_pr_comment( + comment_body=f"Workflow [[{_workflow.name}]({page_url})], commit [{_Environment.get().SHA[:8]}]", + or_update_comment_with_substring=f"Workflow [", + ) + if not (res1 or res2): + Utils.raise_with_error( + "Failed to set both GH commit status and PR comment with Workflow Status, cannot proceed" + ) + + if env.PR_NUMBER: + commits = _get_pr_commits(env.PR_NUMBER) + # TODO: upload commits data to s3 to visualise it on a report page + print(commits) + + @classmethod + def pre_run(cls, _workflow, _job): + result = Result.from_fs(_job.name) + S3.copy_result_from_s3( + Result.file_name_static(_workflow.name), + ) + workflow_result = Result.from_fs(_workflow.name) + workflow_result.update_sub_result(result) + S3.copy_result_to_s3( + workflow_result, + unlock=True, + ) + + @classmethod + def run(cls, _workflow, _job): + pass + + @classmethod + def post_run(cls, _workflow, _job, info_errors): + result = Result.from_fs(_job.name) + env = _Environment.get() + S3.copy_result_from_s3( + Result.file_name_static(_workflow.name), + lock=True, + ) + workflow_result = Result.from_fs(_workflow.name) + print(f"Workflow info [{workflow_result.info}], info_errors [{info_errors}]") + + env_info = env.REPORT_INFO + if env_info: + print( + f"WARNING: some info lines are set in Environment - append to report [{env_info}]" + ) + info_errors += env_info + if info_errors: + info_errors = [f" | {error}" for error in info_errors] + info_str = f"{_job.name}:\n" + info_str += "\n".join(info_errors) + print("Update workflow results with new info") + workflow_result.set_info(info_str) + + old_status = workflow_result.status + + S3.upload_result_files_to_s3(result) + workflow_result.update_sub_result(result) + + skipped_job_results = [] + if not result.is_ok(): + print( + "Current job failed - find dependee jobs in the workflow and set their statuses to skipped" + ) + workflow_config_parsed = WorkflowConfigParser(_workflow).parse() + for dependee_job in workflow_config_parsed.workflow_yaml_config.jobs: + if _job.name in dependee_job.needs: + if _workflow.get_job(dependee_job.name).run_unless_cancelled: + continue + print( + f"NOTE: Set job [{dependee_job.name}] status to [{Result.Status.SKIPPED}] due to current failure" + ) + skipped_job_results.append( + Result( + name=dependee_job.name, + status=Result.Status.SKIPPED, + info=ResultInfo.SKIPPED_DUE_TO_PREVIOUS_FAILURE + + f" [{_job.name}]", + ) + ) + for skipped_job_result in skipped_job_results: + workflow_result.update_sub_result(skipped_job_result) + + S3.copy_result_to_s3( + workflow_result, + unlock=True, + ) + if workflow_result.status != old_status: + print( + f"Update GH commit status [{result.name}]: [{old_status} -> {workflow_result.status}], link [{workflow_result.html_link}]" + ) + GH.post_commit_status( + name=workflow_result.name, + status=GH.convert_to_gh_status(workflow_result.status), + description="", + url=workflow_result.html_link, + ) diff --git a/ci/praktika/hook_interface.py b/ci/praktika/hook_interface.py new file mode 100644 index 00000000000..762ee62eeb1 --- /dev/null +++ b/ci/praktika/hook_interface.py @@ -0,0 +1,43 @@ +from abc import ABC, abstractmethod + +from praktika import Workflow + + +class HookInterface(ABC): + @abstractmethod + def pre_run(self, _workflow, _job): + """ + runs in pre-run step + :param _workflow: + :param _job: + :return: + """ + pass + + @abstractmethod + def run(self, _workflow, _job): + """ + runs in run step + :param _workflow: + :param _job: + :return: + """ + pass + + @abstractmethod + def post_run(self, _workflow, _job): + """ + runs in post-run step + :param _workflow: + :param _job: + :return: + """ + pass + + @abstractmethod + def configure(self, _workflow: Workflow.Config): + """ + runs in initial WorkflowConfig job in run step + :return: + """ + pass diff --git a/ci/praktika/html_prepare.py b/ci/praktika/html_prepare.py new file mode 100644 index 00000000000..54bee2f6bbf --- /dev/null +++ b/ci/praktika/html_prepare.py @@ -0,0 +1,10 @@ +from praktika.s3 import S3 +from praktika.settings import Settings + + +class Html: + @classmethod + def prepare(cls): + S3.copy_file_to_s3( + s3_path=Settings.HTML_S3_PATH, local_path=Settings.HTML_PAGE_FILE + ) diff --git a/ci/praktika/job.py b/ci/praktika/job.py new file mode 100644 index 00000000000..d0d4232cfa2 --- /dev/null +++ b/ci/praktika/job.py @@ -0,0 +1,102 @@ +import copy +import json +from dataclasses import dataclass, field +from typing import Any, List, Optional + + +class Job: + @dataclass + class Requirements: + python: bool = False + python_requirements_txt: str = "" + + @dataclass + class CacheDigestConfig: + include_paths: List[str] = field(default_factory=list) + exclude_paths: List[str] = field(default_factory=list) + + @dataclass + class Config: + # Job Name + name: str + + # Machine's label to run job on. For instance [ubuntu-latest] for free gh runner + runs_on: List[str] + + # Job Run Command + command: str + + # What job requires + # May be phony or physical names + requires: List[str] = field(default_factory=list) + + # What job provides + # May be phony or physical names + provides: List[str] = field(default_factory=list) + + job_requirements: Optional["Job.Requirements"] = None + + timeout: int = 1 * 3600 + + digest_config: Optional["Job.CacheDigestConfig"] = None + + run_in_docker: str = "" + + run_unless_cancelled: bool = False + + allow_merge_on_failure: bool = False + + parameter: Any = None + + def parametrize( + self, + parameter: Optional[List[Any]] = None, + runs_on: Optional[List[List[str]]] = None, + timeout: Optional[List[int]] = None, + ): + assert ( + parameter or runs_on + ), "Either :parameter or :runs_on must be non empty list for parametrisation" + if not parameter: + parameter = [None] * len(runs_on) + if not runs_on: + runs_on = [None] * len(parameter) + if not timeout: + timeout = [None] * len(parameter) + assert ( + len(parameter) == len(runs_on) == len(timeout) + ), "Parametrization lists must be of the same size" + + res = [] + for parameter_, runs_on_, timeout_ in zip(parameter, runs_on, timeout): + obj = copy.deepcopy(self) + if parameter_: + obj.parameter = parameter_ + if runs_on_: + obj.runs_on = runs_on_ + if timeout_: + obj.timeout = timeout_ + obj.name = obj.get_job_name_with_parameter() + res.append(obj) + return res + + def get_job_name_with_parameter(self): + name, parameter, runs_on = self.name, self.parameter, self.runs_on + res = name + name_params = [] + if isinstance(parameter, list) or isinstance(parameter, dict): + name_params.append(json.dumps(parameter)) + elif parameter is not None: + name_params.append(parameter) + if runs_on: + assert isinstance(runs_on, list) + name_params.append(json.dumps(runs_on)) + if name_params: + name_params = [str(param) for param in name_params] + res += f" ({', '.join(name_params)})" + + self.name = res + return res + + def __repr__(self): + return self.name diff --git a/ci/praktika/json.html b/ci/praktika/json.html new file mode 100644 index 00000000000..2f8c3e45d0b --- /dev/null +++ b/ci/praktika/json.html @@ -0,0 +1,745 @@ + + + + + + praktika report + + + + +
+
+
+ +
+
+
+
+ ☀️ +
+
+ + + + diff --git a/ci/praktika/mangle.py b/ci/praktika/mangle.py new file mode 100644 index 00000000000..89fc52cf849 --- /dev/null +++ b/ci/praktika/mangle.py @@ -0,0 +1,137 @@ +import copy +import importlib.util +from pathlib import Path +from typing import Any, Dict + +from praktika import Job +from praktika._settings import _USER_DEFINED_SETTINGS, _Settings +from praktika.utils import ContextManager, Utils + + +def _get_workflows(name=None, file=None): + """ + Gets user's workflow configs + """ + res = [] + + with ContextManager.cd(): + directory = Path(_Settings.WORKFLOWS_DIRECTORY) + for py_file in directory.glob("*.py"): + if file and file not in str(py_file): + continue + module_name = py_file.name.removeprefix(".py") + spec = importlib.util.spec_from_file_location( + module_name, f"{_Settings.WORKFLOWS_DIRECTORY}/{module_name}" + ) + assert spec + foo = importlib.util.module_from_spec(spec) + assert spec.loader + spec.loader.exec_module(foo) + try: + for workflow in foo.WORKFLOWS: + if name: + if name == workflow.name: + print(f"Read workflow [{name}] config from [{module_name}]") + res = [workflow] + break + else: + continue + else: + res += foo.WORKFLOWS + print(f"Read workflow configs from [{module_name}]") + except Exception as e: + print( + f"WARNING: Failed to add WORKFLOWS config from [{module_name}], exception [{e}]" + ) + if not res: + Utils.raise_with_error(f"Failed to find workflow [{name or file}]") + + for workflow in res: + # add native jobs + _update_workflow_with_native_jobs(workflow) + # fill in artifact properties, e.g. _provided_by + _update_workflow_artifacts(workflow) + return res + + +def _update_workflow_artifacts(workflow): + artifact_job = {} + for job in workflow.jobs: + for artifact_name in job.provides: + assert artifact_name not in artifact_job + artifact_job[artifact_name] = job.name + for artifact in workflow.artifacts: + artifact._provided_by = artifact_job[artifact.name] + + +def _update_workflow_with_native_jobs(workflow): + if workflow.dockers: + from praktika.native_jobs import _docker_build_job + + print(f"Enable native job [{_docker_build_job.name}] for [{workflow.name}]") + aux_job = copy.deepcopy(_docker_build_job) + if workflow.enable_cache: + print( + f"Add automatic digest config for [{aux_job.name}] job since cache is enabled" + ) + docker_digest_config = Job.CacheDigestConfig() + for docker_config in workflow.dockers: + docker_digest_config.include_paths.append(docker_config.path) + aux_job.digest_config = docker_digest_config + + workflow.jobs.insert(0, aux_job) + for job in workflow.jobs[1:]: + if not job.requires: + job.requires = [] + job.requires.append(aux_job.name) + + if ( + workflow.enable_cache + or workflow.enable_report + or workflow.enable_merge_ready_status + ): + from praktika.native_jobs import _workflow_config_job + + print(f"Enable native job [{_workflow_config_job.name}] for [{workflow.name}]") + aux_job = copy.deepcopy(_workflow_config_job) + workflow.jobs.insert(0, aux_job) + for job in workflow.jobs[1:]: + if not job.requires: + job.requires = [] + job.requires.append(aux_job.name) + + if workflow.enable_merge_ready_status: + from praktika.native_jobs import _final_job + + print(f"Enable native job [{_final_job.name}] for [{workflow.name}]") + aux_job = copy.deepcopy(_final_job) + for job in workflow.jobs: + aux_job.requires.append(job.name) + workflow.jobs.append(aux_job) + + +def _get_user_settings() -> Dict[str, Any]: + """ + Gets user's settings + """ + res = {} # type: Dict[str, Any] + + directory = Path(_Settings.SETTINGS_DIRECTORY) + for py_file in directory.glob("*.py"): + module_name = py_file.name.removeprefix(".py") + spec = importlib.util.spec_from_file_location( + module_name, f"{_Settings.SETTINGS_DIRECTORY}/{module_name}" + ) + assert spec + foo = importlib.util.module_from_spec(spec) + assert spec.loader + spec.loader.exec_module(foo) + for setting in _USER_DEFINED_SETTINGS: + try: + value = getattr(foo, setting) + res[setting] = value + print(f"Apply user defined setting [{setting} = {value}]") + except Exception as e: + pass + + return res diff --git a/ci/praktika/native_jobs.py b/ci/praktika/native_jobs.py new file mode 100644 index 00000000000..f7fd4ca190b --- /dev/null +++ b/ci/praktika/native_jobs.py @@ -0,0 +1,378 @@ +import sys +from typing import Dict + +from praktika import Job, Workflow +from praktika._environment import _Environment +from praktika.cidb import CIDB +from praktika.digest import Digest +from praktika.docker import Docker +from praktika.gh import GH +from praktika.hook_cache import CacheRunnerHooks +from praktika.hook_html import HtmlRunnerHooks +from praktika.mangle import _get_workflows +from praktika.result import Result, ResultInfo +from praktika.runtime import RunConfig +from praktika.s3 import S3 +from praktika.settings import Settings +from praktika.utils import Shell, Utils + +assert Settings.CI_CONFIG_RUNS_ON + +_workflow_config_job = Job.Config( + name=Settings.CI_CONFIG_JOB_NAME, + runs_on=Settings.CI_CONFIG_RUNS_ON, + job_requirements=( + Job.Requirements( + python=Settings.INSTALL_PYTHON_FOR_NATIVE_JOBS, + python_requirements_txt=Settings.INSTALL_PYTHON_REQS_FOR_NATIVE_JOBS, + ) + if Settings.INSTALL_PYTHON_REQS_FOR_NATIVE_JOBS + else None + ), + command=f"{Settings.PYTHON_INTERPRETER} -m praktika.native_jobs '{Settings.CI_CONFIG_JOB_NAME}'", +) + +_docker_build_job = Job.Config( + name=Settings.DOCKER_BUILD_JOB_NAME, + runs_on=Settings.DOCKER_BUILD_RUNS_ON, + job_requirements=Job.Requirements( + python=Settings.INSTALL_PYTHON_FOR_NATIVE_JOBS, + python_requirements_txt="", + ), + timeout=4 * 3600, + command=f"{Settings.PYTHON_INTERPRETER} -m praktika.native_jobs '{Settings.DOCKER_BUILD_JOB_NAME}'", +) + +_final_job = Job.Config( + name=Settings.FINISH_WORKFLOW_JOB_NAME, + runs_on=Settings.CI_CONFIG_RUNS_ON, + job_requirements=Job.Requirements( + python=Settings.INSTALL_PYTHON_FOR_NATIVE_JOBS, + python_requirements_txt="", + ), + command=f"{Settings.PYTHON_INTERPRETER} -m praktika.native_jobs '{Settings.FINISH_WORKFLOW_JOB_NAME}'", + run_unless_cancelled=True, +) + + +def _build_dockers(workflow, job_name): + print(f"Start [{job_name}], workflow [{workflow.name}]") + dockers = workflow.dockers + ready = [] + results = [] + job_status = Result.Status.SUCCESS + job_info = "" + dockers = Docker.sort_in_build_order(dockers) + docker_digests = {} # type: Dict[str, str] + for docker in dockers: + docker_digests[docker.name] = Digest().calc_docker_digest(docker, dockers) + + if not Shell.check( + "docker buildx inspect --bootstrap | grep -q docker-container", verbose=True + ): + print("Install docker container driver") + if not Shell.check( + "docker buildx create --use --name mybuilder --driver docker-container", + verbose=True, + ): + job_status = Result.Status.FAILED + job_info = "Failed to install docker buildx driver" + + if job_status == Result.Status.SUCCESS: + if not Docker.login( + Settings.DOCKERHUB_USERNAME, + user_password=workflow.get_secret(Settings.DOCKERHUB_SECRET).get_value(), + ): + job_status = Result.Status.FAILED + job_info = "Failed to login to dockerhub" + + if job_status == Result.Status.SUCCESS: + for docker in dockers: + assert ( + docker.name not in ready + ), f"All docker names must be uniq [{dockers}]" + stopwatch = Utils.Stopwatch() + info = f"{docker.name}:{docker_digests[docker.name]}" + log_file = f"{Settings.OUTPUT_DIR}/docker_{Utils.normalize_string(docker.name)}.log" + files = [] + + code, out, err = Shell.get_res_stdout_stderr( + f"docker manifest inspect {docker.name}:{docker_digests[docker.name]}" + ) + print( + f"Docker inspect results for {docker.name}:{docker_digests[docker.name]}: exit code [{code}], out [{out}], err [{err}]" + ) + if "no such manifest" in err: + ret_code = Docker.build( + docker, log_file=log_file, digests=docker_digests, add_latest=False + ) + if ret_code == 0: + status = Result.Status.SUCCESS + else: + status = Result.Status.FAILED + job_status = Result.Status.FAILED + info += f", failed with exit code: {ret_code}, see log" + files.append(log_file) + else: + print( + f"Docker image [{docker.name}:{docker_digests[docker.name]} exists - skip build" + ) + status = Result.Status.SKIPPED + ready.append(docker.name) + results.append( + Result( + name=docker.name, + status=status, + info=info, + duration=stopwatch.duration, + start_time=stopwatch.start_time, + files=files, + ) + ) + Result.from_fs(job_name).set_status(job_status).set_results(results).set_info( + job_info + ) + + if job_status != Result.Status.SUCCESS: + sys.exit(1) + + +def _config_workflow(workflow: Workflow.Config, job_name): + def _check_yaml_up_to_date(): + print("Check workflows are up to date") + stop_watch = Utils.Stopwatch() + exit_code, output, err = Shell.get_res_stdout_stderr( + f"git diff-index HEAD -- {Settings.WORKFLOW_PATH_PREFIX}" + ) + info = "" + status = Result.Status.SUCCESS + if exit_code != 0: + info = f"workspace has uncommitted files unexpectedly [{output}]" + status = Result.Status.ERROR + print("ERROR: ", info) + else: + Shell.check(f"{Settings.PYTHON_INTERPRETER} -m praktika --generate") + exit_code, output, err = Shell.get_res_stdout_stderr( + f"git diff-index HEAD -- {Settings.WORKFLOW_PATH_PREFIX}" + ) + if exit_code != 0: + info = f"workspace has outdated workflows [{output}] - regenerate with [python -m praktika --generate]" + status = Result.Status.ERROR + print("ERROR: ", info) + + return ( + Result( + name="Check Workflows updated", + status=status, + start_time=stop_watch.start_time, + duration=stop_watch.duration, + info=info, + ), + info, + ) + + def _check_secrets(secrets): + print("Check Secrets") + stop_watch = Utils.Stopwatch() + infos = [] + for secret_config in secrets: + value = secret_config.get_value() + if not value: + info = f"ERROR: Failed to read secret [{secret_config.name}]" + infos.append(info) + print(info) + + info = "\n".join(infos) + return ( + Result( + name="Check Secrets", + status=(Result.Status.FAILED if infos else Result.Status.SUCCESS), + start_time=stop_watch.start_time, + duration=stop_watch.duration, + info=info, + ), + info, + ) + + def _check_db(workflow): + stop_watch = Utils.Stopwatch() + res, info = CIDB( + workflow.get_secret(Settings.SECRET_CI_DB_URL).get_value(), + workflow.get_secret(Settings.SECRET_CI_DB_PASSWORD).get_value(), + ).check() + return ( + Result( + name="Check CI DB", + status=(Result.Status.FAILED if not res else Result.Status.SUCCESS), + start_time=stop_watch.start_time, + duration=stop_watch.duration, + info=info, + ), + info, + ) + + print(f"Start [{job_name}], workflow [{workflow.name}]") + results = [] + files = [] + info_lines = [] + job_status = Result.Status.SUCCESS + + workflow_config = RunConfig( + name=workflow.name, + digest_jobs={}, + digest_dockers={}, + sha=_Environment.get().SHA, + cache_success=[], + cache_success_base64=[], + cache_artifacts={}, + ).dump() + + # checks: + result_, info = _check_yaml_up_to_date() + if result_.status != Result.Status.SUCCESS: + print("ERROR: yaml files are outdated - regenerate, commit and push") + job_status = Result.Status.ERROR + info_lines.append(job_name + ": " + info) + results.append(result_) + + if workflow.secrets: + result_, info = _check_secrets(workflow.secrets) + if result_.status != Result.Status.SUCCESS: + print(f"ERROR: Invalid secrets in workflow [{workflow.name}]") + job_status = Result.Status.ERROR + info_lines.append(job_name + ": " + info) + results.append(result_) + + if workflow.enable_cidb: + result_, info = _check_db(workflow) + if result_.status != Result.Status.SUCCESS: + job_status = Result.Status.ERROR + info_lines.append(job_name + ": " + info) + results.append(result_) + + # config: + if workflow.dockers: + print("Calculate docker's digests") + dockers = workflow.dockers + dockers = Docker.sort_in_build_order(dockers) + for docker in dockers: + workflow_config.digest_dockers[docker.name] = Digest().calc_docker_digest( + docker, dockers + ) + workflow_config.dump() + + if workflow.enable_cache: + print("Cache Lookup") + stop_watch = Utils.Stopwatch() + workflow_config = CacheRunnerHooks.configure(workflow) + results.append( + Result( + name="Cache Lookup", + status=Result.Status.SUCCESS, + start_time=stop_watch.start_time, + duration=stop_watch.duration, + ) + ) + files.append(RunConfig.file_name_static(workflow.name)) + + workflow_config.dump() + + if workflow.enable_report: + print("Init report") + stop_watch = Utils.Stopwatch() + HtmlRunnerHooks.configure(workflow) + results.append( + Result( + name="Init Report", + status=Result.Status.SUCCESS, + start_time=stop_watch.start_time, + duration=stop_watch.duration, + ) + ) + files.append(Result.file_name_static(workflow.name)) + + Result.from_fs(job_name).set_status(job_status).set_results(results).set_files( + files + ).set_info("\n".join(info_lines)) + + if job_status != Result.Status.SUCCESS: + sys.exit(1) + + +def _finish_workflow(workflow, job_name): + print(f"Start [{job_name}], workflow [{workflow.name}]") + env = _Environment.get() + + print("Check Actions statuses") + print(env.get_needs_statuses()) + + print("Check Workflow results") + S3.copy_result_from_s3( + Result.file_name_static(workflow.name), + lock=False, + ) + workflow_result = Result.from_fs(workflow.name) + + ready_for_merge_status = Result.Status.SUCCESS + ready_for_merge_description = "" + failed_results = [] + update_final_report = False + for result in workflow_result.results: + if result.name == job_name or result.status in ( + Result.Status.SUCCESS, + Result.Status.SKIPPED, + ): + continue + if not result.is_completed(): + print( + f"ERROR: not finished job [{result.name}] in the workflow - set status to error" + ) + result.status = Result.Status.ERROR + # dump workflow result after update - to have an updated result in post + workflow_result.dump() + # add error into env - should apper in the report + env.add_info(ResultInfo.NOT_FINALIZED + f" [{result.name}]") + update_final_report = True + job = workflow.get_job(result.name) + if not job or not job.allow_merge_on_failure: + print( + f"NOTE: Result for [{result.name}] has not ok status [{result.status}]" + ) + ready_for_merge_status = Result.Status.FAILED + failed_results.append(result.name.split("(", maxsplit=1)[0]) # cut name + + if failed_results: + ready_for_merge_description = f"failed: {', '.join(failed_results)}" + + if not GH.post_commit_status( + name=Settings.READY_FOR_MERGE_STATUS_NAME + f" [{workflow.name}]", + status=ready_for_merge_status, + description=ready_for_merge_description, + url="", + ): + print(f"ERROR: failed to set status [{Settings.READY_FOR_MERGE_STATUS_NAME}]") + env.add_info(ResultInfo.GH_STATUS_ERROR) + + if update_final_report: + S3.copy_result_to_s3( + workflow_result, + unlock=False, + ) # no lock - no unlock + + Result.from_fs(job_name).set_status(Result.Status.SUCCESS).set_info( + ready_for_merge_description + ) + + +if __name__ == "__main__": + job_name = sys.argv[1] + assert job_name, "Job name must be provided as input argument" + workflow = _get_workflows(name=_Environment.get().WORKFLOW_NAME)[0] + if job_name == Settings.DOCKER_BUILD_JOB_NAME: + _build_dockers(workflow, job_name) + elif job_name == Settings.CI_CONFIG_JOB_NAME: + _config_workflow(workflow, job_name) + elif job_name == Settings.FINISH_WORKFLOW_JOB_NAME: + _finish_workflow(workflow, job_name) + else: + assert False, f"BUG, job name [{job_name}]" diff --git a/ci/praktika/parser.py b/ci/praktika/parser.py new file mode 100644 index 00000000000..95aa27c4576 --- /dev/null +++ b/ci/praktika/parser.py @@ -0,0 +1,258 @@ +import dataclasses +from typing import Any, Dict, List + +from praktika import Artifact, Workflow +from praktika.mangle import _get_workflows + + +class AddonType: + PY = "py" + + +@dataclasses.dataclass +class WorkflowYaml: + @dataclasses.dataclass + class JobYaml: + name: str + needs: List[str] + runs_on: List[str] + artifacts_gh_requires: List["WorkflowYaml.ArtifactYaml"] + artifacts_gh_provides: List["WorkflowYaml.ArtifactYaml"] + addons: List["WorkflowYaml.JobAddonYaml"] + gh_app_auth: bool + run_unless_cancelled: bool + parameter: Any + + def __repr__(self): + return self.name + + @dataclasses.dataclass + class ArtifactYaml: + name: str + provided_by: str + required_by: List[str] + path: str + type: str + + def __repr__(self): + return self.name + + @dataclasses.dataclass + class JobAddonYaml: + install_python: bool + requirements_txt_path: str + + name: str + event: str + branches: List[str] + jobs: List[JobYaml] + job_to_config: Dict[str, JobYaml] + artifact_to_config: Dict[str, ArtifactYaml] + secret_names_gh: List[str] + enable_cache: bool + + +class WorkflowConfigParser: + def __init__(self, config: Workflow.Config): + self.workflow_name = config.name + self.config = config + self.requires_all = [] # type: List[str] + self.provides_all = [] # type: List[str] + self.job_names_all = [] # type: List[str] + self.artifact_to_providing_job_map = {} # type: Dict[str, List[str]] + self.artifact_to_job_requires_map = {} # type: Dict[str, List[str]] + self.artifact_map = {} # type: Dict[str, List[Artifact.Config]] + + self.job_to_provides_artifacts = {} # type: Dict[str, List[Artifact.Config]] + self.job_to_requires_artifacts = {} # type: Dict[str, List[Artifact.Config]] + + self.workflow_yaml_config = WorkflowYaml( + name=self.workflow_name, + event=config.event, + branches=[], + jobs=[], + secret_names_gh=[], + job_to_config={}, + artifact_to_config={}, + enable_cache=False, + ) + + def parse(self): + self.workflow_yaml_config.enable_cache = self.config.enable_cache + + # populate WorkflowYaml.branches + if self.config.event in (Workflow.Event.PUSH,): + assert ( + self.config.branches + ), f'Workflow.Config.branches (e.g. ["main"]) must be set for workflow with event [{self.config.event}], workflow [{self.workflow_name}]' + assert ( + not self.config.base_branches + ), f'Workflow.Config.base_branches (e.g. ["main"]) must not be set for workflow with event [{self.config.event}], workflow [{self.workflow_name}]' + assert isinstance( + self.config.branches, list + ), f'Workflow.Config.branches must be of type list (e.g. ["main"]), workflow [{self.workflow_name}]' + self.workflow_yaml_config.branches = self.config.branches + elif self.config.event in (Workflow.Event.PULL_REQUEST,): + assert ( + self.config.base_branches + ), f'Workflow.Config.base_branches (e.g. ["main"]) must be set for workflow with event [{self.config.event}], workflow [{self.workflow_name}]' + assert ( + not self.config.branches + ), f'Workflow.Config.branches (e.g. ["main"]) must not be set for workflow with event [{self.config.event}], workflow [{self.workflow_name}]' + assert isinstance( + self.config.base_branches, list + ), f'Workflow.Config.base_branches must be of type list (e.g. ["main"]), workflow [{self.workflow_name}]' + self.workflow_yaml_config.branches = self.config.base_branches + + # populate WorkflowYaml.artifact_to_config with phony artifacts + for job in self.config.jobs: + assert ( + job.name not in self.workflow_yaml_config.artifact_to_config + ), f"Not uniq Job name [{job.name}], workflow [{self.workflow_name}]" + self.workflow_yaml_config.artifact_to_config[job.name] = ( + WorkflowYaml.ArtifactYaml( + name=job.name, + provided_by=job.name, + required_by=[], + path="", + type=Artifact.Type.PHONY, + ) + ) + + # populate jobs + for job in self.config.jobs: + job_yaml_config = WorkflowYaml.JobYaml( + name=job.name, + addons=[], + artifacts_gh_requires=[], + artifacts_gh_provides=[], + needs=[], + runs_on=[], + gh_app_auth=False, + run_unless_cancelled=job.run_unless_cancelled, + parameter=None, + ) + self.workflow_yaml_config.jobs.append(job_yaml_config) + assert ( + job.name not in self.workflow_yaml_config.job_to_config + ), f"Job name [{job.name}] is not uniq, workflow [{self.workflow_name}]" + self.workflow_yaml_config.job_to_config[job.name] = job_yaml_config + + # populate WorkflowYaml.artifact_to_config + if self.config.artifacts: + for artifact in self.config.artifacts: + assert ( + artifact.name not in self.workflow_yaml_config.artifact_to_config + ), f"Artifact name [{artifact.name}] is not uniq, workflow [{self.workflow_name}]" + artifact_yaml_config = WorkflowYaml.ArtifactYaml( + name=artifact.name, + provided_by="", + required_by=[], + path=artifact.path, + type=artifact.type, + ) + self.workflow_yaml_config.artifact_to_config[artifact.name] = ( + artifact_yaml_config + ) + + # populate ArtifactYaml.provided_by + for job in self.config.jobs: + if job.provides: + for artifact_name in job.provides: + assert ( + artifact_name in self.workflow_yaml_config.artifact_to_config + ), f"Artifact [{artifact_name}] has no config, job [{job.name}], workflow [{self.workflow_name}]" + assert not self.workflow_yaml_config.artifact_to_config[ + artifact_name + ].provided_by, f"Artifact [{artifact_name}] provided by multiple jobs [{self.workflow_yaml_config.artifact_to_config[artifact_name].provided_by}] and [{job.name}]" + self.workflow_yaml_config.artifact_to_config[ + artifact_name + ].provided_by = job.name + + # populate ArtifactYaml.required_by + for job in self.config.jobs: + if job.requires: + for artifact_name in job.requires: + assert ( + artifact_name in self.workflow_yaml_config.artifact_to_config + ), f"Artifact [{artifact_name}] has no config, job [{job.name}], workflow [{self.workflow_name}]" + assert self.workflow_yaml_config.artifact_to_config[ + artifact_name + ].provided_by, f"Artifact [{artifact_name}] has no job providing it, required by job [{job.name}], workflow [{self.workflow_name}]" + self.workflow_yaml_config.artifact_to_config[ + artifact_name + ].required_by.append(job.name) + + # populate JobYaml.addons + for job in self.config.jobs: + if job.job_requirements: + addon_yaml = WorkflowYaml.JobAddonYaml( + requirements_txt_path=job.job_requirements.python_requirements_txt, + install_python=job.job_requirements.python, + ) + self.workflow_yaml_config.job_to_config[job.name].addons.append( + addon_yaml + ) + + if self.config.enable_report: + for job in self.config.jobs: + # auth required for every job with enabled HTML, so that workflow summary status can be updated + self.workflow_yaml_config.job_to_config[job.name].gh_app_auth = True + + # populate JobYaml.runs_on + for job in self.config.jobs: + self.workflow_yaml_config.job_to_config[job.name].runs_on = job.runs_on + + # populate JobYaml.artifacts_gh_requires, JobYaml.artifacts_gh_provides and JobYaml.needs + for ( + artifact_name, + artifact, + ) in self.workflow_yaml_config.artifact_to_config.items(): + # assert ( + # artifact.provided_by + # and artifact.provided_by in self.workflow_yaml_config.job_to_config + # ), f"Artifact [{artifact_name}] has no valid job providing it [{artifact.provided_by}]" + for job_name in artifact.required_by: + if ( + artifact.provided_by + not in self.workflow_yaml_config.job_to_config[job_name].needs + ): + self.workflow_yaml_config.job_to_config[job_name].needs.append( + artifact.provided_by + ) + if artifact.type in (Artifact.Type.GH,): + self.workflow_yaml_config.job_to_config[ + job_name + ].artifacts_gh_requires.append(artifact) + elif artifact.type in (Artifact.Type.PHONY, Artifact.Type.S3): + pass + else: + assert ( + False + ), f"Artifact [{artifact_name}] has unsupported type [{artifact.type}]" + if not artifact.required_by and artifact.type != Artifact.Type.PHONY: + print( + f"WARNING: Artifact [{artifact_name}] provided by job [{artifact.provided_by}] not required by any job in workflow [{self.workflow_name}]" + ) + if artifact.type == Artifact.Type.GH: + self.workflow_yaml_config.job_to_config[ + artifact.provided_by + ].artifacts_gh_provides.append(artifact) + + # populate JobYaml.parametrize + for job in self.config.jobs: + self.workflow_yaml_config.job_to_config[job.name].parameter = job.parameter + + # populate secrets + for secret_config in self.config.secrets: + if secret_config.is_gh(): + self.workflow_yaml_config.secret_names_gh.append(secret_config.name) + + return self + + +if __name__ == "__main__": + # test + workflows = _get_workflows() + for workflow in workflows: + WorkflowConfigParser(workflow).parse() diff --git a/ci/praktika/result.py b/ci/praktika/result.py new file mode 100644 index 00000000000..3d3c986d5f9 --- /dev/null +++ b/ci/praktika/result.py @@ -0,0 +1,354 @@ +import dataclasses +import datetime +import sys +from collections.abc import Container +from pathlib import Path +from typing import Any, Dict, List, Optional + +from praktika._environment import _Environment +from praktika._settings import _Settings +from praktika.utils import ContextManager, MetaClasses, Shell, Utils + + +@dataclasses.dataclass +class Result(MetaClasses.Serializable): + """ + Represents the outcome of a workflow/job/task or any operation, along with associated metadata. + + This class supports nesting of results to represent tasks with sub-tasks, and includes + various attributes to track status, timing, files, and links. + + Attributes: + name (str): The name of the task. + status (str): The current status of the task. Should be one of the values defined in the Status class. + start_time (Optional[float]): The start time of the task in Unix timestamp format. None if not started. + duration (Optional[float]): The duration of the task in seconds. None if not completed. + results (List[Result]): A list of sub-results representing nested tasks. + files (List[str]): A list of file paths or names related to the result. + links (List[str]): A list of URLs related to the result (e.g., links to reports or resources). + info (str): Additional information about the result. Free-form text. + # TODO: rename + aux_links (List[str]): A list of auxiliary links that provide additional context for the result. + # TODO: remove + html_link (str): A direct link to an HTML representation of the result (e.g., a detailed report page). + + Inner Class: + Status: Defines possible statuses for the task, such as "success", "failure", etc. + """ + + class Status: + SKIPPED = "skipped" + SUCCESS = "success" + FAILED = "failure" + PENDING = "pending" + RUNNING = "running" + ERROR = "error" + + name: str + status: str + start_time: Optional[float] = None + duration: Optional[float] = None + results: List["Result"] = dataclasses.field(default_factory=list) + files: List[str] = dataclasses.field(default_factory=list) + links: List[str] = dataclasses.field(default_factory=list) + info: str = "" + aux_links: List[str] = dataclasses.field(default_factory=list) + html_link: str = "" + + @staticmethod + def create_from( + name="", + results: List["Result"] = None, + stopwatch: Utils.Stopwatch = None, + status="", + files=None, + info="", + with_info_from_results=True, + ): + if isinstance(status, bool): + status = Result.Status.SUCCESS if status else Result.Status.FAILED + if not results and not status: + print("ERROR: Either .results or .status must be provided") + raise + if not name: + name = _Environment.get().JOB_NAME + if not name: + print("ERROR: Failed to guess the .name") + raise + result_status = status or Result.Status.SUCCESS + infos = [] + if info: + if isinstance(info, Container): + infos += info + else: + infos.append(info) + if results and not status: + for result in results: + if result.status not in (Result.Status.SUCCESS, Result.Status.FAILED): + Utils.raise_with_error( + f"Unexpected result status [{result.status}] for Result.create_from call" + ) + if result.status != Result.Status.SUCCESS: + result_status = Result.Status.FAILED + if results: + for result in results: + if result.info and with_info_from_results: + infos.append(f"{result.name}: {result.info}") + return Result( + name=name, + status=result_status, + start_time=stopwatch.start_time if stopwatch else None, + duration=stopwatch.duration if stopwatch else None, + info="\n".join(infos) if infos else "", + results=results or [], + files=files or [], + ) + + @staticmethod + def get(): + return Result.from_fs(_Environment.get().JOB_NAME) + + def is_completed(self): + return self.status not in (Result.Status.PENDING, Result.Status.RUNNING) + + def is_running(self): + return self.status not in (Result.Status.RUNNING,) + + def is_ok(self): + return self.status in (Result.Status.SKIPPED, Result.Status.SUCCESS) + + def set_status(self, status) -> "Result": + self.status = status + self.dump() + return self + + def set_success(self) -> "Result": + return self.set_status(Result.Status.SUCCESS) + + def set_results(self, results: List["Result"]) -> "Result": + self.results = results + self.dump() + return self + + def set_files(self, files) -> "Result": + for file in files: + assert Path( + file + ).is_file(), f"Not valid file [{file}] from file list [{files}]" + if not self.files: + self.files = [] + self.files += files + self.dump() + return self + + def set_info(self, info: str) -> "Result": + if self.info: + self.info += "\n" + self.info += info + self.dump() + return self + + def set_link(self, link) -> "Result": + self.links.append(link) + self.dump() + return self + + @classmethod + def file_name_static(cls, name): + return f"{_Settings.TEMP_DIR}/result_{Utils.normalize_string(name)}.json" + + @classmethod + def from_dict(cls, obj: Dict[str, Any]) -> "Result": + sub_results = [] + for result_dict in obj["results"] or []: + sub_res = cls.from_dict(result_dict) + sub_results.append(sub_res) + obj["results"] = sub_results + return Result(**obj) + + def update_duration(self): + if not self.duration and self.start_time: + self.duration = datetime.datetime.utcnow().timestamp() - self.start_time + else: + if not self.duration: + print( + f"NOTE: duration is set for job [{self.name}] Result - do not update by CI" + ) + else: + print( + f"NOTE: start_time is not set for job [{self.name}] Result - do not update duration" + ) + return self + + def update_sub_result(self, result: "Result"): + assert self.results, "BUG?" + for i, result_ in enumerate(self.results): + if result_.name == result.name: + self.results[i] = result + self._update_status() + return self + + def _update_status(self): + was_pending = False + was_running = False + if self.status == self.Status.PENDING: + was_pending = True + if self.status == self.Status.RUNNING: + was_running = True + + has_pending, has_running, has_failed = False, False, False + for result_ in self.results: + if result_.status in (self.Status.RUNNING,): + has_running = True + if result_.status in (self.Status.PENDING,): + has_pending = True + if result_.status in (self.Status.ERROR, self.Status.FAILED): + has_failed = True + if has_running: + self.status = self.Status.RUNNING + elif has_pending: + self.status = self.Status.PENDING + elif has_failed: + self.status = self.Status.FAILED + else: + self.status = self.Status.SUCCESS + if (was_pending or was_running) and self.status not in ( + self.Status.PENDING, + self.Status.RUNNING, + ): + print("Pipeline finished") + self.update_duration() + + @classmethod + def generate_pending(cls, name, results=None): + return Result( + name=name, + status=Result.Status.PENDING, + start_time=None, + duration=None, + results=results or [], + files=[], + links=[], + info="", + ) + + @classmethod + def generate_skipped(cls, name, results=None): + return Result( + name=name, + status=Result.Status.SKIPPED, + start_time=None, + duration=None, + results=results or [], + files=[], + links=[], + info="from cache", + ) + + @classmethod + def create_from_command_execution( + cls, + name, + command, + with_log=False, + fail_fast=True, + workdir=None, + command_args=None, + command_kwargs=None, + ): + """ + Executes shell commands or Python callables, optionally logging output, and handles errors. + + :param name: Check name + :param command: Shell command (str) or Python callable, or list of them. + :param workdir: Optional working directory. + :param with_log: Boolean flag to log output to a file. + :param fail_fast: Boolean flag to stop execution if one command fails. + :param command_args: Positional arguments for the callable command. + :param command_kwargs: Keyword arguments for the callable command. + :return: Result object with status and optional log file. + """ + + # Stopwatch to track execution time + stop_watch_ = Utils.Stopwatch() + command_args = command_args or [] + command_kwargs = command_kwargs or {} + + # Set log file path if logging is enabled + log_file = ( + f"{_Settings.TEMP_DIR}/{Utils.normalize_string(name)}.log" + if with_log + else None + ) + + # Ensure the command is a list for consistent iteration + if not isinstance(command, list): + fail_fast = False + command = [command] + + print(f"> Starting execution for [{name}]") + res = True # Track success/failure status + error_infos = [] + for command_ in command: + if callable(command_): + # If command is a Python function, call it with provided arguments + result = command_(*command_args, **command_kwargs) + if isinstance(result, bool): + res = result + elif result: + error_infos.append(str(result)) + res = False + else: + # Run shell command in a specified directory with logging and verbosity + with ContextManager.cd(workdir): + exit_code = Shell.run(command_, verbose=True, log_file=log_file) + res = exit_code == 0 + + # If fail_fast is enabled, stop on first failure + if not res and fail_fast: + print(f"Execution stopped due to failure in [{command_}]") + break + + # Create and return the result object with status and log file (if any) + return Result.create_from( + name=name, + status=res, + stopwatch=stop_watch_, + info=error_infos, + files=[log_file] if log_file else None, + ) + + def finish_job_accordingly(self): + self.dump() + if not self.is_ok(): + print("ERROR: Job Failed") + for result in self.results: + if not result.is_ok(): + print("Failed checks:") + print(" | ", result) + sys.exit(1) + else: + print("ok") + + +class ResultInfo: + SETUP_ENV_JOB_FAILED = ( + "Failed to set up job env, it's praktika bug or misconfiguration" + ) + PRE_JOB_FAILED = ( + "Failed to do a job pre-run step, it's praktika bug or misconfiguration" + ) + KILLED = "Job killed or terminated, no Result provided" + NOT_FOUND_IMPOSSIBLE = ( + "No Result file (bug, or job misbehaviour, must not ever happen)" + ) + SKIPPED_DUE_TO_PREVIOUS_FAILURE = "Skipped due to previous failure" + TIMEOUT = "Timeout" + + GH_STATUS_ERROR = "Failed to set GH commit status" + + NOT_FINALIZED = ( + "Job did not not provide Result: job script bug, died CI runner or praktika bug" + ) + + S3_ERROR = "S3 call failure" diff --git a/ci/praktika/runner.py b/ci/praktika/runner.py new file mode 100644 index 00000000000..797a799a74d --- /dev/null +++ b/ci/praktika/runner.py @@ -0,0 +1,348 @@ +import os +import re +import sys +import traceback +from pathlib import Path + +from praktika._environment import _Environment +from praktika.artifact import Artifact +from praktika.cidb import CIDB +from praktika.digest import Digest +from praktika.hook_cache import CacheRunnerHooks +from praktika.hook_html import HtmlRunnerHooks +from praktika.result import Result, ResultInfo +from praktika.runtime import RunConfig +from praktika.s3 import S3 +from praktika.settings import Settings +from praktika.utils import Shell, TeePopen, Utils + + +class Runner: + @staticmethod + def generate_dummy_environment(workflow, job): + print("WARNING: Generate dummy env for local test") + Shell.check( + f"mkdir -p {Settings.TEMP_DIR} {Settings.INPUT_DIR} {Settings.OUTPUT_DIR}" + ) + _Environment( + WORKFLOW_NAME=workflow.name, + JOB_NAME=job.name, + REPOSITORY="", + BRANCH="", + SHA="", + PR_NUMBER=-1, + EVENT_TYPE="", + JOB_OUTPUT_STREAM="", + EVENT_FILE_PATH="", + CHANGE_URL="", + COMMIT_URL="", + BASE_BRANCH="", + RUN_URL="", + RUN_ID="", + INSTANCE_ID="", + INSTANCE_TYPE="", + INSTANCE_LIFE_CYCLE="", + LOCAL_RUN=True, + ).dump() + workflow_config = RunConfig( + name=workflow.name, + digest_jobs={}, + digest_dockers={}, + sha="", + cache_success=[], + cache_success_base64=[], + cache_artifacts={}, + ) + for docker in workflow.dockers: + workflow_config.digest_dockers[docker.name] = Digest().calc_docker_digest( + docker, workflow.dockers + ) + workflow_config.dump() + + Result.generate_pending(job.name).dump() + + def _setup_env(self, _workflow, job): + # source env file to write data into fs (workflow config json, workflow status json) + Shell.check(f". {Settings.ENV_SETUP_SCRIPT}", verbose=True, strict=True) + + # parse the same env script and apply envs from python so that this process sees them + with open(Settings.ENV_SETUP_SCRIPT, "r") as f: + content = f.read() + export_pattern = re.compile( + r"export (\w+)=\$\(cat<<\'EOF\'\n(.*?)EOF\n\)", re.DOTALL + ) + matches = export_pattern.findall(content) + for key, value in matches: + value = value.strip() + os.environ[key] = value + print(f"Set environment variable {key}.") + + print("Read GH Environment") + env = _Environment.from_env() + env.JOB_NAME = job.name + env.PARAMETER = job.parameter + env.dump() + print(env) + + return 0 + + def _pre_run(self, workflow, job): + env = _Environment.get() + + result = Result( + name=job.name, + status=Result.Status.RUNNING, + start_time=Utils.timestamp(), + ) + result.dump() + + if workflow.enable_report and job.name != Settings.CI_CONFIG_JOB_NAME: + print("Update Job and Workflow Report") + HtmlRunnerHooks.pre_run(workflow, job) + + print("Download required artifacts") + required_artifacts = [] + if job.requires and workflow.artifacts: + for requires_artifact_name in job.requires: + for artifact in workflow.artifacts: + if ( + artifact.name == requires_artifact_name + and artifact.type == Artifact.Type.S3 + ): + required_artifacts.append(artifact) + print(f"--- Job requires s3 artifacts [{required_artifacts}]") + if workflow.enable_cache: + prefixes = CacheRunnerHooks.pre_run( + _job=job, _workflow=workflow, _required_artifacts=required_artifacts + ) + else: + prefixes = [env.get_s3_prefix()] * len(required_artifacts) + for artifact, prefix in zip(required_artifacts, prefixes): + s3_path = f"{Settings.S3_ARTIFACT_PATH}/{prefix}/{Utils.normalize_string(artifact._provided_by)}/{Path(artifact.path).name}" + assert S3.copy_file_from_s3(s3_path=s3_path, local_path=Settings.INPUT_DIR) + + return 0 + + def _run(self, workflow, job, docker="", no_docker=False, param=None): + if param: + if not isinstance(param, str): + Utils.raise_with_error( + f"Custom param for local tests must be of type str, got [{type(param)}]" + ) + env = _Environment.get() + env.dump() + + if job.run_in_docker and not no_docker: + # TODO: add support for any image, including not from ci config (e.g. ubuntu:latest) + docker_tag = RunConfig.from_fs(workflow.name).digest_dockers[ + job.run_in_docker + ] + docker = docker or f"{job.run_in_docker}:{docker_tag}" + cmd = f"docker run --rm --user \"$(id -u):$(id -g)\" -e PYTHONPATH='{Settings.DOCKER_WD}:{Settings.DOCKER_WD}/ci' --volume ./:{Settings.DOCKER_WD} --volume {Settings.TEMP_DIR}:{Settings.TEMP_DIR} --workdir={Settings.DOCKER_WD} {docker} {job.command}" + else: + cmd = job.command + + if param: + print(f"Custom --param [{param}] will be passed to job's script") + cmd += f" --param {param}" + print(f"--- Run command [{cmd}]") + + with TeePopen(cmd, timeout=job.timeout) as process: + exit_code = process.wait() + + result = Result.from_fs(job.name) + if exit_code != 0: + if not result.is_completed(): + if process.timeout_exceeded: + print( + f"WARNING: Job timed out: [{job.name}], timeout [{job.timeout}], exit code [{exit_code}]" + ) + result.set_status(Result.Status.ERROR).set_info( + ResultInfo.TIMEOUT + ) + elif result.is_running(): + info = f"ERROR: Job terminated with an error, exit code [{exit_code}] - set status to [{Result.Status.ERROR}]" + print(info) + result.set_status(Result.Status.ERROR).set_info(info) + else: + info = f"ERROR: Invalid status [{result.status}] for exit code [{exit_code}] - switch to [{Result.Status.ERROR}]" + print(info) + result.set_status(Result.Status.ERROR).set_info(info) + result.dump() + + return exit_code + + def _post_run( + self, workflow, job, setup_env_exit_code, prerun_exit_code, run_exit_code + ): + info_errors = [] + env = _Environment.get() + result_exist = Result.exist(job.name) + + if setup_env_exit_code != 0: + info = f"ERROR: {ResultInfo.SETUP_ENV_JOB_FAILED}" + print(info) + # set Result with error and logs + Result( + name=job.name, + status=Result.Status.ERROR, + start_time=Utils.timestamp(), + duration=0.0, + info=info, + ).dump() + elif prerun_exit_code != 0: + info = f"ERROR: {ResultInfo.PRE_JOB_FAILED}" + print(info) + # set Result with error and logs + Result( + name=job.name, + status=Result.Status.ERROR, + start_time=Utils.timestamp(), + duration=0.0, + info=info, + ).dump() + elif not result_exist: + info = f"ERROR: {ResultInfo.NOT_FOUND_IMPOSSIBLE}" + print(info) + Result( + name=job.name, + start_time=Utils.timestamp(), + duration=None, + status=Result.Status.ERROR, + info=ResultInfo.NOT_FOUND_IMPOSSIBLE, + ).dump() + + result = Result.from_fs(job.name) + + if not result.is_completed(): + info = f"ERROR: {ResultInfo.KILLED}" + print(info) + result.set_info(info).set_status(Result.Status.ERROR).dump() + + result.set_files(files=[Settings.RUN_LOG]) + result.update_duration().dump() + + if result.info and result.status != Result.Status.SUCCESS: + # provide job info to workflow level + info_errors.append(result.info) + + if run_exit_code == 0: + providing_artifacts = [] + if job.provides and workflow.artifacts: + for provides_artifact_name in job.provides: + for artifact in workflow.artifacts: + if ( + artifact.name == provides_artifact_name + and artifact.type == Artifact.Type.S3 + ): + providing_artifacts.append(artifact) + if providing_artifacts: + print(f"Job provides s3 artifacts [{providing_artifacts}]") + for artifact in providing_artifacts: + try: + assert Shell.check( + f"ls -l {artifact.path}", verbose=True + ), f"Artifact {artifact.path} not found" + s3_path = f"{Settings.S3_ARTIFACT_PATH}/{env.get_s3_prefix()}/{Utils.normalize_string(env.JOB_NAME)}" + link = S3.copy_file_to_s3( + s3_path=s3_path, local_path=artifact.path + ) + result.set_link(link) + except Exception as e: + error = ( + f"ERROR: Failed to upload artifact [{artifact}], ex [{e}]" + ) + print(error) + info_errors.append(error) + result.set_status(Result.Status.ERROR) + + if workflow.enable_cidb: + print("Insert results to CIDB") + try: + CIDB( + url=workflow.get_secret(Settings.SECRET_CI_DB_URL).get_value(), + passwd=workflow.get_secret( + Settings.SECRET_CI_DB_PASSWORD + ).get_value(), + ).insert(result) + except Exception as ex: + error = f"ERROR: Failed to insert data into CI DB, exception [{ex}]" + print(error) + info_errors.append(error) + + result.dump() + + # always in the end + if workflow.enable_cache: + print(f"Run CI cache hook") + if result.is_ok(): + CacheRunnerHooks.post_run(workflow, job) + + if workflow.enable_report: + print(f"Run html report hook") + HtmlRunnerHooks.post_run(workflow, job, info_errors) + + return True + + def run( + self, workflow, job, docker="", dummy_env=False, no_docker=False, param=None + ): + res = True + setup_env_code = -10 + prerun_code = -10 + run_code = -10 + + if res and not dummy_env: + print( + f"\n\n=== Setup env script [{job.name}], workflow [{workflow.name}] ===" + ) + try: + setup_env_code = self._setup_env(workflow, job) + # Source the bash script and capture the environment variables + res = setup_env_code == 0 + if not res: + print( + f"ERROR: Setup env script failed with exit code [{setup_env_code}]" + ) + except Exception as e: + print(f"ERROR: Setup env script failed with exception [{e}]") + traceback.print_exc() + print(f"=== Setup env finished ===\n\n") + else: + self.generate_dummy_environment(workflow, job) + + if res and not dummy_env: + res = False + print(f"=== Pre run script [{job.name}], workflow [{workflow.name}] ===") + try: + prerun_code = self._pre_run(workflow, job) + res = prerun_code == 0 + if not res: + print(f"ERROR: Pre-run failed with exit code [{prerun_code}]") + except Exception as e: + print(f"ERROR: Pre-run script failed with exception [{e}]") + traceback.print_exc() + print(f"=== Pre run finished ===\n\n") + + if res: + res = False + print(f"=== Run script [{job.name}], workflow [{workflow.name}] ===") + try: + run_code = self._run( + workflow, job, docker=docker, no_docker=no_docker, param=param + ) + res = run_code == 0 + if not res: + print(f"ERROR: Run failed with exit code [{run_code}]") + except Exception as e: + print(f"ERROR: Run script failed with exception [{e}]") + traceback.print_exc() + print(f"=== Run scrip finished ===\n\n") + + if not dummy_env: + print(f"=== Post run script [{job.name}], workflow [{workflow.name}] ===") + self._post_run(workflow, job, setup_env_code, prerun_code, run_code) + print(f"=== Post run scrip finished ===") + + if not res: + sys.exit(1) diff --git a/ci/praktika/runtime.py b/ci/praktika/runtime.py new file mode 100644 index 00000000000..a87b67c2c79 --- /dev/null +++ b/ci/praktika/runtime.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass +from typing import Dict, List + +from praktika.cache import Cache +from praktika.settings import Settings +from praktika.utils import MetaClasses, Utils + + +@dataclass +class RunConfig(MetaClasses.Serializable): + name: str + digest_jobs: Dict[str, str] + digest_dockers: Dict[str, str] + cache_success: List[str] + # there are might be issue with special characters in job names if used directly in yaml syntax - create base64 encoded list to avoid this + cache_success_base64: List[str] + cache_artifacts: Dict[str, Cache.CacheRecord] + sha: str + + @classmethod + def from_dict(cls, obj): + cache_artifacts = obj["cache_artifacts"] + cache_artifacts_deserialized = {} + for artifact_name, cache_artifact in cache_artifacts.items(): + cache_artifacts_deserialized[artifact_name] = Cache.CacheRecord.from_dict( + cache_artifact + ) + obj["cache_artifacts"] = cache_artifacts_deserialized + return RunConfig(**obj) + + @classmethod + def file_name_static(cls, name): + return ( + f"{Settings.TEMP_DIR}/workflow_config_{Utils.normalize_string(name)}.json" + ) diff --git a/ci/praktika/s3.py b/ci/praktika/s3.py new file mode 100644 index 00000000000..8cfb70a9076 --- /dev/null +++ b/ci/praktika/s3.py @@ -0,0 +1,295 @@ +import dataclasses +import json +import time +from pathlib import Path +from typing import Dict + +from praktika._environment import _Environment +from praktika.settings import Settings +from praktika.utils import Shell, Utils + + +class S3: + @dataclasses.dataclass + class Object: + AcceptRanges: str + Expiration: str + LastModified: str + ContentLength: int + ETag: str + ContentType: str + ServerSideEncryption: str + Metadata: Dict + + def has_tags(self, tags): + meta = self.Metadata + for k, v in tags.items(): + if k not in meta or meta[k] != v: + print(f"tag [{k}={v}] does not match meta [{meta}]") + return False + return True + + @classmethod + def clean_s3_directory(cls, s3_path): + assert len(s3_path.split("/")) > 2, "check to not delete too much" + cmd = f"aws s3 rm s3://{s3_path} --recursive" + cls.run_command_with_retries(cmd, retries=1) + return + + @classmethod + def copy_file_to_s3(cls, s3_path, local_path, text=False): + assert Path(local_path).exists(), f"Path [{local_path}] does not exist" + assert Path(s3_path), f"Invalid S3 Path [{s3_path}]" + assert Path( + local_path + ).is_file(), f"Path [{local_path}] is not file. Only files are supported" + file_name = Path(local_path).name + s3_full_path = s3_path + if not s3_full_path.endswith(file_name): + s3_full_path = f"{s3_path}/{Path(local_path).name}" + cmd = f"aws s3 cp {local_path} s3://{s3_full_path}" + if text: + cmd += " --content-type text/plain" + res = cls.run_command_with_retries(cmd) + if not res: + raise + bucket = s3_path.split("/")[0] + endpoint = Settings.S3_BUCKET_TO_HTTP_ENDPOINT[bucket] + assert endpoint + return f"https://{s3_full_path}".replace(bucket, endpoint) + + @classmethod + def put(cls, s3_path, local_path, text=False, metadata=None): + assert Path(local_path).exists(), f"Path [{local_path}] does not exist" + assert Path(s3_path), f"Invalid S3 Path [{s3_path}]" + assert Path( + local_path + ).is_file(), f"Path [{local_path}] is not file. Only files are supported" + file_name = Path(local_path).name + s3_full_path = s3_path + if not s3_full_path.endswith(file_name): + s3_full_path = f"{s3_path}/{Path(local_path).name}" + + s3_full_path = str(s3_full_path).removeprefix("s3://") + bucket, key = s3_full_path.split("/", maxsplit=1) + + command = ( + f"aws s3api put-object --bucket {bucket} --key {key} --body {local_path}" + ) + if metadata: + for k, v in metadata.items(): + command += f" --metadata {k}={v}" + + cmd = f"aws s3 cp {local_path} s3://{s3_full_path}" + if text: + cmd += " --content-type text/plain" + res = cls.run_command_with_retries(command) + assert res + + @classmethod + def run_command_with_retries(cls, command, retries=Settings.MAX_RETRIES_S3): + i = 0 + res = False + while not res and i < retries: + i += 1 + ret_code, stdout, stderr = Shell.get_res_stdout_stderr( + command, verbose=True + ) + if "aws sso login" in stderr: + print("ERROR: aws login expired") + break + elif "does not exist" in stderr: + print("ERROR: requested file does not exist") + break + if ret_code != 0: + print( + f"ERROR: aws s3 cp failed, stdout/stderr err: [{stderr}], out [{stdout}]" + ) + res = ret_code == 0 + return res + + @classmethod + def get_link(cls, s3_path, local_path): + s3_full_path = f"{s3_path}/{Path(local_path).name}" + bucket = s3_path.split("/")[0] + endpoint = Settings.S3_BUCKET_TO_HTTP_ENDPOINT[bucket] + return f"https://{s3_full_path}".replace(bucket, endpoint) + + @classmethod + def copy_file_from_s3(cls, s3_path, local_path): + assert Path(s3_path), f"Invalid S3 Path [{s3_path}]" + if Path(local_path).is_dir(): + local_path = Path(local_path) / Path(s3_path).name + else: + assert Path( + local_path + ).parent.is_dir(), f"Parent path for [{local_path}] does not exist" + cmd = f"aws s3 cp s3://{s3_path} {local_path}" + res = cls.run_command_with_retries(cmd) + return res + + @classmethod + def head_object(cls, s3_path): + s3_path = str(s3_path).removeprefix("s3://") + bucket, key = s3_path.split("/", maxsplit=1) + output = Shell.get_output( + f"aws s3api head-object --bucket {bucket} --key {key}", verbose=True + ) + if not output: + return None + else: + return cls.Object(**json.loads(output)) + + @classmethod + def delete(cls, s3_path): + assert Path(s3_path), f"Invalid S3 Path [{s3_path}]" + return Shell.check( + f"aws s3 rm s3://{s3_path}", + verbose=True, + ) + + # TODO: apparently should be placed into separate file to be used only inside praktika + # keeping this module clean from importing Settings, Environment and etc, making it easy for use externally + @classmethod + def copy_result_to_s3(cls, result, unlock=True): + result.dump() + env = _Environment.get() + s3_path = f"{Settings.HTML_S3_PATH}/{env.get_s3_prefix()}" + s3_path_full = f"{s3_path}/{Path(result.file_name()).name}" + url = S3.copy_file_to_s3(s3_path=s3_path, local_path=result.file_name()) + if env.PR_NUMBER: + print("Duplicate Result for latest commit alias in PR") + s3_path = f"{Settings.HTML_S3_PATH}/{env.get_s3_prefix(latest=True)}" + url = S3.copy_file_to_s3(s3_path=s3_path, local_path=result.file_name()) + if unlock: + if not cls.unlock(s3_path_full): + print(f"ERROR: File [{s3_path_full}] unlock failure") + assert False # TODO: investigate + return url + + @classmethod + def copy_result_from_s3(cls, local_path, lock=True): + env = _Environment.get() + file_name = Path(local_path).name + s3_path = f"{Settings.HTML_S3_PATH}/{env.get_s3_prefix()}/{file_name}" + if lock: + cls.lock(s3_path) + if not S3.copy_file_from_s3(s3_path=s3_path, local_path=local_path): + print(f"ERROR: failed to cp file [{s3_path}] from s3") + raise + + @classmethod + def lock(cls, s3_path, level=0): + assert level < 3, "Never" + env = _Environment.get() + s3_path_lock = s3_path + f".lock" + file_path_lock = f"{Settings.TEMP_DIR}/{Path(s3_path_lock).name}" + assert Shell.check( + f"echo '''{env.JOB_NAME}''' > {file_path_lock}", verbose=True + ), "Never" + + i = 20 + meta = S3.head_object(s3_path_lock) + while meta: + print(f"WARNING: Failed to acquire lock, meta [{meta}] - wait") + i -= 5 + if i < 0: + info = f"ERROR: lock acquire failure - unlock forcefully" + print(info) + env.add_info(info) + break + time.sleep(5) + + metadata = {"job": Utils.to_base64(env.JOB_NAME)} + S3.put( + s3_path=s3_path_lock, + local_path=file_path_lock, + metadata=metadata, + ) + time.sleep(1) + obj = S3.head_object(s3_path_lock) + if not obj or not obj.has_tags(tags=metadata): + print(f"WARNING: locked by another job [{obj}]") + env.add_info("S3 lock file failure") + cls.lock(s3_path, level=level + 1) + print("INFO: lock acquired") + + @classmethod + def unlock(cls, s3_path): + s3_path_lock = s3_path + ".lock" + env = _Environment.get() + obj = S3.head_object(s3_path_lock) + if not obj: + print("ERROR: lock file is removed") + assert False # investigate + elif not obj.has_tags({"job": Utils.to_base64(env.JOB_NAME)}): + print("ERROR: lock file was acquired by another job") + assert False # investigate + + if not S3.delete(s3_path_lock): + print(f"ERROR: File [{s3_path_lock}] delete failure") + print("INFO: lock released") + return True + + @classmethod + def get_result_link(cls, result): + env = _Environment.get() + s3_path = f"{Settings.HTML_S3_PATH}/{env.get_s3_prefix(latest=True if env.PR_NUMBER else False)}" + return S3.get_link(s3_path=s3_path, local_path=result.file_name()) + + @classmethod + def clean_latest_result(cls): + env = _Environment.get() + env.SHA = "latest" + assert env.PR_NUMBER + s3_path = f"{Settings.HTML_S3_PATH}/{env.get_s3_prefix()}" + S3.clean_s3_directory(s3_path=s3_path) + + @classmethod + def _upload_file_to_s3( + cls, local_file_path, upload_to_s3: bool, text: bool = False, s3_subprefix="" + ) -> str: + if upload_to_s3: + env = _Environment.get() + s3_path = f"{Settings.HTML_S3_PATH}/{env.get_s3_prefix()}" + if s3_subprefix: + s3_subprefix.removeprefix("/").removesuffix("/") + s3_path += f"/{s3_subprefix}" + html_link = S3.copy_file_to_s3( + s3_path=s3_path, local_path=local_file_path, text=text + ) + return html_link + return f"file://{Path(local_file_path).absolute()}" + + @classmethod + def upload_result_files_to_s3(cls, result): + if result.results: + for result_ in result.results: + cls.upload_result_files_to_s3(result_) + for file in result.files: + if not Path(file).is_file(): + print(f"ERROR: Invalid file [{file}] in [{result.name}] - skip upload") + result.info += f"\nWARNING: Result file [{file}] was not found" + file_link = cls._upload_file_to_s3(file, upload_to_s3=False) + else: + is_text = False + for text_file_suffix in Settings.TEXT_CONTENT_EXTENSIONS: + if file.endswith(text_file_suffix): + print( + f"File [{file}] matches Settings.TEXT_CONTENT_EXTENSIONS [{Settings.TEXT_CONTENT_EXTENSIONS}] - add text attribute for s3 object" + ) + is_text = True + break + file_link = cls._upload_file_to_s3( + file, + upload_to_s3=True, + text=is_text, + s3_subprefix=Utils.normalize_string(result.name), + ) + result.links.append(file_link) + if result.files: + print( + f"Result files [{result.files}] uploaded to s3 [{result.links[-len(result.files):]}] - clean files list" + ) + result.files = [] + result.dump() diff --git a/ci/praktika/secret.py b/ci/praktika/secret.py new file mode 100644 index 00000000000..9c033d76708 --- /dev/null +++ b/ci/praktika/secret.py @@ -0,0 +1,61 @@ +import dataclasses +import os + +from praktika.utils import Shell + + +class Secret: + class Type: + AWS_SSM_VAR = "aws parameter" + AWS_SSM_SECRET = "aws secret" + GH_SECRET = "gh secret" + + @dataclasses.dataclass + class Config: + name: str + type: str + + def is_gh(self): + return self.type == Secret.Type.GH_SECRET + + def get_value(self): + if self.type == Secret.Type.AWS_SSM_VAR: + return self.get_aws_ssm_var() + if self.type == Secret.Type.AWS_SSM_SECRET: + return self.get_aws_ssm_secret() + elif self.type == Secret.Type.GH_SECRET: + return self.get_gh_secret() + else: + assert False, f"Not supported secret type, secret [{self}]" + + def get_aws_ssm_var(self): + res = Shell.get_output( + f"aws ssm get-parameter --name {self.name} --with-decryption --output text --query Parameter.Value", + ) + if not res: + print(f"ERROR: Failed to get secret [{self.name}]") + raise RuntimeError() + return res + + def get_aws_ssm_secret(self): + name, secret_key_name = self.name, "" + if "." in self.name: + name, secret_key_name = self.name.split(".") + cmd = f"aws secretsmanager get-secret-value --secret-id {name} --query SecretString --output text" + if secret_key_name: + cmd += f" | jq -r '.[\"{secret_key_name}\"]'" + res = Shell.get_output(cmd, verbose=True) + if not res: + print(f"ERROR: Failed to get secret [{self.name}]") + raise RuntimeError() + return res + + def get_gh_secret(self): + res = os.getenv(f"{self.name}") + if not res: + print(f"ERROR: Failed to get secret [{self.name}]") + raise RuntimeError() + return res + + def __repr__(self): + return self.name diff --git a/ci/praktika/settings.py b/ci/praktika/settings.py new file mode 100644 index 00000000000..1a4068d9398 --- /dev/null +++ b/ci/praktika/settings.py @@ -0,0 +1,8 @@ +from praktika._settings import _Settings +from praktika.mangle import _get_user_settings + +Settings = _Settings() + +user_settings = _get_user_settings() +for setting, value in user_settings.items(): + Settings.__setattr__(setting, value) diff --git a/ci/praktika/utils.py b/ci/praktika/utils.py new file mode 100644 index 00000000000..b96c78e4fa7 --- /dev/null +++ b/ci/praktika/utils.py @@ -0,0 +1,597 @@ +import base64 +import dataclasses +import glob +import json +import multiprocessing +import os +import re +import signal +import subprocess +import sys +import time +from abc import ABC, abstractmethod +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path +from threading import Thread +from types import SimpleNamespace +from typing import Any, Dict, Iterator, List, Optional, Type, TypeVar, Union + +from praktika._settings import _Settings + +T = TypeVar("T", bound="Serializable") + + +class MetaClasses: + class WithIter(type): + def __iter__(cls): + return (v for k, v in cls.__dict__.items() if not k.startswith("_")) + + @dataclasses.dataclass + class Serializable(ABC): + @classmethod + def to_dict(cls, obj): + if dataclasses.is_dataclass(obj): + return {k: cls.to_dict(v) for k, v in dataclasses.asdict(obj).items()} + elif isinstance(obj, SimpleNamespace): + return {k: cls.to_dict(v) for k, v in vars(obj).items()} + elif isinstance(obj, list): + return [cls.to_dict(i) for i in obj] + elif isinstance(obj, dict): + return {k: cls.to_dict(v) for k, v in obj.items()} + else: + return obj + + @classmethod + def from_dict(cls: Type[T], obj: Dict[str, Any]) -> T: + return cls(**obj) + + @classmethod + def from_fs(cls: Type[T], name) -> T: + with open(cls.file_name_static(name), "r", encoding="utf8") as f: + try: + return cls.from_dict(json.load(f)) + except json.decoder.JSONDecodeError as ex: + print(f"ERROR: failed to parse json, ex [{ex}]") + print(f"JSON content [{cls.file_name_static(name)}]") + Shell.check(f"cat {cls.file_name_static(name)}") + raise ex + + @classmethod + @abstractmethod + def file_name_static(cls, name): + pass + + def file_name(self): + return self.file_name_static(self.name) + + def dump(self): + with open(self.file_name(), "w", encoding="utf8") as f: + json.dump(self.to_dict(self), f, indent=4) + return self + + @classmethod + def exist(cls, name): + return Path(cls.file_name_static(name)).is_file() + + def to_json(self, pretty=False): + return json.dumps(dataclasses.asdict(self), indent=4 if pretty else None) + + +class ContextManager: + @staticmethod + @contextmanager + def cd(to: Optional[Union[Path, str]] = None) -> Iterator[None]: + """ + changes current working directory to @path or `git root` if @path is None + :param to: + :return: + """ + if not to: + try: + to = Shell.get_output_or_raise("git rev-parse --show-toplevel") + except: + pass + if not to: + if Path(_Settings.DOCKER_WD).is_dir(): + to = _Settings.DOCKER_WD + if not to: + assert False, "FIX IT" + assert to + old_pwd = os.getcwd() + os.chdir(to) + try: + yield + finally: + os.chdir(old_pwd) + + +class Shell: + @classmethod + def get_output_or_raise(cls, command, verbose=False): + return cls.get_output(command, verbose=verbose, strict=True).strip() + + @classmethod + def get_output(cls, command, strict=False, verbose=False): + if verbose: + print(f"Run command [{command}]") + res = subprocess.run( + command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if res.stderr: + print(f"WARNING: stderr: {res.stderr.strip()}") + if strict and res.returncode != 0: + raise RuntimeError(f"command failed with {res.returncode}") + return res.stdout.strip() + + @classmethod + def get_res_stdout_stderr(cls, command, verbose=True): + if verbose: + print(f"Run command [{command}]") + res = subprocess.run( + command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + return res.returncode, res.stdout.strip(), res.stderr.strip() + + @classmethod + def check( + cls, + command, + log_file=None, + strict=False, + verbose=False, + dry_run=False, + stdin_str=None, + timeout=None, + retries=0, + **kwargs, + ): + return ( + cls.run( + command, + log_file, + strict, + verbose, + dry_run, + stdin_str, + retries=retries, + timeout=timeout, + **kwargs, + ) + == 0 + ) + + @classmethod + def run( + cls, + command, + log_file=None, + strict=False, + verbose=False, + dry_run=False, + stdin_str=None, + timeout=None, + retries=0, + **kwargs, + ): + def _check_timeout(timeout, process) -> None: + if not timeout: + return + time.sleep(timeout) + print( + f"WARNING: Timeout exceeded [{timeout}], sending SIGTERM to process group [{process.pid}]" + ) + try: + os.killpg(process.pid, signal.SIGTERM) + except ProcessLookupError: + print("Process already terminated.") + return + + time_wait = 0 + wait_interval = 5 + + # Wait for process to terminate + while process.poll() is None and time_wait < 100: + print("Waiting for process to exit...") + time.sleep(wait_interval) + time_wait += wait_interval + + # Force kill if still running + if process.poll() is None: + print(f"WARNING: Process still running after SIGTERM, sending SIGKILL") + try: + os.killpg(process.pid, signal.SIGKILL) + except ProcessLookupError: + print("Process already terminated.") + + # Dry-run + if dry_run: + print(f"Dry-run. Would run command [{command}]") + return 0 # Return success for dry-run + + if verbose: + print(f"Run command: [{command}]") + + log_file = log_file or "/dev/null" + proc = None + for retry in range(retries + 1): + try: + with open(log_file, "w") as log_fp: + proc = subprocess.Popen( + command, + shell=True, + stderr=subprocess.STDOUT, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE if stdin_str else None, + universal_newlines=True, + start_new_session=True, # Start a new process group for signal handling + bufsize=1, # Line-buffered + errors="backslashreplace", + **kwargs, + ) + + # Start the timeout thread if specified + if timeout: + t = Thread(target=_check_timeout, args=(timeout, proc)) + t.daemon = True + t.start() + + # Write stdin if provided + if stdin_str: + proc.stdin.write(stdin_str) + proc.stdin.close() + + # Process output in real-time + if proc.stdout: + for line in proc.stdout: + sys.stdout.write(line) + log_fp.write(line) + + proc.wait() # Wait for the process to finish + + if proc.returncode == 0: + break # Exit retry loop if success + else: + if verbose: + print( + f"ERROR: command [{command}] failed, exit code: {proc.returncode}, retry: {retry}/{retries}" + ) + except Exception as e: + if verbose: + print( + f"ERROR: command failed, exception: {e}, retry: {retry}/{retries}" + ) + if proc: + proc.kill() + + # Handle strict mode (ensure process success or fail) + if strict: + assert ( + proc and proc.returncode == 0 + ), f"Command failed with return code {proc.returncode}" + + return proc.returncode if proc else 1 # Return 1 if process never started + + @classmethod + def run_async( + cls, + command, + stdin_str=None, + verbose=False, + suppress_output=False, + **kwargs, + ): + if verbose: + print(f"Run command in background [{command}]") + proc = subprocess.Popen( + command, + shell=True, + stderr=subprocess.STDOUT if not suppress_output else subprocess.DEVNULL, + stdout=subprocess.PIPE if not suppress_output else subprocess.DEVNULL, + stdin=subprocess.PIPE if stdin_str else None, + universal_newlines=True, + start_new_session=True, + bufsize=1, + errors="backslashreplace", + **kwargs, + ) + if proc.stdout: + for line in proc.stdout: + print(line, end="") + return proc + + +class Utils: + @staticmethod + def terminate_process_group(pid, force=False): + if not force: + os.killpg(os.getpgid(pid), signal.SIGTERM) + else: + os.killpg(os.getpgid(pid), signal.SIGKILL) + + @staticmethod + def set_env(key, val): + os.environ[key] = val + + @staticmethod + def print_formatted_error(error_message, stdout="", stderr=""): + stdout_lines = stdout.splitlines() if stdout else [] + stderr_lines = stderr.splitlines() if stderr else [] + print(f"ERROR: {error_message}") + if stdout_lines: + print(" Out:") + for line in stdout_lines: + print(f" | {line}") + if stderr_lines: + print(" Err:") + for line in stderr_lines: + print(f" | {line}") + + @staticmethod + def sleep(seconds): + time.sleep(seconds) + + @staticmethod + def cwd(): + return Path.cwd() + + @staticmethod + def cpu_count(): + return multiprocessing.cpu_count() + + @staticmethod + def raise_with_error(error_message, stdout="", stderr="", ex=None): + Utils.print_formatted_error(error_message, stdout, stderr) + raise ex or RuntimeError() + + @staticmethod + def timestamp(): + return datetime.utcnow().timestamp() + + @staticmethod + def timestamp_to_str(timestamp): + return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S") + + @staticmethod + def get_failed_tests_number(description: str) -> Optional[int]: + description = description.lower() + + pattern = r"fail:\s*(\d+)\s*(?=,|$)" + match = re.search(pattern, description) + if match: + return int(match.group(1)) + return None + + @staticmethod + def is_killed_with_oom(): + if Shell.check( + "sudo dmesg -T | grep -q -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE'" + ): + return True + return False + + @staticmethod + def clear_dmesg(): + Shell.check("sudo dmesg --clear", verbose=True) + + @staticmethod + def to_base64(value): + assert isinstance(value, str), f"TODO: not supported for {type(value)}" + string_bytes = value.encode("utf-8") + base64_bytes = base64.b64encode(string_bytes) + base64_string = base64_bytes.decode("utf-8") + return base64_string + + @staticmethod + def is_hex(s): + try: + int(s, 16) + return True + except ValueError: + return False + + @staticmethod + def normalize_string(string: str) -> str: + res = string.lower() + for r in ( + (" ", "_"), + ("(", ""), + (")", ""), + ("{", ""), + ("}", ""), + ("'", ""), + ("[", ""), + ("]", ""), + (",", ""), + ("/", "_"), + ("-", "_"), + (":", ""), + ('"', ""), + ): + res = res.replace(*r) + return res + + @staticmethod + def traverse_path(path, file_suffixes=None, sorted=False, not_exists_ok=False): + res = [] + + def is_valid_file(file): + if file_suffixes is None: + return True + return any(file.endswith(suffix) for suffix in file_suffixes) + + if os.path.isfile(path): + if is_valid_file(path): + res.append(path) + elif os.path.isdir(path): + for root, dirs, files in os.walk(path): + for file in files: + full_path = os.path.join(root, file) + if is_valid_file(full_path): + res.append(full_path) + elif "*" in str(path): + res.extend( + [ + f + for f in glob.glob(path, recursive=True) + if os.path.isfile(f) and is_valid_file(f) + ] + ) + else: + if not_exists_ok: + pass + else: + assert False, f"File does not exist or not valid [{path}]" + + if sorted: + res.sort(reverse=True) + + return res + + @classmethod + def traverse_paths( + cls, + include_paths, + exclude_paths, + file_suffixes=None, + sorted=False, + not_exists_ok=False, + ) -> List["str"]: + included_files_ = set() + for path in include_paths: + included_files_.update(cls.traverse_path(path, file_suffixes=file_suffixes)) + + excluded_files = set() + for path in exclude_paths: + res = cls.traverse_path(path, not_exists_ok=not_exists_ok) + if not res: + print( + f"WARNING: Utils.traverse_paths excluded 0 files by path [{path}] in exclude_paths" + ) + else: + excluded_files.update(res) + res = [f for f in included_files_ if f not in excluded_files] + if sorted: + res.sort(reverse=True) + return res + + @classmethod + def add_to_PATH(cls, path): + path_cur = os.getenv("PATH", "") + if path_cur: + path += ":" + path_cur + os.environ["PATH"] = path + + class Stopwatch: + def __init__(self): + self.start_time = datetime.utcnow().timestamp() + + @property + def duration(self) -> float: + return datetime.utcnow().timestamp() - self.start_time + + +class TeePopen: + def __init__( + self, + command: str, + log_file: Union[str, Path] = "", + env: Optional[dict] = None, + timeout: Optional[int] = None, + ): + self.command = command + self.log_file_name = log_file + self.log_file = None + self.env = env or os.environ.copy() + self.process = None # type: Optional[subprocess.Popen] + self.timeout = timeout + self.timeout_exceeded = False + self.terminated_by_sigterm = False + self.terminated_by_sigkill = False + + def _check_timeout(self) -> None: + if self.timeout is None: + return + time.sleep(self.timeout) + print( + f"WARNING: Timeout exceeded [{self.timeout}], send SIGTERM to [{self.process.pid}] and give a chance for graceful termination" + ) + self.send_signal(signal.SIGTERM) + time_wait = 0 + self.terminated_by_sigterm = True + self.timeout_exceeded = True + while self.process.poll() is None and time_wait < 100: + print("wait...") + wait = 5 + time.sleep(wait) + time_wait += wait + while self.process.poll() is None: + print(f"WARNING: Still running, send SIGKILL to [{self.process.pid}]") + self.send_signal(signal.SIGKILL) + self.terminated_by_sigkill = True + time.sleep(2) + + def __enter__(self) -> "TeePopen": + if self.log_file_name: + self.log_file = open(self.log_file_name, "w", encoding="utf-8") + self.process = subprocess.Popen( + self.command, + shell=True, + universal_newlines=True, + env=self.env, + start_new_session=True, # signall will be sent to all children + stderr=subprocess.STDOUT, + stdout=subprocess.PIPE, + bufsize=1, + errors="backslashreplace", + ) + time.sleep(1) + print(f"Subprocess started, pid [{self.process.pid}]") + if self.timeout is not None and self.timeout > 0: + t = Thread(target=self._check_timeout) + t.daemon = True # does not block the program from exit + t.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.wait() + if self.log_file: + self.log_file.close() + + def wait(self) -> int: + if self.process.stdout is not None: + for line in self.process.stdout: + sys.stdout.write(line) + if self.log_file: + self.log_file.write(line) + + return self.process.wait() + + def poll(self): + return self.process.poll() + + def send_signal(self, signal_num): + os.killpg(self.process.pid, signal_num) + + +if __name__ == "__main__": + + @dataclasses.dataclass + class Test(MetaClasses.Serializable): + name: str + + @staticmethod + def file_name_static(name): + return f"/tmp/{Utils.normalize_string(name)}.json" + + Test(name="dsada").dump() + t = Test.from_fs("dsada") + print(t) diff --git a/ci/praktika/validator.py b/ci/praktika/validator.py new file mode 100644 index 00000000000..29edc0a27ed --- /dev/null +++ b/ci/praktika/validator.py @@ -0,0 +1,208 @@ +import glob +import sys +from itertools import chain +from pathlib import Path + +from praktika import Workflow +from praktika._settings import GHRunners +from praktika.mangle import _get_workflows +from praktika.settings import Settings +from praktika.utils import ContextManager + + +class Validator: + @classmethod + def validate(cls): + print("---Start validating Pipeline and settings---") + workflows = _get_workflows() + for workflow in workflows: + print(f"Validating workflow [{workflow.name}]") + + cls.validate_file_paths_in_run_command(workflow) + cls.validate_file_paths_in_digest_configs(workflow) + cls.validate_requirements_txt_files(workflow) + cls.validate_dockers(workflow) + + if workflow.artifacts: + for artifact in workflow.artifacts: + if artifact.is_s3_artifact(): + assert ( + Settings.S3_ARTIFACT_PATH + ), "Provide S3_ARTIFACT_PATH setting in any .py file in ./ci/settings/* to be able to use s3 for artifacts" + + for job in workflow.jobs: + if job.requires and workflow.artifacts: + for require in job.requires: + if ( + require in workflow.artifacts + and workflow.artifacts[require].is_s3_artifact() + ): + assert not any( + [r in GHRunners for r in job.runs_on] + ), f"GH runners [{job.name}:{job.runs_on}] must not be used with S3 as artifact storage" + + if job.allow_merge_on_failure: + assert ( + workflow.enable_merge_ready_status + ), f"Job property allow_merge_on_failure must be used only with enabled workflow.enable_merge_ready_status, workflow [{workflow.name}], job [{job.name}]" + + if workflow.enable_cache: + assert ( + Settings.CI_CONFIG_RUNS_ON + ), f"Runner label to run workflow config job must be provided via CACHE_CONFIG_RUNS_ON setting if enable_cache=True, workflow [{workflow.name}]" + + assert ( + Settings.CACHE_S3_PATH + ), f"CACHE_S3_PATH Setting must be defined if enable_cache=True, workflow [{workflow.name}]" + + if workflow.dockers: + cls.evaluate_check( + Settings.DOCKER_BUILD_RUNS_ON, + f"DOCKER_BUILD_RUNS_ON settings must be defined if workflow has dockers", + workflow_name=workflow.name, + ) + + if workflow.enable_report: + assert ( + Settings.HTML_S3_PATH + ), f"HTML_S3_PATH Setting must be defined if enable_html=True, workflow [{workflow.name}]" + assert ( + Settings.S3_BUCKET_TO_HTTP_ENDPOINT + ), f"S3_BUCKET_TO_HTTP_ENDPOINT Setting must be defined if enable_html=True, workflow [{workflow.name}]" + assert ( + Settings.HTML_S3_PATH.split("/")[0] + in Settings.S3_BUCKET_TO_HTTP_ENDPOINT + ), f"S3_BUCKET_TO_HTTP_ENDPOINT Setting must include bucket name [{Settings.HTML_S3_PATH}] from HTML_S3_PATH, workflow [{workflow.name}]" + + if workflow.enable_cache: + for artifact in workflow.artifacts or []: + assert ( + artifact.is_s3_artifact() + ), f"All artifacts must be of S3 type if enable_cache|enable_html=True, artifact [{artifact.name}], type [{artifact.type}], workflow [{workflow.name}]" + + if workflow.dockers: + assert ( + Settings.DOCKERHUB_USERNAME + ), f"Settings.DOCKERHUB_USERNAME must be provided if workflow has dockers, workflow [{workflow.name}]" + assert ( + Settings.DOCKERHUB_SECRET + ), f"Settings.DOCKERHUB_SECRET must be provided if workflow has dockers, workflow [{workflow.name}]" + assert workflow.get_secret( + Settings.DOCKERHUB_SECRET + ), f"Secret [{Settings.DOCKERHUB_SECRET}] must have configuration in workflow.secrets, workflow [{workflow.name}]" + + if ( + workflow.enable_cache + or workflow.enable_report + or workflow.enable_merge_ready_status + ): + for job in workflow.jobs: + assert not any( + job in ("ubuntu-latest",) for job in job.runs_on + ), f"GitHub Runners must not be used for workflow with enabled: workflow.enable_cache, workflow.enable_html or workflow.enable_merge_ready_status as s3 access is required, workflow [{workflow.name}], job [{job.name}]" + + if workflow.enable_cidb: + assert ( + Settings.SECRET_CI_DB_URL + ), f"Settings.CI_DB_URL_SECRET must be provided if workflow.enable_cidb=True, workflow [{workflow.name}]" + assert ( + Settings.SECRET_CI_DB_PASSWORD + ), f"Settings.CI_DB_PASSWORD_SECRET must be provided if workflow.enable_cidb=True, workflow [{workflow.name}]" + assert ( + Settings.CI_DB_DB_NAME + ), f"Settings.CI_DB_DB_NAME must be provided if workflow.enable_cidb=True, workflow [{workflow.name}]" + assert ( + Settings.CI_DB_TABLE_NAME + ), f"Settings.CI_DB_TABLE_NAME must be provided if workflow.enable_cidb=True, workflow [{workflow.name}]" + + @classmethod + def validate_file_paths_in_run_command(cls, workflow: Workflow.Config) -> None: + if not Settings.VALIDATE_FILE_PATHS: + return + with ContextManager.cd(): + for job in workflow.jobs: + run_command = job.command + command_parts = run_command.split(" ") + for part in command_parts: + if ">" in part: + return + if "/" in part: + assert ( + Path(part).is_file() or Path(part).is_dir() + ), f"Apparently run command [{run_command}] for job [{job}] has invalid path [{part}]. Setting to disable check: VALIDATE_FILE_PATHS" + + @classmethod + def validate_file_paths_in_digest_configs(cls, workflow: Workflow.Config) -> None: + if not Settings.VALIDATE_FILE_PATHS: + return + with ContextManager.cd(): + for job in workflow.jobs: + if not job.digest_config: + continue + for include_path in chain( + job.digest_config.include_paths, job.digest_config.exclude_paths + ): + if "*" in include_path: + assert glob.glob( + include_path, recursive=True + ), f"Apparently file glob [{include_path}] in job [{job.name}] digest_config [{job.digest_config}] invalid, workflow [{workflow.name}]. Setting to disable check: VALIDATE_FILE_PATHS" + else: + assert ( + Path(include_path).is_file() or Path(include_path).is_dir() + ), f"Apparently file path [{include_path}] in job [{job.name}] digest_config [{job.digest_config}] invalid, workflow [{workflow.name}]. Setting to disable check: VALIDATE_FILE_PATHS" + + @classmethod + def validate_requirements_txt_files(cls, workflow: Workflow.Config) -> None: + with ContextManager.cd(): + for job in workflow.jobs: + if job.job_requirements: + if job.job_requirements.python_requirements_txt: + path = Path(job.job_requirements.python_requirements_txt) + message = f"File with py requirement [{path}] does not exist" + if job.name in ( + Settings.DOCKER_BUILD_JOB_NAME, + Settings.CI_CONFIG_JOB_NAME, + Settings.FINISH_WORKFLOW_JOB_NAME, + ): + message += '\n If all requirements already installed on your runners - add setting INSTALL_PYTHON_REQS_FOR_NATIVE_JOBS""' + message += "\n If requirements needs to be installed - add requirements file (Settings.INSTALL_PYTHON_REQS_FOR_NATIVE_JOBS):" + message += "\n echo jwt==1.3.1 > ./ci/requirements.txt" + message += ( + "\n echo requests==2.32.3 >> ./ci/requirements.txt" + ) + message += "\n echo https://clickhouse-builds.s3.amazonaws.com/packages/praktika-0.1-py3-none-any.whl >> ./ci/requirements.txt" + cls.evaluate_check( + path.is_file(), message, job.name, workflow.name + ) + + @classmethod + def validate_dockers(cls, workflow: Workflow.Config): + names = [] + for docker in workflow.dockers: + cls.evaluate_check( + docker.name not in names, + f"Non uniq docker name [{docker.name}]", + workflow_name=workflow.name, + ) + names.append(docker.name) + for docker in workflow.dockers: + for docker_dep in docker.depends_on: + cls.evaluate_check( + docker_dep in names, + f"Docker [{docker.name}] has invalid dependency [{docker_dep}]", + workflow_name=workflow.name, + ) + + @classmethod + def evaluate_check(cls, check_ok, message, workflow_name, job_name=""): + message = message.split("\n") + messages = [message] if not isinstance(message, list) else message + if check_ok: + return + else: + print( + f"ERROR: Config validation failed: workflow [{workflow_name}], job [{job_name}]:" + ) + for message in messages: + print(" || " + message) + sys.exit(1) diff --git a/ci/praktika/version.py b/ci/praktika/version.py new file mode 100644 index 00000000000..b71dad9b794 --- /dev/null +++ b/ci/praktika/version.py @@ -0,0 +1 @@ +VERSION = 1 diff --git a/ci/praktika/workflow.py b/ci/praktika/workflow.py new file mode 100644 index 00000000000..41e8056f9ef --- /dev/null +++ b/ci/praktika/workflow.py @@ -0,0 +1,68 @@ +from dataclasses import dataclass, field +from typing import List, Optional + +from praktika import Artifact, Job +from praktika.docker import Docker +from praktika.secret import Secret +from praktika.utils import Utils + + +class Workflow: + class Event: + PULL_REQUEST = "pull_request" + PUSH = "push" + + @dataclass + class Config: + """ + branches - List of branch names or patterns, for push trigger only + base_branches - List of base branches (target branch), for pull_request trigger only + """ + + name: str + event: str + jobs: List[Job.Config] + branches: List[str] = field(default_factory=list) + base_branches: List[str] = field(default_factory=list) + artifacts: List[Artifact.Config] = field(default_factory=list) + dockers: List[Docker.Config] = field(default_factory=list) + secrets: List[Secret.Config] = field(default_factory=list) + enable_cache: bool = False + enable_report: bool = False + enable_merge_ready_status: bool = False + enable_cidb: bool = False + + def is_event_pull_request(self): + return self.event == Workflow.Event.PULL_REQUEST + + def is_event_push(self): + return self.event == Workflow.Event.PUSH + + def get_job(self, name): + job = self.find_job(name) + if not job: + Utils.raise_with_error( + f"Failed to find job [{name}], workflow [{self.name}]" + ) + return job + + def find_job(self, name, lazy=False): + name = str(name) + for job in self.jobs: + if lazy: + if name.lower() in job.name.lower(): + return job + else: + if job.name == name: + return job + return None + + def get_secret(self, name) -> Optional[Secret.Config]: + name = str(name) + names = [] + for secret in self.secrets: + if secret.name == name: + return secret + names.append(secret.name) + print(f"ERROR: Failed to find secret [{name}], workflow secrets [{names}]") + raise diff --git a/ci/praktika/yaml_generator.py b/ci/praktika/yaml_generator.py new file mode 100644 index 00000000000..00c469fec0c --- /dev/null +++ b/ci/praktika/yaml_generator.py @@ -0,0 +1,350 @@ +import dataclasses +from typing import List + +from praktika import Artifact, Job, Workflow +from praktika.mangle import _get_workflows +from praktika.parser import WorkflowConfigParser +from praktika.runtime import RunConfig +from praktika.settings import Settings +from praktika.utils import ContextManager, Shell, Utils + + +class YamlGenerator: + class Templates: + TEMPLATE_PULL_REQUEST_0 = """\ +# generated by praktika + +name: {NAME} + +on: + {EVENT}: + branches: [{BRANCHES}] + +# Cancel the previous wf run in PRs. +concurrency: + group: ${{{{{{{{ github.workflow }}}}}}}}-${{{{{{{{ github.ref }}}}}}}} + cancel-in-progress: true + +env: + # Force the stdout and stderr streams to be unbuffered + PYTHONUNBUFFERED: 1 + GH_TOKEN: ${{{{{{{{ github.token }}}}}}}} + +# Allow updating GH commit statuses and PR comments to post an actual job reports link +permissions: write-all + +jobs: +{JOBS}\ +""" + + TEMPLATE_CALLABLE_WORKFLOW = """\ +# generated by praktika + +name: {NAME} +on: + workflow_call: + inputs: + config: + type: string + required: false + default: '' + secrets: +{SECRETS} + +env: + PYTHONUNBUFFERED: 1 + +jobs: +{JOBS}\ +""" + + TEMPLATE_SECRET_CONFIG = """\ + {SECRET_NAME}: + required: true +""" + + TEMPLATE_MATRIX = """ + strategy: + fail-fast: false + matrix: + params: {PARAMS_LIST}\ +""" + + TEMPLATE_JOB_0 = """ + {JOB_NAME_NORMALIZED}: + runs-on: [{RUNS_ON}] + needs: [{NEEDS}]{IF_EXPRESSION} + name: "{JOB_NAME_GH}" + outputs: + data: ${{{{ steps.run.outputs.DATA }}}} + steps: + - name: Checkout code + uses: actions/checkout@v4 +{JOB_ADDONS} + - name: Prepare env script + run: | + cat > {ENV_SETUP_SCRIPT} << 'ENV_SETUP_SCRIPT_EOF' + export PYTHONPATH=./ci:. +{SETUP_ENVS} + cat > {WORKFLOW_CONFIG_FILE} << 'EOF' + ${{{{ needs.{WORKFLOW_CONFIG_JOB_NAME}.outputs.data }}}} + EOF + cat > {WORKFLOW_STATUS_FILE} << 'EOF' + ${{{{ toJson(needs) }}}} + EOF + ENV_SETUP_SCRIPT_EOF + + rm -rf {INPUT_DIR} {OUTPUT_DIR} {TEMP_DIR} + mkdir -p {TEMP_DIR} {INPUT_DIR} {OUTPUT_DIR} +{DOWNLOADS_GITHUB} + - name: Run + id: run + run: | + . /tmp/praktika_setup_env.sh + set -o pipefail + {PYTHON} -m praktika run --job '''{JOB_NAME}''' --workflow "{WORKFLOW_NAME}" --ci |& tee {RUN_LOG} +{UPLOADS_GITHUB}\ +""" + + TEMPLATE_SETUP_ENV_SECRETS = """\ + export {SECRET_NAME}=$(cat<<'EOF' + ${{{{ secrets.{SECRET_NAME} }}}} + EOF + )\ +""" + + TEMPLATE_PY_INSTALL = """ + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: {PYTHON_VERSION} +""" + + TEMPLATE_PY_WITH_REQUIREMENTS = """ + - name: Install dependencies + run: | + sudo apt-get update && sudo apt install -y python3-pip + # TODO: --break-system-packages? otherwise ubuntu's apt/apt-get complains + {PYTHON} -m pip install --upgrade pip --break-system-packages + {PIP} install -r {REQUIREMENT_PATH} --break-system-packages +""" + + TEMPLATE_GH_UPLOAD = """ + - name: Upload artifact {NAME} + uses: actions/upload-artifact@v4 + with: + name: {NAME} + path: {PATH} +""" + + TEMPLATE_GH_DOWNLOAD = """ + - name: Download artifact {NAME} + uses: actions/download-artifact@v4 + with: + name: {NAME} + path: {PATH} +""" + + TEMPLATE_IF_EXPRESSION = """ + if: ${{{{ !failure() && !cancelled() && !contains(fromJson(needs.{WORKFLOW_CONFIG_JOB_NAME}.outputs.data).cache_success_base64, '{JOB_NAME_BASE64}') }}}}\ +""" + + TEMPLATE_IF_EXPRESSION_SKIPPED_OR_SUCCESS = """ + if: ${{ !failure() && !cancelled() }}\ +""" + + TEMPLATE_IF_EXPRESSION_NOT_CANCELLED = """ + if: ${{ !cancelled() }}\ +""" + + def __init__(self): + self.py_workflows = [] # type: List[Workflow.Config] + + @classmethod + def _get_workflow_file_name(cls, workflow_name): + return f"{Settings.WORKFLOW_PATH_PREFIX}/{Utils.normalize_string(workflow_name)}.yaml" + + def generate(self, workflow_file="", workflow_config=None): + print("---Start generating yaml pipelines---") + if workflow_config: + self.py_workflows = [workflow_config] + else: + self.py_workflows = _get_workflows(file=workflow_file) + assert self.py_workflows + for workflow_config in self.py_workflows: + print(f"Generate workflow [{workflow_config.name}]") + parser = WorkflowConfigParser(workflow_config).parse() + if ( + workflow_config.is_event_pull_request() + or workflow_config.is_event_push() + ): + yaml_workflow_str = PullRequestPushYamlGen(parser).generate() + else: + assert ( + False + ), f"Workflow event not yet supported [{workflow_config.event}]" + + with ContextManager.cd(): + with open(self._get_workflow_file_name(workflow_config.name), "w") as f: + f.write(yaml_workflow_str) + + with ContextManager.cd(): + Shell.check("git add ./.github/workflows/*.yaml") + + +class PullRequestPushYamlGen: + def __init__(self, parser: WorkflowConfigParser): + self.workflow_config = parser.workflow_yaml_config + self.parser = parser + + def generate(self): + job_items = [] + for i, job in enumerate(self.workflow_config.jobs): + job_name_normalized = Utils.normalize_string(job.name) + needs = ", ".join(map(Utils.normalize_string, job.needs)) + job_name = job.name + job_addons = [] + for addon in job.addons: + if addon.install_python: + job_addons.append( + YamlGenerator.Templates.TEMPLATE_PY_INSTALL.format( + PYTHON_VERSION=Settings.PYTHON_VERSION + ) + ) + if addon.requirements_txt_path: + job_addons.append( + YamlGenerator.Templates.TEMPLATE_PY_WITH_REQUIREMENTS.format( + PYTHON=Settings.PYTHON_INTERPRETER, + PIP=Settings.PYTHON_PACKET_MANAGER, + PYTHON_VERSION=Settings.PYTHON_VERSION, + REQUIREMENT_PATH=addon.requirements_txt_path, + ) + ) + uploads_github = [] + for artifact in job.artifacts_gh_provides: + uploads_github.append( + YamlGenerator.Templates.TEMPLATE_GH_UPLOAD.format( + NAME=artifact.name, PATH=artifact.path + ) + ) + downloads_github = [] + for artifact in job.artifacts_gh_requires: + downloads_github.append( + YamlGenerator.Templates.TEMPLATE_GH_DOWNLOAD.format( + NAME=artifact.name, PATH=Settings.INPUT_DIR + ) + ) + + config_job_name_normalized = Utils.normalize_string( + Settings.CI_CONFIG_JOB_NAME + ) + + if_expression = "" + if ( + self.workflow_config.enable_cache + and job_name_normalized != config_job_name_normalized + ): + if_expression = YamlGenerator.Templates.TEMPLATE_IF_EXPRESSION.format( + WORKFLOW_CONFIG_JOB_NAME=config_job_name_normalized, + JOB_NAME_BASE64=Utils.to_base64(job_name), + ) + if job.run_unless_cancelled: + if_expression = ( + YamlGenerator.Templates.TEMPLATE_IF_EXPRESSION_NOT_CANCELLED + ) + + secrets_envs = [] + for secret in self.workflow_config.secret_names_gh: + secrets_envs.append( + YamlGenerator.Templates.TEMPLATE_SETUP_ENV_SECRETS.format( + SECRET_NAME=secret + ) + ) + + job_item = YamlGenerator.Templates.TEMPLATE_JOB_0.format( + JOB_NAME_NORMALIZED=job_name_normalized, + WORKFLOW_CONFIG_JOB_NAME=config_job_name_normalized, + IF_EXPRESSION=if_expression, + RUNS_ON=", ".join(job.runs_on), + NEEDS=needs, + JOB_NAME_GH=job_name.replace('"', '\\"'), + JOB_NAME=job_name.replace( + "'", "'\\''" + ), # ' must be escaped so that yaml commands are properly parsed + WORKFLOW_NAME=self.workflow_config.name, + ENV_SETUP_SCRIPT=Settings.ENV_SETUP_SCRIPT, + SETUP_ENVS="\n".join(secrets_envs), + WORKFLOW_CONFIG_FILE=RunConfig.file_name_static( + self.workflow_config.name + ), + JOB_ADDONS="".join(job_addons), + DOWNLOADS_GITHUB="\n".join(downloads_github), + UPLOADS_GITHUB="\n".join(uploads_github), + RUN_LOG=Settings.RUN_LOG, + PYTHON=Settings.PYTHON_INTERPRETER, + WORKFLOW_STATUS_FILE=Settings.WORKFLOW_STATUS_FILE, + TEMP_DIR=Settings.TEMP_DIR, + INPUT_DIR=Settings.INPUT_DIR, + OUTPUT_DIR=Settings.OUTPUT_DIR, + ) + job_items.append(job_item) + + base_template = YamlGenerator.Templates.TEMPLATE_PULL_REQUEST_0 + template_1 = base_template.strip().format( + NAME=self.workflow_config.name, + BRANCHES=", ".join( + [f"'{branch}'" for branch in self.workflow_config.branches] + ), + EVENT=self.workflow_config.event, + JOBS="{}" * len(job_items), + ) + res = template_1.format(*job_items) + + return res + + +@dataclasses.dataclass +class AuxConfig: + # defines aux step to install dependencies + addon: Job.Requirements + # defines aux step(s) to upload GH artifacts + uploads_gh: List[Artifact.Config] + # defines aux step(s) to download GH artifacts + downloads_gh: List[Artifact.Config] + + def get_aux_workflow_name(self): + suffix = "" + if self.addon.python_requirements_txt: + suffix += "_py" + for _ in self.uploads_gh: + suffix += "_uplgh" + for _ in self.downloads_gh: + suffix += "_dnlgh" + return f"{Settings.WORKFLOW_PATH_PREFIX}/aux_job{suffix}.yaml" + + def get_aux_workflow_input(self): + res = "" + if self.addon.python_requirements_txt: + res += f" requirements_txt: {self.addon.python_requirements_txt}" + return res + + +if __name__ == "__main__": + WFS = [ + Workflow.Config( + name="PR", + event=Workflow.Event.PULL_REQUEST, + jobs=[ + Job.Config( + name="Hello World", + runs_on=["foo"], + command="bar", + job_requirements=Job.Requirements( + python_requirements_txt="./requirement.txt" + ), + ) + ], + enable_cache=True, + ) + ] + YamlGenerator().generate(workflow_config=WFS) diff --git a/ci_v2/settings/definitions.py b/ci/settings/definitions.py similarity index 69% rename from ci_v2/settings/definitions.py rename to ci/settings/definitions.py index 87669cdcf25..176e865e6f3 100644 --- a/ci_v2/settings/definitions.py +++ b/ci/settings/definitions.py @@ -7,6 +7,7 @@ S3_BUCKET_HTTP_ENDPOINT = "clickhouse-builds.s3.amazonaws.com" class RunnerLabels: CI_SERVICES = "ci_services" CI_SERVICES_EBS = "ci_services_ebs" + BUILDER = "builder" BASE_BRANCH = "master" @@ -29,155 +30,134 @@ SECRETS = [ DOCKERS = [ # Docker.Config( # name="clickhouse/binary-builder", - # path="./docker/packager/binary-builder", - # arm64=True, - # amd64=True, + # path="./ci/docker/packager/binary-builder", + # platforms=Docker.Platforms.arm_amd, # depends_on=[], # ), # Docker.Config( # name="clickhouse/cctools", - # path="./docker/packager/cctools", - # arm64=True, - # amd64=True, + # path="./ci/docker/packager/cctools", + # platforms=Docker.Platforms.arm_amd, # depends_on=[], # ), # Docker.Config( # name="clickhouse/test-old-centos", - # path="./docker/test/compatibility/centos", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/compatibility/centos", + # platforms=Docker.Platforms.arm_amd, # depends_on=[], # ), # Docker.Config( # name="clickhouse/test-old-ubuntu", - # path="./docker/test/compatibility/ubuntu", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/compatibility/ubuntu", + # platforms=Docker.Platforms.arm_amd, # depends_on=[], # ), # Docker.Config( # name="clickhouse/test-util", - # path="./docker/test/util", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/util", + # platforms=Docker.Platforms.arm_amd, # depends_on=[], # ), # Docker.Config( # name="clickhouse/integration-test", - # path="./docker/test/integration/base", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/integration/base", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/fuzzer", - # path="./docker/test/fuzzer", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/fuzzer", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/performance-comparison", - # path="./docker/test/performance-comparison", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/performance-comparison", + # platforms=Docker.Platforms.arm_amd, # depends_on=[], # ), - # Docker.Config( - # name="clickhouse/fasttest", - # path="./docker/test/fasttest", - # arm64=True, - # amd64=True, - # depends_on=["clickhouse/test-util"], - # ), + Docker.Config( + name="clickhouse/fasttest", + path="./ci/docker/fasttest", + platforms=Docker.Platforms.arm_amd, + depends_on=[], + ), # Docker.Config( # name="clickhouse/test-base", - # path="./docker/test/base", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/base", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-util"], # ), # Docker.Config( # name="clickhouse/clickbench", - # path="./docker/test/clickbench", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/clickbench", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/keeper-jepsen-test", - # path="./docker/test/keeper-jepsen", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/keeper-jepsen", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/server-jepsen-test", - # path="./docker/test/server-jepsen", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/server-jepsen", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/sqllogic-test", - # path="./docker/test/sqllogic", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/sqllogic", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/sqltest", - # path="./docker/test/sqltest", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/sqltest", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/stateless-test", - # path="./docker/test/stateless", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/stateless", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/stateful-test", - # path="./docker/test/stateful", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/stateful", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/stateless-test"], # ), # Docker.Config( # name="clickhouse/stress-test", - # path="./docker/test/stress", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/stress", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/stateful-test"], # ), # Docker.Config( # name="clickhouse/unit-test", - # path="./docker/test/unit", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/unit", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), # Docker.Config( # name="clickhouse/integration-tests-runner", - # path="./docker/test/integration/runner", - # arm64=True, - # amd64=True, + # path="./ci/docker/test/integration/runner", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), Docker.Config( name="clickhouse/style-test", - path="./ci_v2/docker/style-test", + path="./ci/docker/style-test", platforms=Docker.Platforms.arm_amd, depends_on=[], ), # Docker.Config( # name="clickhouse/docs-builder", - # path="./docker/docs/builder", - # arm64=True, - # amd64=True, + # path="./ci/docker/docs/builder", + # platforms=Docker.Platforms.arm_amd, # depends_on=["clickhouse/test-base"], # ), ] @@ -249,3 +229,5 @@ DOCKERS = [ class JobNames: STYLE_CHECK = "Style Check" + FAST_TEST = "Fast test" + BUILD_AMD_DEBUG = "Build amd64 debug" diff --git a/ci_v2/settings/settings.py b/ci/settings/settings.py similarity index 93% rename from ci_v2/settings/settings.py rename to ci/settings/settings.py index 153aab93506..8d5e7bc3c87 100644 --- a/ci_v2/settings/settings.py +++ b/ci/settings/settings.py @@ -1,4 +1,4 @@ -from ci_v2.settings.definitions import ( +from ci.settings.definitions import ( S3_BUCKET_HTTP_ENDPOINT, S3_BUCKET_NAME, RunnerLabels, diff --git a/ci/workflows/pull_request.py b/ci/workflows/pull_request.py new file mode 100644 index 00000000000..74129177efb --- /dev/null +++ b/ci/workflows/pull_request.py @@ -0,0 +1,94 @@ +from typing import List + +from praktika import Artifact, Job, Workflow +from praktika.settings import Settings + +from ci.settings.definitions import ( + BASE_BRANCH, + DOCKERS, + SECRETS, + JobNames, + RunnerLabels, +) + + +class ArtifactNames: + ch_debug_binary = "clickhouse_debug_binary" + + +style_check_job = Job.Config( + name=JobNames.STYLE_CHECK, + runs_on=[RunnerLabels.CI_SERVICES], + command="python3 ./ci/jobs/check_style.py", + run_in_docker="clickhouse/style-test", +) + +fast_test_job = Job.Config( + name=JobNames.FAST_TEST, + runs_on=[RunnerLabels.BUILDER], + command="python3 ./ci/jobs/fast_test.py", + run_in_docker="clickhouse/fasttest", + digest_config=Job.CacheDigestConfig( + include_paths=[ + "./ci/jobs/fast_test.py", + "./tests/queries/0_stateless/", + "./src", + ], + ), +) + +job_build_amd_debug = Job.Config( + name=JobNames.BUILD_AMD_DEBUG, + runs_on=[RunnerLabels.BUILDER], + command="python3 ./ci/jobs/build_clickhouse.py amd_debug", + run_in_docker="clickhouse/fasttest", + digest_config=Job.CacheDigestConfig( + include_paths=[ + "./src", + "./contrib/", + "./CMakeLists.txt", + "./PreLoad.cmake", + "./cmake", + "./base", + "./programs", + "./docker/packager/packager", + "./rust", + "./tests/ci/version_helper.py", + ], + ), + provides=[ArtifactNames.ch_debug_binary], +) + +workflow = Workflow.Config( + name="PR", + event=Workflow.Event.PULL_REQUEST, + base_branches=[BASE_BRANCH], + jobs=[ + style_check_job, + fast_test_job, + job_build_amd_debug, + ], + artifacts=[ + Artifact.Config( + name=ArtifactNames.ch_debug_binary, + type=Artifact.Type.S3, + path=f"{Settings.TEMP_DIR}/build/programs/clickhouse", + ) + ], + dockers=DOCKERS, + secrets=SECRETS, + enable_cache=True, + enable_report=True, + enable_merge_ready_status=True, +) + +WORKFLOWS = [ + workflow, +] # type: List[Workflow.Config] + + +if __name__ == "__main__": + # local job test inside praktika environment + from praktika.runner import Runner + + Runner().run(workflow, fast_test_job, docker="fasttest", dummy_env=True) diff --git a/ci_v2/docker/style-test/requirements.txt b/ci_v2/docker/style-test/requirements.txt deleted file mode 100644 index 987b014d9ba..00000000000 --- a/ci_v2/docker/style-test/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -requests==2.32.3 -yamllint==1.26.3 -codespell==2.2.1 -https://clickhouse-builds.s3.amazonaws.com/packages/praktika-0.1-py3-none-any.whl diff --git a/ci_v2/workflows/pull_request.py b/ci_v2/workflows/pull_request.py deleted file mode 100644 index 226455c77f2..00000000000 --- a/ci_v2/workflows/pull_request.py +++ /dev/null @@ -1,44 +0,0 @@ -from typing import List - -from ci_v2.settings.definitions import ( - BASE_BRANCH, - DOCKERS, - SECRETS, - JobNames, - RunnerLabels, -) -from praktika import Job, Workflow - -style_check_job = Job.Config( - name=JobNames.STYLE_CHECK, - runs_on=[RunnerLabels.CI_SERVICES], - command="python3 ./ci_v2/jobs/check_style.py", - run_in_docker="clickhouse/style-test", -) - -workflow = Workflow.Config( - name="PR", - event=Workflow.Event.PULL_REQUEST, - base_branches=[BASE_BRANCH], - jobs=[ - style_check_job, - ], - dockers=DOCKERS, - secrets=SECRETS, - enable_cache=True, - enable_report=True, - enable_merge_ready_status=True, -) - -WORKFLOWS = [ - workflow, -] # type: List[Workflow.Config] - - -if __name__ == "__main__": - # example: local job test inside praktika environment - from praktika.runner import Runner - - Runner.generate_dummy_environment(workflow, style_check_job) - - Runner().run(workflow, style_check_job) diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index 91a7e976aaf..99141510248 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: VERSION_REVISION has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54491) +SET(VERSION_REVISION 54492) SET(VERSION_MAJOR 24) -SET(VERSION_MINOR 10) +SET(VERSION_MINOR 11) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH b12a367741812f9e5fe754d19ebae600e2a2614c) -SET(VERSION_DESCRIBE v24.10.1.1-testing) -SET(VERSION_STRING 24.10.1.1) +SET(VERSION_GITHASH c82cf25b3e5864bcc153cbe45adb8c6527e1ec6e) +SET(VERSION_DESCRIBE v24.11.1.1-testing) +SET(VERSION_STRING 24.11.1.1) # end of autochange diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index b102b2919d9..fa0f95245f2 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -178,35 +178,13 @@ add_contrib (sqlite-cmake sqlite-amalgamation) add_contrib (s2geometry-cmake s2geometry) add_contrib (c-ares-cmake c-ares) -if (OS_LINUX AND ARCH_AMD64 AND ENABLE_SSE42) - option (ENABLE_QPL "Enable Intel® Query Processing Library (QPL)" ${ENABLE_LIBRARIES}) -elseif(ENABLE_QPL) - message (${RECONFIGURE_MESSAGE_LEVEL} "QPL library is only supported on x86_64 with SSE 4.2 or higher") -endif() -if (ENABLE_QPL) - add_contrib (idxd-config-cmake idxd-config) - add_contrib (qpl-cmake qpl) # requires: idxd-config -else() - message(STATUS "Not using QPL") -endif () - if (OS_LINUX AND ARCH_AMD64 AND NOT NO_SSE3_OR_HIGHER) option (ENABLE_QATLIB "Enable Intel® QuickAssist Technology Library (QATlib)" ${ENABLE_LIBRARIES}) elseif(ENABLE_QATLIB) message (${RECONFIGURE_MESSAGE_LEVEL} "QATLib is only supported on x86_64") endif() if (ENABLE_QATLIB) - option (ENABLE_QAT_USDM_DRIVER "A User Space DMA-able Memory (USDM) component which allocates/frees DMA-able memory" OFF) - option (ENABLE_QAT_OUT_OF_TREE_BUILD "Using out-of-tree driver, user needs to customize ICP_ROOT variable" OFF) - set(ICP_ROOT "" CACHE STRING "ICP_ROOT variable to define the path of out-of-tree driver package") - if (ENABLE_QAT_OUT_OF_TREE_BUILD) - if (ICP_ROOT STREQUAL "") - message(FATAL_ERROR "Please define the path of out-of-tree driver package with -DICP_ROOT=xxx or disable out-of-tree build with -DENABLE_QAT_OUT_OF_TREE_BUILD=OFF; \ - If you want out-of-tree build but have no package available, please download and build ICP package from: https://www.intel.com/content/www/us/en/download/765501.html") - endif () - else() - add_contrib (qatlib-cmake qatlib) # requires: isa-l - endif () + add_contrib (qatlib-cmake qatlib) # requires: isa-l add_contrib (QAT-ZSTD-Plugin-cmake QAT-ZSTD-Plugin) else() message(STATUS "Not using QATLib") diff --git a/contrib/QAT-ZSTD-Plugin-cmake/CMakeLists.txt b/contrib/QAT-ZSTD-Plugin-cmake/CMakeLists.txt index fc18092f574..5d1cfa2af14 100644 --- a/contrib/QAT-ZSTD-Plugin-cmake/CMakeLists.txt +++ b/contrib/QAT-ZSTD-Plugin-cmake/CMakeLists.txt @@ -1,85 +1,53 @@ # Intel® QuickAssist Technology ZSTD Plugin (QAT ZSTD Plugin) is a plugin to Zstandard*(ZSTD*) for accelerating compression by QAT. -# ENABLE_QAT_OUT_OF_TREE_BUILD = 1 means kernel don't have native support, user will build and install driver from external package: https://www.intel.com/content/www/us/en/download/765501.html -# meanwhile, user need to set ICP_ROOT environment variable which point to the root directory of QAT driver source tree. -# ENABLE_QAT_OUT_OF_TREE_BUILD = 0 means kernel has built-in qat driver, QAT-ZSTD-PLUGIN just has dependency on qatlib. -if (ENABLE_QAT_OUT_OF_TREE_BUILD) - message(STATUS "Intel QATZSTD out-of-tree build, ICP_ROOT:${ICP_ROOT}") +message(STATUS "Intel QATZSTD in-tree build") +set(QATZSTD_SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/QAT-ZSTD-Plugin/src") +set(QATZSTD_SRC "${QATZSTD_SRC_DIR}/qatseqprod.c") +set(ZSTD_LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/zstd/lib") - set(QATZSTD_SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/QAT-ZSTD-Plugin/src") - set(QATZSTD_SRC "${QATZSTD_SRC_DIR}/qatseqprod.c") - set(ZSTD_LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/zstd/lib") - set(QAT_INCLUDE_DIR "${ICP_ROOT}/quickassist/include") - set(QAT_DC_INCLUDE_DIR "${ICP_ROOT}/quickassist/include/dc") - set(QAT_AL_INCLUDE_DIR "${ICP_ROOT}/quickassist/lookaside/access_layer/include") - set(QAT_USDM_INCLUDE_DIR "${ICP_ROOT}/quickassist/utilities/libusdm_drv") - set(USDM_LIBRARY "${ICP_ROOT}/build/libusdm_drv_s.so") - set(QAT_S_LIBRARY "${ICP_ROOT}/build/libqat_s.so") - if (ENABLE_QAT_USDM_DRIVER) - add_definitions(-DENABLE_USDM_DRV) - endif() - add_library(_qatzstd_plugin ${QATZSTD_SRC}) - target_link_libraries (_qatzstd_plugin PUBLIC ${USDM_LIBRARY} ${QAT_S_LIBRARY}) - target_include_directories(_qatzstd_plugin - SYSTEM PUBLIC "${QATZSTD_SRC_DIR}" - PRIVATE ${QAT_INCLUDE_DIR} - ${QAT_DC_INCLUDE_DIR} - ${QAT_AL_INCLUDE_DIR} - ${QAT_USDM_INCLUDE_DIR} - ${ZSTD_LIBRARY_DIR}) - target_compile_definitions(_qatzstd_plugin PRIVATE -DDEBUGLEVEL=0) - add_library (ch_contrib::qatzstd_plugin ALIAS _qatzstd_plugin) -else () # In-tree build - message(STATUS "Intel QATZSTD in-tree build") - set(QATZSTD_SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/QAT-ZSTD-Plugin/src") - set(QATZSTD_SRC "${QATZSTD_SRC_DIR}/qatseqprod.c") - set(ZSTD_LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/zstd/lib") +# please download&build ICP package from: https://www.intel.com/content/www/us/en/download/765501.html +set(ICP_ROOT "${ClickHouse_SOURCE_DIR}/contrib/qatlib") +set(QAT_INCLUDE_DIR "${ICP_ROOT}/quickassist/include") +set(QAT_DC_INCLUDE_DIR "${ICP_ROOT}/quickassist/include/dc") +set(QAT_AL_INCLUDE_DIR "${ICP_ROOT}/quickassist/lookaside/access_layer/include") +set(QAT_USDM_INCLUDE_DIR "${ICP_ROOT}/quickassist/utilities/libusdm_drv") +set(USDM_LIBRARY "${ICP_ROOT}/build/libusdm_drv_s.so") +set(QAT_S_LIBRARY "${ICP_ROOT}/build/libqat_s.so") +set(LIBQAT_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/qatlib") +set(LIBQAT_HEADER_DIR "${CMAKE_CURRENT_BINARY_DIR}/include") - # please download&build ICP package from: https://www.intel.com/content/www/us/en/download/765501.html - set(ICP_ROOT "${ClickHouse_SOURCE_DIR}/contrib/qatlib") - set(QAT_INCLUDE_DIR "${ICP_ROOT}/quickassist/include") - set(QAT_DC_INCLUDE_DIR "${ICP_ROOT}/quickassist/include/dc") - set(QAT_AL_INCLUDE_DIR "${ICP_ROOT}/quickassist/lookaside/access_layer/include") - set(QAT_USDM_INCLUDE_DIR "${ICP_ROOT}/quickassist/utilities/libusdm_drv") - set(USDM_LIBRARY "${ICP_ROOT}/build/libusdm_drv_s.so") - set(QAT_S_LIBRARY "${ICP_ROOT}/build/libqat_s.so") - set(LIBQAT_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/qatlib") - set(LIBQAT_HEADER_DIR "${CMAKE_CURRENT_BINARY_DIR}/include") +file(MAKE_DIRECTORY + "${LIBQAT_HEADER_DIR}/qat" +) +file(COPY "${LIBQAT_ROOT_DIR}/quickassist/include/cpa.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" +) +file(COPY "${LIBQAT_ROOT_DIR}/quickassist/include/dc/cpa_dc.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" +) +file(COPY "${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/include/icp_sal_poll.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" +) +file(COPY "${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/include/icp_sal_user.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" +) +file(COPY "${LIBQAT_ROOT_DIR}/quickassist/utilities/libusdm_drv/qae_mem.h" + DESTINATION "${LIBQAT_HEADER_DIR}/qat/" +) - file(MAKE_DIRECTORY - "${LIBQAT_HEADER_DIR}/qat" - ) - file(COPY "${LIBQAT_ROOT_DIR}/quickassist/include/cpa.h" - DESTINATION "${LIBQAT_HEADER_DIR}/qat/" - ) - file(COPY "${LIBQAT_ROOT_DIR}/quickassist/include/dc/cpa_dc.h" - DESTINATION "${LIBQAT_HEADER_DIR}/qat/" - ) - file(COPY "${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/include/icp_sal_poll.h" - DESTINATION "${LIBQAT_HEADER_DIR}/qat/" - ) - file(COPY "${LIBQAT_ROOT_DIR}/quickassist/lookaside/access_layer/include/icp_sal_user.h" - DESTINATION "${LIBQAT_HEADER_DIR}/qat/" - ) - file(COPY "${LIBQAT_ROOT_DIR}/quickassist/utilities/libusdm_drv/qae_mem.h" - DESTINATION "${LIBQAT_HEADER_DIR}/qat/" - ) - - if (ENABLE_QAT_USDM_DRIVER) - add_definitions(-DENABLE_USDM_DRV) - endif() - - add_library(_qatzstd_plugin ${QATZSTD_SRC}) - target_link_libraries (_qatzstd_plugin PUBLIC ch_contrib::qatlib ch_contrib::usdm) - target_include_directories(_qatzstd_plugin PRIVATE - ${QAT_INCLUDE_DIR} - ${QAT_DC_INCLUDE_DIR} - ${QAT_AL_INCLUDE_DIR} - ${QAT_USDM_INCLUDE_DIR} - ${ZSTD_LIBRARY_DIR} - ${LIBQAT_HEADER_DIR}) - target_compile_definitions(_qatzstd_plugin PRIVATE -DDEBUGLEVEL=0 PUBLIC -DINTREE) - target_include_directories(_qatzstd_plugin SYSTEM PUBLIC $ $) - add_library (ch_contrib::qatzstd_plugin ALIAS _qatzstd_plugin) -endif () +if (ENABLE_QAT_USDM_DRIVER) + add_definitions(-DENABLE_USDM_DRV) +endif() +add_library(_qatzstd_plugin ${QATZSTD_SRC}) +target_link_libraries (_qatzstd_plugin PUBLIC ch_contrib::qatlib ch_contrib::usdm) +target_include_directories(_qatzstd_plugin PRIVATE + ${QAT_INCLUDE_DIR} + ${QAT_DC_INCLUDE_DIR} + ${QAT_AL_INCLUDE_DIR} + ${QAT_USDM_INCLUDE_DIR} + ${ZSTD_LIBRARY_DIR} + ${LIBQAT_HEADER_DIR}) +target_compile_definitions(_qatzstd_plugin PRIVATE -DDEBUGLEVEL=0 PUBLIC -DINTREE) +target_include_directories(_qatzstd_plugin SYSTEM PUBLIC $ $) +add_library (ch_contrib::qatzstd_plugin ALIAS _qatzstd_plugin) diff --git a/contrib/SimSIMD b/contrib/SimSIMD index ff51434d90c..ee3c9c9c00b 160000 --- a/contrib/SimSIMD +++ b/contrib/SimSIMD @@ -1 +1 @@ -Subproject commit ff51434d90c66f916e94ff05b24530b127aa4cff +Subproject commit ee3c9c9c00b51645f62a1a9e99611b78c0052a21 diff --git a/contrib/SimSIMD-cmake/CMakeLists.txt b/contrib/SimSIMD-cmake/CMakeLists.txt index f5dc4d63604..8350417479a 100644 --- a/contrib/SimSIMD-cmake/CMakeLists.txt +++ b/contrib/SimSIMD-cmake/CMakeLists.txt @@ -1,4 +1,8 @@ -set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD") - -add_library(_simsimd INTERFACE) -target_include_directories(_simsimd SYSTEM INTERFACE "${SIMSIMD_PROJECT_DIR}/include") +# See contrib/usearch-cmake/CMakeLists.txt, why only enabled on x86 +if (ARCH_AMD64) + set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD") + set(SIMSIMD_SRCS ${SIMSIMD_PROJECT_DIR}/c/lib.c) + add_library(_simsimd ${SIMSIMD_SRCS}) + target_include_directories(_simsimd SYSTEM PUBLIC "${SIMSIMD_PROJECT_DIR}/include") + target_compile_definitions(_simsimd PUBLIC SIMSIMD_DYNAMIC_DISPATCH) +endif() diff --git a/contrib/arrow b/contrib/arrow index 5cfccd8ea65..6e2574f5013 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 5cfccd8ea65f33d4517e7409815d761c7650b45d +Subproject commit 6e2574f5013a005c050c9a7787d341aef09d0063 diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 96d1f4adda7..208d48df178 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -213,13 +213,19 @@ target_include_directories(_orc SYSTEM PRIVATE set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow") # arrow/cpp/src/arrow/CMakeLists.txt (ARROW_SRCS + ARROW_COMPUTE + ARROW_IPC) +# find . \( -iname \*.cc -o -iname \*.cpp -o -iname \*.c \) | sort | awk '{print "\"${LIBRARY_DIR}" substr($1,2) "\"" }' | grep -v 'test.cc' | grep -v 'json' | grep -v 'flight' \| +# grep -v 'csv' | grep -v 'acero' | grep -v 'dataset' | grep -v 'testing' | grep -v 'gpu' | grep -v 'engine' | grep -v 'filesystem' | grep -v 'benchmark.cc' set(ARROW_SRCS + "${LIBRARY_DIR}/adapters/orc/adapter.cc" + "${LIBRARY_DIR}/adapters/orc/options.cc" + "${LIBRARY_DIR}/adapters/orc/util.cc" "${LIBRARY_DIR}/array/array_base.cc" "${LIBRARY_DIR}/array/array_binary.cc" "${LIBRARY_DIR}/array/array_decimal.cc" "${LIBRARY_DIR}/array/array_dict.cc" "${LIBRARY_DIR}/array/array_nested.cc" "${LIBRARY_DIR}/array/array_primitive.cc" + "${LIBRARY_DIR}/array/array_run_end.cc" "${LIBRARY_DIR}/array/builder_adaptive.cc" "${LIBRARY_DIR}/array/builder_base.cc" "${LIBRARY_DIR}/array/builder_binary.cc" @@ -227,124 +233,26 @@ set(ARROW_SRCS "${LIBRARY_DIR}/array/builder_dict.cc" "${LIBRARY_DIR}/array/builder_nested.cc" "${LIBRARY_DIR}/array/builder_primitive.cc" - "${LIBRARY_DIR}/array/builder_union.cc" "${LIBRARY_DIR}/array/builder_run_end.cc" - "${LIBRARY_DIR}/array/array_run_end.cc" + "${LIBRARY_DIR}/array/builder_union.cc" "${LIBRARY_DIR}/array/concatenate.cc" "${LIBRARY_DIR}/array/data.cc" "${LIBRARY_DIR}/array/diff.cc" "${LIBRARY_DIR}/array/util.cc" "${LIBRARY_DIR}/array/validate.cc" - "${LIBRARY_DIR}/builder.cc" "${LIBRARY_DIR}/buffer.cc" - "${LIBRARY_DIR}/chunked_array.cc" - "${LIBRARY_DIR}/chunk_resolver.cc" - "${LIBRARY_DIR}/compare.cc" - "${LIBRARY_DIR}/config.cc" - "${LIBRARY_DIR}/datum.cc" - "${LIBRARY_DIR}/device.cc" - "${LIBRARY_DIR}/extension_type.cc" - "${LIBRARY_DIR}/memory_pool.cc" - "${LIBRARY_DIR}/pretty_print.cc" - "${LIBRARY_DIR}/record_batch.cc" - "${LIBRARY_DIR}/result.cc" - "${LIBRARY_DIR}/scalar.cc" - "${LIBRARY_DIR}/sparse_tensor.cc" - "${LIBRARY_DIR}/status.cc" - "${LIBRARY_DIR}/table.cc" - "${LIBRARY_DIR}/table_builder.cc" - "${LIBRARY_DIR}/tensor.cc" - "${LIBRARY_DIR}/tensor/coo_converter.cc" - "${LIBRARY_DIR}/tensor/csf_converter.cc" - "${LIBRARY_DIR}/tensor/csx_converter.cc" - "${LIBRARY_DIR}/type.cc" - "${LIBRARY_DIR}/visitor.cc" + "${LIBRARY_DIR}/builder.cc" "${LIBRARY_DIR}/c/bridge.cc" - "${LIBRARY_DIR}/io/buffered.cc" - "${LIBRARY_DIR}/io/caching.cc" - "${LIBRARY_DIR}/io/compressed.cc" - "${LIBRARY_DIR}/io/file.cc" - "${LIBRARY_DIR}/io/hdfs.cc" - "${LIBRARY_DIR}/io/hdfs_internal.cc" - "${LIBRARY_DIR}/io/interfaces.cc" - "${LIBRARY_DIR}/io/memory.cc" - "${LIBRARY_DIR}/io/slow.cc" - "${LIBRARY_DIR}/io/stdio.cc" - "${LIBRARY_DIR}/io/transform.cc" - "${LIBRARY_DIR}/util/async_util.cc" - "${LIBRARY_DIR}/util/basic_decimal.cc" - "${LIBRARY_DIR}/util/bit_block_counter.cc" - "${LIBRARY_DIR}/util/bit_run_reader.cc" - "${LIBRARY_DIR}/util/bit_util.cc" - "${LIBRARY_DIR}/util/bitmap.cc" - "${LIBRARY_DIR}/util/bitmap_builders.cc" - "${LIBRARY_DIR}/util/bitmap_ops.cc" - "${LIBRARY_DIR}/util/bpacking.cc" - "${LIBRARY_DIR}/util/cancel.cc" - "${LIBRARY_DIR}/util/compression.cc" - "${LIBRARY_DIR}/util/counting_semaphore.cc" - "${LIBRARY_DIR}/util/cpu_info.cc" - "${LIBRARY_DIR}/util/decimal.cc" - "${LIBRARY_DIR}/util/delimiting.cc" - "${LIBRARY_DIR}/util/formatting.cc" - "${LIBRARY_DIR}/util/future.cc" - "${LIBRARY_DIR}/util/int_util.cc" - "${LIBRARY_DIR}/util/io_util.cc" - "${LIBRARY_DIR}/util/logging.cc" - "${LIBRARY_DIR}/util/key_value_metadata.cc" - "${LIBRARY_DIR}/util/memory.cc" - "${LIBRARY_DIR}/util/mutex.cc" - "${LIBRARY_DIR}/util/string.cc" - "${LIBRARY_DIR}/util/string_builder.cc" - "${LIBRARY_DIR}/util/task_group.cc" - "${LIBRARY_DIR}/util/tdigest.cc" - "${LIBRARY_DIR}/util/thread_pool.cc" - "${LIBRARY_DIR}/util/time.cc" - "${LIBRARY_DIR}/util/trie.cc" - "${LIBRARY_DIR}/util/unreachable.cc" - "${LIBRARY_DIR}/util/uri.cc" - "${LIBRARY_DIR}/util/utf8.cc" - "${LIBRARY_DIR}/util/value_parsing.cc" - "${LIBRARY_DIR}/util/byte_size.cc" - "${LIBRARY_DIR}/util/debug.cc" - "${LIBRARY_DIR}/util/tracing.cc" - "${LIBRARY_DIR}/util/atfork_internal.cc" - "${LIBRARY_DIR}/util/crc32.cc" - "${LIBRARY_DIR}/util/hashing.cc" - "${LIBRARY_DIR}/util/ree_util.cc" - "${LIBRARY_DIR}/util/union_util.cc" - "${LIBRARY_DIR}/vendored/base64.cpp" - "${LIBRARY_DIR}/vendored/datetime/tz.cpp" - "${LIBRARY_DIR}/vendored/musl/strptime.c" - "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c" - "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c" - "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c" - "${LIBRARY_DIR}/vendored/uriparser/UriFile.c" - "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c" - "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c" - "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c" - "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c" - "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c" - "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c" - "${LIBRARY_DIR}/vendored/uriparser/UriParse.c" - "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c" - "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c" - "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c" - "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c" - "${LIBRARY_DIR}/vendored/double-conversion/bignum.cc" - "${LIBRARY_DIR}/vendored/double-conversion/bignum-dtoa.cc" - "${LIBRARY_DIR}/vendored/double-conversion/cached-powers.cc" - "${LIBRARY_DIR}/vendored/double-conversion/double-to-string.cc" - "${LIBRARY_DIR}/vendored/double-conversion/fast-dtoa.cc" - "${LIBRARY_DIR}/vendored/double-conversion/fixed-dtoa.cc" - "${LIBRARY_DIR}/vendored/double-conversion/string-to-double.cc" - "${LIBRARY_DIR}/vendored/double-conversion/strtod.cc" - + "${LIBRARY_DIR}/c/dlpack.cc" + "${LIBRARY_DIR}/chunk_resolver.cc" + "${LIBRARY_DIR}/chunked_array.cc" + "${LIBRARY_DIR}/compare.cc" "${LIBRARY_DIR}/compute/api_aggregate.cc" "${LIBRARY_DIR}/compute/api_scalar.cc" "${LIBRARY_DIR}/compute/api_vector.cc" "${LIBRARY_DIR}/compute/cast.cc" "${LIBRARY_DIR}/compute/exec.cc" + "${LIBRARY_DIR}/compute/expression.cc" "${LIBRARY_DIR}/compute/function.cc" "${LIBRARY_DIR}/compute/function_internal.cc" "${LIBRARY_DIR}/compute/kernel.cc" @@ -355,6 +263,7 @@ set(ARROW_SRCS "${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc" "${LIBRARY_DIR}/compute/kernels/codegen_internal.cc" "${LIBRARY_DIR}/compute/kernels/hash_aggregate.cc" + "${LIBRARY_DIR}/compute/kernels/ree_util_internal.cc" "${LIBRARY_DIR}/compute/kernels/row_encoder.cc" "${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc" "${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc" @@ -382,30 +291,139 @@ set(ARROW_SRCS "${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc" "${LIBRARY_DIR}/compute/kernels/vector_hash.cc" "${LIBRARY_DIR}/compute/kernels/vector_nested.cc" + "${LIBRARY_DIR}/compute/kernels/vector_pairwise.cc" "${LIBRARY_DIR}/compute/kernels/vector_rank.cc" "${LIBRARY_DIR}/compute/kernels/vector_replace.cc" + "${LIBRARY_DIR}/compute/kernels/vector_run_end_encode.cc" "${LIBRARY_DIR}/compute/kernels/vector_select_k.cc" "${LIBRARY_DIR}/compute/kernels/vector_selection.cc" - "${LIBRARY_DIR}/compute/kernels/vector_sort.cc" - "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc" "${LIBRARY_DIR}/compute/kernels/vector_selection_filter_internal.cc" + "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc" "${LIBRARY_DIR}/compute/kernels/vector_selection_take_internal.cc" - "${LIBRARY_DIR}/compute/light_array.cc" - "${LIBRARY_DIR}/compute/registry.cc" - "${LIBRARY_DIR}/compute/expression.cc" + "${LIBRARY_DIR}/compute/kernels/vector_sort.cc" + "${LIBRARY_DIR}/compute/key_hash_internal.cc" + "${LIBRARY_DIR}/compute/key_map_internal.cc" + "${LIBRARY_DIR}/compute/light_array_internal.cc" "${LIBRARY_DIR}/compute/ordering.cc" + "${LIBRARY_DIR}/compute/registry.cc" "${LIBRARY_DIR}/compute/row/compare_internal.cc" "${LIBRARY_DIR}/compute/row/encode_internal.cc" "${LIBRARY_DIR}/compute/row/grouper.cc" "${LIBRARY_DIR}/compute/row/row_internal.cc" - + "${LIBRARY_DIR}/compute/util.cc" + "${LIBRARY_DIR}/config.cc" + "${LIBRARY_DIR}/datum.cc" + "${LIBRARY_DIR}/device.cc" + "${LIBRARY_DIR}/extension_type.cc" + "${LIBRARY_DIR}/integration/c_data_integration_internal.cc" + "${LIBRARY_DIR}/io/buffered.cc" + "${LIBRARY_DIR}/io/caching.cc" + "${LIBRARY_DIR}/io/compressed.cc" + "${LIBRARY_DIR}/io/file.cc" + "${LIBRARY_DIR}/io/hdfs.cc" + "${LIBRARY_DIR}/io/hdfs_internal.cc" + "${LIBRARY_DIR}/io/interfaces.cc" + "${LIBRARY_DIR}/io/memory.cc" + "${LIBRARY_DIR}/io/slow.cc" + "${LIBRARY_DIR}/io/stdio.cc" + "${LIBRARY_DIR}/io/transform.cc" "${LIBRARY_DIR}/ipc/dictionary.cc" "${LIBRARY_DIR}/ipc/feather.cc" + "${LIBRARY_DIR}/ipc/file_to_stream.cc" "${LIBRARY_DIR}/ipc/message.cc" "${LIBRARY_DIR}/ipc/metadata_internal.cc" "${LIBRARY_DIR}/ipc/options.cc" "${LIBRARY_DIR}/ipc/reader.cc" + "${LIBRARY_DIR}/ipc/stream_to_file.cc" "${LIBRARY_DIR}/ipc/writer.cc" + "${LIBRARY_DIR}/memory_pool.cc" + "${LIBRARY_DIR}/pretty_print.cc" + "${LIBRARY_DIR}/record_batch.cc" + "${LIBRARY_DIR}/result.cc" + "${LIBRARY_DIR}/scalar.cc" + "${LIBRARY_DIR}/sparse_tensor.cc" + "${LIBRARY_DIR}/status.cc" + "${LIBRARY_DIR}/table.cc" + "${LIBRARY_DIR}/table_builder.cc" + "${LIBRARY_DIR}/tensor.cc" + "${LIBRARY_DIR}/tensor/coo_converter.cc" + "${LIBRARY_DIR}/tensor/csf_converter.cc" + "${LIBRARY_DIR}/tensor/csx_converter.cc" + "${LIBRARY_DIR}/type.cc" + "${LIBRARY_DIR}/type_traits.cc" + "${LIBRARY_DIR}/util/align_util.cc" + "${LIBRARY_DIR}/util/async_util.cc" + "${LIBRARY_DIR}/util/atfork_internal.cc" + "${LIBRARY_DIR}/util/basic_decimal.cc" + "${LIBRARY_DIR}/util/bit_block_counter.cc" + "${LIBRARY_DIR}/util/bit_run_reader.cc" + "${LIBRARY_DIR}/util/bit_util.cc" + "${LIBRARY_DIR}/util/bitmap.cc" + "${LIBRARY_DIR}/util/bitmap_builders.cc" + "${LIBRARY_DIR}/util/bitmap_ops.cc" + "${LIBRARY_DIR}/util/bpacking.cc" + "${LIBRARY_DIR}/util/byte_size.cc" + "${LIBRARY_DIR}/util/cancel.cc" + "${LIBRARY_DIR}/util/compression.cc" + "${LIBRARY_DIR}/util/counting_semaphore.cc" + "${LIBRARY_DIR}/util/cpu_info.cc" + "${LIBRARY_DIR}/util/crc32.cc" + "${LIBRARY_DIR}/util/debug.cc" + "${LIBRARY_DIR}/util/decimal.cc" + "${LIBRARY_DIR}/util/delimiting.cc" + "${LIBRARY_DIR}/util/dict_util.cc" + "${LIBRARY_DIR}/util/float16.cc" + "${LIBRARY_DIR}/util/formatting.cc" + "${LIBRARY_DIR}/util/future.cc" + "${LIBRARY_DIR}/util/hashing.cc" + "${LIBRARY_DIR}/util/int_util.cc" + "${LIBRARY_DIR}/util/io_util.cc" + "${LIBRARY_DIR}/util/key_value_metadata.cc" + "${LIBRARY_DIR}/util/list_util.cc" + "${LIBRARY_DIR}/util/logging.cc" + "${LIBRARY_DIR}/util/memory.cc" + "${LIBRARY_DIR}/util/mutex.cc" + "${LIBRARY_DIR}/util/ree_util.cc" + "${LIBRARY_DIR}/util/string.cc" + "${LIBRARY_DIR}/util/string_builder.cc" + "${LIBRARY_DIR}/util/task_group.cc" + "${LIBRARY_DIR}/util/tdigest.cc" + "${LIBRARY_DIR}/util/thread_pool.cc" + "${LIBRARY_DIR}/util/time.cc" + "${LIBRARY_DIR}/util/tracing.cc" + "${LIBRARY_DIR}/util/trie.cc" + "${LIBRARY_DIR}/util/union_util.cc" + "${LIBRARY_DIR}/util/unreachable.cc" + "${LIBRARY_DIR}/util/uri.cc" + "${LIBRARY_DIR}/util/utf8.cc" + "${LIBRARY_DIR}/util/value_parsing.cc" + "${LIBRARY_DIR}/vendored/base64.cpp" + "${LIBRARY_DIR}/vendored/datetime/tz.cpp" + "${LIBRARY_DIR}/vendored/double-conversion/bignum-dtoa.cc" + "${LIBRARY_DIR}/vendored/double-conversion/bignum.cc" + "${LIBRARY_DIR}/vendored/double-conversion/cached-powers.cc" + "${LIBRARY_DIR}/vendored/double-conversion/double-to-string.cc" + "${LIBRARY_DIR}/vendored/double-conversion/fast-dtoa.cc" + "${LIBRARY_DIR}/vendored/double-conversion/fixed-dtoa.cc" + "${LIBRARY_DIR}/vendored/double-conversion/string-to-double.cc" + "${LIBRARY_DIR}/vendored/double-conversion/strtod.cc" + "${LIBRARY_DIR}/vendored/musl/strptime.c" + "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c" + "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c" + "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c" + "${LIBRARY_DIR}/vendored/uriparser/UriFile.c" + "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c" + "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c" + "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c" + "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c" + "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c" + "${LIBRARY_DIR}/vendored/uriparser/UriParse.c" + "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c" + "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c" + "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c" + "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c" + "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c" + "${LIBRARY_DIR}/visitor.cc" "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc" "${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc" @@ -465,22 +483,38 @@ set(PARQUET_SRCS "${LIBRARY_DIR}/arrow/schema.cc" "${LIBRARY_DIR}/arrow/schema_internal.cc" "${LIBRARY_DIR}/arrow/writer.cc" + "${LIBRARY_DIR}/benchmark_util.cc" "${LIBRARY_DIR}/bloom_filter.cc" + "${LIBRARY_DIR}/bloom_filter_reader.cc" "${LIBRARY_DIR}/column_reader.cc" "${LIBRARY_DIR}/column_scanner.cc" "${LIBRARY_DIR}/column_writer.cc" "${LIBRARY_DIR}/encoding.cc" + "${LIBRARY_DIR}/encryption/crypto_factory.cc" "${LIBRARY_DIR}/encryption/encryption.cc" "${LIBRARY_DIR}/encryption/encryption_internal.cc" + "${LIBRARY_DIR}/encryption/encryption_internal_nossl.cc" + "${LIBRARY_DIR}/encryption/file_key_unwrapper.cc" + "${LIBRARY_DIR}/encryption/file_key_wrapper.cc" + "${LIBRARY_DIR}/encryption/file_system_key_material_store.cc" "${LIBRARY_DIR}/encryption/internal_file_decryptor.cc" "${LIBRARY_DIR}/encryption/internal_file_encryptor.cc" + "${LIBRARY_DIR}/encryption/key_material.cc" + "${LIBRARY_DIR}/encryption/key_metadata.cc" + "${LIBRARY_DIR}/encryption/key_toolkit.cc" + "${LIBRARY_DIR}/encryption/key_toolkit_internal.cc" + "${LIBRARY_DIR}/encryption/kms_client.cc" + "${LIBRARY_DIR}/encryption/local_wrap_kms_client.cc" + "${LIBRARY_DIR}/encryption/openssl_internal.cc" "${LIBRARY_DIR}/exception.cc" "${LIBRARY_DIR}/file_reader.cc" "${LIBRARY_DIR}/file_writer.cc" - "${LIBRARY_DIR}/page_index.cc" - "${LIBRARY_DIR}/level_conversion.cc" "${LIBRARY_DIR}/level_comparison.cc" + "${LIBRARY_DIR}/level_comparison_avx2.cc" + "${LIBRARY_DIR}/level_conversion.cc" + "${LIBRARY_DIR}/level_conversion_bmi2.cc" "${LIBRARY_DIR}/metadata.cc" + "${LIBRARY_DIR}/page_index.cc" "${LIBRARY_DIR}/platform.cc" "${LIBRARY_DIR}/printer.cc" "${LIBRARY_DIR}/properties.cc" @@ -489,7 +523,6 @@ set(PARQUET_SRCS "${LIBRARY_DIR}/stream_reader.cc" "${LIBRARY_DIR}/stream_writer.cc" "${LIBRARY_DIR}/types.cc" - "${LIBRARY_DIR}/bloom_filter_reader.cc" "${LIBRARY_DIR}/xxhasher.cc" "${GEN_LIBRARY_DIR}/parquet_constants.cpp" @@ -520,6 +553,9 @@ endif () add_definitions(-DPARQUET_THRIFT_VERSION_MAJOR=0) add_definitions(-DPARQUET_THRIFT_VERSION_MINOR=16) +# As per https://github.com/apache/arrow/pull/35672 you need to enable it explicitly. +add_definitions(-DARROW_ENABLE_THREADING) + # === tools set(TOOLS_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/tools/parquet") diff --git a/contrib/flatbuffers b/contrib/flatbuffers index eb3f8279482..0100f6a5779 160000 --- a/contrib/flatbuffers +++ b/contrib/flatbuffers @@ -1 +1 @@ -Subproject commit eb3f827948241ce0e701516f16cd67324802bce9 +Subproject commit 0100f6a5779831fa7a651e4b67ef389a8752bd9b diff --git a/contrib/idxd-config b/contrib/idxd-config deleted file mode 160000 index a836ce0e420..00000000000 --- a/contrib/idxd-config +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a836ce0e42052a69bffbbc14239ab4097f3b77f1 diff --git a/contrib/idxd-config-cmake/CMakeLists.txt b/contrib/idxd-config-cmake/CMakeLists.txt deleted file mode 100644 index 030252ec8e6..00000000000 --- a/contrib/idxd-config-cmake/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -## accel_config is the utility library required by QPL-Deflate codec for controlling and configuring Intel® In-Memory Analytics Accelerator (Intel® IAA). -set (LIBACCEL_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/idxd-config") -set (UUID_DIR "${ClickHouse_SOURCE_DIR}/contrib/qpl-cmake") -set (LIBACCEL_HEADER_DIR "${ClickHouse_SOURCE_DIR}/contrib/idxd-config-cmake/include") -set (SRCS - "${LIBACCEL_SOURCE_DIR}/accfg/lib/libaccfg.c" - "${LIBACCEL_SOURCE_DIR}/util/log.c" - "${LIBACCEL_SOURCE_DIR}/util/sysfs.c" -) - -add_library(_accel-config ${SRCS}) - -target_compile_options(_accel-config PRIVATE "-D_GNU_SOURCE") - -target_include_directories(_accel-config BEFORE - PRIVATE ${UUID_DIR} - PRIVATE ${LIBACCEL_HEADER_DIR} - PRIVATE ${LIBACCEL_SOURCE_DIR}) - -target_include_directories(_accel-config SYSTEM BEFORE - PUBLIC ${LIBACCEL_SOURCE_DIR}/accfg) - -add_library(ch_contrib::accel-config ALIAS _accel-config) diff --git a/contrib/idxd-config-cmake/include/config.h b/contrib/idxd-config-cmake/include/config.h deleted file mode 100644 index f03b0eac0b0..00000000000 --- a/contrib/idxd-config-cmake/include/config.h +++ /dev/null @@ -1,159 +0,0 @@ -/* config.h. Generated from config.h.in by configure. */ -/* config.h.in. Generated from configure.ac by autoheader. */ - -/* Define if building universal (internal helper macro) */ -/* #undef AC_APPLE_UNIVERSAL_BUILD */ - -/* Debug messages. */ -/* #undef ENABLE_DEBUG */ - -/* Documentation / man pages. */ -/* #define ENABLE_DOCS */ - -/* System logging. */ -#define ENABLE_LOGGING 1 - -/* accfg test support */ -/* #undef ENABLE_TEST */ - -/* Define to 1 if big-endian-arch */ -/* #undef HAVE_BIG_ENDIAN */ - -/* Define to 1 if you have the header file. */ -#define HAVE_DLFCN_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_INTTYPES_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_LINUX_VERSION_H 1 - -/* Define to 1 if little-endian-arch */ -#define HAVE_LITTLE_ENDIAN 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_MEMORY_H 1 - -/* Define to 1 if you have the `secure_getenv' function. */ -#define HAVE_SECURE_GETENV 1 - -/* Define to 1 if you have statement expressions. */ -#define HAVE_STATEMENT_EXPR 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDINT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDLIB_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STRINGS_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STRING_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_STAT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_TYPES_H 1 - -/* Define to 1 if typeof works with your compiler. */ -#define HAVE_TYPEOF 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_UNISTD_H 1 - -/* Define to 1 if using libuuid */ -#define HAVE_UUID 1 - -/* Define to 1 if you have the `__secure_getenv' function. */ -/* #undef HAVE___SECURE_GETENV */ - -/* Define to the sub-directory where libtool stores uninstalled libraries. */ -#define LT_OBJDIR ".libs/" - -/* Name of package */ -#define PACKAGE "accel-config" - -/* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "linux-dsa@lists.01.org" - -/* Define to the full name of this package. */ -#define PACKAGE_NAME "accel-config" - -/* Define to the full name and version of this package. */ -#define PACKAGE_STRING "accel-config 3.5.2.gitf6605c41" - -/* Define to the one symbol short name of this package. */ -#define PACKAGE_TARNAME "accel-config" - -/* Define to the home page for this package. */ -#define PACKAGE_URL "https://github.com/xxx/accel-config" - -/* Define to the version of this package. */ -#define PACKAGE_VERSION "3.5.2.gitf6605c41" - -/* Define to 1 if you have the ANSI C header files. */ -#define STDC_HEADERS 1 - -/* Enable extensions on AIX 3, Interix. */ -#ifndef _ALL_SOURCE -# define _ALL_SOURCE 1 -#endif -/* Enable GNU extensions on systems that have them. */ -#ifndef _GNU_SOURCE -# define _GNU_SOURCE 1 -#endif -/* Enable threading extensions on Solaris. */ -#ifndef _POSIX_PTHREAD_SEMANTICS -# define _POSIX_PTHREAD_SEMANTICS 1 -#endif -/* Enable extensions on HP NonStop. */ -#ifndef _TANDEM_SOURCE -# define _TANDEM_SOURCE 1 -#endif -/* Enable general extensions on Solaris. */ -#ifndef __EXTENSIONS__ -# define __EXTENSIONS__ 1 -#endif - - -/* Version number of package */ -#define VERSION "3.5.2.gitf6605c41" - -/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most - significant byte first (like Motorola and SPARC, unlike Intel). */ -#if defined AC_APPLE_UNIVERSAL_BUILD -# if defined __BIG_ENDIAN__ -# define WORDS_BIGENDIAN 1 -# endif -#else -# ifndef WORDS_BIGENDIAN -/* # undef WORDS_BIGENDIAN */ -# endif -#endif - -/* Enable large inode numbers on Mac OS X 10.5. */ -#ifndef _DARWIN_USE_64_BIT_INODE -# define _DARWIN_USE_64_BIT_INODE 1 -#endif - -/* Number of bits in a file offset, on hosts where this is settable. */ -/* #undef _FILE_OFFSET_BITS */ - -/* Define for large files, on AIX-style hosts. */ -/* #undef _LARGE_FILES */ - -/* Define to 1 if on MINIX. */ -/* #undef _MINIX */ - -/* Define to 2 if the system does not provide POSIX.1 features except with - this defined. */ -/* #undef _POSIX_1_SOURCE */ - -/* Define to 1 if you need to in order for `stat' and other things to work. */ -/* #undef _POSIX_SOURCE */ - -/* Define to __typeof__ if your compiler spells it that way. */ -/* #undef typeof */ diff --git a/contrib/krb5 b/contrib/krb5 index 71b06c22760..c5b4b994c18 160000 --- a/contrib/krb5 +++ b/contrib/krb5 @@ -1 +1 @@ -Subproject commit 71b06c2276009ae649c7703019f3b4605f66fd3d +Subproject commit c5b4b994c18db86933255907a97eee5993fd18fe diff --git a/contrib/numactl b/contrib/numactl index 8d13d63a05f..ff32c618d63 160000 --- a/contrib/numactl +++ b/contrib/numactl @@ -1 +1 @@ -Subproject commit 8d13d63a05f0c3cd88bf777cbb61541202b7da08 +Subproject commit ff32c618d63ca7ac48cce366c5a04bb3563683a0 diff --git a/contrib/qpl b/contrib/qpl deleted file mode 160000 index c2ced94c53c..00000000000 --- a/contrib/qpl +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c2ced94c53c1ee22191201a59878e9280bc9b9b8 diff --git a/contrib/qpl-cmake/CMakeLists.txt b/contrib/qpl-cmake/CMakeLists.txt deleted file mode 100644 index 89332ae0f7a..00000000000 --- a/contrib/qpl-cmake/CMakeLists.txt +++ /dev/null @@ -1,738 +0,0 @@ -## The Intel® QPL provides high performance implementations of data processing functions for existing hardware accelerator, and/or software path in case if hardware accelerator is not available. -set (UUID_DIR "${ClickHouse_SOURCE_DIR}/contrib/qpl-cmake") -set (QPL_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/qpl") -set (QPL_SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/qpl/sources") -set (QPL_BINARY_DIR "${ClickHouse_BINARY_DIR}/build/contrib/qpl") -set (EFFICIENT_WAIT OFF) -set (LOG_HW_INIT OFF) -set (SANITIZE_MEMORY OFF) -set (SANITIZE_THREADS OFF) -set (LIB_FUZZING_ENGINE OFF) -set (DYNAMIC_LOADING_LIBACCEL_CONFIG OFF) - -function(GetLibraryVersion _content _outputVar) - string(REGEX MATCHALL "QPL VERSION (.+) LANGUAGES" VERSION_REGEX "${_content}") - SET(${_outputVar} ${CMAKE_MATCH_1} PARENT_SCOPE) -endfunction() - -set (QPL_VERSION 1.6.0) - -message(STATUS "Intel QPL version: ${QPL_VERSION}") - -# There are 5 source subdirectories under $QPL_SRC_DIR: c_api, core-iaa, core-sw, middle-layer and isal. -# Generate 8 library targets: qpl_c_api, core_iaa, qplcore_px, qplcore_avx512, qplcore_sw_dispatcher, middle_layer_lib, isal and isal_asm, -# which are then combined into static or shared qpl. -# Output ch_contrib::qpl by linking with 8 library targets. - -# Note, QPL has integrated a customized version of ISA-L to meet specific needs. -# This version has been significantly modified and there are no plans to maintain compatibility with the upstream version -# or upgrade the current copy. - -## cmake/CompileOptions.cmake and automatic wrappers generation - -# ========================================================================== -# Copyright (C) 2022 Intel Corporation -# -# SPDX-License-Identifier: MIT -# ========================================================================== - -set(QPL_LINUX_TOOLCHAIN_CPP_EMBEDDED_FLAGS "-fno-exceptions;-fno-rtti") - -function(modify_standard_language_flag) - # Declaring function parameters - set(OPTIONS "") - set(ONE_VALUE_ARGS - LANGUAGE_NAME - FLAG_NAME - NEW_FLAG_VALUE) - set(MULTI_VALUE_ARGS "") - - # Parsing function parameters - cmake_parse_arguments(MODIFY - "${OPTIONS}" - "${ONE_VALUE_ARGS}" - "${MULTI_VALUE_ARGS}" - ${ARGN}) - - # Variables - set(FLAG_REGULAR_EXPRESSION "${MODIFY_FLAG_NAME}.*[ ]*") - set(NEW_VALUE "${MODIFY_FLAG_NAME}${MODIFY_NEW_FLAG_VALUE}") - - # Replacing specified flag with new value - string(REGEX REPLACE - ${FLAG_REGULAR_EXPRESSION} ${NEW_VALUE} - NEW_COMPILE_FLAGS - "${CMAKE_${MODIFY_LANGUAGE_NAME}_FLAGS}") - - # Returning the value - set(CMAKE_${MODIFY_LANGUAGE_NAME}_FLAGS ${NEW_COMPILE_FLAGS} PARENT_SCOPE) -endfunction() - -function(get_function_name_with_default_bit_width in_function_name bit_width out_function_name) - - if(in_function_name MATCHES ".*_i") - - string(REPLACE "_i" "" in_function_name ${in_function_name}) - - set(${out_function_name} "${in_function_name}_${bit_width}_i" PARENT_SCOPE) - - else() - - set(${out_function_name} "${in_function_name}_${bit_width}" PARENT_SCOPE) - - endif() - -endfunction() - -macro(get_list_of_supported_optimizations PLATFORMS_LIST) - list(APPEND PLATFORMS_LIST "") - list(APPEND PLATFORMS_LIST "px") - list(APPEND PLATFORMS_LIST "avx512") -endmacro(get_list_of_supported_optimizations) - -function(generate_unpack_kernel_arrays current_directory PLATFORMS_LIST) - list(APPEND UNPACK_POSTFIX_LIST "") - list(APPEND UNPACK_PRLE_POSTFIX_LIST "") - list(APPEND PACK_POSTFIX_LIST "") - list(APPEND PACK_INDEX_POSTFIX_LIST "") - list(APPEND SCAN_POSTFIX_LIST "") - list(APPEND DEFAULT_BIT_WIDTH_FUNCTIONS_LIST "") - list(APPEND DEFAULT_BIT_WIDTH_LIST "") - - #create list of functions that use only 8u 16u 32u postfixes - list(APPEND DEFAULT_BIT_WIDTH_FUNCTIONS_LIST "unpack_prle") - list(APPEND DEFAULT_BIT_WIDTH_FUNCTIONS_LIST "extract") - list(APPEND DEFAULT_BIT_WIDTH_FUNCTIONS_LIST "extract_i") - list(APPEND DEFAULT_BIT_WIDTH_FUNCTIONS_LIST "select") - list(APPEND DEFAULT_BIT_WIDTH_FUNCTIONS_LIST "select_i") - list(APPEND DEFAULT_BIT_WIDTH_FUNCTIONS_LIST "expand") - - #create default bit width list - list(APPEND DEFAULT_BIT_WIDTH_LIST "8u") - list(APPEND DEFAULT_BIT_WIDTH_LIST "16u") - list(APPEND DEFAULT_BIT_WIDTH_LIST "32u") - - #create scan kernel postfixes - list(APPEND SCAN_COMPARATOR_LIST "") - - list(APPEND SCAN_COMPARATOR_LIST "eq") - list(APPEND SCAN_COMPARATOR_LIST "ne") - list(APPEND SCAN_COMPARATOR_LIST "lt") - list(APPEND SCAN_COMPARATOR_LIST "le") - list(APPEND SCAN_COMPARATOR_LIST "gt") - list(APPEND SCAN_COMPARATOR_LIST "ge") - list(APPEND SCAN_COMPARATOR_LIST "range") - list(APPEND SCAN_COMPARATOR_LIST "not_range") - - foreach(SCAN_COMPARATOR IN LISTS SCAN_COMPARATOR_LIST) - list(APPEND SCAN_POSTFIX_LIST "_${SCAN_COMPARATOR}_8u") - list(APPEND SCAN_POSTFIX_LIST "_${SCAN_COMPARATOR}_16u8u") - list(APPEND SCAN_POSTFIX_LIST "_${SCAN_COMPARATOR}_32u8u") - endforeach() - - # create unpack kernel postfixes - foreach(input_width RANGE 1 32 1) - if(input_width LESS 8 OR input_width EQUAL 8) - list(APPEND UNPACK_POSTFIX_LIST "_${input_width}u8u") - - elseif(input_width LESS 16 OR input_width EQUAL 16) - list(APPEND UNPACK_POSTFIX_LIST "_${input_width}u16u") - - else() - list(APPEND UNPACK_POSTFIX_LIST "_${input_width}u32u") - endif() - endforeach() - - # create pack kernel postfixes - foreach(output_width RANGE 1 8 1) - list(APPEND PACK_POSTFIX_LIST "_8u${output_width}u") - endforeach() - - foreach(output_width RANGE 9 16 1) - list(APPEND PACK_POSTFIX_LIST "_16u${output_width}u") - endforeach() - - foreach(output_width RANGE 17 32 1) - list(APPEND PACK_POSTFIX_LIST "_32u${output_width}u") - endforeach() - - list(APPEND PACK_POSTFIX_LIST "_8u16u") - list(APPEND PACK_POSTFIX_LIST "_8u32u") - list(APPEND PACK_POSTFIX_LIST "_16u32u") - - # create pack index kernel postfixes - list(APPEND PACK_INDEX_POSTFIX_LIST "_nu") - list(APPEND PACK_INDEX_POSTFIX_LIST "_8u") - list(APPEND PACK_INDEX_POSTFIX_LIST "_8u16u") - list(APPEND PACK_INDEX_POSTFIX_LIST "_8u32u") - - # write to file - file(MAKE_DIRECTORY ${current_directory}/generated) - - foreach(PLATFORM_VALUE IN LISTS PLATFORMS_LIST) - set(directory "${current_directory}/generated") - set(PLATFORM_PREFIX "${PLATFORM_VALUE}_") - - # - # Write unpack table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}unpack.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}unpack.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}unpack.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}unpack.cpp "unpack_table_t ${PLATFORM_PREFIX}unpack_table = {\n") - - #write LE kernels - foreach(UNPACK_POSTFIX IN LISTS UNPACK_POSTFIX_LIST) - file(APPEND ${directory}/${PLATFORM_PREFIX}unpack.cpp "\t${PLATFORM_PREFIX}qplc_unpack${UNPACK_POSTFIX},\n") - endforeach() - - #write BE kernels - - #get last element of the list - set(LAST_ELEMENT "") - list(GET UNPACK_POSTFIX_LIST -1 LAST_ELEMENT) - - foreach(UNPACK_POSTFIX IN LISTS UNPACK_POSTFIX_LIST) - - if(UNPACK_POSTFIX STREQUAL LAST_ELEMENT) - file(APPEND ${directory}/${PLATFORM_PREFIX}unpack.cpp "\t${PLATFORM_PREFIX}qplc_unpack_be${UNPACK_POSTFIX}};\n") - else() - file(APPEND ${directory}/${PLATFORM_PREFIX}unpack.cpp "\t${PLATFORM_PREFIX}qplc_unpack_be${UNPACK_POSTFIX},\n") - endif() - endforeach() - - file(APPEND ${directory}/${PLATFORM_PREFIX}unpack.cpp "}\n") - - # - # Write pack table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}pack.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack.cpp "pack_table_t ${PLATFORM_PREFIX}pack_table = {\n") - - #write LE kernels - foreach(PACK_POSTFIX IN LISTS PACK_POSTFIX_LIST) - file(APPEND ${directory}/${PLATFORM_PREFIX}pack.cpp "\t${PLATFORM_PREFIX}qplc_pack${PACK_POSTFIX},\n") - endforeach() - - #write BE kernels - - #get last element of the list - set(LAST_ELEMENT "") - list(GET PACK_POSTFIX_LIST -1 LAST_ELEMENT) - - foreach(PACK_POSTFIX IN LISTS PACK_POSTFIX_LIST) - - if(PACK_POSTFIX STREQUAL LAST_ELEMENT) - file(APPEND ${directory}/${PLATFORM_PREFIX}pack.cpp "\t${PLATFORM_PREFIX}qplc_pack_be${PACK_POSTFIX}};\n") - else() - file(APPEND ${directory}/${PLATFORM_PREFIX}pack.cpp "\t${PLATFORM_PREFIX}qplc_pack_be${PACK_POSTFIX},\n") - endif() - endforeach() - - file(APPEND ${directory}/${PLATFORM_PREFIX}pack.cpp "}\n") - - # - # Write scan table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}scan.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}scan.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}scan.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}scan.cpp "scan_table_t ${PLATFORM_PREFIX}scan_table = {\n") - - #get last element of the list - set(LAST_ELEMENT "") - list(GET SCAN_POSTFIX_LIST -1 LAST_ELEMENT) - - foreach(SCAN_POSTFIX IN LISTS SCAN_POSTFIX_LIST) - - if(SCAN_POSTFIX STREQUAL LAST_ELEMENT) - file(APPEND ${directory}/${PLATFORM_PREFIX}scan.cpp "\t${PLATFORM_PREFIX}qplc_scan${SCAN_POSTFIX}};\n") - else() - file(APPEND ${directory}/${PLATFORM_PREFIX}scan.cpp "\t${PLATFORM_PREFIX}qplc_scan${SCAN_POSTFIX},\n") - endif() - endforeach() - - file(APPEND ${directory}/${PLATFORM_PREFIX}scan.cpp "}\n") - - # - # Write scan_i table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}scan_i.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}scan_i.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}scan_i.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}scan_i.cpp "scan_i_table_t ${PLATFORM_PREFIX}scan_i_table = {\n") - - #get last element of the list - set(LAST_ELEMENT "") - list(GET SCAN_POSTFIX_LIST -1 LAST_ELEMENT) - - foreach(SCAN_POSTFIX IN LISTS SCAN_POSTFIX_LIST) - - if(SCAN_POSTFIX STREQUAL LAST_ELEMENT) - file(APPEND ${directory}/${PLATFORM_PREFIX}scan_i.cpp "\t${PLATFORM_PREFIX}qplc_scan${SCAN_POSTFIX}_i};\n") - else() - file(APPEND ${directory}/${PLATFORM_PREFIX}scan_i.cpp "\t${PLATFORM_PREFIX}qplc_scan${SCAN_POSTFIX}_i,\n") - endif() - endforeach() - - file(APPEND ${directory}/${PLATFORM_PREFIX}scan_i.cpp "}\n") - - # - # Write pack_index table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}pack_index.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "pack_index_table_t ${PLATFORM_PREFIX}pack_index_table = {\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "\t${PLATFORM_PREFIX}qplc_pack_bits_nu,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "\t${PLATFORM_PREFIX}qplc_pack_index_8u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "\t${PLATFORM_PREFIX}qplc_pack_index_8u16u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "\t${PLATFORM_PREFIX}qplc_pack_index_8u32u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "\t${PLATFORM_PREFIX}qplc_pack_bits_be_nu,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "\t${PLATFORM_PREFIX}qplc_pack_index_8u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "\t${PLATFORM_PREFIX}qplc_pack_index_be_8u16u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "\t${PLATFORM_PREFIX}qplc_pack_index_be_8u32u};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}pack_index.cpp "}\n") - - # - # Write default bit width functions - # - foreach(DEAULT_BIT_WIDTH_FUNCTION IN LISTS DEFAULT_BIT_WIDTH_FUNCTIONS_LIST) - file(WRITE ${directory}/${PLATFORM_PREFIX}${DEAULT_BIT_WIDTH_FUNCTION}.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}${DEAULT_BIT_WIDTH_FUNCTION}.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}${DEAULT_BIT_WIDTH_FUNCTION}.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}${DEAULT_BIT_WIDTH_FUNCTION}.cpp "${DEAULT_BIT_WIDTH_FUNCTION}_table_t ${PLATFORM_PREFIX}${DEAULT_BIT_WIDTH_FUNCTION}_table = {\n") - - #get last element of the list - set(LAST_ELEMENT "") - list(GET DEFAULT_BIT_WIDTH_LIST -1 LAST_ELEMENT) - - foreach(BIT_WIDTH IN LISTS DEFAULT_BIT_WIDTH_LIST) - - set(FUNCTION_NAME "") - get_function_name_with_default_bit_width(${DEAULT_BIT_WIDTH_FUNCTION} ${BIT_WIDTH} FUNCTION_NAME) - - if(BIT_WIDTH STREQUAL LAST_ELEMENT) - file(APPEND ${directory}/${PLATFORM_PREFIX}${DEAULT_BIT_WIDTH_FUNCTION}.cpp "\t${PLATFORM_PREFIX}qplc_${FUNCTION_NAME}};\n") - else() - file(APPEND ${directory}/${PLATFORM_PREFIX}${DEAULT_BIT_WIDTH_FUNCTION}.cpp "\t${PLATFORM_PREFIX}qplc_${FUNCTION_NAME},\n") - endif() - endforeach() - - file(APPEND ${directory}/${PLATFORM_PREFIX}${DEAULT_BIT_WIDTH_FUNCTION}.cpp "}\n") - endforeach() - - # - # Write aggregates table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}aggregates.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}aggregates.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}aggregates.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}aggregates.cpp "aggregates_table_t ${PLATFORM_PREFIX}aggregates_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}aggregates.cpp "\t${PLATFORM_PREFIX}qplc_bit_aggregates_8u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}aggregates.cpp "\t${PLATFORM_PREFIX}qplc_aggregates_8u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}aggregates.cpp "\t${PLATFORM_PREFIX}qplc_aggregates_16u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}aggregates.cpp "\t${PLATFORM_PREFIX}qplc_aggregates_32u};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}aggregates.cpp "}\n") - - # - # Write mem_copy functions table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}memory_copy.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}memory_copy.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}memory_copy.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}memory_copy.cpp "memory_copy_table_t ${PLATFORM_PREFIX}memory_copy_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}memory_copy.cpp "\t${PLATFORM_PREFIX}qplc_copy_8u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}memory_copy.cpp "\t${PLATFORM_PREFIX}qplc_copy_16u,\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}memory_copy.cpp "\t${PLATFORM_PREFIX}qplc_copy_32u};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}memory_copy.cpp "}\n") - - # - # Write mem_copy functions table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}zero.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}zero.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}zero.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}zero.cpp "zero_table_t ${PLATFORM_PREFIX}zero_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}zero.cpp "\t${PLATFORM_PREFIX}qplc_zero_8u};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}zero.cpp "}\n") - - # - # Write move functions table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}move.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}move.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}move.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}move.cpp "move_table_t ${PLATFORM_PREFIX}move_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}move.cpp "\t${PLATFORM_PREFIX}qplc_move_8u};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}move.cpp "}\n") - - # - # Write crc64 function table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}crc64.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}crc64.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}crc64.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}crc64.cpp "crc64_table_t ${PLATFORM_PREFIX}crc64_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}crc64.cpp "\t${PLATFORM_PREFIX}qplc_crc64};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}crc64.cpp "}\n") - - # - # Write xor_checksum function table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}xor_checksum.cpp "#include \"qplc_api.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}xor_checksum.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}xor_checksum.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}xor_checksum.cpp "xor_checksum_table_t ${PLATFORM_PREFIX}xor_checksum_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}xor_checksum.cpp "\t${PLATFORM_PREFIX}qplc_xor_checksum_8u};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}xor_checksum.cpp "}\n") - - # - # Write deflate functions table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}deflate.cpp "#include \"deflate_slow_icf.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "#include \"deflate_hash_table.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "#include \"deflate_histogram.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "deflate_table_t ${PLATFORM_PREFIX}deflate_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "\t reinterpret_cast(&${PLATFORM_PREFIX}slow_deflate_icf_body),\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "\t reinterpret_cast(&${PLATFORM_PREFIX}deflate_histogram_reset),\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "\t reinterpret_cast(&${PLATFORM_PREFIX}deflate_hash_table_reset)};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate.cpp "}\n") - - # - # Write deflate fix functions table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}deflate_fix.cpp "#include \"deflate_slow.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate_fix.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate_fix.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate_fix.cpp "deflate_fix_table_t ${PLATFORM_PREFIX}deflate_fix_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate_fix.cpp "\t reinterpret_cast(&${PLATFORM_PREFIX}slow_deflate_body)};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}deflate_fix.cpp "}\n") - - # - # Write setup_dictionary functions table - # - file(WRITE ${directory}/${PLATFORM_PREFIX}setup_dictionary.cpp "#include \"deflate_slow_utils.h\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}setup_dictionary.cpp "#include \"dispatcher/dispatcher.hpp\"\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}setup_dictionary.cpp "namespace qpl::core_sw::dispatcher\n{\n") - file(APPEND ${directory}/${PLATFORM_PREFIX}setup_dictionary.cpp "setup_dictionary_table_t ${PLATFORM_PREFIX}setup_dictionary_table = {\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}setup_dictionary.cpp "\t reinterpret_cast(&${PLATFORM_PREFIX}setup_dictionary)};\n") - - file(APPEND ${directory}/${PLATFORM_PREFIX}setup_dictionary.cpp "}\n") - - endforeach() -endfunction() - -# [SUBDIR]isal - -enable_language(ASM_NASM) - -set(ISAL_C_SRC ${QPL_SRC_DIR}/isal/igzip/adler32_base.c - ${QPL_SRC_DIR}/isal/igzip/huff_codes.c - ${QPL_SRC_DIR}/isal/igzip/hufftables_c.c - ${QPL_SRC_DIR}/isal/igzip/igzip.c - ${QPL_SRC_DIR}/isal/igzip/igzip_base.c - ${QPL_SRC_DIR}/isal/igzip/flatten_ll.c - ${QPL_SRC_DIR}/isal/igzip/encode_df.c - ${QPL_SRC_DIR}/isal/igzip/igzip_icf_base.c - ${QPL_SRC_DIR}/isal/igzip/igzip_inflate.c - ${QPL_SRC_DIR}/isal/igzip/igzip_icf_body.c - ${QPL_SRC_DIR}/isal/crc/crc_base.c - ${QPL_SRC_DIR}/isal/crc/crc64_base.c) - -set(ISAL_ASM_SRC ${QPL_SRC_DIR}/isal/igzip/igzip_body.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_gen_icf_map_lh1_04.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_gen_icf_map_lh1_06.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_decode_block_stateless_04.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_finish.asm - ${QPL_SRC_DIR}/isal/igzip/encode_df_04.asm - ${QPL_SRC_DIR}/isal/igzip/encode_df_06.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_decode_block_stateless_01.asm - ${QPL_SRC_DIR}/isal/igzip/proc_heap.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_icf_body_h1_gr_bt.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_icf_finish.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_inflate_multibinary.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_update_histogram_01.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_update_histogram_04.asm - ${QPL_SRC_DIR}/isal/igzip/rfc1951_lookup.asm - ${QPL_SRC_DIR}/isal/igzip/adler32_sse.asm - ${QPL_SRC_DIR}/isal/igzip/adler32_avx2_4.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_deflate_hash.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_set_long_icf_fg_04.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_set_long_icf_fg_06.asm - ${QPL_SRC_DIR}/isal/igzip/igzip_multibinary.asm - ${QPL_SRC_DIR}/isal/crc/crc_multibinary.asm - ${QPL_SRC_DIR}/isal/crc/crc32_gzip_refl_by8.asm - ${QPL_SRC_DIR}/isal/crc/crc32_gzip_refl_by8_02.asm - ${QPL_SRC_DIR}/isal/crc/crc32_gzip_refl_by16_10.asm - ${QPL_SRC_DIR}/isal/crc/crc32_ieee_01.asm - ${QPL_SRC_DIR}/isal/crc/crc32_ieee_02.asm - ${QPL_SRC_DIR}/isal/crc/crc32_ieee_by4.asm - ${QPL_SRC_DIR}/isal/crc/crc32_ieee_by16_10.asm - ${QPL_SRC_DIR}/isal/crc/crc32_iscsi_00.asm - ${QPL_SRC_DIR}/isal/crc/crc32_iscsi_01.asm - ${QPL_SRC_DIR}/isal/crc/crc32_iscsi_by16_10.asm) - -# Adding ISA-L library target -add_library(isal OBJECT ${ISAL_C_SRC}) -add_library(isal_asm OBJECT ${ISAL_ASM_SRC}) - -set_property(GLOBAL APPEND PROPERTY QPL_LIB_DEPS - $) - -set_property(GLOBAL APPEND PROPERTY QPL_LIB_DEPS - $) - -# Setting external and internal interfaces for ISA-L library -target_include_directories(isal - PUBLIC $ - PUBLIC ${QPL_SRC_DIR}/isal/igzip) - -set_target_properties(isal PROPERTIES - CXX_STANDARD 11 - C_STANDARD 99) - -# AS_FEATURE_LEVEL=10 means "Check SIMD capabilities of the target system at runtime and use up to AVX512 if available". -# HAVE_KNOWS_AVX512 means rely on AVX512 being available on the target system. -target_compile_options(isal_asm PRIVATE "-I${QPL_SRC_DIR}/isal/include/" - PRIVATE "-I${QPL_SRC_DIR}/isal/igzip/" - PRIVATE "-I${QPL_SRC_DIR}/isal/crc/" - PRIVATE "-DHAVE_AS_KNOWS_AVX512" - PRIVATE "-DAS_FEATURE_LEVEL=10" - PRIVATE "-DQPL_LIB") - -# Here must remove "-fno-sanitize=undefined" from COMPILE_OPTIONS. -# Otherwise nasm compiler would fail to proceed due to unrecognition of "-fno-sanitize=undefined" -if (SANITIZE STREQUAL "undefined") - get_target_property(target_options isal_asm COMPILE_OPTIONS) - list(REMOVE_ITEM target_options "-fno-sanitize=undefined") - set_property(TARGET isal_asm PROPERTY COMPILE_OPTIONS ${target_options}) -endif() - -target_compile_definitions(isal PUBLIC - QPL_LIB - NDEBUG) - -# [SUBDIR]core-sw -# Create set of libraries corresponding to supported platforms for SW fallback which are implemented by AVX512 and non-AVX512 instructions respectively. -# The upper level QPL API will check SIMD capabilities of the target system at runtime and decide to call AVX512 function or non-AVX512 function. -# Hence, here we don't need put ENABLE_AVX512 CMake switch. - -get_list_of_supported_optimizations(PLATFORMS_LIST) - -foreach(PLATFORM_ID IN LISTS PLATFORMS_LIST) - # Find Core Sources - file(GLOB SOURCES - ${QPL_SRC_DIR}/core-sw/src/checksums/*.c - ${QPL_SRC_DIR}/core-sw/src/filtering/*.c - ${QPL_SRC_DIR}/core-sw/src/other/*.c - ${QPL_SRC_DIR}/core-sw/src/compression/*.c) - - file(GLOB DATA_SOURCES - ${QPL_SRC_DIR}/core-sw/src/data/*.c) - - # Create library - add_library(qplcore_${PLATFORM_ID} OBJECT ${SOURCES}) - - set_property(GLOBAL APPEND PROPERTY QPL_LIB_DEPS - $) - - target_include_directories(qplcore_${PLATFORM_ID} - PUBLIC $ - PUBLIC $ - PUBLIC $ - PUBLIC $ - PRIVATE $) - - # Set specific compiler options and/or definitions based on a platform - if (${PLATFORM_ID} MATCHES "avx512") - target_compile_definitions(qplcore_${PLATFORM_ID} PRIVATE PLATFORM=2) - target_compile_options(qplcore_${PLATFORM_ID} PRIVATE -march=skylake-avx512) - else() # Create default px library - target_compile_definitions(qplcore_${PLATFORM_ID} PRIVATE PLATFORM=0) - endif() - - target_link_libraries(qplcore_${PLATFORM_ID} isal) -endforeach() - -# -# Create dispatcher between platforms and auto-generated wrappers -# -file(GLOB SW_DISPATCHER_SOURCES ${QPL_SRC_DIR}/core-sw/dispatcher/*.cpp) - -add_library(qplcore_sw_dispatcher OBJECT ${SW_DISPATCHER_SOURCES}) - -set_property(GLOBAL APPEND PROPERTY QPL_LIB_DEPS - $) - -target_include_directories(qplcore_sw_dispatcher - PUBLIC $) - -# Generate kernel wrappers -generate_unpack_kernel_arrays(${QPL_BINARY_DIR} "${PLATFORMS_LIST}") - -foreach(PLATFORM_ID IN LISTS PLATFORMS_LIST) - file(GLOB GENERATED_${PLATFORM_ID}_TABLES_SRC ${QPL_BINARY_DIR}/generated/${PLATFORM_ID}_*.cpp) - - target_sources(qplcore_sw_dispatcher PRIVATE ${GENERATED_${PLATFORM_ID}_TABLES_SRC}) - - # Set specific compiler options and/or definitions based on a platform - if (${PLATFORM_ID} MATCHES "avx512") - set_source_files_properties(${GENERATED_${PLATFORM_ID}_TABLES_SRC} PROPERTIES COMPILE_DEFINITIONS PLATFORM=2) - else() - set_source_files_properties(${GENERATED_${PLATFORM_ID}_TABLES_SRC} PROPERTIES COMPILE_DEFINITIONS PLATFORM=0) - endif() - - target_include_directories(qplcore_sw_dispatcher - PUBLIC $) -endforeach() - -set_target_properties(qplcore_sw_dispatcher PROPERTIES CXX_STANDARD 17) - -# w/a for build compatibility with ISAL codebase -target_compile_definitions(qplcore_sw_dispatcher PUBLIC -DQPL_LIB) - -target_compile_options(qplcore_sw_dispatcher - PRIVATE ${QPL_LINUX_TOOLCHAIN_CPP_EMBEDDED_FLAGS}) - -# [SUBDIR]core-iaa -file(GLOB HW_PATH_SRC ${QPL_SRC_DIR}/core-iaa/sources/aecs/*.c - ${QPL_SRC_DIR}/core-iaa/sources/driver_loader/*.c - ${QPL_SRC_DIR}/core-iaa/sources/descriptors/*.c - ${QPL_SRC_DIR}/core-iaa/sources/*.c) - -# Create library -add_library(core_iaa OBJECT ${HW_PATH_SRC}) - -set_property(GLOBAL APPEND PROPERTY QPL_LIB_DEPS - $) - -target_include_directories(core_iaa - PRIVATE ${UUID_DIR} - PUBLIC $ - PUBLIC $ - PRIVATE $ # status.h in own_checkers.h - PRIVATE $ # for own_checkers.h - PRIVATE $) - -target_compile_features(core_iaa PRIVATE c_std_11) - -target_compile_definitions(core_iaa PRIVATE QPL_BADARG_CHECK - PRIVATE $<$:LOG_HW_INIT> - PRIVATE $<$:DYNAMIC_LOADING_LIBACCEL_CONFIG>) - -# [SUBDIR]middle-layer -file(GLOB MIDDLE_LAYER_SRC - ${QPL_SRC_DIR}/middle-layer/accelerator/*.cpp - ${QPL_SRC_DIR}/middle-layer/analytics/*.cpp - ${QPL_SRC_DIR}/middle-layer/common/*.cpp - ${QPL_SRC_DIR}/middle-layer/compression/*.cpp - ${QPL_SRC_DIR}/middle-layer/compression/*/*.cpp - ${QPL_SRC_DIR}/middle-layer/compression/*/*/*.cpp - ${QPL_SRC_DIR}/middle-layer/dispatcher/*.cpp - ${QPL_SRC_DIR}/middle-layer/other/*.cpp - ${QPL_SRC_DIR}/middle-layer/util/*.cpp) - -add_library(middle_layer_lib OBJECT - ${MIDDLE_LAYER_SRC}) - -set_property(GLOBAL APPEND PROPERTY QPL_LIB_DEPS - $) - -target_compile_options(middle_layer_lib - PRIVATE $<$:$<$:-O3;-U_FORTIFY_SOURCE;-D_FORTIFY_SOURCE=2>> - PRIVATE ${QPL_LINUX_TOOLCHAIN_CPP_EMBEDDED_FLAGS}) - -target_compile_definitions(middle_layer_lib - PUBLIC QPL_VERSION="${QPL_VERSION}" - PUBLIC $<$:LOG_HW_INIT> - PUBLIC $<$:QPL_EFFICIENT_WAIT> - PUBLIC QPL_BADARG_CHECK - PUBLIC $<$:DYNAMIC_LOADING_LIBACCEL_CONFIG>) - -set_target_properties(middle_layer_lib PROPERTIES CXX_STANDARD 17) - -target_include_directories(middle_layer_lib - PRIVATE ${UUID_DIR} - PUBLIC $ - PUBLIC $ - PRIVATE $ - PUBLIC $ - PUBLIC $ - PUBLIC $) - -target_compile_definitions(middle_layer_lib PUBLIC -DQPL_LIB) - -# [SUBDIR]c_api -file(GLOB QPL_C_API_SRC - ${QPL_SRC_DIR}/c_api/compression_operations/*.c - ${QPL_SRC_DIR}/c_api/compression_operations/*.cpp - ${QPL_SRC_DIR}/c_api/filter_operations/*.cpp - ${QPL_SRC_DIR}/c_api/legacy_hw_path/*.c - ${QPL_SRC_DIR}/c_api/legacy_hw_path/*.cpp - ${QPL_SRC_DIR}/c_api/other_operations/*.cpp - ${QPL_SRC_DIR}/c_api/serialization/*.cpp - ${QPL_SRC_DIR}/c_api/*.cpp) - -add_library(qpl_c_api OBJECT ${QPL_C_API_SRC}) - -target_include_directories(qpl_c_api - PUBLIC $ - PUBLIC $ $ - PRIVATE $) - -set_target_properties(qpl_c_api PROPERTIES - $<$:C_STANDARD 17 - CXX_STANDARD 17) - -target_compile_options(qpl_c_api - PRIVATE $<$:$<$:-O3;-U_FORTIFY_SOURCE;-D_FORTIFY_SOURCE=2>> - PRIVATE $<$:${QPL_LINUX_TOOLCHAIN_CPP_EMBEDDED_FLAGS}>) - -target_compile_definitions(qpl_c_api - PUBLIC -DQPL_BADARG_CHECK # own_checkers.h - PUBLIC -DQPL_LIB # needed for middle_layer_lib - PUBLIC $<$:LOG_HW_INIT>) # needed for middle_layer_lib - -set_property(GLOBAL APPEND PROPERTY QPL_LIB_DEPS - $) - -# Final _qpl target - -get_property(LIB_DEPS GLOBAL PROPERTY QPL_LIB_DEPS) - -add_library(_qpl STATIC ${LIB_DEPS}) - -target_include_directories(_qpl - PUBLIC $ $) - -target_link_libraries(_qpl - PRIVATE ch_contrib::accel-config) - -target_include_directories(_qpl SYSTEM BEFORE - PUBLIC "${QPL_PROJECT_DIR}/include" - PUBLIC ${UUID_DIR}) - -add_library (ch_contrib::qpl ALIAS _qpl) diff --git a/contrib/qpl-cmake/uuid/uuid.h b/contrib/qpl-cmake/uuid/uuid.h deleted file mode 100644 index bf108ba0d29..00000000000 --- a/contrib/qpl-cmake/uuid/uuid.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _QPL_UUID_UUID_H -#define _QPL_UUID_UUID_H -typedef unsigned char uuid_t[16]; -#endif /* _QPL_UUID_UUID_H */ diff --git a/contrib/usearch b/contrib/usearch index 1706420acaf..7efe8b710c9 160000 --- a/contrib/usearch +++ b/contrib/usearch @@ -1 +1 @@ -Subproject commit 1706420acafbd83d852c512dcf343af0a4059e48 +Subproject commit 7efe8b710c9831bfe06573b1df0fad001b04a2b5 diff --git a/contrib/usearch-cmake/CMakeLists.txt b/contrib/usearch-cmake/CMakeLists.txt index 25f6ca82a74..fda061bf467 100644 --- a/contrib/usearch-cmake/CMakeLists.txt +++ b/contrib/usearch-cmake/CMakeLists.txt @@ -6,12 +6,63 @@ target_include_directories(_usearch SYSTEM INTERFACE ${USEARCH_PROJECT_DIR}/incl target_link_libraries(_usearch INTERFACE _fp16) target_compile_definitions(_usearch INTERFACE USEARCH_USE_FP16LIB) -# target_compile_definitions(_usearch INTERFACE USEARCH_USE_SIMSIMD) -# ^^ simsimd is not enabled at the moment. Reasons: -# - Vectorization is important for raw scans but not so much for HNSW. We use usearch only for HNSW. -# - Simsimd does compile-time dispatch (choice of SIMD kernels determined by capabilities of the build machine) or dynamic dispatch (SIMD -# kernels chosen at runtime based on cpuid instruction). Since current builds are limited to SSE 4.2 (x86) and NEON (ARM), the speedup of -# the former would be moderate compared to AVX-512 / SVE. The latter is at the moment too fragile with respect to portability across x86 -# and ARM machines ... certain conbinations of quantizations / distance functions / SIMD instructions are not implemented at the moment. +# Only x86 for now. On ARM, the linker goes down in flames. To make SimSIMD compile, I had to remove a macro checks in SimSIMD +# for AVX512 (x86, worked nicely) and __ARM_BF16_FORMAT_ALTERNATIVE. It is probably because of that. +if (ARCH_AMD64) + target_link_libraries(_usearch INTERFACE _simsimd) + target_compile_definitions(_usearch INTERFACE USEARCH_USE_SIMSIMD) + + target_compile_definitions(_usearch INTERFACE USEARCH_CAN_COMPILE_FLOAT16) + target_compile_definitions(_usearch INTERFACE USEARCH_CAN_COMPILE_BF16) +endif () add_library(ch_contrib::usearch ALIAS _usearch) + + +# Cf. https://github.com/llvm/llvm-project/issues/107810 (though it is not 100% the same stack) +# +# LLVM ERROR: Cannot select: 0x7996e7a73150: f32,ch = load<(load (s16) from %ir.22, !tbaa !54231), anyext from bf16> 0x79961cb737c0, 0x7996e7a1a500, undef:i64, ./contrib/SimSIMD/include/simsimd/dot.h:215:1 +# 0x7996e7a1a500: i64 = add 0x79961e770d00, Constant:i64<-16>, ./contrib/SimSIMD/include/simsimd/dot.h:215:1 +# 0x79961e770d00: i64,ch = CopyFromReg 0x79961cb737c0, Register:i64 %4, ./contrib/SimSIMD/include/simsimd/dot.h:215:1 +# 0x7996e7a1ae10: i64 = Register %4 +# 0x7996e7a1b5f0: i64 = Constant<-16> +# 0x7996e7a1a730: i64 = undef +# In function: _ZL23simsimd_dot_bf16_serialPKu6__bf16S0_yPd +# PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. +# Stack dump: +# 0. Running pass 'Function Pass Manager' on module 'src/libdbms.a(MergeTreeIndexVectorSimilarity.cpp.o at 2312737440)'. +# 1. Running pass 'AArch64 Instruction Selection' on function '@_ZL23simsimd_dot_bf16_serialPKu6__bf16S0_yPd' +# #0 0x00007999e83a63bf llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xda63bf) +# #1 0x00007999e83a44f9 llvm::sys::RunSignalHandlers() (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xda44f9) +# #2 0x00007999e83a6b00 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xda6b00) +# #3 0x00007999e6e45320 (/lib/x86_64-linux-gnu/libc.so.6+0x45320) +# #4 0x00007999e6e9eb1c pthread_kill (/lib/x86_64-linux-gnu/libc.so.6+0x9eb1c) +# #5 0x00007999e6e4526e raise (/lib/x86_64-linux-gnu/libc.so.6+0x4526e) +# #6 0x00007999e6e288ff abort (/lib/x86_64-linux-gnu/libc.so.6+0x288ff) +# #7 0x00007999e82fe0c2 llvm::report_fatal_error(llvm::Twine const&, bool) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xcfe0c2) +# #8 0x00007999e8c2f8e3 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x162f8e3) +# #9 0x00007999e8c2ed76 llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x162ed76) +# #10 0x00007999ea1adbcb (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x2badbcb) +# #11 0x00007999e8c2611f llvm::SelectionDAGISel::DoInstructionSelection() (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x162611f) +# #12 0x00007999e8c25790 llvm::SelectionDAGISel::CodeGenAndEmitDAG() (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x1625790) +# #13 0x00007999e8c248de llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x16248de) +# #14 0x00007999e8c22934 llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x1622934) +# #15 0x00007999e87826b9 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x11826b9) +# #16 0x00007999e84f7772 llvm::FPPassManager::runOnFunction(llvm::Function&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xef7772) +# #17 0x00007999e84fd2f4 llvm::FPPassManager::runOnModule(llvm::Module&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xefd2f4) +# #18 0x00007999e84f7e9f llvm::legacy::PassManagerImpl::run(llvm::Module&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xef7e9f) +# #19 0x00007999e99f7d61 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f7d61) +# #20 0x00007999e99f8c91 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f8c91) +# #21 0x00007999e99f8b10 llvm::lto::thinBackend(llvm::lto::Config const&, unsigned int, std::function>> (unsigned int, llvm::Twine const&)>, llvm::Module&, llvm::ModuleSummaryIndex const&, llvm::DenseMap, std::equal_to, std::allocator>, llvm::DenseMapInfo, llvm::detail::DenseMapPair, std::equal_to, std::allocator>>> const&, llvm::DenseMap, llvm::detail::DenseMapPair> const&, llvm::MapVector, llvm::detail::DenseMapPair>, llvm::SmallVector, 0u>>*, std::vector> const&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f8b10) +# #22 0x00007999e99f248d (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f248d) +# #23 0x00007999e99f1cd6 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f1cd6) +# #24 0x00007999e82c9beb (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xcc9beb) +# #25 0x00007999e834ebe3 llvm::ThreadPool::processTasks(llvm::ThreadPoolTaskGroup*) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xd4ebe3) +# #26 0x00007999e834f704 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xd4f704) +# #27 0x00007999e6e9ca94 (/lib/x86_64-linux-gnu/libc.so.6+0x9ca94) +# #28 0x00007999e6f29c3c (/lib/x86_64-linux-gnu/libc.so.6+0x129c3c) +# clang++-18: error: unable to execute command: Aborted (core dumped) +# clang++-18: error: linker command failed due to signal (use -v to see invocation) +# ^[[A^Cninja: build stopped: interrupted by user. diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index dfe6a420260..4ecc087afb4 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -1,7 +1,7 @@ # The Dockerfile.ubuntu exists for the tests/ci/docker_server.py script # If the image is built from Dockerfile.alpine, then the `-alpine` suffix is added automatically, # so the only purpose of Dockerfile.ubuntu is to push `latest`, `head` and so on w/o suffixes -FROM ubuntu:20.04 AS glibc-donor +FROM ubuntu:22.04 AS glibc-donor ARG TARGETARCH RUN arch=${TARGETARCH:-amd64} \ @@ -9,7 +9,11 @@ RUN arch=${TARGETARCH:-amd64} \ amd64) rarch=x86_64 ;; \ arm64) rarch=aarch64 ;; \ esac \ - && ln -s "${rarch}-linux-gnu" /lib/linux-gnu + && ln -s "${rarch}-linux-gnu" /lib/linux-gnu \ + && case $arch in \ + amd64) ln /lib/linux-gnu/ld-linux-x86-64.so.2 /lib/linux-gnu/ld-2.35.so ;; \ + arm64) ln /lib/linux-gnu/ld-linux-aarch64.so.1 /lib/linux-gnu/ld-2.35.so ;; \ + esac FROM alpine @@ -20,7 +24,7 @@ ENV LANG=en_US.UTF-8 \ TZ=UTC \ CLICKHOUSE_CONFIG=/etc/clickhouse-server/config.xml -COPY --from=glibc-donor /lib/linux-gnu/libc.so.6 /lib/linux-gnu/libdl.so.2 /lib/linux-gnu/libm.so.6 /lib/linux-gnu/libpthread.so.0 /lib/linux-gnu/librt.so.1 /lib/linux-gnu/libnss_dns.so.2 /lib/linux-gnu/libnss_files.so.2 /lib/linux-gnu/libresolv.so.2 /lib/linux-gnu/ld-2.31.so /lib/ +COPY --from=glibc-donor /lib/linux-gnu/libc.so.6 /lib/linux-gnu/libdl.so.2 /lib/linux-gnu/libm.so.6 /lib/linux-gnu/libpthread.so.0 /lib/linux-gnu/librt.so.1 /lib/linux-gnu/libnss_dns.so.2 /lib/linux-gnu/libnss_files.so.2 /lib/linux-gnu/libresolv.so.2 /lib/linux-gnu/ld-2.35.so /lib/ COPY --from=glibc-donor /etc/nsswitch.conf /etc/ COPY entrypoint.sh /entrypoint.sh @@ -34,7 +38,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.9.2.42" +ARG VERSION="24.10.1.2812" ARG PACKAGES="clickhouse-keeper" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 991c25ad142..93acf1a5773 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -35,7 +35,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.9.2.42" +ARG VERSION="24.10.1.2812" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 5dc88b49e31..0d5c983f5e6 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:22.04 # see https://github.com/moby/moby/issues/4032#issuecomment-192327844 # It could be removed after we move on a version 23:04+ @@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="24.9.2.42" +ARG VERSION="24.10.1.2812" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" #docker-official-library:off diff --git a/docker/server/README.md b/docker/server/README.md index 65239126790..1dc636414ac 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -20,6 +20,7 @@ For more information and documentation see https://clickhouse.com/. - The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). Virtually all x86 CPUs after 2005 support SSE3. - The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A) and additionally the Load-Acquire RCpc register. The register is optional in version ARMv8.2-A and mandatory in [ARMv8.3-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.3-A). Supported in Graviton >=2, Azure and GCP instances. Examples for unsupported devices are Raspberry Pi 4 (ARMv8.0-A) and Jetson AGX Xavier/Orin (ARMv8.2-A). +- Since the Clickhouse 24.11 Ubuntu images started using `ubuntu:22.04` as its base image. It requires docker version >= `20.10.10` containing [patch](https://github.com/moby/moby/commit/977283509f75303bc6612665a04abf76ff1d2468). As a workaround you could use `docker run [--privileged | --security-opt seccomp=unconfined]` instead, however that has security implications. ## How to use this image diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index a39f96867be..12f1cc4d357 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -25,7 +25,7 @@ EXTRA_COLUMNS_EXPRESSION_TRACE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> d # coverage_log needs more columns for symbolization, but only symbol names (the line numbers are too heavy to calculate) EXTRA_COLUMNS_COVERAGE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), " -EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> demangle(addressToSymbol(x)), coverage)::Array(LowCardinality(String)) AS symbols" +EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayDistinct(arrayMap(x -> demangle(addressToSymbol(x)), coverage))::Array(LowCardinality(String)) AS symbols" function __set_connection_args diff --git a/docker/test/integration/runner/requirements.txt b/docker/test/integration/runner/requirements.txt index 4802623abd6..bb0c4d001e6 100644 --- a/docker/test/integration/runner/requirements.txt +++ b/docker/test/integration/runner/requirements.txt @@ -23,6 +23,7 @@ charset-normalizer==3.3.2 click==8.1.7 confluent-kafka==2.3.0 cryptography==42.0.0 +datacompy==0.7.3 dbus-python==1.2.18 delta-spark==2.3.0 deltalake==0.16.0 @@ -60,6 +61,7 @@ oauthlib==3.2.0 packaging==24.0 paramiko==3.4.0 pika==1.2.0 +pandas==2.2.3 pip==24.1.1 pluggy==1.5.0 protobuf==4.25.2 diff --git a/docker/test/libfuzzer/Dockerfile b/docker/test/libfuzzer/Dockerfile index 3ffae0cd921..46e305c90ab 100644 --- a/docker/test/libfuzzer/Dockerfile +++ b/docker/test/libfuzzer/Dockerfile @@ -33,8 +33,6 @@ RUN apt-get update \ COPY requirements.txt / RUN pip3 install --no-cache-dir -r /requirements.txt -ENV FUZZER_ARGS="-max_total_time=60" - SHELL ["/bin/bash", "-c"] # docker run --network=host --volume :/workspace -e PR_TO_TEST=<> -e SHA_TO_TEST=<> clickhouse/libfuzzer diff --git a/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile b/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile deleted file mode 100644 index a9802f6f1da..00000000000 --- a/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -# Since right now we can't set volumes to the docker during build, we split building container in stages: -# 1. build base container -# 2. run base conatiner with mounted volumes -# 3. commit container as image -FROM ubuntu:20.04 as clickhouse-test-runner-base - -# A volume where directory with clickhouse packages to be mounted, -# for later installing. -VOLUME /packages - -CMD apt-get update ;\ - DEBIAN_FRONTEND=noninteractive \ - apt install -y /packages/clickhouse-common-static_*.deb \ - /packages/clickhouse-client_*.deb \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index cdc1d1fa095..564301f447c 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -16,6 +16,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ libxml2-utils \ locales \ moreutils \ + ripgrep \ python3-pip \ yamllint \ zstd \ @@ -27,7 +28,7 @@ COPY requirements.txt / RUN pip3 install --no-cache-dir -r requirements.txt RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8 -ENV LC_ALL en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 # Architecture of the image when BuildKit/buildx is used ARG TARGETARCH diff --git a/docker/test/style/requirements.txt b/docker/test/style/requirements.txt index cc87f6e548d..aab20b5bee0 100644 --- a/docker/test/style/requirements.txt +++ b/docker/test/style/requirements.txt @@ -12,6 +12,7 @@ charset-normalizer==3.3.2 click==8.1.7 codespell==2.2.1 cryptography==43.0.1 +datacompy==0.7.3 Deprecated==1.2.14 dill==0.3.8 flake8==4.0.1 @@ -23,6 +24,7 @@ mccabe==0.6.1 multidict==6.0.5 mypy==1.8.0 mypy-extensions==1.0.0 +pandas==2.2.3 packaging==24.1 pathspec==0.9.0 pip==24.1.1 diff --git a/docs/changelogs/v24.10.1.2812-stable.md b/docs/changelogs/v24.10.1.2812-stable.md new file mode 100644 index 00000000000..c26bbf706ff --- /dev/null +++ b/docs/changelogs/v24.10.1.2812-stable.md @@ -0,0 +1,412 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.10.1.2812-stable (9cd0a3738d5) FIXME as compared to v24.10.1.1-new (b12a3677418) + +#### Backward Incompatible Change +* Allow to write `SETTINGS` before `FORMAT` in a chain of queries with `UNION` when subqueries are inside parentheses. This closes [#39712](https://github.com/ClickHouse/ClickHouse/issues/39712). Change the behavior when a query has the SETTINGS clause specified twice in a sequence. The closest SETTINGS clause will have a preference for the corresponding subquery. In the previous versions, the outermost SETTINGS clause could take a preference over the inner one. [#68614](https://github.com/ClickHouse/ClickHouse/pull/68614) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Reordering of filter conditions from `[PRE]WHERE` clause is now allowed by default. It could be disabled by setting `allow_reorder_prewhere_conditions` to `false`. [#70657](https://github.com/ClickHouse/ClickHouse/pull/70657) ([Nikita Taranov](https://github.com/nickitat)). +* Fix `optimize_functions_to_subcolumns` optimization (previously could lead to `Invalid column type for ColumnUnique::insertRangeFrom. Expected String, got LowCardinality(String)` error), by preserving `LowCardinality` type in `mapKeys`/`mapValues`. [#70716](https://github.com/ClickHouse/ClickHouse/pull/70716) ([Azat Khuzhin](https://github.com/azat)). +* Remove the `idxd-config` library, which has an incompatible license. This also removes the experimental Intel DeflateQPL codec. [#70987](https://github.com/ClickHouse/ClickHouse/pull/70987) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### New Feature +* MongoDB integration refactored: migration to new driver mongocxx from deprecated Poco::MongoDB, remove support for deprecated old protocol, support for connection by URI, support for all MongoDB types, support for WHERE and ORDER BY statements on MongoDB side, restriction for expression unsupported by MongoDB. [#63279](https://github.com/ClickHouse/ClickHouse/pull/63279) ([Kirill Nikiforov](https://github.com/allmazz)). +* A new `--progress-table` option in clickhouse-client prints a table with metrics changing during query execution; a new `--enable-progress-table-toggle` is associated with the `--progress-table` option, and toggles the rendering of the progress table by pressing the control key (Space). [#63689](https://github.com/ClickHouse/ClickHouse/pull/63689) ([Maria Khristenko](https://github.com/mariaKhr)). +* This allows to grant access to the wildcard prefixes. `GRANT SELECT ON db.table_pefix_* TO user`. [#65311](https://github.com/ClickHouse/ClickHouse/pull/65311) ([pufit](https://github.com/pufit)). +* Add system.query_metric_log which contains history of memory and metric values from table system.events for individual queries, periodically flushed to disk. [#66532](https://github.com/ClickHouse/ClickHouse/pull/66532) ([Pablo Marcos](https://github.com/pamarcos)). +* A simple SELECT query can be written with implicit SELECT to enable calculator-style expressions, e.g., `ch "1 + 2"`. This is controlled by a new setting, `implicit_select`. [#68502](https://github.com/ClickHouse/ClickHouse/pull/68502) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support --copy mode for clickhouse local as a shortcut for format conversion [#68503](https://github.com/ClickHouse/ClickHouse/issues/68503). [#68583](https://github.com/ClickHouse/ClickHouse/pull/68583) ([Denis Hananein](https://github.com/denis-hananein)). +* Add support for `arrayUnion` function. [#68989](https://github.com/ClickHouse/ClickHouse/pull/68989) ([Peter Nguyen](https://github.com/petern48)). +* Support aggreate function `quantileExactWeightedInterpolated`, which is a interpolated version based on quantileExactWeighted. Some people may wonder why we need a new `quantileExactWeightedInterpolated` since we already have `quantileExactInterpolatedWeighted`. The reason is the new one is more accurate than the old one. BTW, it is for spark compatiability in Apache Gluten. [#69619](https://github.com/ClickHouse/ClickHouse/pull/69619) ([李扬](https://github.com/taiyang-li)). +* Support function arrayElementOrNull. It returns null if array index is out of range or map key not found. [#69646](https://github.com/ClickHouse/ClickHouse/pull/69646) ([李扬](https://github.com/taiyang-li)). +* Allows users to specify regular expressions through new `message_regexp` and `message_regexp_negative` fields in the `config.xml` file to filter out logging. The logging is applied to the formatted un-colored text for the most intuitive developer experience. [#69657](https://github.com/ClickHouse/ClickHouse/pull/69657) ([Peter Nguyen](https://github.com/petern48)). +* Support Dynamic type in most functions by executing them on internal types inside Dynamic. [#69691](https://github.com/ClickHouse/ClickHouse/pull/69691) ([Pavel Kruglov](https://github.com/Avogar)). +* Re-added `RIPEMD160` function, which computes the RIPEMD-160 cryptographic hash of a string. Example: `SELECT HEX(RIPEMD160('The quick brown fox jumps over the lazy dog'))` returns `37F332F68DB77BD9D7EDD4969571AD671CF9DD3B`. [#70087](https://github.com/ClickHouse/ClickHouse/pull/70087) ([Dergousov Maxim](https://github.com/m7kss1)). +* Allow to cache read files for object storage table engines and data lakes using hash from ETag + file path as cache key. [#70135](https://github.com/ClickHouse/ClickHouse/pull/70135) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support reading Iceberg tables on HDFS. [#70268](https://github.com/ClickHouse/ClickHouse/pull/70268) ([flynn](https://github.com/ucasfl)). +* Allow to read/write JSON type as binary string in RowBinary format under settings `input_format_binary_read_json_as_string/output_format_binary_write_json_as_string`. [#70288](https://github.com/ClickHouse/ClickHouse/pull/70288) ([Pavel Kruglov](https://github.com/Avogar)). +* Allow to serialize/deserialize JSON column as single String column in Native format. For output use setting `output_format_native_write_json_as_string`. For input, use serialization version `1` before the column data. [#70312](https://github.com/ClickHouse/ClickHouse/pull/70312) ([Pavel Kruglov](https://github.com/Avogar)). +* Supports standard CTE, `with insert`, as previously only supports `insert ... with ...`. [#70593](https://github.com/ClickHouse/ClickHouse/pull/70593) ([Shichao Jin](https://github.com/jsc0218)). + +#### Performance Improvement +* Support minmax index for `pointInPolygon`. [#62085](https://github.com/ClickHouse/ClickHouse/pull/62085) ([JackyWoo](https://github.com/JackyWoo)). +* Add support for parquet bloom filters. [#62966](https://github.com/ClickHouse/ClickHouse/pull/62966) ([Arthur Passos](https://github.com/arthurpassos)). +* Lock-free parts rename to avoid INSERT affect SELECT (due to parts lock) (under normal circumstances with `fsync_part_directory`, QPS of SELECT with INSERT in parallel, increased 2x, under heavy load the effect is even bigger). Note, this only includes `ReplicatedMergeTree` for now. [#64955](https://github.com/ClickHouse/ClickHouse/pull/64955) ([Azat Khuzhin](https://github.com/azat)). +* Respect `ttl_only_drop_parts` on `materialize ttl`; only read necessary columns to recalculate TTL and drop parts by replacing them with an empty one. [#65488](https://github.com/ClickHouse/ClickHouse/pull/65488) ([Andrey Zvonov](https://github.com/zvonand)). +* Refactor `IDisk` and `IObjectStorage` for better performance. Tables from `plain` and `plain_rewritable` object storages will initialize faster. [#68146](https://github.com/ClickHouse/ClickHouse/pull/68146) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Optimized thread creation in the ThreadPool to minimize lock contention. Thread creation is now performed outside of the critical section to avoid delays in job scheduling and thread management under high load conditions. This leads to a much more responsive ClickHouse under heavy concurrent load. [#68694](https://github.com/ClickHouse/ClickHouse/pull/68694) ([filimonov](https://github.com/filimonov)). +* Enable reading LowCardinality string columns from ORC. [#69481](https://github.com/ClickHouse/ClickHouse/pull/69481) ([李扬](https://github.com/taiyang-li)). +* Added an ability to parse data directly into sparse columns. [#69828](https://github.com/ClickHouse/ClickHouse/pull/69828) ([Anton Popov](https://github.com/CurtizJ)). +* Supports parallel reading of parquet row groups and prefetching of row groups in single-threaded mode. [#69862](https://github.com/ClickHouse/ClickHouse/pull/69862) ([LiuNeng](https://github.com/liuneng1994)). +* Improved performance of parsing formats with high number of missed values (e.g. `JSONEachRow`). [#69875](https://github.com/ClickHouse/ClickHouse/pull/69875) ([Anton Popov](https://github.com/CurtizJ)). +* Use `LowCardinality` for `ProfileEvents` in system logs such as `part_log`, `query_views_log`, `filesystem_cache_log`. [#70152](https://github.com/ClickHouse/ClickHouse/pull/70152) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Improve performance of FromUnixTimestamp/ToUnixTimestamp functions. [#71042](https://github.com/ClickHouse/ClickHouse/pull/71042) ([kevinyhzou](https://github.com/KevinyhZou)). + +#### Improvement +* Allow parametrised SQL aliases. [#50665](https://github.com/ClickHouse/ClickHouse/pull/50665) ([Anton Kozlov](https://github.com/tonickkozlov)). +* Fixed [#57616](https://github.com/ClickHouse/ClickHouse/issues/57616) this problem occurs because all positive number arguments are automatically identified as `uint64` type, leading to an inability to match int type data in `summapfiltered`. the issue of non-matching is indeed confusing, as the `uint64` parameters are not specified by the user. additionally, if the arguments are `[1,2,3,toint8(-3)]`, due to the `getleastsupertype()`, these parameters will be uniformly treated as `int` type, causing `'1,2,3'` to also fail in matching the `uint` type data in `summapfiltered`. [#58408](https://github.com/ClickHouse/ClickHouse/pull/58408) ([Chen768959](https://github.com/Chen768959)). +* `ALTER TABLE .. REPLACE PARTITION` doesn't wait anymore for mutations/merges that happen in other partitions. [#59138](https://github.com/ClickHouse/ClickHouse/pull/59138) ([Vasily Nemkov](https://github.com/Enmk)). +* Refreshable materialized views are now supported in Replicated databases. [#60669](https://github.com/ClickHouse/ClickHouse/pull/60669) ([Michael Kolupaev](https://github.com/al13n321)). +* Symbolic links for tables in the `data/database_name/` directory are created for the actual paths to the table's data, depending on the storage policy, instead of the `store/...` directory on the default disk. [#61777](https://github.com/ClickHouse/ClickHouse/pull/61777) ([Kirill](https://github.com/kirillgarbar)). +* Apply configuration updates in global context object. It fixes issues like [#62308](https://github.com/ClickHouse/ClickHouse/issues/62308). [#62944](https://github.com/ClickHouse/ClickHouse/pull/62944) ([Amos Bird](https://github.com/amosbird)). +* Reworked settings that control the behavior of parallel replicas algorithms. A quick recap: ClickHouse has four different algorithms for parallel reading involving multiple replicas, which is reflected in the setting `parallel_replicas_mode`, the default value for it is `read_tasks` Additionally, the toggle-switch setting `enable_parallel_replicas` has been added. [#63151](https://github.com/ClickHouse/ClickHouse/pull/63151) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix `ReadSettings` not using user set values, because defaults were only used. [#65625](https://github.com/ClickHouse/ClickHouse/pull/65625) ([Kseniia Sumarokova](https://github.com/kssenii)). +* While parsing an Enum field from JSON, a string containing an integer will be interpreted as the corresponding Enum element. This closes [#65119](https://github.com/ClickHouse/ClickHouse/issues/65119). [#66801](https://github.com/ClickHouse/ClickHouse/pull/66801) ([scanhex12](https://github.com/scanhex12)). +* Allow `TRIM` -ing `LEADING` or `TRAILING` empty string as a no-op. Closes [#67792](https://github.com/ClickHouse/ClickHouse/issues/67792). [#68455](https://github.com/ClickHouse/ClickHouse/pull/68455) ([Peter Nguyen](https://github.com/petern48)). +* Support creating a table with a query: `CREATE TABLE ... CLONE AS ...`. It clones the source table's schema and then attaches all partitions to the newly created table. This feature is only supported with tables of the `MergeTree` family Closes [#65015](https://github.com/ClickHouse/ClickHouse/issues/65015). [#69091](https://github.com/ClickHouse/ClickHouse/pull/69091) ([tuanpach](https://github.com/tuanpach)). +* In Gluten ClickHouse, Spark's timestamp type is mapped to ClickHouse's datetime64(6) type. When casting timestamp '2012-01-01 00:11:22' as a string, Spark returns '2012-01-01 00:11:22', while Gluten ClickHouse returns '2012-01-01 00:11:22.000000'. [#69179](https://github.com/ClickHouse/ClickHouse/pull/69179) ([Wenzheng Liu](https://github.com/lwz9103)). +* Always use the new analyzer to calculate constant expressions when `enable_analyzer` is set to `true`. Support calculation of `executable()` table function arguments without using `SELECT` query for constant expression. [#69292](https://github.com/ClickHouse/ClickHouse/pull/69292) ([Dmitry Novik](https://github.com/novikd)). +* Add `enable_secure_identifiers` to disallow insecure identifiers. [#69411](https://github.com/ClickHouse/ClickHouse/pull/69411) ([tuanpach](https://github.com/tuanpach)). +* Add `show_create_query_identifier_quoting_rule` to define identifier quoting behavior of the show create query result. Possible values: - `user_display`: When the identifiers is a keyword. - `when_necessary`: When the identifiers is one of `{"distinct", "all", "table"}`, or it can cause ambiguity: column names, dictionary attribute names. - `always`: Always quote identifiers. [#69448](https://github.com/ClickHouse/ClickHouse/pull/69448) ([tuanpach](https://github.com/tuanpach)). +* Follow-up to https://github.com/ClickHouse/ClickHouse/pull/69346 Point 4 described there will work now as well:. [#69563](https://github.com/ClickHouse/ClickHouse/pull/69563) ([Vitaly Baranov](https://github.com/vitlibar)). +* Implement generic SerDe between Avro Union and ClickHouse Variant type. Resolves [#69713](https://github.com/ClickHouse/ClickHouse/issues/69713). [#69712](https://github.com/ClickHouse/ClickHouse/pull/69712) ([Jiří Kozlovský](https://github.com/jirislav)). +* 1. CREATE TABLE AS will copy PRIMARY KEY, ORDER BY, and similar clauses. Now it is supported only for the MergeTree family of table engines. 2. For example, the follow SQL statements will trigger exception in the past, but this PR fixes it: if the destination table do not provide an `ORDER BY` or `PRIMARY KEY` expression in the table definition, we will copy that from source table. [#69739](https://github.com/ClickHouse/ClickHouse/pull/69739) ([sakulali](https://github.com/sakulali)). +* Added user-level settings `min_free_disk_bytes_to_throw_insert` and `min_free_disk_ratio_to_throw_insert` to prevent insertions on disks that are almost full. [#69755](https://github.com/ClickHouse/ClickHouse/pull/69755) ([Marco Vilas Boas](https://github.com/marco-vb)). +* If you run `clickhouse-client` or other CLI application and it starts up slowly due to an overloaded server, and you start typing your query, such as `SELECT`, the previous versions will display the remaining of the terminal echo contents before printing the greetings message, such as `SELECTClickHouse local version 24.10.1.1.` instead of `ClickHouse local version 24.10.1.1.`. Now it is fixed. This closes [#31696](https://github.com/ClickHouse/ClickHouse/issues/31696). [#69856](https://github.com/ClickHouse/ClickHouse/pull/69856) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add new column readonly_duration to the system.replicas table. Needed to be able to distinguish actual readonly replicas from sentinel ones in alerts. [#69871](https://github.com/ClickHouse/ClickHouse/pull/69871) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). +* Change the join to sort settings type to unsigned int. [#69886](https://github.com/ClickHouse/ClickHouse/pull/69886) ([kevinyhzou](https://github.com/KevinyhZou)). +* Support 64-bit XID in Keeper. It can be enabled with `use_xid_64` config. [#69908](https://github.com/ClickHouse/ClickHouse/pull/69908) ([Antonio Andelic](https://github.com/antonio2368)). +* New function getSettingOrDefault() added to return the default value and avoid exception if a custom setting is not found in the current profile. [#69917](https://github.com/ClickHouse/ClickHouse/pull/69917) ([Shankar](https://github.com/shiyer7474)). +* Allow empty needle in function replace, the same behavior with PostgreSQL. [#69918](https://github.com/ClickHouse/ClickHouse/pull/69918) ([zhanglistar](https://github.com/zhanglistar)). +* Enhance OpenTelemetry span logging to include query settings. [#70011](https://github.com/ClickHouse/ClickHouse/pull/70011) ([sharathks118](https://github.com/sharathks118)). +* Allow empty needle in functions replaceRegexp*, like https://github.com/ClickHouse/ClickHouse/pull/69918. [#70053](https://github.com/ClickHouse/ClickHouse/pull/70053) ([zhanglistar](https://github.com/zhanglistar)). +* Add info to higher-order array functions if lambda result type is unexpected. [#70093](https://github.com/ClickHouse/ClickHouse/pull/70093) ([ttanay](https://github.com/ttanay)). +* Keeper improvement: less blocking during cluster changes. [#70275](https://github.com/ClickHouse/ClickHouse/pull/70275) ([Antonio Andelic](https://github.com/antonio2368)). +* Embedded documentation for settings will be strictly more detailed and complete than the documentation on the website. This is the first step before making the website documentation always auto-generated from the source code. This has long-standing implications: - it will be guaranteed to have every setting; - there is no chance of having default values obsolete; - we can generate this documentation for each ClickHouse version; - the documentation can be displayed by the server itself even without Internet access. Generate the docs on the website from the source code. [#70289](https://github.com/ClickHouse/ClickHouse/pull/70289) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add `WITH IMPLICIT` and `FINAL` keywords to the `SHOW GRANTS` command. Fix a minor bug with implicit grants: [#70094](https://github.com/ClickHouse/ClickHouse/issues/70094). [#70293](https://github.com/ClickHouse/ClickHouse/pull/70293) ([pufit](https://github.com/pufit)). +* Don't disable nonblocking read from page cache for the entire server when reading from a blocking I/O. [#70299](https://github.com/ClickHouse/ClickHouse/pull/70299) ([Antonio Andelic](https://github.com/antonio2368)). +* Respect `compatibility` for MergeTree settings. The `compatibility` value is taken from the `default` profile on server startup, and default MergeTree settings are changed accordingly. Further changes of the `compatibility` setting do not affect MergeTree settings. [#70322](https://github.com/ClickHouse/ClickHouse/pull/70322) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Clickhouse-client realtime metrics follow-up: restore cursor when ctrl-c cancels query; immediately stop intercepting keystrokes when the query is canceled; display the metrics table if `--progress-table` is on, and toggling is disabled. [#70423](https://github.com/ClickHouse/ClickHouse/pull/70423) ([Julia Kartseva](https://github.com/jkartseva)). +* Command-line arguments for Bool settings are set to true when no value is provided for the argument (e.g. `clickhouse-client --optimize_aggregation_in_order --query "SELECT 1"`). [#70459](https://github.com/ClickHouse/ClickHouse/pull/70459) ([davidtsuk](https://github.com/davidtsuk)). +* Avoid spamming the logs with large HTTP response bodies in case of errors during inter-server communication. [#70487](https://github.com/ClickHouse/ClickHouse/pull/70487) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Added a new setting `max_parts_to_move` to control the maximum number of parts that can be moved at once. [#70520](https://github.com/ClickHouse/ClickHouse/pull/70520) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Limit the frequency of certain log messages. [#70601](https://github.com/ClickHouse/ClickHouse/pull/70601) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Don't do validation when synchronizing user_directories from keeper. [#70644](https://github.com/ClickHouse/ClickHouse/pull/70644) ([Raúl Marín](https://github.com/Algunenano)). +* Introduced a special (experimental) mode of a merge selector for MergeTree tables which makes it more aggressive for the partitions that are close to the limit by the number of parts. It is controlled by the `merge_selector_use_blurry_base` MergeTree-level setting. [#70645](https://github.com/ClickHouse/ClickHouse/pull/70645) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* `CHECK TABLE` with `PART` qualifier was incorrectly formatted in the client. [#70660](https://github.com/ClickHouse/ClickHouse/pull/70660) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support write column index and offset index using parquet native writer. [#70669](https://github.com/ClickHouse/ClickHouse/pull/70669) ([LiuNeng](https://github.com/liuneng1994)). +* Support parse `DateTime64` for microseond and timezone in joda syntax. [#70737](https://github.com/ClickHouse/ClickHouse/pull/70737) ([kevinyhzou](https://github.com/KevinyhZou)). +* Changed an approach to figure out if a cloud storage supports [batch delete](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) or not. [#70786](https://github.com/ClickHouse/ClickHouse/pull/70786) ([Vitaly Baranov](https://github.com/vitlibar)). +* Support for Parquet page V2 on native reader. [#70807](https://github.com/ClickHouse/ClickHouse/pull/70807) ([Arthur Passos](https://github.com/arthurpassos)). +* Add an HTML page for visualizing merges. [#70821](https://github.com/ClickHouse/ClickHouse/pull/70821) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#71234](https://github.com/ClickHouse/ClickHouse/issues/71234): Do not call the object storage API when listing directories, as this may be cost-inefficient. Instead, store the list of filenames in the memory. The trade-offs are increased initial load time and memory required to store filenames. [#70823](https://github.com/ClickHouse/ClickHouse/pull/70823) ([Julia Kartseva](https://github.com/jkartseva)). +* A check if table has both `storage_policy` and `disk` set after alter query is added. A check if a new storage policy is compatible with an old one when using `disk` setting is added. [#70839](https://github.com/ClickHouse/ClickHouse/pull/70839) ([Kirill](https://github.com/kirillgarbar)). +* Add system.s3_queue_settings and system.azure_queue_settings. [#70841](https://github.com/ClickHouse/ClickHouse/pull/70841) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Functions `base58Encode` and `base58Decode` now accept arguments of type `FixedString`. Example: `SELECT base58Encode(toFixedString('plaintext', 9));`. [#70846](https://github.com/ClickHouse/ClickHouse/pull/70846) ([Faizan Patel](https://github.com/faizan2786)). +* Add the `partition` column to every entry type of the part log. Previously, it was set only for some entries. This closes [#70819](https://github.com/ClickHouse/ClickHouse/issues/70819). [#70848](https://github.com/ClickHouse/ClickHouse/pull/70848) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add merge start and mutate start events into `system.part_log` which helps with merges analysis and visualization. [#70850](https://github.com/ClickHouse/ClickHouse/pull/70850) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Do not call the LIST object storage API when determining if a file or directory exists on the plain rewritable disk, as it can be cost-inefficient. [#70852](https://github.com/ClickHouse/ClickHouse/pull/70852) ([Julia Kartseva](https://github.com/jkartseva)). +* Add a profile event about the number of merged source parts. It allows the monitoring of the fanout of the merge tree in production. [#70908](https://github.com/ClickHouse/ClickHouse/pull/70908) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Reduce the number of object storage HEAD API requests in the plain_rewritable disk. [#70915](https://github.com/ClickHouse/ClickHouse/pull/70915) ([Julia Kartseva](https://github.com/jkartseva)). +* Background downloads to filesystem cache was enabled back. [#70929](https://github.com/ClickHouse/ClickHouse/pull/70929) ([Nikita Taranov](https://github.com/nickitat)). +* Add a new merge selector algorithm, named `Trivial`, for professional usage only. It is worse than the `Simple` merge selector. [#70969](https://github.com/ClickHouse/ClickHouse/pull/70969) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Bug Fix (user-visible misbehavior in an official stable release) +* Fix toHour-like conversion functions' monotonicity when optional time zone argument is passed. [#60264](https://github.com/ClickHouse/ClickHouse/pull/60264) ([Amos Bird](https://github.com/amosbird)). +* Relax `supportsPrewhere` check for StorageMerge. This fixes [#61064](https://github.com/ClickHouse/ClickHouse/issues/61064). It was hardened unnecessarily in [#60082](https://github.com/ClickHouse/ClickHouse/issues/60082). [#61091](https://github.com/ClickHouse/ClickHouse/pull/61091) ([Amos Bird](https://github.com/amosbird)). +* Fix `use_concurrency_control` setting handling for proper `concurrent_threads_soft_limit_num` limit enforcing. This enables concurrency control by default because previously it was broken. [#61473](https://github.com/ClickHouse/ClickHouse/pull/61473) ([Sergei Trifonov](https://github.com/serxa)). +* Fix incorrect JOIN ON section optimization in case of `IS NULL` check under any other function (like `NOT`) that may lead to wrong results. Closes [#67915](https://github.com/ClickHouse/ClickHouse/issues/67915). [#68049](https://github.com/ClickHouse/ClickHouse/pull/68049) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Prevent `ALTER` queries that would make the `CREATE` query of tables invalid. [#68574](https://github.com/ClickHouse/ClickHouse/pull/68574) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix inconsistent AST formatting for `negate` (`-`) and `NOT` functions with tuples and arrays. [#68600](https://github.com/ClickHouse/ClickHouse/pull/68600) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fix insertion of incomplete type into Dynamic during deserialization. It could lead to `Parameter out of bound` errors. [#69291](https://github.com/ClickHouse/ClickHouse/pull/69291) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix inf loop after `restore replica` in the replicated merge tree with zero copy. [#69293](https://github.com/ClickHouse/ClickHouse/pull/69293) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* Return back default value of `processing_threads_num` as number of cpu cores in storage `S3Queue`. [#69384](https://github.com/ClickHouse/ClickHouse/pull/69384) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Bypass try/catch flow when de/serializing nested repeated protobuf to nested columns ( fixes [#41971](https://github.com/ClickHouse/ClickHouse/issues/41971) ). [#69556](https://github.com/ClickHouse/ClickHouse/pull/69556) ([Eliot Hautefeuille](https://github.com/hileef)). +* Fix vrash during insertion into FixedString column in PostgreSQL engine. [#69584](https://github.com/ClickHouse/ClickHouse/pull/69584) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix crash when executing `create view t as (with recursive 42 as ttt select ttt);`. [#69676](https://github.com/ClickHouse/ClickHouse/pull/69676) ([Han Fei](https://github.com/hanfei1991)). +* Added `strict_once` mode to aggregate function `windowFunnel` to avoid counting one event several times in case it matches multiple conditions, close [#21835](https://github.com/ClickHouse/ClickHouse/issues/21835). [#69738](https://github.com/ClickHouse/ClickHouse/pull/69738) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fixed `maxMapState` throwing 'Bad get' if value type is DateTime64. [#69787](https://github.com/ClickHouse/ClickHouse/pull/69787) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix `getSubcolumn` with `LowCardinality` columns by overriding `useDefaultImplementationForLowCardinalityColumns` to return `true`. [#69831](https://github.com/ClickHouse/ClickHouse/pull/69831) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). +* Fix permanent blocked distributed sends if DROP of distributed table fails. [#69843](https://github.com/ClickHouse/ClickHouse/pull/69843) ([Azat Khuzhin](https://github.com/azat)). +* Fix non-cancellable queries containing WITH FILL with NaN keys. This closes [#69261](https://github.com/ClickHouse/ClickHouse/issues/69261). [#69845](https://github.com/ClickHouse/ClickHouse/pull/69845) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix analyzer default with old compatibility value. [#69895](https://github.com/ClickHouse/ClickHouse/pull/69895) ([Raúl Marín](https://github.com/Algunenano)). +* Don't check dependencies during CREATE OR REPLACE VIEW during DROP of old table. Previously CREATE OR REPLACE query failed when there are dependent tables of the recreated view. [#69907](https://github.com/ClickHouse/ClickHouse/pull/69907) ([Pavel Kruglov](https://github.com/Avogar)). +* Implement missing decimal cases for `zeroField`. Fixes [#69730](https://github.com/ClickHouse/ClickHouse/issues/69730). [#69978](https://github.com/ClickHouse/ClickHouse/pull/69978) ([Arthur Passos](https://github.com/arthurpassos)). +* Now SQL security will work with parameterized views correctly. [#69984](https://github.com/ClickHouse/ClickHouse/pull/69984) ([pufit](https://github.com/pufit)). +* Closes [#69752](https://github.com/ClickHouse/ClickHouse/issues/69752). [#69985](https://github.com/ClickHouse/ClickHouse/pull/69985) ([pufit](https://github.com/pufit)). +* Fixed a bug when the timezone could change the result of the query with a `Date` or `Date32` arguments. [#70036](https://github.com/ClickHouse/ClickHouse/pull/70036) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Fixes `Block structure mismatch` for queries with nested views and `WHERE` condition. Fixes [#66209](https://github.com/ClickHouse/ClickHouse/issues/66209). [#70054](https://github.com/ClickHouse/ClickHouse/pull/70054) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Avoid reusing columns among different named tuples when evaluating `tuple` functions. This fixes [#70022](https://github.com/ClickHouse/ClickHouse/issues/70022). [#70103](https://github.com/ClickHouse/ClickHouse/pull/70103) ([Amos Bird](https://github.com/amosbird)). +* Fix wrong LOGICAL_ERROR when replacing literals in ranges. [#70122](https://github.com/ClickHouse/ClickHouse/pull/70122) ([Pablo Marcos](https://github.com/pamarcos)). +* Check for Nullable(Nothing) type during ALTER TABLE MODIFY COLUMN/QUERY to prevent tables with such data type. [#70123](https://github.com/ClickHouse/ClickHouse/pull/70123) ([Pavel Kruglov](https://github.com/Avogar)). +* Proper error message for illegal query `JOIN ... ON *` , close [#68650](https://github.com/ClickHouse/ClickHouse/issues/68650). [#70124](https://github.com/ClickHouse/ClickHouse/pull/70124) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fix wrong result with skipping index. [#70127](https://github.com/ClickHouse/ClickHouse/pull/70127) ([Raúl Marín](https://github.com/Algunenano)). +* Fix data race in ColumnObject/ColumnTuple decompress method that could lead to heap use after free. [#70137](https://github.com/ClickHouse/ClickHouse/pull/70137) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix possible hung in ALTER COLUMN with Dynamic type. [#70144](https://github.com/ClickHouse/ClickHouse/pull/70144) ([Pavel Kruglov](https://github.com/Avogar)). +* Now ClickHouse will consider more errors as retriable and will not mark data parts as broken in case of such errors. [#70145](https://github.com/ClickHouse/ClickHouse/pull/70145) ([alesapin](https://github.com/alesapin)). +* Use correct `max_types` parameter during Dynamic type creation for JSON subcolumn. [#70147](https://github.com/ClickHouse/ClickHouse/pull/70147) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix the password being displayed in `system.query_log` for users with bcrypt password authentication method. [#70148](https://github.com/ClickHouse/ClickHouse/pull/70148) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix event counter for native interface (InterfaceNativeSendBytes). [#70153](https://github.com/ClickHouse/ClickHouse/pull/70153) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix possible crash in JSON column. [#70172](https://github.com/ClickHouse/ClickHouse/pull/70172) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix multiple issues with arrayMin and arrayMax. [#70207](https://github.com/ClickHouse/ClickHouse/pull/70207) ([Raúl Marín](https://github.com/Algunenano)). +* Respect setting allow_simdjson in JSON type parser. [#70218](https://github.com/ClickHouse/ClickHouse/pull/70218) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix server segfault on creating a materialized view with two selects and an `INTERSECT`, e.g. `CREATE MATERIALIZED VIEW v0 AS (SELECT 1) INTERSECT (SELECT 1);`. [#70264](https://github.com/ClickHouse/ClickHouse/pull/70264) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Don't modify global settings with startup scripts. Previously, changing a setting in a startup script would change it globally. [#70310](https://github.com/ClickHouse/ClickHouse/pull/70310) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix ALTER of Dynamic type with reducing max_types parameter that could lead to server crash. [#70328](https://github.com/ClickHouse/ClickHouse/pull/70328) ([Pavel Kruglov](https://github.com/Avogar)). +* Fix crash when using WITH FILL incorrectly. [#70338](https://github.com/ClickHouse/ClickHouse/pull/70338) ([Raúl Marín](https://github.com/Algunenano)). +* Fix possible use-after-free in `SYSTEM DROP FORMAT SCHEMA CACHE FOR Protobuf`. [#70358](https://github.com/ClickHouse/ClickHouse/pull/70358) ([Azat Khuzhin](https://github.com/azat)). +* Fix crash during GROUP BY JSON sub-object subcolumn. [#70374](https://github.com/ClickHouse/ClickHouse/pull/70374) ([Pavel Kruglov](https://github.com/Avogar)). +* Don't prefetch parts for vertical merges if part has no rows. [#70452](https://github.com/ClickHouse/ClickHouse/pull/70452) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix crash in WHERE with lambda functions. [#70464](https://github.com/ClickHouse/ClickHouse/pull/70464) ([Raúl Marín](https://github.com/Algunenano)). +* Fix table creation with `CREATE ... AS table_function()` with database `Replicated` and unavailable table function source on secondary replica. [#70511](https://github.com/ClickHouse/ClickHouse/pull/70511) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Ignore all output on async insert with `wait_for_async_insert=1`. Closes [#62644](https://github.com/ClickHouse/ClickHouse/issues/62644). [#70530](https://github.com/ClickHouse/ClickHouse/pull/70530) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Ignore frozen_metadata.txt while traversing shadow directory from system.remote_data_paths. [#70590](https://github.com/ClickHouse/ClickHouse/pull/70590) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Fix creation of stateful window functions on misaligned memory. [#70631](https://github.com/ClickHouse/ClickHouse/pull/70631) ([Raúl Marín](https://github.com/Algunenano)). +* Fixed rare crashes in `SELECT`-s and merges after adding a column of `Array` type with non-empty default expression. [#70695](https://github.com/ClickHouse/ClickHouse/pull/70695) ([Anton Popov](https://github.com/CurtizJ)). +* Insert into table function s3 respect query settings. [#70696](https://github.com/ClickHouse/ClickHouse/pull/70696) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fix infinite recursion when infering a proto schema with skip unsupported fields enabled. [#70697](https://github.com/ClickHouse/ClickHouse/pull/70697) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#71122](https://github.com/ClickHouse/ClickHouse/issues/71122): `GroupArraySortedData` uses a PODArray with non-POD elements, manually calling constructors and destructors for the elements as needed. But it wasn't careful enough: in two places it forgot to call destructor, in one place it left elements uninitialized if an exception is thrown when deserializing previous elements. Then `GroupArraySortedData`'s destructor called destructors on uninitialized elements and crashed: ``` 2024.10.17 22:58:23.523790 [ 5233 ] {} BaseDaemon: ########## Short fault info ############ 2024.10.17 22:58:23.523834 [ 5233 ] {} BaseDaemon: (version 24.6.1.4609 (official build), build id: 5423339A6571004018D55BBE05D464AFA35E6718, git hash: fa6cdfda8a94890eb19bc7f22f8b0b56292f7a26) (from thread 682) Received signal 11 2024.10.17 22:58:23.523862 [ 5233 ] {} BaseDaemon: Signal description: Segmentation fault 2024.10.17 22:58:23.523883 [ 5233 ] {} BaseDaemon: Address: 0x8f. Access: . Address not mapped to object. 2024.10.17 22:58:23.523908 [ 5233 ] {} BaseDaemon: Stack trace: 0x0000aaaac4b78308 0x0000ffffb7701850 0x0000aaaac0104855 0x0000aaaac01048a0 0x0000aaaac501e84c 0x0000aaaac7c510d0 0x0000aaaac7c4ba20 0x0000aaaac968bbfc 0x0000aaaac968fab0 0x0000aaaac969bf50 0x0000aaaac9b7520c 0x0000aaaac9b74c74 0x0000aaaac9b8a150 0x0000aaaac9b809f0 0x0000aaaac9b80574 0x0000aaaac9b8e364 0x0000aaaac9b8e4fc 0x0000aaaac94f4328 0x0000aaaac94f428c 0x0000aaaac94f7df0 0x0000aaaac98b5a3c 0x0000aaaac950b234 0x0000aaaac49ae264 0x0000aaaac49b1dd0 0x0000aaaac49b0a80 0x0000ffffb755d5c8 0x0000ffffb75c5edc 2024.10.17 22:58:23.523936 [ 5233 ] {} BaseDaemon: ######################################## 2024.10.17 22:58:23.523959 [ 5233 ] {} BaseDaemon: (version 24.6.1.4609 (official build), build id: 5423339A6571004018D55BBE05D464AFA35E6718, git hash: fa6cdfda8a94890eb19bc7f22f8b0b56292f7a26) (from thread 682) (query_id: 6c8a33a2-f45a-4a3b-bd71-ded6a1c9ccd3::202410_534066_534078_2) (query: ) Received signal Segmentation fault (11) 2024.10.17 22:58:23.523977 [ 5233 ] {} BaseDaemon: Address: 0x8f. Access: . Address not mapped to object. 2024.10.17 22:58:23.523993 [ 5233 ] {} BaseDaemon: Stack trace: 0x0000aaaac4b78308 0x0000ffffb7701850 0x0000aaaac0104855 0x0000aaaac01048a0 0x0000aaaac501e84c 0x0000aaaac7c510d0 0x0000aaaac7c4ba20 0x0000aaaac968bbfc 0x0000aaaac968fab0 0x0000aaaac969bf50 0x0000aaaac9b7520c 0x0000aaaac9b74c74 0x0000aaaac9b8a150 0x0000aaaac9b809f0 0x0000aaaac9b80574 0x0000aaaac9b8e364 0x0000aaaac9b8e4fc 0x0000aaaac94f4328 0x0000aaaac94f428c 0x0000aaaac94f7df0 0x0000aaaac98b5a3c 0x0000aaaac950b234 0x0000aaaac49ae264 0x0000aaaac49b1dd0 0x0000aaaac49b0a80 0x0000ffffb755d5c8 0x0000ffffb75c5edc 2024.10.17 22:58:23.524817 [ 5233 ] {} BaseDaemon: 0. signalHandler(int, siginfo_t*, void*) @ 0x000000000c6f8308 2024.10.17 22:58:23.524917 [ 5233 ] {} BaseDaemon: 1. ? @ 0x0000ffffb7701850 2024.10.17 22:58:23.524962 [ 5233 ] {} BaseDaemon: 2. DB::Field::~Field() @ 0x0000000007c84855 2024.10.17 22:58:23.525012 [ 5233 ] {} BaseDaemon: 3. DB::Field::~Field() @ 0x0000000007c848a0 2024.10.17 22:58:23.526626 [ 5233 ] {} BaseDaemon: 4. DB::IAggregateFunctionDataHelper, DB::(anonymous namespace)::GroupArraySorted, DB::Field>>::destroy(char*) const (.5a6a451027f732f9fd91c13f4a13200c) @ 0x000000000cb9e84c 2024.10.17 22:58:23.527322 [ 5233 ] {} BaseDaemon: 5. DB::SerializationAggregateFunction::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const @ 0x000000000f7d10d0 2024.10.17 22:58:23.528470 [ 5233 ] {} BaseDaemon: 6. DB::ISerialization::deserializeBinaryBulkWithMultipleStreams(COW::immutable_ptr&, unsigned long, DB::ISerialization::DeserializeBinaryBulkSettings&, std::shared_ptr&, std::unordered_map::immutable_ptr, std::hash, std::equal_to, std::allocator::immutable_ptr>>>*) const @ 0x000000000f7cba20 2024.10.17 22:58:23.529213 [ 5233 ] {} BaseDaemon: 7. DB::MergeTreeReaderCompact::readData(DB::NameAndTypePair const&, COW::immutable_ptr&, unsigned long, std::function const&) @ 0x000000001120bbfc 2024.10.17 22:58:23.529277 [ 5233 ] {} BaseDaemon: 8. DB::MergeTreeReaderCompactSingleBuffer::readRows(unsigned long, unsigned long, bool, unsigned long, std::vector::immutable_ptr, std::allocator::immutable_ptr>>&) @ 0x000000001120fab0 2024.10.17 22:58:23.529319 [ 5233 ] {} BaseDaemon: 9. DB::MergeTreeSequentialSource::generate() @ 0x000000001121bf50 2024.10.17 22:58:23.529346 [ 5233 ] {} BaseDaemon: 10. DB::ISource::tryGenerate() @ 0x00000000116f520c 2024.10.17 22:58:23.529653 [ 5233 ] {} BaseDaemon: 11. DB::ISource::work() @ 0x00000000116f4c74 2024.10.17 22:58:23.529679 [ 5233 ] {} BaseDaemon: 12. DB::ExecutionThreadContext::executeTask() @ 0x000000001170a150 2024.10.17 22:58:23.529733 [ 5233 ] {} BaseDaemon: 13. DB::PipelineExecutor::executeStepImpl(unsigned long, std::atomic*) @ 0x00000000117009f0 2024.10.17 22:58:23.529763 [ 5233 ] {} BaseDaemon: 14. DB::PipelineExecutor::executeStep(std::atomic*) @ 0x0000000011700574 2024.10.17 22:58:23.530089 [ 5233 ] {} BaseDaemon: 15. DB::PullingPipelineExecutor::pull(DB::Chunk&) @ 0x000000001170e364 2024.10.17 22:58:23.530277 [ 5233 ] {} BaseDaemon: 16. DB::PullingPipelineExecutor::pull(DB::Block&) @ 0x000000001170e4fc 2024.10.17 22:58:23.530295 [ 5233 ] {} BaseDaemon: 17. DB::MergeTask::ExecuteAndFinalizeHorizontalPart::executeImpl() @ 0x0000000011074328 2024.10.17 22:58:23.530318 [ 5233 ] {} BaseDaemon: 18. DB::MergeTask::ExecuteAndFinalizeHorizontalPart::execute() @ 0x000000001107428c 2024.10.17 22:58:23.530339 [ 5233 ] {} BaseDaemon: 19. DB::MergeTask::execute() @ 0x0000000011077df0 2024.10.17 22:58:23.530362 [ 5233 ] {} BaseDaemon: 20. DB::SharedMergeMutateTaskBase::executeStep() @ 0x0000000011435a3c 2024.10.17 22:58:23.530384 [ 5233 ] {} BaseDaemon: 21. DB::MergeTreeBackgroundExecutor::threadFunction() @ 0x000000001108b234 2024.10.17 22:58:23.530410 [ 5233 ] {} BaseDaemon: 22. ThreadPoolImpl>::worker(std::__list_iterator, void*>) @ 0x000000000c52e264 2024.10.17 22:58:23.530448 [ 5233 ] {} BaseDaemon: 23. void std::__function::__policy_invoker::__call_impl::ThreadFromGlobalPoolImpl>::scheduleImpl(std::function, Priority, std::optional, bool)::'lambda0'()>(void&&)::'lambda'(), void ()>>(std::__function::__policy_storage const*) @ 0x000000000c531dd0 2024.10.17 22:58:23.530476 [ 5233 ] {} BaseDaemon: 24. void* std::__thread_proxy[abi:v15000]>, void ThreadPoolImpl::scheduleImpl(std::function, Priority, std::optional, bool)::'lambda0'()>>(void*) @ 0x000000000c530a80 2024.10.17 22:58:23.530514 [ 5233 ] {} BaseDaemon: 25. ? @ 0x000000000007d5c8 2024.10.17 22:58:23.530534 [ 5233 ] {} BaseDaemon: 26. ? @ 0x00000000000e5edc 2024.10.17 22:58:23.530551 [ 5233 ] {} BaseDaemon: Integrity check of the executable skipped because the reference checksum could not be read. 2024.10.17 22:58:23.531083 [ 5233 ] {} BaseDaemon: Report this error to https://github.com/ClickHouse/ClickHouse/issues 2024.10.17 22:58:23.531294 [ 5233 ] {} BaseDaemon: Changed settings: max_insert_threads = 4, max_threads = 42, use_hedged_requests = false, distributed_foreground_insert = true, alter_sync = 0, enable_memory_bound_merging_of_aggregation_results = true, cluster_for_parallel_replicas = 'default', do_not_merge_across_partitions_select_final = false, log_queries = true, log_queries_probability = 1., max_http_get_redirects = 10, enable_deflate_qpl_codec = false, enable_zstd_qat_codec = false, query_profiler_real_time_period_ns = 0, query_profiler_cpu_time_period_ns = 0, max_bytes_before_external_group_by = 90194313216, max_bytes_before_external_sort = 90194313216, max_memory_usage = 180388626432, backup_restore_keeper_retry_max_backoff_ms = 60000, cancel_http_readonly_queries_on_client_close = true, max_table_size_to_drop = 1000000000000, max_partition_size_to_drop = 1000000000000, default_table_engine = 'ReplicatedMergeTree', mutations_sync = 0, optimize_trivial_insert_select = false, database_replicated_allow_only_replicated_engine = true, cloud_mode = true, cloud_mode_engine = 2, distributed_ddl_output_mode = 'none_only_active', distributed_ddl_entry_format_version = 6, async_insert_max_data_size = 10485760, async_insert_busy_timeout_max_ms = 1000, enable_filesystem_cache_on_write_operations = true, load_marks_asynchronously = true, allow_prefetched_read_pool_for_remote_filesystem = true, filesystem_prefetch_max_memory_usage = 18038862643, filesystem_prefetches_limit = 200, compatibility = '24.6', insert_keeper_max_retries = 20, allow_experimental_materialized_postgresql_table = false, date_time_input_format = 'best_effort' ```. [#70820](https://github.com/ClickHouse/ClickHouse/pull/70820) ([Michael Kolupaev](https://github.com/al13n321)). +* Disable enable_named_columns_in_function_tuple by default. [#70833](https://github.com/ClickHouse/ClickHouse/pull/70833) ([Raúl Marín](https://github.com/Algunenano)). +* Fix S3Queue table engine setting processing_threads_num not being effective in case it was deduced from the number of cpu cores on the server. [#70837](https://github.com/ClickHouse/ClickHouse/pull/70837) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Normalize named tuple arguments in aggregation states. This fixes [#69732](https://github.com/ClickHouse/ClickHouse/issues/69732) . [#70853](https://github.com/ClickHouse/ClickHouse/pull/70853) ([Amos Bird](https://github.com/amosbird)). +* Fix a logical error due to negative zeros in the two-level hash table. This closes [#70973](https://github.com/ClickHouse/ClickHouse/issues/70973). [#70979](https://github.com/ClickHouse/ClickHouse/pull/70979) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#71214](https://github.com/ClickHouse/ClickHouse/issues/71214): Fix logical error in `StorageS3Queue` "Cannot create a persistent node in /processed since it already exists". [#70984](https://github.com/ClickHouse/ClickHouse/pull/70984) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#71243](https://github.com/ClickHouse/ClickHouse/issues/71243): Fixed named sessions not being closed and hanging on forever under certain circumstances. [#70998](https://github.com/ClickHouse/ClickHouse/pull/70998) ([Márcio Martins](https://github.com/marcio-absmartly)). +* Backported in [#71157](https://github.com/ClickHouse/ClickHouse/issues/71157): Fix the bug that didn't consider _row_exists column in rebuild option of projection lightweight delete. [#71089](https://github.com/ClickHouse/ClickHouse/pull/71089) ([Shichao Jin](https://github.com/jsc0218)). +* Backported in [#71265](https://github.com/ClickHouse/ClickHouse/issues/71265): Fix wrong value in system.query_metric_log due to unexpected race condition. [#71124](https://github.com/ClickHouse/ClickHouse/pull/71124) ([Pablo Marcos](https://github.com/pamarcos)). +* Backported in [#71331](https://github.com/ClickHouse/ClickHouse/issues/71331): Fix async inserts with empty blocks via native protocol. [#71312](https://github.com/ClickHouse/ClickHouse/pull/71312) ([Anton Popov](https://github.com/CurtizJ)). + +#### Build/Testing/Packaging Improvement +* Docker in integration tests runner is updated to latest version. It was previously pinned u until patch release 24.0.3 was out. https://github.com/moby/moby/issues/45770#issuecomment-1618255130. - HDFS image was deprecated and not running with current docker version. Switched to newer version of a derivative image based on ubuntu. - HDFS tests were hardened to allow them to run with python-repeat. [#66867](https://github.com/ClickHouse/ClickHouse/pull/66867) ([Ilya Yatsishin](https://github.com/qoega)). +* Alpine docker images now use ubuntu 22.04 as glibc donor, results in upgrade of glibc version delivered with alpine images from 2.31 to 2.35. [#69033](https://github.com/ClickHouse/ClickHouse/pull/69033) ([filimonov](https://github.com/filimonov)). +* Makes dbms independent from clickhouse_functions. [#69914](https://github.com/ClickHouse/ClickHouse/pull/69914) ([Raúl Marín](https://github.com/Algunenano)). +* Fix FreeBSD compilation of the MariaDB connector. [#70007](https://github.com/ClickHouse/ClickHouse/pull/70007) ([Raúl Marín](https://github.com/Algunenano)). +* Building on Apple Mac OS X Darwin does not produce strange warnings anymore. [#70411](https://github.com/ClickHouse/ClickHouse/pull/70411) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix building with ARCH_NATIVE CMake flag. [#70585](https://github.com/ClickHouse/ClickHouse/pull/70585) ([Daniil Gentili](https://github.com/danog)). +* The universal installer will download Musl build on Alpine Linux. Some Docker containers are using Alpine Linux, but it was not possible to install ClickHouse there with `curl https://clickhouse.com/ | sh`. [#70767](https://github.com/ClickHouse/ClickHouse/pull/70767) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NO CL CATEGORY + +* Backported in [#71259](https://github.com/ClickHouse/ClickHouse/issues/71259):. [#71220](https://github.com/ClickHouse/ClickHouse/pull/71220) ([Raúl Marín](https://github.com/Algunenano)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Revert "JSONCompactWithProgress query output format"'. [#69989](https://github.com/ClickHouse/ClickHouse/pull/69989) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* NO CL ENTRY: 'Revert "Support CREATE OR REPLACE VIEW atomically"'. [#70535](https://github.com/ClickHouse/ClickHouse/pull/70535) ([Raúl Marín](https://github.com/Algunenano)). +* NO CL ENTRY: 'Revert "Revert "Support CREATE OR REPLACE VIEW atomically""'. [#70536](https://github.com/ClickHouse/ClickHouse/pull/70536) ([Raúl Marín](https://github.com/Algunenano)). +* NO CL ENTRY: 'Revert "Add projections size to system.projections"'. [#70858](https://github.com/ClickHouse/ClickHouse/pull/70858) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Allow writing argument of `has` or `hasAny` or `hasAll` as string values if array element type is `Enum`. [#56555](https://github.com/ClickHouse/ClickHouse/pull/56555) ([Duc Canh Le](https://github.com/canhld94)). +* Rename FileSegmentKind::Ephemeral and other changes. [#66600](https://github.com/ClickHouse/ClickHouse/pull/66600) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Closes [#67345](https://github.com/ClickHouse/ClickHouse/issues/67345). [#67346](https://github.com/ClickHouse/ClickHouse/pull/67346) ([KrJin](https://github.com/jincong8973)). +* Because it is too complicated to support. [#68410](https://github.com/ClickHouse/ClickHouse/pull/68410) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix 01600_parts_states_metrics_long flakiness. [#68521](https://github.com/ClickHouse/ClickHouse/pull/68521) ([Azat Khuzhin](https://github.com/azat)). +* Reduce client start time in debug/sanitizer mode. [#68980](https://github.com/ClickHouse/ClickHouse/pull/68980) ([Raúl Marín](https://github.com/Algunenano)). +* Closes [#69038](https://github.com/ClickHouse/ClickHouse/issues/69038). [#69040](https://github.com/ClickHouse/ClickHouse/pull/69040) ([Nikolay Degterinsky](https://github.com/evillique)). +* Better exception for unsupported full_text index with non-full parts. [#69067](https://github.com/ClickHouse/ClickHouse/pull/69067) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Catch additional zk connection erros while creating table and make sure to cleanup dirs if necessary for retries. [#69093](https://github.com/ClickHouse/ClickHouse/pull/69093) ([Sumit](https://github.com/sum12)). +* Update version_date.tsv and changelog after v24.7.5.37-stable. [#69185](https://github.com/ClickHouse/ClickHouse/pull/69185) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* DOCS: Replace live view with refreshable since the former is deprecated. [#69392](https://github.com/ClickHouse/ClickHouse/pull/69392) ([Damian Kula](https://github.com/heavelock)). +* Update ORC to the current HEAD. [#69473](https://github.com/ClickHouse/ClickHouse/pull/69473) ([Nikita Taranov](https://github.com/nickitat)). +* Make a test ready for flaky check. [#69586](https://github.com/ClickHouse/ClickHouse/pull/69586) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Support antlr parser to parse sql with some keywords as alias, make the behaviour same as the clickhouse-server - remove redundant `for` in the `keyword` field. [#69614](https://github.com/ClickHouse/ClickHouse/pull/69614) ([Z.H.](https://github.com/onlyacat)). +* Allow default implementations for null in function mapFromArrays for spark compatiability in apache gluten. Current change doesn't have any side effects on clickhouse in theory. [#69715](https://github.com/ClickHouse/ClickHouse/pull/69715) ([李扬](https://github.com/taiyang-li)). +* Fix exception message in AzureBlobStorage. [#69728](https://github.com/ClickHouse/ClickHouse/pull/69728) ([Pavel Kruglov](https://github.com/Avogar)). +* Add test parsing s3 URL with a bucket name including a dot. [#69743](https://github.com/ClickHouse/ClickHouse/pull/69743) ([Kaushik Iska](https://github.com/iskakaushik)). +* Make `clang-tidy` happy. [#69765](https://github.com/ClickHouse/ClickHouse/pull/69765) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Prepare to enable `clang-tidy` `readability-else-after-return`. [#69768](https://github.com/ClickHouse/ClickHouse/pull/69768) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* S3Queue: support having deprecated settings to not fail server startup. [#69769](https://github.com/ClickHouse/ClickHouse/pull/69769) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Use only adaptive heuristic to choose task sizes for remote reading. [#69778](https://github.com/ClickHouse/ClickHouse/pull/69778) ([Nikita Taranov](https://github.com/nickitat)). +* Remove unused buggy code. [#69780](https://github.com/ClickHouse/ClickHouse/pull/69780) ([Raúl Marín](https://github.com/Algunenano)). +* Fix bugfix check. [#69789](https://github.com/ClickHouse/ClickHouse/pull/69789) ([Antonio Andelic](https://github.com/antonio2368)). +* Followup for [#63279](https://github.com/ClickHouse/ClickHouse/issues/63279). [#69790](https://github.com/ClickHouse/ClickHouse/pull/69790) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Update version after release. [#69816](https://github.com/ClickHouse/ClickHouse/pull/69816) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update ext-dict-functions.md. [#69819](https://github.com/ClickHouse/ClickHouse/pull/69819) ([kurikuQwQ](https://github.com/kurikuQwQ)). +* Allow cyrillic characters in generated contributor names. [#69820](https://github.com/ClickHouse/ClickHouse/pull/69820) ([Raúl Marín](https://github.com/Algunenano)). +* CI: praktika integration 1. [#69822](https://github.com/ClickHouse/ClickHouse/pull/69822) ([Max Kainov](https://github.com/maxknv)). +* Fix `test_delayed_replica_failover`. [#69826](https://github.com/ClickHouse/ClickHouse/pull/69826) ([Antonio Andelic](https://github.com/antonio2368)). +* minor change, less conflicts. [#69830](https://github.com/ClickHouse/ClickHouse/pull/69830) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Improve error message DDLWorker.cpp. [#69835](https://github.com/ClickHouse/ClickHouse/pull/69835) ([Denny Crane](https://github.com/den-crane)). +* Fix typo in description: mutation_sync -> mutations_sync. [#69838](https://github.com/ClickHouse/ClickHouse/pull/69838) ([Alexander Gololobov](https://github.com/davenger)). +* Fix changelog. [#69841](https://github.com/ClickHouse/ClickHouse/pull/69841) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* This closes [#49940](https://github.com/ClickHouse/ClickHouse/issues/49940). [#69842](https://github.com/ClickHouse/ClickHouse/pull/69842) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* This closes [#51036](https://github.com/ClickHouse/ClickHouse/issues/51036). [#69844](https://github.com/ClickHouse/ClickHouse/pull/69844) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update README.md - Update meetups. [#69849](https://github.com/ClickHouse/ClickHouse/pull/69849) ([Tanya Bragin](https://github.com/tbragin)). +* Revert [#69790](https://github.com/ClickHouse/ClickHouse/issues/69790) and [#63279](https://github.com/ClickHouse/ClickHouse/issues/63279). [#69850](https://github.com/ClickHouse/ClickHouse/pull/69850) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* See [#63279](https://github.com/ClickHouse/ClickHouse/issues/63279). [#69851](https://github.com/ClickHouse/ClickHouse/pull/69851) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add a test for [#50928](https://github.com/ClickHouse/ClickHouse/issues/50928). [#69852](https://github.com/ClickHouse/ClickHouse/pull/69852) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add a test for [#55981](https://github.com/ClickHouse/ClickHouse/issues/55981). [#69853](https://github.com/ClickHouse/ClickHouse/pull/69853) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add a test for [#56823](https://github.com/ClickHouse/ClickHouse/issues/56823). [#69854](https://github.com/ClickHouse/ClickHouse/pull/69854) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* This closes [#62350](https://github.com/ClickHouse/ClickHouse/issues/62350). [#69855](https://github.com/ClickHouse/ClickHouse/pull/69855) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Refactor functions and variables in statistics code. [#69860](https://github.com/ClickHouse/ClickHouse/pull/69860) ([Robert Schulze](https://github.com/rschu1ze)). +* Resubmit [#63279](https://github.com/ClickHouse/ClickHouse/issues/63279). [#69861](https://github.com/ClickHouse/ClickHouse/pull/69861) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Improve stateless test runner. [#69864](https://github.com/ClickHouse/ClickHouse/pull/69864) ([Alexey Katsman](https://github.com/alexkats)). +* Adjust fast test time limit a bit. [#69874](https://github.com/ClickHouse/ClickHouse/pull/69874) ([Raúl Marín](https://github.com/Algunenano)). +* Add initial 24.9 CHANGELOG. [#69876](https://github.com/ClickHouse/ClickHouse/pull/69876) ([Raúl Marín](https://github.com/Algunenano)). +* Fix test `01278_random_string_utf8`. [#69878](https://github.com/ClickHouse/ClickHouse/pull/69878) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix minor fuzzer issue with experimental statistics. [#69881](https://github.com/ClickHouse/ClickHouse/pull/69881) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix linking after settings refactoring. [#69882](https://github.com/ClickHouse/ClickHouse/pull/69882) ([Robert Schulze](https://github.com/rschu1ze)). +* Add Proj Obsolete Setting. [#69883](https://github.com/ClickHouse/ClickHouse/pull/69883) ([Shichao Jin](https://github.com/jsc0218)). +* Improve remote queries startup time. [#69884](https://github.com/ClickHouse/ClickHouse/pull/69884) ([Igor Nikonov](https://github.com/devcrafter)). +* Revert "Merge pull request [#69032](https://github.com/ClickHouse/ClickHouse/issues/69032) from alexon1234/include_real_time_execution_in_http_header". [#69885](https://github.com/ClickHouse/ClickHouse/pull/69885) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* A dedicated commits from https://github.com/ClickHouse/ClickHouse/pull/61473. [#69896](https://github.com/ClickHouse/ClickHouse/pull/69896) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Added aliases `time_bucket`(from TimescaleDB) and `date_bin`(from PostgreSQL) for `toStartOfInterval`. [#69900](https://github.com/ClickHouse/ClickHouse/pull/69900) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* RIPE is an acronym and thus should be capital. RIPE stands for **R**ACE **I**ntegrity **P**rimitives **E**valuation and RACE stands for **R**esearch and Development in **A**dvanced **C**ommunications **T**echnologies in **E**urope. [#69901](https://github.com/ClickHouse/ClickHouse/pull/69901) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Replace error codes with error names in stateless tests. [#69906](https://github.com/ClickHouse/ClickHouse/pull/69906) ([Dmitry Novik](https://github.com/novikd)). +* Move setting to 24.10. [#69913](https://github.com/ClickHouse/ClickHouse/pull/69913) ([Raúl Marín](https://github.com/Algunenano)). +* Minor: Reduce diff between public and private repo. [#69928](https://github.com/ClickHouse/ClickHouse/pull/69928) ([Robert Schulze](https://github.com/rschu1ze)). +* Followup for [#69861](https://github.com/ClickHouse/ClickHouse/issues/69861). [#69930](https://github.com/ClickHouse/ClickHouse/pull/69930) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fix test_dictionaries_all_layouts_separate_sources. [#69962](https://github.com/ClickHouse/ClickHouse/pull/69962) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Fix test_keeper_mntr_data_size. [#69965](https://github.com/ClickHouse/ClickHouse/pull/69965) ([Antonio Andelic](https://github.com/antonio2368)). +* This closes [#49823](https://github.com/ClickHouse/ClickHouse/issues/49823). [#69981](https://github.com/ClickHouse/ClickHouse/pull/69981) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add changelog for 24.9. [#69982](https://github.com/ClickHouse/ClickHouse/pull/69982) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add a test for [#45303](https://github.com/ClickHouse/ClickHouse/issues/45303). [#69987](https://github.com/ClickHouse/ClickHouse/pull/69987) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update CHANGELOG.md. [#69988](https://github.com/ClickHouse/ClickHouse/pull/69988) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update README.md. [#69991](https://github.com/ClickHouse/ClickHouse/pull/69991) ([Tyler Hannan](https://github.com/tylerhannan)). +* Disable `03215_parallel_replicas_crash_after_refactoring.sql` for Azure. [#69992](https://github.com/ClickHouse/ClickHouse/pull/69992) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Update CHANGELOG.md. [#69993](https://github.com/ClickHouse/ClickHouse/pull/69993) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update CHANGELOG.md. [#70004](https://github.com/ClickHouse/ClickHouse/pull/70004) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Revert "Add RIPEMD160 function". [#70005](https://github.com/ClickHouse/ClickHouse/pull/70005) ([Robert Schulze](https://github.com/rschu1ze)). +* Update CHANGELOG.md. [#70009](https://github.com/ClickHouse/ClickHouse/pull/70009) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update CHANGELOG.md. [#70010](https://github.com/ClickHouse/ClickHouse/pull/70010) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Make the pylint stricter. [#70013](https://github.com/ClickHouse/ClickHouse/pull/70013) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Added a setting `restore_replace_external_dictionary_source_to_null` which enables replacing dictionary source with Null on restore for external dictionaries (useful for testing). [#70032](https://github.com/ClickHouse/ClickHouse/pull/70032) ([Alexander Tokmakov](https://github.com/tavplubix)). +* `isort` is a simple import sorter for the python to comply [pep-8](https://peps.python.org/pep-0008/#imports) requirements. It will allow to decrease conflicts during sync and beautify the code. The import block is divided into three sub-blocks: `standard library` -> `third-party libraries` -> `local imports` -> `.local imports`. Each sub-block is ordered alphabetically with sub-sub-blocks `import X` -> `from X import Y`. [#70038](https://github.com/ClickHouse/ClickHouse/pull/70038) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update version_date.tsv and changelog after v24.9.1.3278-stable. [#70049](https://github.com/ClickHouse/ClickHouse/pull/70049) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Despite the fact that we set the org-level workflow parameter `PYTHONUNBUFFERED`, it's not inherited in workflows. [#70050](https://github.com/ClickHouse/ClickHouse/pull/70050) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix ubsan issue in function sqid. [#70061](https://github.com/ClickHouse/ClickHouse/pull/70061) ([Robert Schulze](https://github.com/rschu1ze)). +* Delete a setting change. [#70071](https://github.com/ClickHouse/ClickHouse/pull/70071) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix `test_distributed_ddl`. [#70075](https://github.com/ClickHouse/ClickHouse/pull/70075) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Remove unused placeholder from exception message string. [#70086](https://github.com/ClickHouse/ClickHouse/pull/70086) ([Alsu Giliazova](https://github.com/alsugiliazova)). +* Better exception message when some of the permission is missing. [#70088](https://github.com/ClickHouse/ClickHouse/pull/70088) ([pufit](https://github.com/pufit)). +* Make vector similarity indexes work with adaptive granularity. [#70101](https://github.com/ClickHouse/ClickHouse/pull/70101) ([Robert Schulze](https://github.com/rschu1ze)). +* Add missing columns `total_rows`, `data_compressed_bytes`, and `data_uncompressed_bytes` to `system.projections`. Part of https://github.com/ClickHouse/ClickHouse/pull/68901. [#70106](https://github.com/ClickHouse/ClickHouse/pull/70106) ([Jordi Villar](https://github.com/jrdi)). +* Make `00938_fix_rwlock_segfault_long` non flaky. [#70109](https://github.com/ClickHouse/ClickHouse/pull/70109) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove TODO. [#70110](https://github.com/ClickHouse/ClickHouse/pull/70110) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Change the default threshold to enable hyper threading. [#70111](https://github.com/ClickHouse/ClickHouse/pull/70111) ([Jiebin Sun](https://github.com/jiebinn)). +* Fixed [#69092](https://github.com/ClickHouse/ClickHouse/issues/69092): if `materialized_postgresql_tables_list=table1(id, code),table(id,name)` (`table1` has name that is a substring for `table`) `getTableAllowedColumns` method returns `[id, code]` for `table` before this fix. [#70114](https://github.com/ClickHouse/ClickHouse/pull/70114) ([Kruglov Kirill](https://github.com/1on)). +* Reduce log level. [#70117](https://github.com/ClickHouse/ClickHouse/pull/70117) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Rename `getNumberOfPhysicalCPUCores` and fix its decription. [#70130](https://github.com/ClickHouse/ClickHouse/pull/70130) ([Nikita Taranov](https://github.com/nickitat)). +* Adding 24.10. [#70132](https://github.com/ClickHouse/ClickHouse/pull/70132) ([Tyler Hannan](https://github.com/tylerhannan)). +* (Re?)-enable libcxx asserts for debug builds. [#70134](https://github.com/ClickHouse/ClickHouse/pull/70134) ([Robert Schulze](https://github.com/rschu1ze)). +* Refactor reading from object storage. [#70141](https://github.com/ClickHouse/ClickHouse/pull/70141) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Silence UBSAN for integer overflows in some datetime functions. [#70142](https://github.com/ClickHouse/ClickHouse/pull/70142) ([Michael Kolupaev](https://github.com/al13n321)). +* Improve pipdeptree generator for docker images. - Update requirements.txt for the integration tests runner container - Remove some small dependencies, improve `helpers/retry_decorator.py` - Upgrade docker-compose from EOL version 1 to version 2. [#70146](https://github.com/ClickHouse/ClickHouse/pull/70146) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix 'QueryPlan was not initialized' in 'loop' with empty MergeTree. [#70149](https://github.com/ClickHouse/ClickHouse/pull/70149) ([Michael Kolupaev](https://github.com/al13n321)). +* Remove QueryPlan DataStream. [#70158](https://github.com/ClickHouse/ClickHouse/pull/70158) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Update test_storage_s3_queue/test.py. [#70159](https://github.com/ClickHouse/ClickHouse/pull/70159) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Small docs fix. [#70160](https://github.com/ClickHouse/ClickHouse/pull/70160) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Test: PR local plan, non-constant in source stream. [#70173](https://github.com/ClickHouse/ClickHouse/pull/70173) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix performance checks. [#70175](https://github.com/ClickHouse/ClickHouse/pull/70175) ([Antonio Andelic](https://github.com/antonio2368)). +* Simplify test 03246_range_literal_replacement_works. [#70176](https://github.com/ClickHouse/ClickHouse/pull/70176) ([Pablo Marcos](https://github.com/pamarcos)). +* Update 01079_parallel_alter_add_drop_column_zookeeper.sh. [#70196](https://github.com/ClickHouse/ClickHouse/pull/70196) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Require bugfix job for a set of labels. [#70197](https://github.com/ClickHouse/ClickHouse/pull/70197) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* CI: Praktika integration, fast test. [#70239](https://github.com/ClickHouse/ClickHouse/pull/70239) ([Max Kainov](https://github.com/maxknv)). +* Avoid `Cannot schedule a task` error when loading parts. [#70257](https://github.com/ClickHouse/ClickHouse/pull/70257) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Bump usearch to v2.15.2 and SimSIMD to v5.0.0. [#70270](https://github.com/ClickHouse/ClickHouse/pull/70270) ([Robert Schulze](https://github.com/rschu1ze)). +* Instead of balancing tests by `crc32(file_name)` we'll use `add tests to a group with a minimal number of tests`. [#70272](https://github.com/ClickHouse/ClickHouse/pull/70272) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Closes [#70263](https://github.com/ClickHouse/ClickHouse/issues/70263). [#70273](https://github.com/ClickHouse/ClickHouse/pull/70273) ([flynn](https://github.com/ucasfl)). +* Hide MergeTreeSettings implementation. [#70285](https://github.com/ClickHouse/ClickHouse/pull/70285) ([Raúl Marín](https://github.com/Algunenano)). +* CI: Remove await feature from release branches. [#70294](https://github.com/ClickHouse/ClickHouse/pull/70294) ([Max Kainov](https://github.com/maxknv)). +* Fix `test_keeper_four_word_command`. [#70298](https://github.com/ClickHouse/ClickHouse/pull/70298) ([Antonio Andelic](https://github.com/antonio2368)). +* Update version_date.tsv and changelog after v24.9.2.42-stable. [#70301](https://github.com/ClickHouse/ClickHouse/pull/70301) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Synchronize settings with private. [#70320](https://github.com/ClickHouse/ClickHouse/pull/70320) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add Ignore Option In DeduplicateMergeProjectionMode. [#70327](https://github.com/ClickHouse/ClickHouse/pull/70327) ([Shichao Jin](https://github.com/jsc0218)). +* CI: Enable Integration Tests for backport PRs. [#70329](https://github.com/ClickHouse/ClickHouse/pull/70329) ([Max Kainov](https://github.com/maxknv)). +* There is [a failed CI job](https://s3.amazonaws.com/clickhouse-test-reports/69778/2d81c38874958bd9d54a25524173bdb1ddf2b75c/stateless_tests__release_.html) which is triggered by [03237_create_or_replace_view_atomically_with_atomic_engine](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/03237_create_or_replace_view_atomically_with_atomic_engine.sh). [#70330](https://github.com/ClickHouse/ClickHouse/pull/70330) ([tuanpach](https://github.com/tuanpach)). +* Fix flaky test `03237_insert_sparse_columns_mem`. [#70333](https://github.com/ClickHouse/ClickHouse/pull/70333) ([Anton Popov](https://github.com/CurtizJ)). +* Rename enable_secure_identifiers -> enforce_strict_identifier_format. [#70335](https://github.com/ClickHouse/ClickHouse/pull/70335) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Attempt to fix flaky RabbitMQ tests. Maybe closes [#45160](https://github.com/ClickHouse/ClickHouse/issues/45160). [#70336](https://github.com/ClickHouse/ClickHouse/pull/70336) ([filimonov](https://github.com/filimonov)). +* Don't fail the stateless check script if we can't collect minio logs. [#70350](https://github.com/ClickHouse/ClickHouse/pull/70350) ([Raúl Marín](https://github.com/Algunenano)). +* Fix tiny mistake, responsible for some of kafka test flaps. Example [report](https://s3.amazonaws.com/clickhouse-test-reports/0/3198aafac59c368993e7b5f49d95674cc1b1be18/integration_tests__release__[2_4].html). [#70352](https://github.com/ClickHouse/ClickHouse/pull/70352) ([filimonov](https://github.com/filimonov)). +* Closes [#69634](https://github.com/ClickHouse/ClickHouse/issues/69634). [#70354](https://github.com/ClickHouse/ClickHouse/pull/70354) ([pufit](https://github.com/pufit)). +* Fix 02346_fulltext_index_bug52019. [#70357](https://github.com/ClickHouse/ClickHouse/pull/70357) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Use new JSON for collecting minio logs. [#70359](https://github.com/ClickHouse/ClickHouse/pull/70359) ([Antonio Andelic](https://github.com/antonio2368)). +* Update comments in VectorSimilarityCondition (WHERE is not supported). [#70360](https://github.com/ClickHouse/ClickHouse/pull/70360) ([Azat Khuzhin](https://github.com/azat)). +* Remove 02492_clickhouse_local_context_uaf test. [#70363](https://github.com/ClickHouse/ClickHouse/pull/70363) ([Azat Khuzhin](https://github.com/azat)). +* Fix `clang-19` build issues. [#70412](https://github.com/ClickHouse/ClickHouse/pull/70412) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Ignore "Invalid multibyte data detected" error during completion. [#70422](https://github.com/ClickHouse/ClickHouse/pull/70422) ([Azat Khuzhin](https://github.com/azat)). +* Make QueryPlan explain methods const. [#70444](https://github.com/ClickHouse/ClickHouse/pull/70444) ([Alexander Gololobov](https://github.com/davenger)). +* Fix 0.1 second delay for interactive queries (due to keystroke interceptor). [#70445](https://github.com/ClickHouse/ClickHouse/pull/70445) ([Azat Khuzhin](https://github.com/azat)). +* Increase lock timeout in attempt to fix 02125_many_mutations. [#70448](https://github.com/ClickHouse/ClickHouse/pull/70448) ([Azat Khuzhin](https://github.com/azat)). +* Fix order in 03249_dynamic_alter_consistency. [#70453](https://github.com/ClickHouse/ClickHouse/pull/70453) ([Alexander Gololobov](https://github.com/davenger)). +* Fix refreshable MV in system database breaking server startup. [#70460](https://github.com/ClickHouse/ClickHouse/pull/70460) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix flaky test_refreshable_mv_in_replicated_db. [#70462](https://github.com/ClickHouse/ClickHouse/pull/70462) ([Michael Kolupaev](https://github.com/al13n321)). +* Update version_date.tsv and changelog after v24.8.5.115-lts. [#70463](https://github.com/ClickHouse/ClickHouse/pull/70463) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Decrease probability of "Server died" due to 00913_many_threads. [#70473](https://github.com/ClickHouse/ClickHouse/pull/70473) ([Azat Khuzhin](https://github.com/azat)). +* Fixes for killing leftovers in clikhouse-test. [#70474](https://github.com/ClickHouse/ClickHouse/pull/70474) ([Azat Khuzhin](https://github.com/azat)). +* Update version_date.tsv and changelog after v24.3.12.75-lts. [#70485](https://github.com/ClickHouse/ClickHouse/pull/70485) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Use logging instead of print. [#70505](https://github.com/ClickHouse/ClickHouse/pull/70505) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Remove slow poll() logs in keeper. [#70508](https://github.com/ClickHouse/ClickHouse/pull/70508) ([Raúl Marín](https://github.com/Algunenano)). +* Add timeouts for retry loops in test_storage_rabbitmq. It should prevent cascading failures of the whole test suite caused by deadloop in one of the test scenarios. Also added small sleeps in a 'tight' loops to make retries bit less agressive. [#70510](https://github.com/ClickHouse/ClickHouse/pull/70510) ([filimonov](https://github.com/filimonov)). +* CI: Fix for canceled Sync workflow. [#70521](https://github.com/ClickHouse/ClickHouse/pull/70521) ([Max Kainov](https://github.com/maxknv)). +* Debug build faild with clang-18 after https://github.com/ClickHouse/ClickHouse/pull/70412, don't know why it's ok in release build, simply changing `_` to `_1` is ok for both release and debug build. [#70532](https://github.com/ClickHouse/ClickHouse/pull/70532) ([Chang chen](https://github.com/baibaichen)). +* Refreshable materialized views are not experimental anymore. [#70550](https://github.com/ClickHouse/ClickHouse/pull/70550) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix 24.9 setting compatibility `database_replicated_allow_explicit_uuid`. [#70565](https://github.com/ClickHouse/ClickHouse/pull/70565) ([Nikita Fomichev](https://github.com/fm4v)). +* Fix typos. [#70588](https://github.com/ClickHouse/ClickHouse/pull/70588) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Vector search: allow to specify HNSW parameter `ef_search` at query time. [#70616](https://github.com/ClickHouse/ClickHouse/pull/70616) ([Robert Schulze](https://github.com/rschu1ze)). +* Increase max_rows_to_read limit in some tests. [#70617](https://github.com/ClickHouse/ClickHouse/pull/70617) ([Raúl Marín](https://github.com/Algunenano)). +* Reduce sync efforts with private. [#70634](https://github.com/ClickHouse/ClickHouse/pull/70634) ([Raúl Marín](https://github.com/Algunenano)). +* Fix parsing of some formats into sparse columns. [#70635](https://github.com/ClickHouse/ClickHouse/pull/70635) ([Anton Popov](https://github.com/CurtizJ)). +* Fix typos. [#70637](https://github.com/ClickHouse/ClickHouse/pull/70637) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Try fix 00180_no_seek_avoiding_when_reading_from_cache. [#70640](https://github.com/ClickHouse/ClickHouse/pull/70640) ([Kseniia Sumarokova](https://github.com/kssenii)). +* When the `PR Check` status is set, it's a valid RunConfig job failure. [#70643](https://github.com/ClickHouse/ClickHouse/pull/70643) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix timeout in materialized pg tests. [#70646](https://github.com/ClickHouse/ClickHouse/pull/70646) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Introduced MergeTree setting which allow to change merge selecting algorithm. However we still have only one algorithm and it's mostly for future experiments. [#70647](https://github.com/ClickHouse/ClickHouse/pull/70647) ([alesapin](https://github.com/alesapin)). +* Docs: Follow-up for [#70585](https://github.com/ClickHouse/ClickHouse/issues/70585). [#70654](https://github.com/ClickHouse/ClickHouse/pull/70654) ([Robert Schulze](https://github.com/rschu1ze)). +* Remove strange file. [#70662](https://github.com/ClickHouse/ClickHouse/pull/70662) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Locally I had lots of errors like `'AllocList' does not refer to a value` around places which used `offsetof`. Changing it to `__builtin_offsetof ` helped and I didn't debug any further. [#70671](https://github.com/ClickHouse/ClickHouse/pull/70671) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Adding the report link to a test result and files' list. [#70677](https://github.com/ClickHouse/ClickHouse/pull/70677) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* materialized postgres: minor fixes. [#70710](https://github.com/ClickHouse/ClickHouse/pull/70710) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Probably fix flaky test_refreshable_mv_in_replicated_db. [#70714](https://github.com/ClickHouse/ClickHouse/pull/70714) ([Michael Kolupaev](https://github.com/al13n321)). +* Move more setting structs to pImpl. [#70739](https://github.com/ClickHouse/ClickHouse/pull/70739) ([Raúl Marín](https://github.com/Algunenano)). +* Reduce sync effort. [#70747](https://github.com/ClickHouse/ClickHouse/pull/70747) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#71198](https://github.com/ClickHouse/ClickHouse/issues/71198): Check number of arguments for function with Dynamic argument. [#70749](https://github.com/ClickHouse/ClickHouse/pull/70749) ([Nikita Taranov](https://github.com/nickitat)). +* Add s3queue settings check for cloud. [#70750](https://github.com/ClickHouse/ClickHouse/pull/70750) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix readiness/health check for OpenLDAP container. [#70755](https://github.com/ClickHouse/ClickHouse/pull/70755) ([Julian Maicher](https://github.com/jmaicher)). +* Allow update plan headers for all the steps. [#70761](https://github.com/ClickHouse/ClickHouse/pull/70761) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Autogenerate documentation for settings. [#70768](https://github.com/ClickHouse/ClickHouse/pull/70768) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Not a logical error. [#70770](https://github.com/ClickHouse/ClickHouse/pull/70770) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* CI: Aarch64 build with Asan. [#70778](https://github.com/ClickHouse/ClickHouse/pull/70778) ([Max Kainov](https://github.com/maxknv)). +* Minor fix. [#70783](https://github.com/ClickHouse/ClickHouse/pull/70783) ([Anton Popov](https://github.com/CurtizJ)). +* The docs for settings should be located in the source code. Now, the CI supports that. [#70784](https://github.com/ClickHouse/ClickHouse/pull/70784) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update style-test image. [#70785](https://github.com/ClickHouse/ClickHouse/pull/70785) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Avoid double finalization of `WriteBuffer` in library bridge. [#70799](https://github.com/ClickHouse/ClickHouse/pull/70799) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Make Array Field serialization consistent. [#70803](https://github.com/ClickHouse/ClickHouse/pull/70803) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* A follow-up for [#70785](https://github.com/ClickHouse/ClickHouse/issues/70785), [jwt](https://pypi.org/project/jwt/#history) looks very outdated, and we have issue with conflicting paths. [#70815](https://github.com/ClickHouse/ClickHouse/pull/70815) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Remove inneficient code. [#70816](https://github.com/ClickHouse/ClickHouse/pull/70816) ([Raúl Marín](https://github.com/Algunenano)). +* Allow large object files if OMIT_HEAVY_DEBUG_SYMBOLS = 0. [#70818](https://github.com/ClickHouse/ClickHouse/pull/70818) ([Michael Kolupaev](https://github.com/al13n321)). +* Add test with distributed queries for 15768. [#70834](https://github.com/ClickHouse/ClickHouse/pull/70834) ([Nikita Taranov](https://github.com/nickitat)). +* More setting structs to pImpl and reuse code. [#70840](https://github.com/ClickHouse/ClickHouse/pull/70840) ([Raúl Marín](https://github.com/Algunenano)). +* Update default HNSW parameter settings. [#70873](https://github.com/ClickHouse/ClickHouse/pull/70873) ([Robert Schulze](https://github.com/rschu1ze)). +* Limiting logging some lines about configs. [#70879](https://github.com/ClickHouse/ClickHouse/pull/70879) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Fix `limit by`, `limit with ties` for distributed and parallel replicas. [#70880](https://github.com/ClickHouse/ClickHouse/pull/70880) ([Nikita Taranov](https://github.com/nickitat)). +* Fix darwin build. [#70894](https://github.com/ClickHouse/ClickHouse/pull/70894) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add dots for consistency. [#70909](https://github.com/ClickHouse/ClickHouse/pull/70909) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Logical error fix for substrings, found by fuzzer. [#70914](https://github.com/ClickHouse/ClickHouse/pull/70914) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* More setting structs to pImpl. [#70942](https://github.com/ClickHouse/ClickHouse/pull/70942) ([Raúl Marín](https://github.com/Algunenano)). +* Add logging for mock HTTP servers used in minio integration tests. [#70943](https://github.com/ClickHouse/ClickHouse/pull/70943) ([Vitaly Baranov](https://github.com/vitlibar)). +* Minor fixups of [#70011](https://github.com/ClickHouse/ClickHouse/issues/70011) and [#69918](https://github.com/ClickHouse/ClickHouse/issues/69918). [#70959](https://github.com/ClickHouse/ClickHouse/pull/70959) ([Robert Schulze](https://github.com/rschu1ze)). +* CI: Do not skip Build report and status fix. [#70965](https://github.com/ClickHouse/ClickHouse/pull/70965) ([Max Kainov](https://github.com/maxknv)). +* Fix Keeper entry serialization compatibility. [#70972](https://github.com/ClickHouse/ClickHouse/pull/70972) ([Antonio Andelic](https://github.com/antonio2368)). +* Update exception message. [#70975](https://github.com/ClickHouse/ClickHouse/pull/70975) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix `utils/c++expr` option `-b`. [#70978](https://github.com/ClickHouse/ClickHouse/pull/70978) ([Sergei Trifonov](https://github.com/serxa)). +* Fix `test_keeper_broken_logs`. [#70982](https://github.com/ClickHouse/ClickHouse/pull/70982) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix `01039_test_setting_parse`. [#70986](https://github.com/ClickHouse/ClickHouse/pull/70986) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Tests for languages support for Embedded Dictionaries. [#71004](https://github.com/ClickHouse/ClickHouse/pull/71004) ([Max Vostrikov](https://github.com/max-vostrikov)). +* Required for internal test runs with the same image build in public CI. [#71008](https://github.com/ClickHouse/ClickHouse/pull/71008) ([Ilya Yatsishin](https://github.com/qoega)). +* Move remaining settings objects to pImpl and start simplification. [#71019](https://github.com/ClickHouse/ClickHouse/pull/71019) ([Raúl Marín](https://github.com/Algunenano)). +* CI: Rearrange directories for praktika ci. [#71029](https://github.com/ClickHouse/ClickHouse/pull/71029) ([Max Kainov](https://github.com/maxknv)). +* Fix assert in RemoteSource::onAsyncJobReady(). [#71034](https://github.com/ClickHouse/ClickHouse/pull/71034) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix showing error message in ReadBufferFromS3 when retrying. Without this PR information about a retryable failure in `ReadBufferFromS3` could look like this:. [#71038](https://github.com/ClickHouse/ClickHouse/pull/71038) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix `test_truncate_database`. [#71057](https://github.com/ClickHouse/ClickHouse/pull/71057) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix clickhouse-test useless 5 second delay in case of multiple threads are used. [#71069](https://github.com/ClickHouse/ClickHouse/pull/71069) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#71142](https://github.com/ClickHouse/ClickHouse/issues/71142): Followup [#70520](https://github.com/ClickHouse/ClickHouse/issues/70520). [#71129](https://github.com/ClickHouse/ClickHouse/pull/71129) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Backported in [#71189](https://github.com/ClickHouse/ClickHouse/issues/71189): Update compatibility setting for `hnsw_candidate_list_size_for_search`. [#71133](https://github.com/ClickHouse/ClickHouse/pull/71133) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#71222](https://github.com/ClickHouse/ClickHouse/issues/71222): Fixes for interactive metrics. [#71173](https://github.com/ClickHouse/ClickHouse/pull/71173) ([Julia Kartseva](https://github.com/jkartseva)). +* Backported in [#71205](https://github.com/ClickHouse/ClickHouse/issues/71205): Maybe not GWPAsan by default. [#71174](https://github.com/ClickHouse/ClickHouse/pull/71174) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#71277](https://github.com/ClickHouse/ClickHouse/issues/71277): Fix LOGICAL_ERROR on wrong scalar subquery argument to table functions. [#71216](https://github.com/ClickHouse/ClickHouse/pull/71216) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#71253](https://github.com/ClickHouse/ClickHouse/issues/71253): Disable enable_named_columns_in_function_tuple for 24.10. [#71219](https://github.com/ClickHouse/ClickHouse/pull/71219) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#71303](https://github.com/ClickHouse/ClickHouse/issues/71303): Improve system.query_metric_log to remove flakiness. [#71295](https://github.com/ClickHouse/ClickHouse/pull/71295) ([Pablo Marcos](https://github.com/pamarcos)). +* Backported in [#71317](https://github.com/ClickHouse/ClickHouse/issues/71317): Fix debug log timestamp. [#71311](https://github.com/ClickHouse/ClickHouse/pull/71311) ([Pablo Marcos](https://github.com/pamarcos)). + +#### Not for changeling + +* Reverted. [#69812](https://github.com/ClickHouse/ClickHouse/pull/69812) ([tuanpach](https://github.com/tuanpach)). + diff --git a/docs/changelogs/v24.3.13.40-lts.md b/docs/changelogs/v24.3.13.40-lts.md new file mode 100644 index 00000000000..bce45e88710 --- /dev/null +++ b/docs/changelogs/v24.3.13.40-lts.md @@ -0,0 +1,31 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.3.13.40-lts (7acabd77389) FIXME as compared to v24.3.12.75-lts (7cb5dff8019) + +#### Bug Fix (user-visible misbehavior in an official stable release) +* Backported in [#63976](https://github.com/ClickHouse/ClickHouse/issues/63976): Fix intersect parts when restart after drop range. [#63202](https://github.com/ClickHouse/ClickHouse/pull/63202) ([Han Fei](https://github.com/hanfei1991)). +* Backported in [#71482](https://github.com/ClickHouse/ClickHouse/issues/71482): Fix `Content-Encoding` not sent in some compressed responses. [#64802](https://github.com/ClickHouse/ClickHouse/issues/64802). [#68975](https://github.com/ClickHouse/ClickHouse/pull/68975) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Backported in [#70451](https://github.com/ClickHouse/ClickHouse/issues/70451): Fix vrash during insertion into FixedString column in PostgreSQL engine. [#69584](https://github.com/ClickHouse/ClickHouse/pull/69584) ([Pavel Kruglov](https://github.com/Avogar)). +* Backported in [#70619](https://github.com/ClickHouse/ClickHouse/issues/70619): Fix server segfault on creating a materialized view with two selects and an `INTERSECT`, e.g. `CREATE MATERIALIZED VIEW v0 AS (SELECT 1) INTERSECT (SELECT 1);`. [#70264](https://github.com/ClickHouse/ClickHouse/pull/70264) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Backported in [#70877](https://github.com/ClickHouse/ClickHouse/issues/70877): Fix table creation with `CREATE ... AS table_function()` with database `Replicated` and unavailable table function source on secondary replica. [#70511](https://github.com/ClickHouse/ClickHouse/pull/70511) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#70571](https://github.com/ClickHouse/ClickHouse/issues/70571): Ignore all output on async insert with `wait_for_async_insert=1`. Closes [#62644](https://github.com/ClickHouse/ClickHouse/issues/62644). [#70530](https://github.com/ClickHouse/ClickHouse/pull/70530) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Backported in [#71146](https://github.com/ClickHouse/ClickHouse/issues/71146): Ignore frozen_metadata.txt while traversing shadow directory from system.remote_data_paths. [#70590](https://github.com/ClickHouse/ClickHouse/pull/70590) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Backported in [#70682](https://github.com/ClickHouse/ClickHouse/issues/70682): Fix creation of stateful window functions on misaligned memory. [#70631](https://github.com/ClickHouse/ClickHouse/pull/70631) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#71113](https://github.com/ClickHouse/ClickHouse/issues/71113): Fix a crash and a leak in AggregateFunctionGroupArraySorted. [#70820](https://github.com/ClickHouse/ClickHouse/pull/70820) ([Michael Kolupaev](https://github.com/al13n321)). +* Backported in [#70990](https://github.com/ClickHouse/ClickHouse/issues/70990): Fix a logical error due to negative zeros in the two-level hash table. This closes [#70973](https://github.com/ClickHouse/ClickHouse/issues/70973). [#70979](https://github.com/ClickHouse/ClickHouse/pull/70979) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#71246](https://github.com/ClickHouse/ClickHouse/issues/71246): Fixed named sessions not being closed and hanging on forever under certain circumstances. [#70998](https://github.com/ClickHouse/ClickHouse/pull/70998) ([Márcio Martins](https://github.com/marcio-absmartly)). +* Backported in [#71371](https://github.com/ClickHouse/ClickHouse/issues/71371): Add try/catch to data parts destructors to avoid terminate. [#71364](https://github.com/ClickHouse/ClickHouse/pull/71364) ([alesapin](https://github.com/alesapin)). +* Backported in [#71594](https://github.com/ClickHouse/ClickHouse/issues/71594): Prevent crash in SortCursor with 0 columns (old analyzer). [#71494](https://github.com/ClickHouse/ClickHouse/pull/71494) ([Raúl Marín](https://github.com/Algunenano)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#71022](https://github.com/ClickHouse/ClickHouse/issues/71022): Fix dropping of file cache in CHECK query in case of enabled transactions. [#69256](https://github.com/ClickHouse/ClickHouse/pull/69256) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#70384](https://github.com/ClickHouse/ClickHouse/issues/70384): CI: Enable Integration Tests for backport PRs. [#70329](https://github.com/ClickHouse/ClickHouse/pull/70329) ([Max Kainov](https://github.com/maxknv)). +* Backported in [#70538](https://github.com/ClickHouse/ClickHouse/issues/70538): Remove slow poll() logs in keeper. [#70508](https://github.com/ClickHouse/ClickHouse/pull/70508) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#70971](https://github.com/ClickHouse/ClickHouse/issues/70971): Limiting logging some lines about configs. [#70879](https://github.com/ClickHouse/ClickHouse/pull/70879) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). + diff --git a/docs/changelogs/v24.8.6.70-lts.md b/docs/changelogs/v24.8.6.70-lts.md new file mode 100644 index 00000000000..81fa4db1458 --- /dev/null +++ b/docs/changelogs/v24.8.6.70-lts.md @@ -0,0 +1,50 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.8.6.70-lts (ddb8c219771) FIXME as compared to v24.8.5.115-lts (8c4cb00a384) + +#### Backward Incompatible Change +* Backported in [#71359](https://github.com/ClickHouse/ClickHouse/issues/71359): Fix possible error `No such file or directory` due to unescaped special symbols in files for JSON subcolumns. [#71182](https://github.com/ClickHouse/ClickHouse/pull/71182) ([Pavel Kruglov](https://github.com/Avogar)). + +#### Improvement +* Backported in [#70680](https://github.com/ClickHouse/ClickHouse/issues/70680): Don't do validation when synchronizing user_directories from keeper. [#70644](https://github.com/ClickHouse/ClickHouse/pull/70644) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#71395](https://github.com/ClickHouse/ClickHouse/issues/71395): Do not call the object storage API when listing directories, as this may be cost-inefficient. Instead, store the list of filenames in the memory. The trade-offs are increased initial load time and memory required to store filenames. [#70823](https://github.com/ClickHouse/ClickHouse/pull/70823) ([Julia Kartseva](https://github.com/jkartseva)). +* Backported in [#71287](https://github.com/ClickHouse/ClickHouse/issues/71287): Reduce the number of object storage HEAD API requests in the plain_rewritable disk. [#70915](https://github.com/ClickHouse/ClickHouse/pull/70915) ([Julia Kartseva](https://github.com/jkartseva)). + +#### Bug Fix (user-visible misbehavior in an official stable release) +* Backported in [#70934](https://github.com/ClickHouse/ClickHouse/issues/70934): Fix incorrect JOIN ON section optimization in case of `IS NULL` check under any other function (like `NOT`) that may lead to wrong results. Closes [#67915](https://github.com/ClickHouse/ClickHouse/issues/67915). [#68049](https://github.com/ClickHouse/ClickHouse/pull/68049) ([Vladimir Cherkasov](https://github.com/vdimir)). +* Backported in [#70735](https://github.com/ClickHouse/ClickHouse/issues/70735): Fix unexpected exception when passing empty tuple in array. This fixes [#68618](https://github.com/ClickHouse/ClickHouse/issues/68618). [#68848](https://github.com/ClickHouse/ClickHouse/pull/68848) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#71138](https://github.com/ClickHouse/ClickHouse/issues/71138): Fix propogating structure argument in s3Cluster. Previously the `DEFAULT` expression of the column could be lost when sending the query to the replicas in s3Cluster. [#69147](https://github.com/ClickHouse/ClickHouse/pull/69147) ([Pavel Kruglov](https://github.com/Avogar)). +* Backported in [#70561](https://github.com/ClickHouse/ClickHouse/issues/70561): Fix `getSubcolumn` with `LowCardinality` columns by overriding `useDefaultImplementationForLowCardinalityColumns` to return `true`. [#69831](https://github.com/ClickHouse/ClickHouse/pull/69831) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). +* Backported in [#70903](https://github.com/ClickHouse/ClickHouse/issues/70903): Avoid reusing columns among different named tuples when evaluating `tuple` functions. This fixes [#70022](https://github.com/ClickHouse/ClickHouse/issues/70022). [#70103](https://github.com/ClickHouse/ClickHouse/pull/70103) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#70623](https://github.com/ClickHouse/ClickHouse/issues/70623): Fix server segfault on creating a materialized view with two selects and an `INTERSECT`, e.g. `CREATE MATERIALIZED VIEW v0 AS (SELECT 1) INTERSECT (SELECT 1);`. [#70264](https://github.com/ClickHouse/ClickHouse/pull/70264) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Backported in [#70688](https://github.com/ClickHouse/ClickHouse/issues/70688): Fix possible use-after-free in `SYSTEM DROP FORMAT SCHEMA CACHE FOR Protobuf`. [#70358](https://github.com/ClickHouse/ClickHouse/pull/70358) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#70494](https://github.com/ClickHouse/ClickHouse/issues/70494): Fix crash during GROUP BY JSON sub-object subcolumn. [#70374](https://github.com/ClickHouse/ClickHouse/pull/70374) ([Pavel Kruglov](https://github.com/Avogar)). +* Backported in [#70482](https://github.com/ClickHouse/ClickHouse/issues/70482): Don't prefetch parts for vertical merges if part has no rows. [#70452](https://github.com/ClickHouse/ClickHouse/pull/70452) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#70556](https://github.com/ClickHouse/ClickHouse/issues/70556): Fix crash in WHERE with lambda functions. [#70464](https://github.com/ClickHouse/ClickHouse/pull/70464) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#70878](https://github.com/ClickHouse/ClickHouse/issues/70878): Fix table creation with `CREATE ... AS table_function()` with database `Replicated` and unavailable table function source on secondary replica. [#70511](https://github.com/ClickHouse/ClickHouse/pull/70511) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#70575](https://github.com/ClickHouse/ClickHouse/issues/70575): Ignore all output on async insert with `wait_for_async_insert=1`. Closes [#62644](https://github.com/ClickHouse/ClickHouse/issues/62644). [#70530](https://github.com/ClickHouse/ClickHouse/pull/70530) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Backported in [#71052](https://github.com/ClickHouse/ClickHouse/issues/71052): Ignore frozen_metadata.txt while traversing shadow directory from system.remote_data_paths. [#70590](https://github.com/ClickHouse/ClickHouse/pull/70590) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Backported in [#70651](https://github.com/ClickHouse/ClickHouse/issues/70651): Fix creation of stateful window functions on misaligned memory. [#70631](https://github.com/ClickHouse/ClickHouse/pull/70631) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#70757](https://github.com/ClickHouse/ClickHouse/issues/70757): Fixed rare crashes in `SELECT`-s and merges after adding a column of `Array` type with non-empty default expression. [#70695](https://github.com/ClickHouse/ClickHouse/pull/70695) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#70763](https://github.com/ClickHouse/ClickHouse/issues/70763): Fix infinite recursion when infering a proto schema with skip unsupported fields enabled. [#70697](https://github.com/ClickHouse/ClickHouse/pull/70697) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#71118](https://github.com/ClickHouse/ClickHouse/issues/71118): `GroupArraySortedData` uses a PODArray with non-POD elements, manually calling constructors and destructors for the elements as needed. But it wasn't careful enough: in two places it forgot to call destructor, in one place it left elements uninitialized if an exception is thrown when deserializing previous elements. Then `GroupArraySortedData`'s destructor called destructors on uninitialized elements and crashed: ``` 2024.10.17 22:58:23.523790 [ 5233 ] {} BaseDaemon: ########## Short fault info ############ 2024.10.17 22:58:23.523834 [ 5233 ] {} BaseDaemon: (version 24.6.1.4609 (official build), build id: 5423339A6571004018D55BBE05D464AFA35E6718, git hash: fa6cdfda8a94890eb19bc7f22f8b0b56292f7a26) (from thread 682) Received signal 11 2024.10.17 22:58:23.523862 [ 5233 ] {} BaseDaemon: Signal description: Segmentation fault 2024.10.17 22:58:23.523883 [ 5233 ] {} BaseDaemon: Address: 0x8f. Access: . Address not mapped to object. 2024.10.17 22:58:23.523908 [ 5233 ] {} BaseDaemon: Stack trace: 0x0000aaaac4b78308 0x0000ffffb7701850 0x0000aaaac0104855 0x0000aaaac01048a0 0x0000aaaac501e84c 0x0000aaaac7c510d0 0x0000aaaac7c4ba20 0x0000aaaac968bbfc 0x0000aaaac968fab0 0x0000aaaac969bf50 0x0000aaaac9b7520c 0x0000aaaac9b74c74 0x0000aaaac9b8a150 0x0000aaaac9b809f0 0x0000aaaac9b80574 0x0000aaaac9b8e364 0x0000aaaac9b8e4fc 0x0000aaaac94f4328 0x0000aaaac94f428c 0x0000aaaac94f7df0 0x0000aaaac98b5a3c 0x0000aaaac950b234 0x0000aaaac49ae264 0x0000aaaac49b1dd0 0x0000aaaac49b0a80 0x0000ffffb755d5c8 0x0000ffffb75c5edc 2024.10.17 22:58:23.523936 [ 5233 ] {} BaseDaemon: ######################################## 2024.10.17 22:58:23.523959 [ 5233 ] {} BaseDaemon: (version 24.6.1.4609 (official build), build id: 5423339A6571004018D55BBE05D464AFA35E6718, git hash: fa6cdfda8a94890eb19bc7f22f8b0b56292f7a26) (from thread 682) (query_id: 6c8a33a2-f45a-4a3b-bd71-ded6a1c9ccd3::202410_534066_534078_2) (query: ) Received signal Segmentation fault (11) 2024.10.17 22:58:23.523977 [ 5233 ] {} BaseDaemon: Address: 0x8f. Access: . Address not mapped to object. 2024.10.17 22:58:23.523993 [ 5233 ] {} BaseDaemon: Stack trace: 0x0000aaaac4b78308 0x0000ffffb7701850 0x0000aaaac0104855 0x0000aaaac01048a0 0x0000aaaac501e84c 0x0000aaaac7c510d0 0x0000aaaac7c4ba20 0x0000aaaac968bbfc 0x0000aaaac968fab0 0x0000aaaac969bf50 0x0000aaaac9b7520c 0x0000aaaac9b74c74 0x0000aaaac9b8a150 0x0000aaaac9b809f0 0x0000aaaac9b80574 0x0000aaaac9b8e364 0x0000aaaac9b8e4fc 0x0000aaaac94f4328 0x0000aaaac94f428c 0x0000aaaac94f7df0 0x0000aaaac98b5a3c 0x0000aaaac950b234 0x0000aaaac49ae264 0x0000aaaac49b1dd0 0x0000aaaac49b0a80 0x0000ffffb755d5c8 0x0000ffffb75c5edc 2024.10.17 22:58:23.524817 [ 5233 ] {} BaseDaemon: 0. signalHandler(int, siginfo_t*, void*) @ 0x000000000c6f8308 2024.10.17 22:58:23.524917 [ 5233 ] {} BaseDaemon: 1. ? @ 0x0000ffffb7701850 2024.10.17 22:58:23.524962 [ 5233 ] {} BaseDaemon: 2. DB::Field::~Field() @ 0x0000000007c84855 2024.10.17 22:58:23.525012 [ 5233 ] {} BaseDaemon: 3. DB::Field::~Field() @ 0x0000000007c848a0 2024.10.17 22:58:23.526626 [ 5233 ] {} BaseDaemon: 4. DB::IAggregateFunctionDataHelper, DB::(anonymous namespace)::GroupArraySorted, DB::Field>>::destroy(char*) const (.5a6a451027f732f9fd91c13f4a13200c) @ 0x000000000cb9e84c 2024.10.17 22:58:23.527322 [ 5233 ] {} BaseDaemon: 5. DB::SerializationAggregateFunction::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const @ 0x000000000f7d10d0 2024.10.17 22:58:23.528470 [ 5233 ] {} BaseDaemon: 6. DB::ISerialization::deserializeBinaryBulkWithMultipleStreams(COW::immutable_ptr&, unsigned long, DB::ISerialization::DeserializeBinaryBulkSettings&, std::shared_ptr&, std::unordered_map::immutable_ptr, std::hash, std::equal_to, std::allocator::immutable_ptr>>>*) const @ 0x000000000f7cba20 2024.10.17 22:58:23.529213 [ 5233 ] {} BaseDaemon: 7. DB::MergeTreeReaderCompact::readData(DB::NameAndTypePair const&, COW::immutable_ptr&, unsigned long, std::function const&) @ 0x000000001120bbfc 2024.10.17 22:58:23.529277 [ 5233 ] {} BaseDaemon: 8. DB::MergeTreeReaderCompactSingleBuffer::readRows(unsigned long, unsigned long, bool, unsigned long, std::vector::immutable_ptr, std::allocator::immutable_ptr>>&) @ 0x000000001120fab0 2024.10.17 22:58:23.529319 [ 5233 ] {} BaseDaemon: 9. DB::MergeTreeSequentialSource::generate() @ 0x000000001121bf50 2024.10.17 22:58:23.529346 [ 5233 ] {} BaseDaemon: 10. DB::ISource::tryGenerate() @ 0x00000000116f520c 2024.10.17 22:58:23.529653 [ 5233 ] {} BaseDaemon: 11. DB::ISource::work() @ 0x00000000116f4c74 2024.10.17 22:58:23.529679 [ 5233 ] {} BaseDaemon: 12. DB::ExecutionThreadContext::executeTask() @ 0x000000001170a150 2024.10.17 22:58:23.529733 [ 5233 ] {} BaseDaemon: 13. DB::PipelineExecutor::executeStepImpl(unsigned long, std::atomic*) @ 0x00000000117009f0 2024.10.17 22:58:23.529763 [ 5233 ] {} BaseDaemon: 14. DB::PipelineExecutor::executeStep(std::atomic*) @ 0x0000000011700574 2024.10.17 22:58:23.530089 [ 5233 ] {} BaseDaemon: 15. DB::PullingPipelineExecutor::pull(DB::Chunk&) @ 0x000000001170e364 2024.10.17 22:58:23.530277 [ 5233 ] {} BaseDaemon: 16. DB::PullingPipelineExecutor::pull(DB::Block&) @ 0x000000001170e4fc 2024.10.17 22:58:23.530295 [ 5233 ] {} BaseDaemon: 17. DB::MergeTask::ExecuteAndFinalizeHorizontalPart::executeImpl() @ 0x0000000011074328 2024.10.17 22:58:23.530318 [ 5233 ] {} BaseDaemon: 18. DB::MergeTask::ExecuteAndFinalizeHorizontalPart::execute() @ 0x000000001107428c 2024.10.17 22:58:23.530339 [ 5233 ] {} BaseDaemon: 19. DB::MergeTask::execute() @ 0x0000000011077df0 2024.10.17 22:58:23.530362 [ 5233 ] {} BaseDaemon: 20. DB::SharedMergeMutateTaskBase::executeStep() @ 0x0000000011435a3c 2024.10.17 22:58:23.530384 [ 5233 ] {} BaseDaemon: 21. DB::MergeTreeBackgroundExecutor::threadFunction() @ 0x000000001108b234 2024.10.17 22:58:23.530410 [ 5233 ] {} BaseDaemon: 22. ThreadPoolImpl>::worker(std::__list_iterator, void*>) @ 0x000000000c52e264 2024.10.17 22:58:23.530448 [ 5233 ] {} BaseDaemon: 23. void std::__function::__policy_invoker::__call_impl::ThreadFromGlobalPoolImpl>::scheduleImpl(std::function, Priority, std::optional, bool)::'lambda0'()>(void&&)::'lambda'(), void ()>>(std::__function::__policy_storage const*) @ 0x000000000c531dd0 2024.10.17 22:58:23.530476 [ 5233 ] {} BaseDaemon: 24. void* std::__thread_proxy[abi:v15000]>, void ThreadPoolImpl::scheduleImpl(std::function, Priority, std::optional, bool)::'lambda0'()>>(void*) @ 0x000000000c530a80 2024.10.17 22:58:23.530514 [ 5233 ] {} BaseDaemon: 25. ? @ 0x000000000007d5c8 2024.10.17 22:58:23.530534 [ 5233 ] {} BaseDaemon: 26. ? @ 0x00000000000e5edc 2024.10.17 22:58:23.530551 [ 5233 ] {} BaseDaemon: Integrity check of the executable skipped because the reference checksum could not be read. 2024.10.17 22:58:23.531083 [ 5233 ] {} BaseDaemon: Report this error to https://github.com/ClickHouse/ClickHouse/issues 2024.10.17 22:58:23.531294 [ 5233 ] {} BaseDaemon: Changed settings: max_insert_threads = 4, max_threads = 42, use_hedged_requests = false, distributed_foreground_insert = true, alter_sync = 0, enable_memory_bound_merging_of_aggregation_results = true, cluster_for_parallel_replicas = 'default', do_not_merge_across_partitions_select_final = false, log_queries = true, log_queries_probability = 1., max_http_get_redirects = 10, enable_deflate_qpl_codec = false, enable_zstd_qat_codec = false, query_profiler_real_time_period_ns = 0, query_profiler_cpu_time_period_ns = 0, max_bytes_before_external_group_by = 90194313216, max_bytes_before_external_sort = 90194313216, max_memory_usage = 180388626432, backup_restore_keeper_retry_max_backoff_ms = 60000, cancel_http_readonly_queries_on_client_close = true, max_table_size_to_drop = 1000000000000, max_partition_size_to_drop = 1000000000000, default_table_engine = 'ReplicatedMergeTree', mutations_sync = 0, optimize_trivial_insert_select = false, database_replicated_allow_only_replicated_engine = true, cloud_mode = true, cloud_mode_engine = 2, distributed_ddl_output_mode = 'none_only_active', distributed_ddl_entry_format_version = 6, async_insert_max_data_size = 10485760, async_insert_busy_timeout_max_ms = 1000, enable_filesystem_cache_on_write_operations = true, load_marks_asynchronously = true, allow_prefetched_read_pool_for_remote_filesystem = true, filesystem_prefetch_max_memory_usage = 18038862643, filesystem_prefetches_limit = 200, compatibility = '24.6', insert_keeper_max_retries = 20, allow_experimental_materialized_postgresql_table = false, date_time_input_format = 'best_effort' ```. [#70820](https://github.com/ClickHouse/ClickHouse/pull/70820) ([Michael Kolupaev](https://github.com/al13n321)). +* Backported in [#70896](https://github.com/ClickHouse/ClickHouse/issues/70896): Disable enable_named_columns_in_function_tuple by default. [#70833](https://github.com/ClickHouse/ClickHouse/pull/70833) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#70994](https://github.com/ClickHouse/ClickHouse/issues/70994): Fix a logical error due to negative zeros in the two-level hash table. This closes [#70973](https://github.com/ClickHouse/ClickHouse/issues/70973). [#70979](https://github.com/ClickHouse/ClickHouse/pull/70979) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#71210](https://github.com/ClickHouse/ClickHouse/issues/71210): Fix logical error in `StorageS3Queue` "Cannot create a persistent node in /processed since it already exists". [#70984](https://github.com/ClickHouse/ClickHouse/pull/70984) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#71248](https://github.com/ClickHouse/ClickHouse/issues/71248): Fixed named sessions not being closed and hanging on forever under certain circumstances. [#70998](https://github.com/ClickHouse/ClickHouse/pull/70998) ([Márcio Martins](https://github.com/marcio-absmartly)). +* Backported in [#71375](https://github.com/ClickHouse/ClickHouse/issues/71375): Add try/catch to data parts destructors to avoid terminate. [#71364](https://github.com/ClickHouse/ClickHouse/pull/71364) ([alesapin](https://github.com/alesapin)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#71026](https://github.com/ClickHouse/ClickHouse/issues/71026): Fix dropping of file cache in CHECK query in case of enabled transactions. [#69256](https://github.com/ClickHouse/ClickHouse/pull/69256) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#70388](https://github.com/ClickHouse/ClickHouse/issues/70388): CI: Enable Integration Tests for backport PRs. [#70329](https://github.com/ClickHouse/ClickHouse/pull/70329) ([Max Kainov](https://github.com/maxknv)). +* Backported in [#70701](https://github.com/ClickHouse/ClickHouse/issues/70701): Fix order in 03249_dynamic_alter_consistency. [#70453](https://github.com/ClickHouse/ClickHouse/pull/70453) ([Alexander Gololobov](https://github.com/davenger)). +* Backported in [#70542](https://github.com/ClickHouse/ClickHouse/issues/70542): Remove slow poll() logs in keeper. [#70508](https://github.com/ClickHouse/ClickHouse/pull/70508) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#70804](https://github.com/ClickHouse/ClickHouse/issues/70804): When the `PR Check` status is set, it's a valid RunConfig job failure. [#70643](https://github.com/ClickHouse/ClickHouse/pull/70643) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#71229](https://github.com/ClickHouse/ClickHouse/issues/71229): Maybe not GWPAsan by default. [#71174](https://github.com/ClickHouse/ClickHouse/pull/71174) ([Antonio Andelic](https://github.com/antonio2368)). + diff --git a/docs/en/development/building_and_benchmarking_deflate_qpl.md b/docs/en/development/building_and_benchmarking_deflate_qpl.md deleted file mode 100644 index b9d39b8cc2d..00000000000 --- a/docs/en/development/building_and_benchmarking_deflate_qpl.md +++ /dev/null @@ -1,327 +0,0 @@ ---- -slug: /en/development/building_and_benchmarking_deflate_qpl -sidebar_position: 73 -sidebar_label: Building and Benchmarking DEFLATE_QPL -description: How to build Clickhouse and run benchmark with DEFLATE_QPL Codec ---- - -# Build Clickhouse with DEFLATE_QPL - -- Make sure your host machine meet the QPL required [prerequisites](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#prerequisites) -- deflate_qpl is enabled by default during cmake build. In case you accidentally change it, please double-check build flag: ENABLE_QPL=1 - -- For generic requirements, please refer to Clickhouse generic [build instructions](/docs/en/development/build.md) - -# Run Benchmark with DEFLATE_QPL - -## Files list - -The folders `benchmark_sample` under [qpl-cmake](https://github.com/ClickHouse/ClickHouse/tree/master/contrib/qpl-cmake) give example to run benchmark with python scripts: - -`client_scripts` contains python scripts for running typical benchmark, for example: -- `client_stressing_test.py`: The python script for query stress test with [1~4] server instances. -- `queries_ssb.sql`: The file lists all queries for [Star Schema Benchmark](https://clickhouse.com/docs/en/getting-started/example-datasets/star-schema/) -- `allin1_ssb.sh`: This shell script executes benchmark workflow all in one automatically. - -`database_files` means it will store database files according to lz4/deflate/zstd codec. - -## Run benchmark automatically for Star Schema: - -``` bash -$ cd ./benchmark_sample/client_scripts -$ sh run_ssb.sh -``` - -After complete, please check all the results in this folder:`./output/` - -In case you run into failure, please manually run benchmark as below sections. - -## Definition - -[CLICKHOUSE_EXE] means the path of clickhouse executable program. - -## Environment - -- CPU: Sapphire Rapid -- OS Requirements refer to [System Requirements for QPL](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#system-requirements) -- IAA Setup refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) -- Install python modules: - -``` bash -pip3 install clickhouse_driver numpy -``` - -[Self-check for IAA] - -``` bash -$ accel-config list | grep -P 'iax|state' -``` - -Expected output like this: -``` bash - "dev":"iax1", - "state":"enabled", - "state":"enabled", -``` - -If you see nothing output, it means IAA is not ready to work. Please check IAA setup again. - -## Generate raw data - -``` bash -$ cd ./benchmark_sample -$ mkdir rawdata_dir && cd rawdata_dir -``` - -Use [`dbgen`](https://clickhouse.com/docs/en/getting-started/example-datasets/star-schema) to generate 100 million rows data with the parameters: --s 20 - -The files like `*.tbl` are expected to output under `./benchmark_sample/rawdata_dir/ssb-dbgen`: - -## Database setup - -Set up database with LZ4 codec - -``` bash -$ cd ./database_dir/lz4 -$ [CLICKHOUSE_EXE] server -C config_lz4.xml >&/dev/null& -$ [CLICKHOUSE_EXE] client -``` - -Here you should see the message `Connected to ClickHouse server` from console which means client successfully setup connection with server. - -Complete below three steps mentioned in [Star Schema Benchmark](https://clickhouse.com/docs/en/getting-started/example-datasets/star-schema) -- Creating tables in ClickHouse -- Inserting data. Here should use `./benchmark_sample/rawdata_dir/ssb-dbgen/*.tbl` as input data. -- Converting “star schema” to de-normalized “flat schema” - -Set up database with IAA Deflate codec - -``` bash -$ cd ./database_dir/deflate -$ [CLICKHOUSE_EXE] server -C config_deflate.xml >&/dev/null& -$ [CLICKHOUSE_EXE] client -``` -Complete three steps same as lz4 above - -Set up database with ZSTD codec - -``` bash -$ cd ./database_dir/zstd -$ [CLICKHOUSE_EXE] server -C config_zstd.xml >&/dev/null& -$ [CLICKHOUSE_EXE] client -``` -Complete three steps same as lz4 above - -[self-check] -For each codec(lz4/zstd/deflate), please execute below query to make sure the databases are created successfully: -```sql -select count() from lineorder_flat -``` -You are expected to see below output: -```sql -┌───count()─┐ -│ 119994608 │ -└───────────┘ -``` -[Self-check for IAA Deflate codec] - -At the first time you execute insertion or query from client, clickhouse server console is expected to print this log: -```text -Hardware-assisted DeflateQpl codec is ready! -``` -If you never find this, but see another log as below: -```text -Initialization of hardware-assisted DeflateQpl codec failed -``` -That means IAA devices is not ready, you need check IAA setup again. - -## Benchmark with single instance - -- Before start benchmark, Please disable C6 and set CPU frequency governor to be `performance` - -``` bash -$ cpupower idle-set -d 3 -$ cpupower frequency-set -g performance -``` - -- To eliminate impact of memory bound on cross sockets, we use `numactl` to bind server on one socket and client on another socket. -- Single instance means single server connected with single client - -Now run benchmark for LZ4/Deflate/ZSTD respectively: - -LZ4: - -``` bash -$ cd ./database_dir/lz4 -$ numactl -m 0 -N 0 [CLICKHOUSE_EXE] server -C config_lz4.xml >&/dev/null& -$ cd ./client_scripts -$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 1 > lz4.log -``` - -IAA deflate: - -``` bash -$ cd ./database_dir/deflate -$ numactl -m 0 -N 0 [CLICKHOUSE_EXE] server -C config_deflate.xml >&/dev/null& -$ cd ./client_scripts -$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 1 > deflate.log -``` - -ZSTD: - -``` bash -$ cd ./database_dir/zstd -$ numactl -m 0 -N 0 [CLICKHOUSE_EXE] server -C config_zstd.xml >&/dev/null& -$ cd ./client_scripts -$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 1 > zstd.log -``` - -Now three logs should be output as expected: -```text -lz4.log -deflate.log -zstd.log -``` - -How to check performance metrics: - -We focus on QPS, please search the keyword: `QPS_Final` and collect statistics - -## Benchmark with multi-instances - -- To reduce impact of memory bound on too much threads, We recommend run benchmark with multi-instances. -- Multi-instance means multiple(2 or 4)servers connected with respective client. -- The cores of one socket need to be divided equally and assigned to the servers respectively. -- For multi-instances, must create new folder for each codec and insert dataset by following the similar steps as single instance. - -There are 2 differences: -- For client side, you need launch clickhouse with the assigned port during table creation and data insertion. -- For server side, you need launch clickhouse with the specific xml config file in which port has been assigned. All customized xml config files for multi-instances has been provided under ./server_config. - -Here we assume there are 60 cores per socket and take 2 instances for example. -Launch server for first instance -LZ4: - -``` bash -$ cd ./database_dir/lz4 -$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_lz4.xml >&/dev/null& -``` - -ZSTD: - -``` bash -$ cd ./database_dir/zstd -$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_zstd.xml >&/dev/null& -``` - -IAA Deflate: - -``` bash -$ cd ./database_dir/deflate -$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_deflate.xml >&/dev/null& -``` - -[Launch server for second instance] - -LZ4: - -``` bash -$ cd ./database_dir && mkdir lz4_s2 && cd lz4_s2 -$ cp ../../server_config/config_lz4_s2.xml ./ -$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_lz4_s2.xml >&/dev/null& -``` - -ZSTD: - -``` bash -$ cd ./database_dir && mkdir zstd_s2 && cd zstd_s2 -$ cp ../../server_config/config_zstd_s2.xml ./ -$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_zstd_s2.xml >&/dev/null& -``` - -IAA Deflate: - -``` bash -$ cd ./database_dir && mkdir deflate_s2 && cd deflate_s2 -$ cp ../../server_config/config_deflate_s2.xml ./ -$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_deflate_s2.xml >&/dev/null& -``` - -Creating tables && Inserting data for second instance - -Creating tables: - -``` bash -$ [CLICKHOUSE_EXE] client -m --port=9001 -``` - -Inserting data: - -``` bash -$ [CLICKHOUSE_EXE] client --query "INSERT INTO [TBL_FILE_NAME] FORMAT CSV" < [TBL_FILE_NAME].tbl --port=9001 -``` - -- [TBL_FILE_NAME] represents the name of a file named with the regular expression: *. tbl under `./benchmark_sample/rawdata_dir/ssb-dbgen`. -- `--port=9001` stands for the assigned port for server instance which is also defined in config_lz4_s2.xml/config_zstd_s2.xml/config_deflate_s2.xml. For even more instances, you need replace it with the value: 9002/9003 which stand for s3/s4 instance respectively. If you don't assign it, the port is 9000 by default which has been used by first instance. - -Benchmarking with 2 instances - -LZ4: - -``` bash -$ cd ./database_dir/lz4 -$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_lz4.xml >&/dev/null& -$ cd ./database_dir/lz4_s2 -$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_lz4_s2.xml >&/dev/null& -$ cd ./client_scripts -$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 2 > lz4_2insts.log -``` - -ZSTD: - -``` bash -$ cd ./database_dir/zstd -$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_zstd.xml >&/dev/null& -$ cd ./database_dir/zstd_s2 -$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_zstd_s2.xml >&/dev/null& -$ cd ./client_scripts -$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 2 > zstd_2insts.log -``` - -IAA deflate - -``` bash -$ cd ./database_dir/deflate -$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_deflate.xml >&/dev/null& -$ cd ./database_dir/deflate_s2 -$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_deflate_s2.xml >&/dev/null& -$ cd ./client_scripts -$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 2 > deflate_2insts.log -``` - -Here the last argument: `2` of client_stressing_test.py stands for the number of instances. For more instances, you need replace it with the value: 3 or 4. This script support up to 4 instances/ - -Now three logs should be output as expected: - -``` text -lz4_2insts.log -deflate_2insts.log -zstd_2insts.log -``` -How to check performance metrics: - -We focus on QPS, please search the keyword: `QPS_Final` and collect statistics - -Benchmark setup for 4 instances is similar with 2 instances above. -We recommend use 2 instances benchmark data as final report for review. - -## Tips - -Each time before launch new clickhouse server, please make sure no background clickhouse process running, please check and kill old one: - -``` bash -$ ps -aux| grep clickhouse -$ kill -9 [PID] -``` -By comparing the query list in ./client_scripts/queries_ssb.sql with official [Star Schema Benchmark](https://clickhouse.com/docs/en/getting-started/example-datasets/star-schema), you will find 3 queries are not included: Q1.2/Q1.3/Q3.4 . This is because cpu utilization% is very low <10% for these queries which means cannot demonstrate performance differences. diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index c49492c1cb4..aac322f05eb 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -18,7 +18,7 @@ SELECT library_name, license_type, license_path FROM system.licenses ORDER BY li Note that the listed libraries are the ones located in the `contrib/` directory of the ClickHouse repository. Depending on the build options, some of the libraries may have not been compiled, and, as a result, their functionality may not be available at runtime. -[Example](https://play.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==) +[Example](https://sql.clickhouse.com?query_id=478GCPU7LRTSZJBNY3EJT3) ## Adding and maintaining third-party libraries diff --git a/docs/en/engines/table-engines/integrations/azure-queue.md b/docs/en/engines/table-engines/integrations/azure-queue.md index b5259336a8b..2e5889c7485 100644 --- a/docs/en/engines/table-engines/integrations/azure-queue.md +++ b/docs/en/engines/table-engines/integrations/azure-queue.md @@ -36,6 +36,7 @@ SETTINGS ## Settings {#settings} The set of supported settings is the same as for `S3Queue` table engine, but without `s3queue_` prefix. See [full list of settings settings](../../../engines/table-engines/integrations/s3queue.md#settings). +To get a list of settings, configured for the table, use `system.s3_queue_settings` table. Available from `24.10`. ## Description {#description} diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index 1958250ed73..41c4e8fc4a9 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -4,9 +4,13 @@ sidebar_position: 50 sidebar_label: EmbeddedRocksDB --- +import CloudNotSupportedBadge from '@theme/badges/CloudNotSupportedBadge'; + # EmbeddedRocksDB Engine -This engine allows integrating ClickHouse with [rocksdb](http://rocksdb.org/). + + +This engine allows integrating ClickHouse with [RocksDB](http://rocksdb.org/). ## Creating a Table {#creating-a-table} diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index fb759b948a5..fd27d4b6ed9 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -290,6 +290,7 @@ The following settings can be specified in configuration file for given endpoint - `expiration_window_seconds` — Grace period for checking if expiration-based credentials have expired. Optional, default value is `120`. - `no_sign_request` - Ignore all the credentials so requests are not signed. Useful for accessing public buckets. - `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times. +- `access_header` - Adds specified HTTP header to a request to given endpoint, in cases where there are no other credentials from another source. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. - `server_side_encryption_kms_key_id` - If specified, required headers for accessing S3 objects with [SSE-KMS encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) will be set. If an empty string is specified, the AWS managed S3 key will be used. Optional. - `server_side_encryption_kms_encryption_context` - If specified alongside `server_side_encryption_kms_key_id`, the given encryption context header for SSE-KMS will be set. Optional. @@ -320,6 +321,32 @@ The following settings can be specified in configuration file for given endpoint ``` +## Working with archives + +Suppose that we have several archive files with following URIs on S3: + +- 'https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m-2018-01-10.csv.zip' +- 'https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m-2018-01-11.csv.zip' +- 'https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m-2018-01-12.csv.zip' + +Extracting data from these archives is possible using ::. Globs can be used both in the url part as well as in the part after :: (responsible for the name of a file inside the archive). + +``` sql +SELECT * +FROM s3( + 'https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m-2018-01-1{0..2}.csv.zip :: *.csv' +); +``` + +:::note +ClickHouse supports three archive formats: +ZIP +TAR +7Z +While ZIP and TAR archives can be accessed from any supported storage location, 7Z archives can only be read from the local filesystem where ClickHouse is installed. +::: + + ## Accessing public buckets ClickHouse tries to fetch credentials from many different types of sources. @@ -331,6 +358,10 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = S3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/aapl_stock.csv', NOSIGN, 'CSVWithNames'); ``` +## Optimizing performance + +For details on optimizing the performance of the s3 function see [our detailed guide](/docs/en/integrations/s3/performance). + ## See also - [s3 table function](../../../sql-reference/table-functions/s3.md) diff --git a/docs/en/engines/table-engines/integrations/s3queue.md b/docs/en/engines/table-engines/integrations/s3queue.md index 1916c33272e..11fc357d222 100644 --- a/docs/en/engines/table-engines/integrations/s3queue.md +++ b/docs/en/engines/table-engines/integrations/s3queue.md @@ -69,6 +69,8 @@ SETTINGS ## Settings {#settings} +To get a list of settings, configured for the table, use `system.s3_queue_settings` table. Available from `24.10`. + ### mode {#mode} Possible values: diff --git a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md index 7a449f400fd..819038ee32c 100644 --- a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md @@ -37,7 +37,7 @@ For a description of request parameters, see [request description](../../../sql- **Query clauses** -When creating an `AggregatingMergeTree` table the same [clauses](../../../engines/table-engines/mergetree-family/mergetree.md) are required, as when creating a `MergeTree` table. +When creating an `AggregatingMergeTree` table, the same [clauses](../../../engines/table-engines/mergetree-family/mergetree.md) are required as when creating a `MergeTree` table.
@@ -62,19 +62,19 @@ All of the parameters have the same meaning as in `MergeTree`. ## SELECT and INSERT {#select-and-insert} To insert data, use [INSERT SELECT](../../../sql-reference/statements/insert-into.md) query with aggregate -State- functions. -When selecting data from `AggregatingMergeTree` table, use `GROUP BY` clause and the same aggregate functions as when inserting data, but using `-Merge` suffix. +When selecting data from `AggregatingMergeTree` table, use `GROUP BY` clause and the same aggregate functions as when inserting data, but using the `-Merge` suffix. -In the results of `SELECT` query, the values of `AggregateFunction` type have implementation-specific binary representation for all of the ClickHouse output formats. If dump data into, for example, `TabSeparated` format with `SELECT` query then this dump can be loaded back using `INSERT` query. +In the results of `SELECT` query, the values of `AggregateFunction` type have implementation-specific binary representation for all of the ClickHouse output formats. For example, if you dump data into `TabSeparated` format with a `SELECT` query, then this dump can be loaded back using an `INSERT` query. ## Example of an Aggregated Materialized View {#example-of-an-aggregated-materialized-view} -The following examples assumes that you have a database named `test` so make sure you create that if it doesn't already exist: +The following example assumes that you have a database named `test`, so create it if it doesn't already exist: ```sql CREATE DATABASE test; ``` -We will create the table `test.visits` that contain the raw data: +Now create the table `test.visits` that contains the raw data: ``` sql CREATE TABLE test.visits @@ -86,9 +86,9 @@ CREATE TABLE test.visits ) ENGINE = MergeTree ORDER BY (StartDate, CounterID); ``` -Next, we need to create an `AggregatingMergeTree` table that will store `AggregationFunction`s that keep track of the total number of visits and the number of unique users. +Next, you need an `AggregatingMergeTree` table that will store `AggregationFunction`s that keep track of the total number of visits and the number of unique users. -`AggregatingMergeTree` materialized view that watches the `test.visits` table, and use the `AggregateFunction` type: +Create an `AggregatingMergeTree` materialized view that watches the `test.visits` table, and uses the `AggregateFunction` type: ``` sql CREATE TABLE test.agg_visits ( @@ -100,7 +100,7 @@ CREATE TABLE test.agg_visits ( ENGINE = AggregatingMergeTree() ORDER BY (StartDate, CounterID); ``` -And then let's create a materialized view that populates `test.agg_visits` from `test.visits` : +Create a materialized view that populates `test.agg_visits` from `test.visits`: ```sql CREATE MATERIALIZED VIEW test.visits_mv TO test.agg_visits @@ -113,7 +113,7 @@ FROM test.visits GROUP BY StartDate, CounterID; ``` -Inserting data into the `test.visits` table. +Insert data into the `test.visits` table: ``` sql INSERT INTO test.visits (StartDate, CounterID, Sign, UserID) @@ -122,7 +122,7 @@ INSERT INTO test.visits (StartDate, CounterID, Sign, UserID) The data is inserted in both `test.visits` and `test.agg_visits`. -To get the aggregated data, we need to execute a query such as `SELECT ... GROUP BY ...` from the materialized view `test.mv_visits`: +To get the aggregated data, execute a query such as `SELECT ... GROUP BY ...` from the materialized view `test.mv_visits`: ```sql SELECT @@ -140,14 +140,14 @@ ORDER BY StartDate; └─────────────────────────┴────────┴───────┘ ``` -And how about if we add another couple of records to `test.visits`, but this time we'll use a different timestamp for one of the records: +Add another couple of records to `test.visits`, but this time try using a different timestamp for one of the records: ```sql INSERT INTO test.visits (StartDate, CounterID, Sign, UserID) VALUES (1669446031000, 2, 5, 10), (1667446031000, 3, 7, 5); ``` -If we then run the `SELECT` query again, we'll see the following output: +Run the `SELECT` query again, which will return the following output: ```text ┌───────────────StartDate─┬─Visits─┬─Users─┐ diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index dc12a60e8ef..fcdc16637e6 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -54,7 +54,7 @@ Parameters: - `distance_function`: either `L2Distance` (the [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) - the length of a line between two points in Euclidean space), or `cosineDistance` (the [cosine distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors). -- `quantization`: either `f64`, `f32`, `f16`, `bf16`, or `i8` for storing the vector with reduced precision (optional, default: `bf16`) +- `quantization`: either `f64`, `f32`, `f16`, `bf16`, or `i8` for storing vectors with reduced precision (optional, default: `bf16`) - `hnsw_max_connections_per_layer`: the number of neighbors per HNSW graph node, also known as `M` in the [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 32) - `hnsw_candidate_list_size_for_construction`: the size of the dynamic candidate list when constructing the HNSW graph, also known as @@ -92,8 +92,8 @@ Vector similarity indexes currently support two distance functions: - `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). -Vector similarity indexes allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16` or `i8`. -If no scalar kind was specified during index creation, `f16` is used as default. +Vector similarity indexes allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16`, `bf16`, +and `i8`. If no scalar kind was specified during index creation, `bf16` is used as default. For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no distance function was specified during index creation, `L2Distance` is used as default. diff --git a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md index 5a0a2691a9e..3670c763da6 100644 --- a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md @@ -12,6 +12,10 @@ Data deduplication occurs only during a merge. Merging occurs in the background Thus, `ReplacingMergeTree` is suitable for clearing out duplicate data in the background in order to save space, but it does not guarantee the absence of duplicates. +:::note +A detailed guide on ReplacingMergeTree, including best practices and how to optimize performance, is available [here](/docs/en/guides/replacing-merge-tree). +::: + ## Creating a Table {#creating-a-table} ``` sql @@ -162,3 +166,51 @@ All of the parameters excepting `ver` have the same meaning as in `MergeTree`. - `ver` - column with the version. Optional parameter. For a description, see the text above.
+ +## Query time de-duplication & FINAL + +At merge time, the ReplacingMergeTree identifies duplicate rows, using the values of the `ORDER BY` columns (used to create the table) as a unique identifier, and retains only the highest version. This, however, offers eventual correctness only - it does not guarantee rows will be deduplicated, and you should not rely on it. Queries can, therefore, produce incorrect answers due to update and delete rows being considered in queries. + +To obtain correct answers, users will need to complement background merges with query time deduplication and deletion removal. This can be achieved using the `FINAL` operator. For example, consider the following example: + +```sql +CREATE TABLE rmt_example +( + `number` UInt16 +) +ENGINE = ReplacingMergeTree +ORDER BY number + +INSERT INTO rmt_example SELECT floor(randUniform(0, 100)) AS number +FROM numbers(1000000000) + +0 rows in set. Elapsed: 19.958 sec. Processed 1.00 billion rows, 8.00 GB (50.11 million rows/s., 400.84 MB/s.) +``` +Querying without `FINAL` produces an incorrect count (exact result will vary depending on merges): + +```sql +SELECT count() +FROM rmt_example + +┌─count()─┐ +│ 200 │ +└─────────┘ + +1 row in set. Elapsed: 0.002 sec. +``` + +Adding final produces a correct result: + +```sql +SELECT count() +FROM rmt_example +FINAL + +┌─count()─┐ +│ 100 │ +└─────────┘ + +1 row in set. Elapsed: 0.002 sec. +``` + +For further details on `FINAL`, including how to optimize `FINAL` performance, we recommend reading our [detailed guide on ReplacingMergeTree](/docs/en/guides/replacing-merge-tree). diff --git a/docs/en/getting-started/example-datasets/brown-benchmark.md b/docs/en/getting-started/example-datasets/brown-benchmark.md index 3fbbe2376e8..6233a7e80ad 100644 --- a/docs/en/getting-started/example-datasets/brown-benchmark.md +++ b/docs/en/getting-started/example-datasets/brown-benchmark.md @@ -453,4 +453,4 @@ ORDER BY yr, mo; ``` -The data is also available for interactive queries in the [Playground](https://play.clickhouse.com/play?user=play), [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). +The data is also available for interactive queries in the [Playground](https://sql.clickhouse.com), [example](https://sql.clickhouse.com?query_id=1MXMHASDLEQIP4P1D1STND). diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md index 94fa6998f5d..ecfd21e9d2c 100644 --- a/docs/en/getting-started/example-datasets/cell-towers.md +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -360,9 +360,9 @@ This screenshot shows cell tower locations with LTE, UMTS, and GSM radios. The ![Dashboard of cell towers by radio type in mcc 204](@site/docs/en/getting-started/example-datasets/images/superset-cell-tower-dashboard.png) :::tip -The data is also available for interactive queries in the [Playground](https://play.clickhouse.com/play?user=play). +The data is also available for interactive queries in the [Playground](https://sql.clickhouse.com). -This [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=) will populate the username and even the query for you. +This [example](https://sql.clickhouse.com?query_id=UV8M4MAGS2PWAUOAYAAARM) will populate the username and even the query for you. Although you cannot create tables in the Playground, you can run all of the queries and even use Superset (adjust the host name and port number). ::: diff --git a/docs/en/getting-started/example-datasets/github.md b/docs/en/getting-started/example-datasets/github.md index e5ffb15bb9a..26a91eee34d 100644 --- a/docs/en/getting-started/example-datasets/github.md +++ b/docs/en/getting-started/example-datasets/github.md @@ -244,13 +244,13 @@ FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhou The tool suggests several queries via its help output. We have answered these in addition to some additional supplementary questions of interest. These queries are of approximately increasing complexity vs. the tool's arbitrary order. -This dataset is available in [play.clickhouse.com](https://play.clickhouse.com/play?user=play#U0hPVyBUQUJMRVMgSU4gZ2l0X2NsaWNraG91c2U=) in the `git_clickhouse` databases. We provide a link to this environment for all queries, adapting the database name as required. Note that play results may vary from the those presented here due to differences in time of data collection. +This dataset is available in [play.clickhouse.com](https://sql.clickhouse.com?query_id=DCQPNPAIMAQXRLHYURLKVJ) in the `git_clickhouse` databases. We provide a link to this environment for all queries, adapting the database name as required. Note that play results may vary from the those presented here due to differences in time of data collection. ## History of a single file The simplest of queries. Here we look at all commit messages for the `StorageReplicatedMergeTree.cpp`. Since these are likely more interesting, we sort by the most recent messages first. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgY2hhbmdlX3R5cGUsCiAgICBhdXRob3IsCiAgICBwYXRoLAogICAgb2xkX3BhdGgsCiAgICBsaW5lc19hZGRlZCwKICAgIGxpbmVzX2RlbGV0ZWQsCiAgICBjb21taXRfbWVzc2FnZQpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoID0gJ3NyYy9TdG9yYWdlcy9TdG9yYWdlUmVwbGljYXRlZE1lcmdlVHJlZS5jcHAnCk9SREVSIEJZIHRpbWUgREVTQwpMSU1JVCAxMA==) +[play](https://sql.clickhouse.com?query_id=COAZRFX2YFULDBXRQTCQ1S) ```sql SELECT @@ -287,7 +287,7 @@ LIMIT 10 We can also review the line changes, excluding renames i.e. we won't show changes before a rename event when the file existed under a different name: -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgc2lnbiwKICAgIGxpbmVfbnVtYmVyX29sZCwKICAgIGxpbmVfbnVtYmVyX25ldywKICAgIGF1dGhvciwKICAgIGxpbmUKRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKV0hFUkUgcGF0aCA9ICdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJwpPUkRFUiBCWSBsaW5lX251bWJlcl9uZXcgQVNDCkxJTUlUIDEw) +[play](https://sql.clickhouse.com?query_id=AKS9SYLARFMZCHGAAQNEBN) ```sql SELECT @@ -327,7 +327,7 @@ This is important for later analysis when we only want to consider the current f **Note there appears to have been a broken commit history in relation to files under the `dbms`, `libs`, `tests/testflows/` directories during their renames. We also thus exclude these.** -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIHBhdGgKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIEdST1VQIEJZIG9sZF9wYXRoCiAgICBVTklPTiBBTEwKICAgIFNFTEVDVAogICAgICAgIHBhdGgsCiAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgR1JPVVAgQlkgcGF0aAopCkdST1VQIEJZIHBhdGgKSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIE5PVCBtYXRjaChwYXRoLCAnKF5kYm1zLyl8KF5saWJzLyl8KF50ZXN0cy90ZXN0Zmxvd3MvKXwoXnByb2dyYW1zL3NlcnZlci9zdG9yZS8pJykgT1JERVIgQlkgcGF0aApMSU1JVCAxMA==) +[play](https://sql.clickhouse.com?query_id=2HNFWPCFWEEY92WTAPMA7W) ```sql SELECT path @@ -369,7 +369,7 @@ LIMIT 10 Note that this allows for files to be renamed and then re-renamed to their original values. First we aggregate `old_path` for a list of deleted files as a result of renaming. We union this with the last operation for every `path`. Finally, we filter this list to those where the final event is not a `Delete`. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIHVuaXEocGF0aCkKRlJPTQooCiAgICBTRUxFQ1QgcGF0aAogICAgRlJPTQogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAyIEFTIGNoYW5nZV90eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgIFVOSU9OIEFMTAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICApCiAgICBHUk9VUCBCWSBwYXRoCiAgICBIQVZJTkcgKGFyZ01heChjaGFuZ2VfdHlwZSwgbGFzdF90aW1lKSAhPSAyKSBBTkQgTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSBPUkRFUiBCWSBwYXRoCikK) +[play](https://sql.clickhouse.com?query_id=1OXCKMOH2JVMSHD3NS2WW6) ```sql SELECT uniq(path) @@ -419,7 +419,7 @@ The difference here is caused by a few factors: - A rename can occur alongside other modifications to the file. These are listed as separate events in file_changes but with the same time. The `argMax` function has no way of distinguishing these - it picks the first value. The natural ordering of the inserts (the only means of knowing the correct order) is not maintained across the union so modified events can be selected. For example, below the `src/Functions/geometryFromColumn.h` file has several modifications before being renamed to `src/Functions/geometryConverters.h`. Our current solution may pick a Modify event as the latest change causing `src/Functions/geometryFromColumn.h` to be retained. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICAgIGNoYW5nZV90eXBlLAogICAgICBwYXRoLAogICAgICBvbGRfcGF0aCwKICAgICAgdGltZSwKICAgICAgY29tbWl0X2hhc2gKICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogIFdIRVJFIChwYXRoID0gJ3NyYy9GdW5jdGlvbnMvZ2VvbWV0cnlGcm9tQ29sdW1uLmgnKSBPUiAob2xkX3BhdGggPSAnc3JjL0Z1bmN0aW9ucy9nZW9tZXRyeUZyb21Db2x1bW4uaCcpCg==) +[play](https://sql.clickhouse.com?query_id=SCXWMR9GBMJ9UNZYQXQBFA) ```sql SELECT @@ -454,7 +454,7 @@ These differences shouldn't meaningfully impact our analysis. **We welcome impro Limiting to current files, we consider the number of modifications to be the sum of deletes and additions. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgKyBzdW0obGluZXNfZGVsZXRlZCkgQVMgbW9kaWZpY2F0aW9ucwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAocGF0aCBJTiAoY3VycmVudF9maWxlcykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBtb2RpZmljYXRpb25zIERFU0MKTElNSVQgMTA=) +[play](https://sql.clickhouse.com?query_id=MHXPSBNPTDMJYR3OYSXVR7) ```sql WITH current_files AS @@ -507,7 +507,7 @@ LIMIT 10 ## What day of the week do commits usually occur? -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKR1JPVVAgQlkgZGF5T2ZXZWVrKHRpbWUpIEFTIGRheV9vZl93ZWVrCg==) +[play](https://sql.clickhouse.com?query_id=GED2STFSYJDRAA59H8RLIV) ```sql SELECT @@ -534,7 +534,7 @@ This makes sense with some productivity drop-off on Fridays. Great to see people This would produce a large query result that is unrealistic to show or visualize if unfiltered. We, therefore, allow a file or subdirectory to be filtered in the following example. Here we group by week using the `toStartOfWeek` function - adapt as required. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB3ZWVrLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkLAogICAgdW5pcShjb21taXRfaGFzaCkgQVMgbnVtX2NvbW1pdHMsCiAgICB1bmlxKGF1dGhvcikgQVMgYXV0aG9ycwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoIExJS0UgJ3NyYy9TdG9yYWdlcyUnCkdST1VQIEJZIHRvU3RhcnRPZldlZWsodGltZSkgQVMgd2VlawpPUkRFUiBCWSB3ZWVrIEFTQwpMSU1JVCAxMAo=) +[play](https://sql.clickhouse.com?query_id=REZRXDVU7CAWT5WKNJSTNY) ```sql SELECT @@ -578,7 +578,7 @@ This data visualizes well. Below we use Superset. Limit to current files only. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHVuaXEoYXV0aG9yKSBBUyBudW1fYXV0aG9ycwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIChjdXJyZW50X2ZpbGVzKQpHUk9VUCBCWSBwYXRoCk9SREVSIEJZIG51bV9hdXRob3JzIERFU0MKTElNSVQgMTA=) +[play](https://sql.clickhouse.com?query_id=CYQFNQNK9TAMPU2OZ8KG5Y) ```sql WITH current_files AS @@ -633,7 +633,7 @@ LIMIT 10 Limited to current files only. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgYW55KHBhdGgpIEFTIGZpbGVfcGF0aCwKICAgIGxpbmUsCiAgICBtYXgodGltZSkgQVMgbGF0ZXN0X2NoYW5nZSwKICAgIGFueShmaWxlX2NoYW5nZV90eXBlKQpGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIChjdXJyZW50X2ZpbGVzKQpHUk9VUCBCWSBsaW5lCk9SREVSIEJZIGxhdGVzdF9jaGFuZ2UgQVNDCkxJTUlUIDEw) +[play](https://sql.clickhouse.com?query_id=VWPBPGRZVGTHOCQYWNQZNT) ```sql WITH current_files AS @@ -690,7 +690,7 @@ LIMIT 10 Limited to current files only. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgY291bnQoKSBBUyBjLAogICAgcGF0aCwKICAgIG1heCh0aW1lKSBBUyBsYXRlc3RfY2hhbmdlCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpCkdST1VQIEJZIHBhdGgKT1JERVIgQlkgYyBERVNDCkxJTUlUIDEw) +[play](https://sql.clickhouse.com?query_id=VWPBPGRZVGTHOCQYWNQZNT) ```sql WITH current_files AS @@ -750,7 +750,7 @@ Our core data structure, the Merge Tree, is obviously under constant evolution w Do we write more docs at certain times of the month e.g., around release dates? We can use the `countIf` function to compute a simple ratio, visualizing the result using the `bar` function. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXksCiAgICBiYXIoZG9jc19yYXRpbyAqIDEwMDAsIDAsIDEwMCwgMTAwKSBBUyBiYXIKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBkYXksCiAgICAgICAgY291bnRJZihmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKSBBUyBjb2RlLAogICAgICAgIGNvdW50SWYoZmlsZV9leHRlbnNpb24gPSAnbWQnKSBBUyBkb2NzLAogICAgICAgIGRvY3MgLyAoY29kZSArIGRvY3MpIEFTIGRvY3NfcmF0aW8KICAgIEZST00gZ2l0X2NsaWNraG91c2UubGluZV9jaGFuZ2VzCiAgICBXSEVSRSAoc2lnbiA9IDEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnLCAnbWQnKSkKICAgIEdST1VQIEJZIGRheU9mTW9udGgodGltZSkgQVMgZGF5CikK) +[play](https://sql.clickhouse.com?query_id=BA4RZUXUHNQBH9YK7F2T9J) ```sql SELECT @@ -811,7 +811,7 @@ Maybe a little more near the end of the month, but overall we keep a good even d We consider diversity here to be the number of unique files an author has contributed to. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICB1bmlxKHBhdGgpIEFTIG51bV9maWxlcwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgYXV0aG9yCk9SREVSIEJZIG51bV9maWxlcyBERVNDCkxJTUlUIDEw) +[play](https://sql.clickhouse.com?query_id=MT8WBABUKYBYSBA78W5TML) ```sql SELECT @@ -841,7 +841,7 @@ LIMIT 10 Let's see who has the most diverse commits in their recent work. Rather than limit by date, we'll restrict to an author's last N commits (in this case, we've used 3 but feel free to modify): -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBzdW0obnVtX2ZpbGVzX2NvbW1pdCkgQVMgbnVtX2ZpbGVzCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgIHVuaXEocGF0aCkgQVMgbnVtX2ZpbGVzX2NvbW1pdCwKICAgICAgICBtYXgodGltZSkgQVMgY29tbWl0X3RpbWUKICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICBXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgIEdST1VQIEJZCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoCiAgICBPUkRFUiBCWQogICAgICAgIGF1dGhvciBBU0MsCiAgICAgICAgY29tbWl0X3RpbWUgREVTQwogICAgTElNSVQgMyBCWSBhdXRob3IKKQpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgbnVtX2ZpbGVzIERFU0MKTElNSVQgMTA=) +[play](https://sql.clickhouse.com?query_id=4Q3D67FWRIVWTY8EIDDE5U) ```sql SELECT @@ -888,7 +888,7 @@ LIMIT 10 Here we select our founder [Alexey Milovidov](https://github.com/alexey-milovidov) and limit our analysis to current files. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAoYXV0aG9yID0gJ0FsZXhleSBNaWxvdmlkb3YnKSBBTkQgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKQpHUk9VUCBCWSBwYXRoCk9SREVSIEJZIGMgREVTQwpMSU1JVCAxMA==) +[play](https://sql.clickhouse.com?query_id=OKGZBACRHVGCRAGCZAJKMF) ```sql WITH current_files AS @@ -941,7 +941,7 @@ LIMIT 10 This makes sense because Alexey has been responsible for maintaining the Change log. But what if we use the base name of the file to identify his popular files - this allows for renames and should focus on code contributions. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBiYXNlLAogICAgY291bnQoKSBBUyBjCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIChhdXRob3IgPSAnQWxleGV5IE1pbG92aWRvdicpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgYmFzZW5hbWUocGF0aCkgQVMgYmFzZQpPUkRFUiBCWSBjIERFU0MKTElNSVQgMTA=) +[play](https://sql.clickhouse.com?query_id=P9PBDZGOSVTKXEXU73ZNAJ) ```sql SELECT @@ -976,7 +976,7 @@ For this, we first need to identify the largest files. Estimating this via a ful To estimate, assuming we restrict to current files, we sum line additions and subtract deletions. We can then compute a ratio of length to the number of authors. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgcGF0aCBJTiAoY3VycmVudF9maWxlcykKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBsaW5lc19hdXRob3JfcmF0aW8gREVTQwpMSU1JVCAxMA==) +[play](https://sql.clickhouse.com?query_id=PVSDOHZYUMRDDUZFEYJC7J) ```sql WITH current_files AS @@ -1031,7 +1031,7 @@ LIMIT 10 Text dictionaries aren't maybe realistic, so lets restrict to code only via a file extension filter! -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCkdST1VQIEJZIHBhdGgKT1JERVIgQlkgbGluZXNfYXV0aG9yX3JhdGlvIERFU0MKTElNSVQgMTA=) +[play](https://sql.clickhouse.com?query_id=BZHGWUIZMPZZUHS5XRBK2M) ```sql WITH current_files AS @@ -1085,7 +1085,7 @@ LIMIT 10 There is some recency bias in this - newer files have fewer opportunities for commits. What about if we restrict to files at least 1 yr old? -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgbWluKHRpbWUpIEFTIG1pbl9kYXRlLAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCkdST1VQIEJZIHBhdGgKSEFWSU5HIG1pbl9kYXRlIDw9IChub3coKSAtIHRvSW50ZXJ2YWxZZWFyKDEpKQpPUkRFUiBCWSBsaW5lc19hdXRob3JfcmF0aW8gREVTQwpMSU1JVCAxMA==) +[play](https://sql.clickhouse.com?query_id=RMHHZEDHFUCBGRQVQA2732) ```sql WITH current_files AS @@ -1144,7 +1144,7 @@ LIMIT 10 We interpret this as the number of lines added and removed by the day of the week. In this case, we focus on the [Functions directory](https://github.com/ClickHouse/ClickHouse/tree/master/src/Functions) -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlPZldlZWssCiAgICB1bmlxKGNvbW1pdF9oYXNoKSBBUyBjb21taXRzLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggTElLRSAnc3JjL0Z1bmN0aW9ucyUnCkdST1VQIEJZIHRvRGF5T2ZXZWVrKHRpbWUpIEFTIGRheU9mV2Vlaw==) +[play](https://sql.clickhouse.com?query_id=PF3KEMYG5CVLJGCFYQEGB1) ```sql SELECT @@ -1171,7 +1171,7 @@ GROUP BY toDayOfWeek(time) AS dayOfWeek And by time of day, -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBob3VyT2ZEYXksCiAgICB1bmlxKGNvbW1pdF9oYXNoKSBBUyBjb21taXRzLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggTElLRSAnc3JjL0Z1bmN0aW9ucyUnCkdST1VQIEJZIHRvSG91cih0aW1lKSBBUyBob3VyT2ZEYXk=) +[play](https://sql.clickhouse.com?query_id=Q4VDVKEGHHRBCUJHNCVTF1) ```sql SELECT @@ -1215,7 +1215,7 @@ GROUP BY toHour(time) AS hourOfDay This distribution makes sense given most of our development team is in Amsterdam. The `bar` functions helps us visualize these distributions: -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBob3VyT2ZEYXksCiAgICBiYXIoY29tbWl0cywgMCwgNDAwLCA1MCkgQVMgY29tbWl0cywKICAgIGJhcihsaW5lc19hZGRlZCwgMCwgMzAwMDAsIDUwKSBBUyBsaW5lc19hZGRlZCwKICAgIGJhcihsaW5lc19kZWxldGVkLCAwLCAxNTAwMCwgNTApIEFTIGxpbmVzX2RlbGV0ZWQKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBob3VyT2ZEYXksCiAgICAgICAgdW5pcShjb21taXRfaGFzaCkgQVMgY29tbWl0cywKICAgICAgICBzdW0obGluZXNfYWRkZWQpIEFTIGxpbmVzX2FkZGVkLAogICAgICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgV0hFUkUgcGF0aCBMSUtFICdzcmMvRnVuY3Rpb25zJScKICAgIEdST1VQIEJZIHRvSG91cih0aW1lKSBBUyBob3VyT2ZEYXkKKQ==) +[play](https://sql.clickhouse.com?query_id=9AZ8CENV8N91YGW7T6IB68) ```sql SELECT @@ -1269,7 +1269,7 @@ FROM The `sign = -1` indicates a code deletion. We exclude punctuation and the insertion of empty lines. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBwcmV2X2F1dGhvciB8fCAnKGEpJyBhcyBhZGRfYXV0aG9yLAogICAgYXV0aG9yICB8fCAnKGQpJyBhcyBkZWxldGVfYXV0aG9yLAogICAgY291bnQoKSBBUyBjCkZST00gZ2l0X2NsaWNraG91c2UubGluZV9jaGFuZ2VzCldIRVJFIChzaWduID0gLTEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcpKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKSBBTkQgKGF1dGhvciAhPSBwcmV2X2F1dGhvcikgQU5EIChwcmV2X2F1dGhvciAhPSAnJykKR1JPVVAgQlkKICAgIHByZXZfYXV0aG9yLAogICAgYXV0aG9yCk9SREVSIEJZIGMgREVTQwpMSU1JVCAxIEJZIHByZXZfYXV0aG9yCkxJTUlUIDEwMA==) +[play](https://sql.clickhouse.com?query_id=448O8GWAHY3EM6ZZ7AGLAM) ```sql SELECT @@ -1325,7 +1325,7 @@ Alexey clearly likes removing other peoples code. Lets exclude him for a more ba If we consider by just number of commits: -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGF1dGhvciwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKR1JPVVAgQlkKICAgIGRheU9mV2Vlayh0aW1lKSBBUyBkYXlfb2Zfd2VlaywKICAgIGF1dGhvcgpPUkRFUiBCWQogICAgZGF5X29mX3dlZWsgQVNDLAogICAgYyBERVNDCkxJTUlUIDEgQlkgZGF5X29mX3dlZWs=) +[play](https://sql.clickhouse.com?query_id=WXPKFJCAHOKYKEVTWNFVCY) ```sql SELECT @@ -1356,7 +1356,7 @@ LIMIT 1 BY day_of_week OK, some possible advantages here to the longest contributor - our founder Alexey. Lets limit our analysis to the last year. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGF1dGhvciwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKV0hFUkUgdGltZSA+IChub3coKSAtIHRvSW50ZXJ2YWxZZWFyKDEpKQpHUk9VUCBCWQogICAgZGF5T2ZXZWVrKHRpbWUpIEFTIGRheV9vZl93ZWVrLAogICAgYXV0aG9yCk9SREVSIEJZCiAgICBkYXlfb2Zfd2VlayBBU0MsCiAgICBjIERFU0MKTElNSVQgMSBCWSBkYXlfb2Zfd2Vlaw==) +[play](https://sql.clickhouse.com?query_id=8YRJGHFTNJAWJ96XCJKKEH) ```sql SELECT @@ -1390,7 +1390,7 @@ This is still a little simple and doesn't reflect people's work. A better metric might be who is the top contributor each day as a fraction of the total work performed in the last year. Note that we treat the deletion and adding code equally. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0b3BfYXV0aG9yLmRheV9vZl93ZWVrLAogICAgdG9wX2F1dGhvci5hdXRob3IsCiAgICB0b3BfYXV0aG9yLmF1dGhvcl93b3JrIC8gYWxsX3dvcmsudG90YWxfd29yayBBUyB0b3BfYXV0aG9yX3BlcmNlbnQKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBkYXlfb2Zfd2VlaywKICAgICAgICBhdXRob3IsCiAgICAgICAgc3VtKGxpbmVzX2FkZGVkKSArIHN1bShsaW5lc19kZWxldGVkKSBBUyBhdXRob3Jfd29yawogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIFdIRVJFIHRpbWUgPiAobm93KCkgLSB0b0ludGVydmFsWWVhcigxKSkKICAgIEdST1VQIEJZCiAgICAgICAgYXV0aG9yLAogICAgICAgIGRheU9mV2Vlayh0aW1lKSBBUyBkYXlfb2Zfd2VlawogICAgT1JERVIgQlkKICAgICAgICBkYXlfb2Zfd2VlayBBU0MsCiAgICAgICAgYXV0aG9yX3dvcmsgREVTQwogICAgTElNSVQgMSBCWSBkYXlfb2Zfd2VlawopIEFTIHRvcF9hdXRob3IKSU5ORVIgSk9JTgooCiAgICBTRUxFQ1QKICAgICAgICBkYXlfb2Zfd2VlaywKICAgICAgICBzdW0obGluZXNfYWRkZWQpICsgc3VtKGxpbmVzX2RlbGV0ZWQpIEFTIHRvdGFsX3dvcmsKICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICBXSEVSRSB0aW1lID4gKG5vdygpIC0gdG9JbnRlcnZhbFllYXIoMSkpCiAgICBHUk9VUCBCWSBkYXlPZldlZWsodGltZSkgQVMgZGF5X29mX3dlZWsKKSBBUyBhbGxfd29yayBVU0lORyAoZGF5X29mX3dlZWsp) +[play](https://sql.clickhouse.com?query_id=VQF4KMRDSUEXGS1JFVDJHV) ```sql SELECT @@ -1440,7 +1440,7 @@ INNER JOIN We limit the analysis to the current files. For brevity, we restrict the results to a depth of 2 with 5 files per root folder. Adjust as required. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgY29uY2F0KHJvb3QsICcvJywgc3ViX2ZvbGRlcikgQVMgZm9sZGVyLAogICAgcm91bmQoYXZnKGRheXNfcHJlc2VudCkpIEFTIGF2Z19hZ2Vfb2ZfZmlsZXMsCiAgICBtaW4oZGF5c19wcmVzZW50KSBBUyBtaW5fYWdlX2ZpbGVzLAogICAgbWF4KGRheXNfcHJlc2VudCkgQVMgbWF4X2FnZV9maWxlcywKICAgIGNvdW50KCkgQVMgYwpGUk9NCigKICAgIFNFTEVDVAogICAgICAgIHBhdGgsCiAgICAgICAgZGF0ZURpZmYoJ2RheScsIG1pbih0aW1lKSwgdG9EYXRlKCcyMDIyLTExLTAzJykpIEFTIGRheXNfcHJlc2VudAogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIFdIRVJFIChwYXRoIElOIChjdXJyZW50X2ZpbGVzKSkgQU5EIChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKQogICAgR1JPVVAgQlkgcGF0aAopCkdST1VQIEJZCiAgICBzcGxpdEJ5Q2hhcignLycsIHBhdGgpWzFdIEFTIHJvb3QsCiAgICBzcGxpdEJ5Q2hhcignLycsIHBhdGgpWzJdIEFTIHN1Yl9mb2xkZXIKT1JERVIgQlkKICAgIHJvb3QgQVNDLAogICAgYyBERVNDCkxJTUlUIDUgQlkgcm9vdAo=) +[play](https://sql.clickhouse.com?query_id=6YWAUQYPZINZDJGBEZBNWG) ```sql WITH current_files AS @@ -1523,7 +1523,7 @@ LIMIT 5 BY root For this question, we need the number of lines written by an author divided by the total number of lines they have had removed by another contributor. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBrLAogICAgd3JpdHRlbl9jb2RlLmMsCiAgICByZW1vdmVkX2NvZGUuYywKICAgIHJlbW92ZWRfY29kZS5jIC8gd3JpdHRlbl9jb2RlLmMgQVMgcmVtb3ZlX3JhdGlvCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yIEFTIGssCiAgICAgICAgY291bnQoKSBBUyBjCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgV0hFUkUgKHNpZ24gPSAxKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnKSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgIEdST1VQIEJZIGsKKSBBUyB3cml0dGVuX2NvZGUKSU5ORVIgSk9JTgooCiAgICBTRUxFQ1QKICAgICAgICBwcmV2X2F1dGhvciBBUyBrLAogICAgICAgIGNvdW50KCkgQVMgYwogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgIFdIRVJFIChzaWduID0gLTEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcpKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKSBBTkQgKGF1dGhvciAhPSBwcmV2X2F1dGhvcikKICAgIEdST1VQIEJZIGsKKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKGspCldIRVJFIHdyaXR0ZW5fY29kZS5jID4gMTAwMApPUkRFUiBCWSByZW1vdmVfcmF0aW8gREVTQwpMSU1JVCAxMAo=) +[play](https://sql.clickhouse.com?query_id=T4DTWTB36WFSEYAZLMGRNF) ```sql SELECT @@ -1627,7 +1627,7 @@ This doesn't capture the notion of a "re-write" however, where a large portion o The query is limited to the current files only. We list all file changes by grouping by `path` and `commit_hash`, returning the number of lines added and removed. Using a window function, we estimate the file's total size at any moment in time by performing a cumulative sum and estimating the impact of any change on file size as `lines added - lines removed`. Using this statistic, we can calculate the percentage of the file that has been added or removed for each change. Finally, we count the number of file changes that constitute a rewrite per file i.e. `(percent_add >= 0.5) AND (percent_delete >= 0.5) AND current_size > 50`. Note we require files to be more than 50 lines to avoid early contributions to a file being counted as a rewrite. This also avoids a bias to very small files, which may be more likely to be rewritten. -[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY3VycmVudF9maWxlcyBBUwogICAgKAogICAgICAgIFNFTEVDVCBwYXRoCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIDIgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgb2xkX3BhdGgKICAgICAgICAgICAgVU5JT04gQUxMCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICAgICAgICAgIEdST1VQIEJZIHBhdGgKICAgICAgICApCiAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgIEhBVklORyAoYXJnTWF4KGNoYW5nZV90eXBlLCBsYXN0X3RpbWUpICE9IDIpIEFORCAoTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSkKICAgICAgICBPUkRFUiBCWSBwYXRoIEFTQwogICAgKSwKICAgIGNoYW5nZXMgQVMKICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIG1heF90aW1lLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgYW55KGxpbmVzX2FkZGVkKSBBUyBudW1fYWRkZWQsCiAgICAgICAgICAgIGFueShsaW5lc19kZWxldGVkKSBBUyBudW1fZGVsZXRlZCwKICAgICAgICAgICAgYW55KGNoYW5nZV90eXBlKSBBUyB0eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAocGF0aCBJTiAoY3VycmVudF9maWxlcykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaAogICAgICAgIE9SREVSIEJZCiAgICAgICAgICAgIHBhdGggQVNDLAogICAgICAgICAgICBtYXhfdGltZSBBU0MKICAgICksCiAgICByZXdyaXRlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00gY2hhbmdlcwogICAgKQpTRUxFQ1QKICAgIHBhdGgsCiAgICBjb3VudCgpIEFTIG51bV9yZXdyaXRlcwpGUk9NIHJld3JpdGVzCldIRVJFICh0eXBlID0gJ01vZGlmeScpIEFORCAocGVyY2VudF9hZGQgPj0gMC41KSBBTkQgKHBlcmNlbnRfZGVsZXRlID49IDAuNSkgQU5EIChjdXJyZW50X3NpemUgPiA1MCkKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBudW1fcmV3cml0ZXMgREVTQwpMSU1JVCAxMA==) +[play](https://sql.clickhouse.com?query_id=5PL1QLNSH6QQTR8H9HINNP) ```sql WITH @@ -1719,7 +1719,7 @@ We query for lines added, joining this with the lines removed - filtering to cas Finally, we aggregate across this dataset to compute the average number of days lines stay in the repository by the day of the week. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2Vla19hZGRlZCwKICAgIGNvdW50KCkgQVMgbnVtLAogICAgYXZnKGRheXNfcHJlc2VudCkgQVMgYXZnX2RheXNfcHJlc2VudApGUk9NCigKICAgIFNFTEVDVAogICAgICAgIGFkZGVkX2NvZGUubGluZSwKICAgICAgICBhZGRlZF9jb2RlLnRpbWUgQVMgYWRkZWRfZGF5LAogICAgICAgIGRhdGVEaWZmKCdkYXknLCBhZGRlZF9jb2RlLnRpbWUsIHJlbW92ZWRfY29kZS50aW1lKSBBUyBkYXlzX3ByZXNlbnQKICAgIEZST00KICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZSwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgIFdIRVJFIChzaWduID0gMSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBsaW5lCiAgICApIEFTIGFkZGVkX2NvZGUKICAgIElOTkVSIEpPSU4KICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZSwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgIFdIRVJFIChzaWduID0gLTEpIEFORCAobGluZV90eXBlIE5PVCBJTiAoJ1B1bmN0JywgJ0VtcHR5JykpCiAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZQogICAgKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKHBhdGgsIGxpbmUpCiAgICBXSEVSRSByZW1vdmVkX2NvZGUudGltZSA+IGFkZGVkX2NvZGUudGltZQopCkdST1VQIEJZIGRheU9mV2VlayhhZGRlZF9kYXkpIEFTIGRheV9vZl93ZWVrX2FkZGVk) +[play](https://sql.clickhouse.com?query_id=GVF23LEZTNZI22BT8LZBBE) ```sql SELECT @@ -1778,7 +1778,7 @@ GROUP BY dayOfWeek(added_day) AS day_of_week_added This query uses the same principle as [What weekday does the code have the highest chance to stay in the repository](#what-weekday-does-the-code-have-the-highest-chance-to-stay-in-the-repository) - by aiming to uniquely identify a line of code using the path and line contents. This allows us to identify the time between when a line was added and removed. We filter to current files and code only, however, and average the time for each file across lines. -[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY3VycmVudF9maWxlcyBBUwogICAgKAogICAgICAgIFNFTEVDVCBwYXRoCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIDIgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgb2xkX3BhdGgKICAgICAgICAgICAgVU5JT04gQUxMCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICAgICAgICAgIEdST1VQIEJZIHBhdGgKICAgICAgICApCiAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgIEhBVklORyAoYXJnTWF4KGNoYW5nZV90eXBlLCBsYXN0X3RpbWUpICE9IDIpIEFORCAoTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSkKICAgICAgICBPUkRFUiBCWSBwYXRoIEFTQwogICAgKSwKICAgIGxpbmVzX3JlbW92ZWQgQVMKICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgYWRkZWRfY29kZS5wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgIGFkZGVkX2NvZGUubGluZSwKICAgICAgICAgICAgYWRkZWRfY29kZS50aW1lIEFTIGFkZGVkX2RheSwKICAgICAgICAgICAgZGF0ZURpZmYoJ2RheScsIGFkZGVkX2NvZGUudGltZSwgcmVtb3ZlZF9jb2RlLnRpbWUpIEFTIGRheXNfcHJlc2VudAogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGxpbmUsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgdGltZSwKICAgICAgICAgICAgICAgIGFueShmaWxlX2V4dGVuc2lvbikgQVMgZmlsZV9leHRlbnNpb24KICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKHNpZ24gPSAxKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKQogICAgICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGxpbmUKICAgICAgICApIEFTIGFkZGVkX2NvZGUKICAgICAgICBJTk5FUiBKT0lOCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBsaW5lLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKHNpZ24gPSAtMSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBsaW5lCiAgICAgICAgKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKHBhdGgsIGxpbmUpCiAgICAgICAgV0hFUkUgKHJlbW92ZWRfY29kZS50aW1lID4gYWRkZWRfY29kZS50aW1lKSBBTkQgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIGF2ZyhkYXlzX3ByZXNlbnQpIEFTIGF2Z19jb2RlX2FnZQpGUk9NIGxpbmVzX3JlbW92ZWQKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBhdmdfY29kZV9hZ2UgREVTQwpMSU1JVCAxMA==) +[play](https://sql.clickhouse.com?query_id=3CYYT7HEHWRFHVCM9JCKSU) ```sql WITH @@ -1869,7 +1869,7 @@ There are a few ways we can address this question. Focusing on the code to test Note we limit to users with more than 20 changes to focus on regular committers and avoid a bias to one-off contributions. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBjb3VudElmKChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcsICdzaCcsICdweScsICdleHBlY3QnKSkgQU5EIChwYXRoIExJS0UgJyV0ZXN0cyUnKSkgQVMgdGVzdCwKICAgIGNvdW50SWYoKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpIEFORCAoTk9UIChwYXRoIExJS0UgJyV0ZXN0cyUnKSkpIEFTIGNvZGUsCiAgICBjb2RlIC8gKGNvZGUgKyB0ZXN0KSBBUyByYXRpb19jb2RlCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCkdST1VQIEJZIGF1dGhvcgpIQVZJTkcgY29kZSA+IDIwCk9SREVSIEJZIGNvZGUgREVTQwpMSU1JVCAyMA==) +[play](https://sql.clickhouse.com?query_id=JGKZSEQDPDTDKZXD3ZCGLE) ```sql SELECT @@ -1911,7 +1911,7 @@ LIMIT 20 We can plot this distribution as a histogram. -[play](https://play.clickhouse.com/play?user=play#V0lUSCAoCiAgICAgICAgU0VMRUNUIGhpc3RvZ3JhbSgxMCkocmF0aW9fY29kZSkgQVMgaGlzdAogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgY291bnRJZigoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnLCAnc2gnLCAncHknLCAnZXhwZWN0JykpIEFORCAocGF0aCBMSUtFICcldGVzdHMlJykpIEFTIHRlc3QsCiAgICAgICAgICAgICAgICBjb3VudElmKChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKSBBTkQgKE5PVCAocGF0aCBMSUtFICcldGVzdHMlJykpKSBBUyBjb2RlLAogICAgICAgICAgICAgICAgY29kZSAvIChjb2RlICsgdGVzdCkgQVMgcmF0aW9fY29kZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBhdXRob3IKICAgICAgICAgICAgSEFWSU5HIGNvZGUgPiAyMAogICAgICAgICAgICBPUkRFUiBCWSBjb2RlIERFU0MKICAgICAgICAgICAgTElNSVQgMjAKICAgICAgICApCiAgICApIEFTIGhpc3QKU0VMRUNUCiAgICBhcnJheUpvaW4oaGlzdCkuMSBBUyBsb3dlciwKICAgIGFycmF5Sm9pbihoaXN0KS4yIEFTIHVwcGVyLAogICAgYmFyKGFycmF5Sm9pbihoaXN0KS4zLCAwLCAxMDAsIDUwMCkgQVMgYmFy) +[play](https://sql.clickhouse.com?query_id=S5AJIIRGSUAY1JXEVHQDAK) ```sql WITH ( @@ -1954,7 +1954,7 @@ Most contributors write more code than tests, as you'd expect. What about who adds the most comments when contributing code? -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBhdmcocmF0aW9fY29tbWVudHMpIEFTIGF2Z19yYXRpb19jb21tZW50cywKICAgIHN1bShjb2RlKSBBUyBjb2RlCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgIGNvdW50SWYobGluZV90eXBlID0gJ0NvbW1lbnQnKSBBUyBjb21tZW50cywKICAgICAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZSwKICAgICAgICBpZihjb21tZW50cyA+IDAsIGNvbW1lbnRzIC8gKGNvbW1lbnRzICsgY29kZSksIDApIEFTIHJhdGlvX2NvbW1lbnRzCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgR1JPVVAgQlkKICAgICAgICBhdXRob3IsCiAgICAgICAgY29tbWl0X2hhc2gKKQpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgY29kZSBERVNDCkxJTUlUIDEwCg==) +[play](https://sql.clickhouse.com?query_id=EXPHDIURBTOXXOK1TGNNYD) ```sql SELECT @@ -2038,7 +2038,7 @@ To compute this, we first work out each author's comments ratio over time - simi After calculating the average by-week offset across all authors, we sample these results by selecting every 10th week. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBhdXRob3JfcmF0aW9zX2J5X29mZnNldCBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGRhdGVEaWZmKCd3ZWVrJywgc3RhcnRfZGF0ZXMuc3RhcnRfZGF0ZSwgY29udHJpYnV0aW9ucy53ZWVrKSBBUyB3ZWVrX29mZnNldCwKICAgICAgICAgICAgcmF0aW9fY29kZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgdG9TdGFydE9mV2VlayhtaW4odGltZSkpIEFTIHN0YXJ0X2RhdGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKQogICAgICAgICAgICBHUk9VUCBCWSBhdXRob3IgQVMgc3RhcnRfZGF0ZXMKICAgICAgICApIEFTIHN0YXJ0X2RhdGVzCiAgICAgICAgSU5ORVIgSk9JTgogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgICAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZSwKICAgICAgICAgICAgICAgIGNvdW50SWYoKGxpbmVfdHlwZSA9ICdDb21tZW50JykgT1IgKGxpbmVfdHlwZSA9ICdQdW5jdCcpKSBBUyBjb21tZW50cywKICAgICAgICAgICAgICAgIGNvbW1lbnRzIC8gKGNvbW1lbnRzICsgY29kZSkgQVMgcmF0aW9fY29kZSwKICAgICAgICAgICAgICAgIHRvU3RhcnRPZldlZWsodGltZSkgQVMgd2VlawogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgICAgICBXSEVSRSAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkgQU5EIChzaWduID0gMSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHRpbWUsCiAgICAgICAgICAgICAgICBhdXRob3IKICAgICAgICAgICAgSEFWSU5HIGNvZGUgPiAyMAogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgYXV0aG9yIEFTQywKICAgICAgICAgICAgICAgIHRpbWUgQVNDCiAgICAgICAgKSBBUyBjb250cmlidXRpb25zIFVTSU5HIChhdXRob3IpCiAgICApClNFTEVDVAogICAgd2Vla19vZmZzZXQsCiAgICBhdmcocmF0aW9fY29kZSkgQVMgYXZnX2NvZGVfcmF0aW8KRlJPTSBhdXRob3JfcmF0aW9zX2J5X29mZnNldApHUk9VUCBCWSB3ZWVrX29mZnNldApIQVZJTkcgKHdlZWtfb2Zmc2V0ICUgMTApID0gMApPUkRFUiBCWSB3ZWVrX29mZnNldCBBU0MKTElNSVQgMjAK) +[play](https://sql.clickhouse.com?query_id=SBHEWR8XC4PRHY13HPPKCN) ```sql WITH author_ratios_by_offset AS @@ -2116,7 +2116,7 @@ Encouragingly, our comment % is pretty constant and doesn't degrade the longer a We can use the same principle as [List files that were rewritten most number of time or by most of authors](#list-files-that-were-rewritten-most-number-of-time-or-by-most-of-authors) to identify rewrites but consider all files. A window function is used to compute the time between rewrites for each file. From this, we can calculate an average and median across all files. -[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBtYXhfdGltZSwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICAgICAgYW55KGxpbmVzX2FkZGVkKSBBUyBudW1fYWRkZWQsCiAgICAgICAgICAgICAgICBhbnkobGluZXNfZGVsZXRlZCkgQVMgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgICAgICBhbnkoY2hhbmdlX3R5cGUpIEFTIHR5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKGNoYW5nZV90eXBlIElOICgnQWRkJywgJ01vZGlmeScpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCiAgICAgICAgICAgIEdST1VQIEJZCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgY29tbWl0X2hhc2gKICAgICAgICAgICAgT1JERVIgQlkKICAgICAgICAgICAgICAgIHBhdGggQVNDLAogICAgICAgICAgICAgICAgbWF4X3RpbWUgQVNDCiAgICAgICAgKQogICAgKSwKICAgIHJld3JpdGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICosCiAgICAgICAgICAgIGFueShtYXhfdGltZSkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZpb3VzX3Jld3JpdGUsCiAgICAgICAgICAgIGRhdGVEaWZmKCdkYXknLCBwcmV2aW91c19yZXdyaXRlLCBtYXhfdGltZSkgQVMgcmV3cml0ZV9kYXlzCiAgICAgICAgRlJPTSBjaGFuZ2VzCiAgICAgICAgV0hFUkUgKHR5cGUgPSAnTW9kaWZ5JykgQU5EIChwZXJjZW50X2FkZCA+PSAwLjUpIEFORCAocGVyY2VudF9kZWxldGUgPj0gMC41KSBBTkQgKGN1cnJlbnRfc2l6ZSA+IDUwKQogICAgKQpTRUxFQ1QKICAgIGF2Z0lmKHJld3JpdGVfZGF5cywgcmV3cml0ZV9kYXlzID4gMCkgQVMgYXZnX3Jld3JpdGVfdGltZSwKICAgIHF1YW50aWxlc1RpbWluZ0lmKDAuNSkocmV3cml0ZV9kYXlzLCByZXdyaXRlX2RheXMgPiAwKSBBUyBoYWxmX2xpZmUKRlJPTSByZXdyaXRlcw==) +[play](https://sql.clickhouse.com?query_id=WSHUEPJP9TNJUH7QITWWOR) ```sql WITH @@ -2176,7 +2176,7 @@ FROM rewrites Similar to [What is the average time before code will be rewritten and the median (half-life of code decay)?](#what-is-the-average-time-before-code-will-be-rewritten-and-the-median-half-life-of-code-decay) and [List files that were rewritten most number of time or by most of authors](#list-files-that-were-rewritten-most-number-of-time-or-by-most-of-authors), except we aggregate by day of week. Adjust as required e.g. month of year. -[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBtYXhfdGltZSwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICAgICAgYW55KGZpbGVfbGluZXNfYWRkZWQpIEFTIG51bV9hZGRlZCwKICAgICAgICAgICAgICAgIGFueShmaWxlX2xpbmVzX2RlbGV0ZWQpIEFTIG51bV9kZWxldGVkLAogICAgICAgICAgICAgICAgYW55KGZpbGVfY2hhbmdlX3R5cGUpIEFTIHR5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKGZpbGVfY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBjb21taXRfaGFzaAogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgcGF0aCBBU0MsCiAgICAgICAgICAgICAgICBtYXhfdGltZSBBU0MKICAgICAgICApCiAgICApLAogICAgcmV3cml0ZXMgQVMKICAgICgKICAgICAgICBTRUxFQ1QgYW55KG1heF90aW1lKSBPVkVSIChQQVJUSVRJT04gQlkgcGF0aCBPUkRFUiBCWSBtYXhfdGltZSBBU0MgUk9XUyBCRVRXRUVOIDEgUFJFQ0VESU5HIEFORCBDVVJSRU5UIFJPVykgQVMgcHJldmlvdXNfcmV3cml0ZQogICAgICAgIEZST00gY2hhbmdlcwogICAgICAgIFdIRVJFICh0eXBlID0gJ01vZGlmeScpIEFORCAocGVyY2VudF9hZGQgPj0gMC41KSBBTkQgKHBlcmNlbnRfZGVsZXRlID49IDAuNSkgQU5EIChjdXJyZW50X3NpemUgPiA1MCkKICAgICkKU0VMRUNUCiAgICBkYXlPZldlZWsocHJldmlvdXNfcmV3cml0ZSkgQVMgZGF5T2ZXZWVrLAogICAgY291bnQoKSBBUyBudW1fcmVfd3JpdGVzCkZST00gcmV3cml0ZXMKR1JPVVAgQlkgZGF5T2ZXZWVr) +[play](https://sql.clickhouse.com?query_id=8PQNWEWHAJTGN6FTX59KH2) ```sql WITH @@ -2240,7 +2240,7 @@ GROUP BY dayOfWeek We define "sticky" as how long does an author's code stay before its rewritten. Similar to the previous question [What is the average time before code will be rewritten and the median (half-life of code decay)?](#what-is-the-average-time-before-code-will-be-rewritten-and-the-median-half-life-of-code-decay) - using the same metric for rewrites i.e. 50% additions and 50% deletions to the file. We compute the average rewrite time per author and only consider contributors with more than two files. -[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICBtYXhfdGltZSwKICAgICAgICAgICAgdHlwZSwKICAgICAgICAgICAgbnVtX2FkZGVkLAogICAgICAgICAgICBudW1fZGVsZXRlZCwKICAgICAgICAgICAgc3VtKG51bV9hZGRlZCAtIG51bV9kZWxldGVkKSBPVkVSIChQQVJUSVRJT04gQlkgcGF0aCBPUkRFUiBCWSBtYXhfdGltZSBBU0MpIEFTIGN1cnJlbnRfc2l6ZSwKICAgICAgICAgICAgaWYoY3VycmVudF9zaXplID4gMCwgbnVtX2FkZGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2FkZCwKICAgICAgICAgICAgaWYoY3VycmVudF9zaXplID4gMCwgbnVtX2RlbGV0ZWQgLyBjdXJyZW50X3NpemUsIDApIEFTIHBlcmNlbnRfZGVsZXRlCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgYW55KGF1dGhvcikgQVMgYXV0aG9yLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIG1heF90aW1lLAogICAgICAgICAgICAgICAgY29tbWl0X2hhc2gsCiAgICAgICAgICAgICAgICBhbnkoZmlsZV9saW5lc19hZGRlZCkgQVMgbnVtX2FkZGVkLAogICAgICAgICAgICAgICAgYW55KGZpbGVfbGluZXNfZGVsZXRlZCkgQVMgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgICAgICBhbnkoZmlsZV9jaGFuZ2VfdHlwZSkgQVMgdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgICAgICBXSEVSRSAoZmlsZV9jaGFuZ2VfdHlwZSBJTiAoJ0FkZCcsICdNb2RpZnknKSkgQU5EIChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKQogICAgICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoCiAgICAgICAgICAgIE9SREVSIEJZCiAgICAgICAgICAgICAgICBwYXRoIEFTQywKICAgICAgICAgICAgICAgIG1heF90aW1lIEFTQwogICAgICAgICkKICAgICksCiAgICByZXdyaXRlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICAqLAogICAgICAgICAgICBhbnkobWF4X3RpbWUpIE9WRVIgKFBBUlRJVElPTiBCWSBwYXRoIE9SREVSIEJZIG1heF90aW1lIEFTQyBST1dTIEJFVFdFRU4gMSBQUkVDRURJTkcgQU5EIENVUlJFTlQgUk9XKSBBUyBwcmV2aW91c19yZXdyaXRlLAogICAgICAgICAgICBkYXRlRGlmZignZGF5JywgcHJldmlvdXNfcmV3cml0ZSwgbWF4X3RpbWUpIEFTIHJld3JpdGVfZGF5cywKICAgICAgICAgICAgYW55KGF1dGhvcikgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZfYXV0aG9yCiAgICAgICAgRlJPTSBjaGFuZ2VzCiAgICAgICAgV0hFUkUgKHR5cGUgPSAnTW9kaWZ5JykgQU5EIChwZXJjZW50X2FkZCA+PSAwLjUpIEFORCAocGVyY2VudF9kZWxldGUgPj0gMC41KSBBTkQgKGN1cnJlbnRfc2l6ZSA+IDUwKQogICAgKQpTRUxFQ1QKICAgIHByZXZfYXV0aG9yLAogICAgYXZnKHJld3JpdGVfZGF5cykgQVMgYywKICAgIHVuaXEocGF0aCkgQVMgbnVtX2ZpbGVzCkZST00gcmV3cml0ZXMKR1JPVVAgQlkgcHJldl9hdXRob3IKSEFWSU5HIG51bV9maWxlcyA+IDIKT1JERVIgQlkgYyBERVNDCkxJTUlUIDEwCg==) +[play](https://sql.clickhouse.com?query_id=BKHLVVWN5SET1VTIFQ8JVK) ```sql WITH @@ -2319,7 +2319,7 @@ This query first requires us to calculate the days when an author has committed. Our subsequent array functions compute each author's longest sequence of consecutive ones. First, the `groupArray` function is used to collate all `consecutive_day` values for an author. This array of 1s and 0s, is then split on 0 values into subarrays. Finally, we calculate the longest subarray. -[play](https://play.clickhouse.com/play?user=play#V0lUSCBjb21taXRfZGF5cyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGRheSwKICAgICAgICAgICAgYW55KGRheSkgT1ZFUiAoUEFSVElUSU9OIEJZIGF1dGhvciBPUkRFUiBCWSBkYXkgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZpb3VzX2NvbW1pdCwKICAgICAgICAgICAgZGF0ZURpZmYoJ2RheScsIHByZXZpb3VzX2NvbW1pdCwgZGF5KSBBUyBkYXlzX3NpbmNlX2xhc3QsCiAgICAgICAgICAgIGlmKGRheXNfc2luY2VfbGFzdCA9IDEsIDEsIDApIEFTIGNvbnNlY3V0aXZlX2RheQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgdG9TdGFydE9mRGF5KHRpbWUpIEFTIGRheQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIGF1dGhvciwKICAgICAgICAgICAgICAgIGRheQogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgYXV0aG9yIEFTQywKICAgICAgICAgICAgICAgIGRheSBBU0MKICAgICAgICApCiAgICApClNFTEVDVAogICAgYXV0aG9yLAogICAgYXJyYXlNYXgoYXJyYXlNYXAoeCAtPiBsZW5ndGgoeCksIGFycmF5U3BsaXQoeCAtPiAoeCA9IDApLCBncm91cEFycmF5KGNvbnNlY3V0aXZlX2RheSkpKSkgQVMgbWF4X2NvbnNlY3V0aXZlX2RheXMKRlJPTSBjb21taXRfZGF5cwpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgbWF4X2NvbnNlY3V0aXZlX2RheXMgREVTQwpMSU1JVCAxMA==) +[play](https://sql.clickhouse.com?query_id=S3E64UYCAMDAYJRSXINVFR) ```sql WITH commit_days AS @@ -2372,7 +2372,7 @@ LIMIT 10 Files can be renamed. When this occurs, we get a rename event, where the `path` column is set to the new path of the file and the `old_path` represents the previous location e.g. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgcGF0aCwKICAgIG9sZF9wYXRoLAogICAgY29tbWl0X2hhc2gsCiAgICBjb21taXRfbWVzc2FnZQpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAocGF0aCA9ICdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykgQU5EIChjaGFuZ2VfdHlwZSA9ICdSZW5hbWUnKQ==) +[play](https://sql.clickhouse.com?query_id=AKTW3Z8JZAPQ4H9BH2ZFRX) ```sql SELECT @@ -2410,8 +2410,6 @@ By calling `file_path_history('src/Storages/StorageReplicatedMergeTree.cpp')` we For example, -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIGZpbGVfcGF0aF9oaXN0b3J5KCdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykgQVMgcGF0aHMK) - ```sql SELECT file_path_history('src/Storages/StorageReplicatedMergeTree.cpp') AS paths @@ -2424,8 +2422,6 @@ SELECT file_path_history('src/Storages/StorageReplicatedMergeTree.cpp') AS paths We can use this capability to now assemble the commits for the entire history of a file. In this example, we show one commit for each of the `path` values. -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgY2hhbmdlX3R5cGUsCiAgICBhdXRob3IsCiAgICBwYXRoLAogICAgY29tbWl0X21lc3NhZ2UKRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgcGF0aCBJTiBmaWxlX3BhdGhfaGlzdG9yeSgnc3JjL1N0b3JhZ2VzL1N0b3JhZ2VSZXBsaWNhdGVkTWVyZ2VUcmVlLmNwcCcpCk9SREVSIEJZIHRpbWUgREVTQwpMSU1JVCAxIEJZIHBhdGgKRk9STUFUIFByZXR0eUNvbXBhY3RNb25vQmxvY2s=) - ```sql SELECT time, @@ -2457,8 +2453,6 @@ This is particularly difficult to get an exact result due to the inability to cu An approximate solution, sufficient for a high-level analysis, may look something like this: -[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBsaW5lX251bWJlcl9uZXcsCiAgICBhcmdNYXgoYXV0aG9yLCB0aW1lKSwKICAgIGFyZ01heChsaW5lLCB0aW1lKQpGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIGZpbGVfcGF0aF9oaXN0b3J5KCdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykKR1JPVVAgQlkgbGluZV9udW1iZXJfbmV3Ck9SREVSIEJZIGxpbmVfbnVtYmVyX25ldyBBU0MKTElNSVQgMjA=) - ```sql SELECT line_number_new, diff --git a/docs/en/getting-started/example-datasets/menus.md b/docs/en/getting-started/example-datasets/menus.md index 5a35c1d45bc..a364085eeeb 100644 --- a/docs/en/getting-started/example-datasets/menus.md +++ b/docs/en/getting-started/example-datasets/menus.md @@ -354,4 +354,4 @@ At least they have caviar with vodka. Very nice. ## Online Playground {#playground} -The data is uploaded to ClickHouse Playground, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICByb3VuZCh0b1VJbnQzMk9yWmVybyhleHRyYWN0KG1lbnVfZGF0ZSwgJ15cXGR7NH0nKSksIC0xKSBBUyBkLAogICAgY291bnQoKSwKICAgIHJvdW5kKGF2ZyhwcmljZSksIDIpLAogICAgYmFyKGF2ZyhwcmljZSksIDAsIDUwLCAxMDApLAogICAgYW55KGRpc2hfbmFtZSkKRlJPTSBtZW51X2l0ZW1fZGVub3JtCldIRVJFIChtZW51X2N1cnJlbmN5IElOICgnRG9sbGFycycsICcnKSkgQU5EIChkID4gMCkgQU5EIChkIDwgMjAyMikgQU5EIChkaXNoX25hbWUgSUxJS0UgJyVjYXZpYXIlJykKR1JPVVAgQlkgZApPUkRFUiBCWSBkIEFTQw==). +The data is uploaded to ClickHouse Playground, [example](https://sql.clickhouse.com?query_id=KB5KQJJFNBKHE5GBUJCP1B). diff --git a/docs/en/getting-started/example-datasets/ontime.md b/docs/en/getting-started/example-datasets/ontime.md index 9efa1afb5c4..5e1f7c9c97f 100644 --- a/docs/en/getting-started/example-datasets/ontime.md +++ b/docs/en/getting-started/example-datasets/ontime.md @@ -386,7 +386,7 @@ ORDER BY c DESC LIMIT 10; ``` -You can also play with the data in Playground, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIERheU9mV2VlaywgY291bnQoKikgQVMgYwpGUk9NIG9udGltZQpXSEVSRSBZZWFyPj0yMDAwIEFORCBZZWFyPD0yMDA4CkdST1VQIEJZIERheU9mV2VlawpPUkRFUiBCWSBjIERFU0M7Cg==). +You can also play with the data in Playground, [example](https://sql.clickhouse.com?query_id=M4FSVBVMSHY98NKCQP8N4K). This performance test was created by Vadim Tkachenko. See: diff --git a/docs/en/getting-started/example-datasets/opensky.md b/docs/en/getting-started/example-datasets/opensky.md index c0b4d96725d..22f88ce274a 100644 --- a/docs/en/getting-started/example-datasets/opensky.md +++ b/docs/en/getting-started/example-datasets/opensky.md @@ -417,4 +417,4 @@ Result: ### Online Playground {#playground} -You can test other queries to this data set using the interactive resource [Online Playground](https://play.clickhouse.com/play?user=play). For example, [like this](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBvcmlnaW4sCiAgICBjb3VudCgpLAogICAgcm91bmQoYXZnKGdlb0Rpc3RhbmNlKGxvbmdpdHVkZV8xLCBsYXRpdHVkZV8xLCBsb25naXR1ZGVfMiwgbGF0aXR1ZGVfMikpKSBBUyBkaXN0YW5jZSwKICAgIGJhcihkaXN0YW5jZSwgMCwgMTAwMDAwMDAsIDEwMCkgQVMgYmFyCkZST00gb3BlbnNreQpXSEVSRSBvcmlnaW4gIT0gJycKR1JPVVAgQlkgb3JpZ2luCk9SREVSIEJZIGNvdW50KCkgREVTQwpMSU1JVCAxMDA=). However, please note that you cannot create temporary tables here. +You can test other queries to this data set using the interactive resource [Online Playground](https://sql.clickhouse.com). For example, [like this](https://sql.clickhouse.com?query_id=BIPDVQNIGVEZFQYFEFQB7O). However, please note that you cannot create temporary tables here. diff --git a/docs/en/getting-started/example-datasets/recipes.md b/docs/en/getting-started/example-datasets/recipes.md index a8808e376e0..78520a34248 100644 --- a/docs/en/getting-started/example-datasets/recipes.md +++ b/docs/en/getting-started/example-datasets/recipes.md @@ -335,4 +335,4 @@ Result: ### Online Playground -The dataset is also available in the [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==). +The dataset is also available in the [Online Playground](https://sql.clickhouse.com?query_id=HQXNQZE26Z1QWYP9KC76ML). diff --git a/docs/en/getting-started/example-datasets/tpch.md b/docs/en/getting-started/example-datasets/tpch.md index 655d68720fe..5fa0d779ecd 100644 --- a/docs/en/getting-started/example-datasets/tpch.md +++ b/docs/en/getting-started/example-datasets/tpch.md @@ -155,10 +155,10 @@ The queries are generated by `./qgen -s `. Example queries for ` ## Queries -::::warning -TPC-H makes heavy use of correlated subqueries which are at the time of writing (October 2024) not supported by ClickHouse ([issue #6697](https://github.com/ClickHouse/ClickHouse/issues/6697)). -As a result, many of below benchmark queries will fail with errors. -:::: +**Correctness** + +The result of the queries agrees with the official results unless mentioned otherwise. To verify, generate a TPC-H database with scale +factor = 1 (`dbgen`, see above) and compare with the [expected results in tpch-kit](https://github.com/gregrahn/tpch-kit/tree/master/dbgen/answers). **Q1** @@ -177,7 +177,7 @@ SELECT FROM lineitem WHERE - l_shipdate <= date '1998-12-01' - INTERVAL '90' DAY + l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY GROUP BY l_returnflag, l_linestatus @@ -234,6 +234,62 @@ ORDER BY p_partkey; ``` +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + +This alternative formulation works and was verified to return the reference results. + +```sql +WITH MinSupplyCost AS ( + SELECT + ps_partkey, + MIN(ps_supplycost) AS min_supplycost + FROM + partsupp ps + JOIN + supplier s ON ps.ps_suppkey = s.s_suppkey + JOIN + nation n ON s.s_nationkey = n.n_nationkey + JOIN + region r ON n.n_regionkey = r.r_regionkey + WHERE + r.r_name = 'EUROPE' + GROUP BY + ps_partkey +) +SELECT + s.s_acctbal, + s.s_name, + n.n_name, + p.p_partkey, + p.p_mfgr, + s.s_address, + s.s_phone, + s.s_comment +FROM + part p +JOIN + partsupp ps ON p.p_partkey = ps.ps_partkey +JOIN + supplier s ON s.s_suppkey = ps.ps_suppkey +JOIN + nation n ON s.s_nationkey = n.n_nationkey +JOIN + region r ON n.n_regionkey = r.r_regionkey +JOIN + MinSupplyCost msc ON ps.ps_partkey = msc.ps_partkey AND ps.ps_supplycost = msc.min_supplycost +WHERE + p.p_size = 15 + AND p.p_type LIKE '%BRASS' + AND r.r_name = 'EUROPE' +ORDER BY + s.s_acctbal DESC, + n.n_name, + s.s_name, + p.p_partkey; +``` +:::: + **Q3** ```sql @@ -250,8 +306,8 @@ WHERE c_mktsegment = 'BUILDING' AND c_custkey = o_custkey AND l_orderkey = o_orderkey - AND o_orderdate < date '1995-03-15' - AND l_shipdate > date '1995-03-15' + AND o_orderdate < DATE '1995-03-15' + AND l_shipdate > DATE '1995-03-15' GROUP BY l_orderkey, o_orderdate, @@ -270,8 +326,8 @@ SELECT FROM orders WHERE - o_orderdate >= date '1993-07-01' - AND o_orderdate < date '1993-07-01' + INTERVAL '3' MONTH + o_orderdate >= DATE '1993-07-01' + AND o_orderdate < DATE '1993-07-01' + INTERVAL '3' MONTH AND EXISTS ( SELECT * @@ -287,6 +343,39 @@ ORDER BY o_orderpriority; ``` +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + +This alternative formulation works and was verified to return the reference results. + +```sql +WITH ValidLineItems AS ( + SELECT + l_orderkey + FROM + lineitem + WHERE + l_commitdate < l_receiptdate + GROUP BY + l_orderkey +) +SELECT + o.o_orderpriority, + COUNT(*) AS order_count +FROM + orders o +JOIN + ValidLineItems vli ON o.o_orderkey = vli.l_orderkey +WHERE + o.o_orderdate >= DATE '1993-07-01' + AND o.o_orderdate < DATE '1993-07-01' + INTERVAL '3' MONTH +GROUP BY + o.o_orderpriority +ORDER BY + o.o_orderpriority; +``` +:::: + **Q5** ```sql @@ -308,8 +397,8 @@ WHERE AND s_nationkey = n_nationkey AND n_regionkey = r_regionkey AND r_name = 'ASIA' - AND o_orderdate >= date '1994-01-01' - AND o_orderdate < date '1994-01-01' + INTERVAL '1' year + AND o_orderdate >= DATE '1994-01-01' + AND o_orderdate < DATE '1994-01-01' + INTERVAL '1' year GROUP BY n_name ORDER BY @@ -324,12 +413,30 @@ SELECT FROM lineitem WHERE - l_shipdate >= date '1994-01-01' - AND l_shipdate < date '1994-01-01' + INTERVAL '1' year + l_shipdate >= DATE '1994-01-01' + AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year AND l_discount BETWEEN 0.06 - 0.01 AND 0.06 + 0.01 AND l_quantity < 24; ``` +::::note +As of October 2024, the query does not work out-of-the box due to a bug with Decimal addition. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70136 + +This alternative formulation works and was verified to return the reference results. + +```sql +SELECT + sum(l_extendedprice * l_discount) AS revenue +FROM + lineitem +WHERE + l_shipdate >= DATE '1994-01-01' + AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year + AND l_discount BETWEEN 0.05 AND 0.07 + AND l_quantity < 24; +``` +:::: + **Q7** ```sql @@ -361,7 +468,7 @@ FROM ( (n1.n_name = 'FRANCE' AND n2.n_name = 'GERMANY') OR (n1.n_name = 'GERMANY' AND n2.n_name = 'FRANCE') ) - AND l_shipdate BETWEEN date '1995-01-01' AND date '1996-12-31' + AND l_shipdate BETWEEN DATE '1995-01-01' AND DATE '1996-12-31' ) AS shipping GROUP BY supp_nation, @@ -406,7 +513,7 @@ FROM ( AND n1.n_regionkey = r_regionkey AND r_name = 'AMERICA' AND s_nationkey = n2.n_nationkey - AND o_orderdate BETWEEN date '1995-01-01' AND date '1996-12-31' + AND o_orderdate BETWEEN DATE '1995-01-01' AND DATE '1996-12-31' AND p_type = 'ECONOMY ANODIZED STEEL' ) AS all_nations GROUP BY @@ -471,8 +578,8 @@ FROM WHERE c_custkey = o_custkey AND l_orderkey = o_orderkey - AND o_orderdate >= date '1993-10-01' - AND o_orderdate < date '1993-10-01' + INTERVAL '3' MONTH + AND o_orderdate >= DATE '1993-10-01' + AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH AND l_returnflag = 'R' AND c_nationkey = n_nationkey GROUP BY @@ -544,8 +651,8 @@ WHERE AND l_shipmode in ('MAIL', 'SHIP') AND l_commitdate < l_receiptdate AND l_shipdate < l_commitdate - AND l_receiptdate >= date '1994-01-01' - AND l_receiptdate < date '1994-01-01' + INTERVAL '1' year + AND l_receiptdate >= DATE '1994-01-01' + AND l_receiptdate < DATE '1994-01-01' + INTERVAL '1' year GROUP BY l_shipmode ORDER BY @@ -576,6 +683,37 @@ ORDER BY c_count DESC; ``` +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + +This alternative formulation works and was verified to return the reference results. + +```sql +WITH CustomerOrderCounts AS ( + SELECT + c.c_custkey, + count(o.o_orderkey) AS order_count + FROM + customer c + LEFT OUTER JOIN + orders o ON c.c_custkey = o.o_custkey + AND o.o_comment NOT LIKE '%special%requests%' + GROUP BY + c.c_custkey +) +SELECT + order_count AS c_count, + count(*) AS custdist +FROM + CustomerOrderCounts +GROUP BY + order_count +ORDER BY + custdist DESC, + c_count DESC; +``` +:::: + **Q14** ```sql @@ -590,8 +728,8 @@ FROM part WHERE l_partkey = p_partkey - AND l_shipdate >= date '1995-09-01' - AND l_shipdate < date '1995-09-01' + INTERVAL '1' MONTH; + AND l_shipdate >= DATE '1995-09-01' + AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH; ``` **Q15** @@ -604,8 +742,8 @@ CREATE VIEW revenue0 (supplier_no, total_revenue) AS FROM lineitem WHERE - l_shipdate >= date '1996-01-01' - AND l_shipdate < date '1996-01-01' + INTERVAL '3' MONTH + l_shipdate >= DATE '1996-01-01' + AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH GROUP BY l_suppkey; @@ -632,6 +770,26 @@ ORDER BY DROP VIEW revenue0; ``` +::::note +As of October 2024, the view definition does not work out-of-the box. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70139 + +This alternative view definition does work: + +```sql +CREATE VIEW revenue0 AS + SELECT + l_suppkey AS supplier_no, + sum(l_extendedprice * (1 - l_discount)) AS total_revenue + FROM + lineitem + WHERE + l_shipdate >= DATE '1996-01-01' + AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH + GROUP BY + l_suppkey; +``` +:::: + **Q16** ```sql @@ -689,6 +847,37 @@ WHERE ); ``` +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + +This alternative formulation works and was verified to return the reference results. + +```sql +WITH AvgQuantity AS ( + SELECT + l_partkey, + AVG(l_quantity) * 0.2 AS avg_quantity + FROM + lineitem + GROUP BY + l_partkey +) +SELECT + SUM(l.l_extendedprice) / 7.0 AS avg_yearly +FROM + lineitem l +JOIN + part p ON p.p_partkey = l.l_partkey +JOIN + AvgQuantity aq ON l.l_partkey = aq.l_partkey +WHERE + p.p_brand = 'Brand#23' + AND p.p_container = 'MED BOX' + AND l.l_quantity < aq.avg_quantity; + +``` +:::: + **Q18** ```sql @@ -731,7 +920,7 @@ ORDER BY ```sql SELECT - sum(l_extendedprice* (1 - l_discount)) AS revenue + sum(l_extendedprice * (1 - l_discount)) AS revenue FROM lineitem, part @@ -767,6 +956,46 @@ WHERE ); ``` +::::note +As of October 2024, the query is extremely slow due to missing join predicate pushdown. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/70802 + +This alternative formulation works and was verified to return the reference results. + +```sql +SELECT + sum(l_extendedprice * (1 - l_discount)) AS revenue +FROM + lineitem, + part +WHERE + p_partkey = l_partkey + AND l_shipinstruct = 'DELIVER IN PERSON' + AND l_shipmode IN ('AIR', 'AIR REG') + AND ( + ( + p_brand = 'Brand#12' + AND p_container IN ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + AND l_quantity >= 1 AND l_quantity <= 1 + 10 + AND p_size BETWEEN 1 AND 5 + ) + OR + ( + p_brand = 'Brand#23' + AND p_container IN ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + AND l_quantity >= 10 AND l_quantity <= 10 + 10 + AND p_size BETWEEN 1 AND 10 + ) + OR + ( + p_brand = 'Brand#34' + AND p_container IN ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + AND l_quantity >= 20 AND l_quantity <= 20 + 10 + AND p_size BETWEEN 1 AND 15 + ) + ) +``` +:::: + **Q20** ```sql @@ -789,7 +1018,7 @@ WHERE FROM part WHERE - p_name LIKE 'forrest%' + p_name LIKE 'forest%' ) AND ps_availqty > ( SELECT @@ -799,8 +1028,8 @@ WHERE WHERE l_partkey = ps_partkey AND l_suppkey = ps_suppkey - AND l_shipdate >= date '1994-01-01' - AND l_shipdate < date '1994-01-01' + INTERVAL '1' year + AND l_shipdate >= DATE '1994-01-01' + AND l_shipdate < DATE '1994-01-01' + INTERVAL '1' year ) ) AND s_nationkey = n_nationkey @@ -809,6 +1038,10 @@ ORDER BY s_name; ``` +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 +:::: + **Q21** ```sql @@ -852,6 +1085,9 @@ ORDER BY numwait DESC, s_name; ``` +::::note +As of October 2024, the query does not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 +:::: **Q22** @@ -860,34 +1096,33 @@ SELECT cntrycode, count(*) AS numcust, sum(c_acctbal) AS totacctbal -FROM - ( - SELECT - substring(c_phone FROM 1 for 2) AS cntrycode, - c_acctbal - FROM - customer - WHERE - substring(c_phone FROM 1 for 2) in - ('13', '31', '23', '29', '30', '18', '17') - AND c_acctbal > ( - SELECT - avg(c_acctbal) - FROM - customer - WHERE - c_acctbal > 0.00 - AND substring(c_phone FROM 1 for 2) in - ('13', '31', '23', '29', '30', '18', '17') - ) - AND NOT EXISTS ( - SELECT - * - FROM - orders - WHERE - o_custkey = c_custkey - ) +FROM ( + SELECT + substring(c_phone FROM 1 for 2) AS cntrycode, + c_acctbal + FROM + customer + WHERE + substring(c_phone FROM 1 for 2) in + ('13', '31', '23', '29', '30', '18', '17') + AND c_acctbal > ( + SELECT + avg(c_acctbal) + FROM + customer + WHERE + c_acctbal > 0.00 + AND substring(c_phone FROM 1 for 2) in + ('13', '31', '23', '29', '30', '18', '17') + ) + AND NOT EXISTS ( + SELECT + * + FROM + orders + WHERE + o_custkey = c_custkey + ) ) AS custsale GROUP BY cntrycode diff --git a/docs/en/getting-started/example-datasets/uk-price-paid.md b/docs/en/getting-started/example-datasets/uk-price-paid.md index 8ed79c3986f..edc9b0956a9 100644 --- a/docs/en/getting-started/example-datasets/uk-price-paid.md +++ b/docs/en/getting-started/example-datasets/uk-price-paid.md @@ -447,4 +447,4 @@ With projection: 100 rows in set. Elapsed: 0.336 sec. Processed 17.32 thousand r ### Test it in the Playground {#playground} -The dataset is also available in the [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==). +The dataset is also available in the [Online Playground](https://sql.clickhouse.com?query_id=TRCWH5ZETY4SEEK8ISCCAX). diff --git a/docs/en/getting-started/index.md b/docs/en/getting-started/index.md index b520220984c..7898ca01129 100644 --- a/docs/en/getting-started/index.md +++ b/docs/en/getting-started/index.md @@ -23,6 +23,7 @@ functions in ClickHouse. The sample datasets include: - The [NYPD Complaint Data](../getting-started/example-datasets/nypd_complaint_data.md) demonstrates how to use data inference to simplify creating tables - The ["What's on the Menu?" dataset](../getting-started/example-datasets/menus.md) has an example of denormalizing data - The [Laion dataset](../getting-started/example-datasets/laion.md) has an example of [Approximate nearest neighbor search indexes](../engines/table-engines/mergetree-family/annindexes.md) usage +- The [TPC-H](../getting-started/example-datasets/tpch.md), [TPC-DS](../getting-started/example-datasets/tpcds.md), and [Star Schema (SSB)](../getting-started/example-datasets/star-schema.md) industry benchmarks for analytics databases - [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) provides examples of defining a schema and loading a small Hacker News dataset - [Getting Data Into ClickHouse - Part 3 - Using S3](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) has examples of loading data from s3 - [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) shows how to generate random data if none of the above fit your needs. diff --git a/docs/en/getting-started/playground.md b/docs/en/getting-started/playground.md index 6a6d4092177..80b6f9a9889 100644 --- a/docs/en/getting-started/playground.md +++ b/docs/en/getting-started/playground.md @@ -8,7 +8,7 @@ slug: /en/getting-started/playground # ClickHouse Playground -[ClickHouse Playground](https://play.clickhouse.com/play?user=play) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. +[ClickHouse Playground](https://sql.clickhouse.com) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. Several example datasets are available in Playground. You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](../interfaces/jdbc.md) or [ODBC](../interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](../integrations/index.mdx). diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 66291014ed7..504f6eec6de 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -190,6 +190,7 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va - `--config-file` – The name of the configuration file. - `--secure` – If specified, will connect to server over secure connection (TLS). You might need to configure your CA certificates in the [configuration file](#configuration_files). The available configuration settings are the same as for [server-side TLS configuration](../operations/server-configuration-parameters/settings.md#openssl). - `--history_file` — Path to a file containing command history. +- `--history_max_entries` — Maximum number of entries in the history file. Default value: 1 000 000. - `--param_` — Value for a [query with parameters](#cli-queries-with-parameters). - `--hardware-utilization` — Print hardware utilization information in progress bar. - `--print-profile-events` – Print `ProfileEvents` packets. diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index d268a5471cc..b8d16debbac 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -2054,35 +2054,40 @@ ClickHouse Avro format supports reading and writing [Avro data files](https://av The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. -| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | -|---------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|-------------------------------| -| `boolean`, `int`, `long`, `float`, `double` | [Int(8\16\32)](/docs/en/sql-reference/data-types/int-uint.md), [UInt(8\16\32)](/docs/en/sql-reference/data-types/int-uint.md) | `int` | -| `boolean`, `int`, `long`, `float`, `double` | [Int64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `long` | -| `boolean`, `int`, `long`, `float`, `double` | [Float32](/docs/en/sql-reference/data-types/float.md) | `float` | -| `boolean`, `int`, `long`, `float`, `double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `double` | -| `bytes`, `string`, `fixed`, `enum` | [String](/docs/en/sql-reference/data-types/string.md) | `bytes` or `string` \* | -| `bytes`, `string`, `fixed` | [FixedString(N)](/docs/en/sql-reference/data-types/fixedstring.md) | `fixed(N)` | -| `enum` | [Enum(8\16)](/docs/en/sql-reference/data-types/enum.md) | `enum` | -| `array(T)` | [Array(T)](/docs/en/sql-reference/data-types/array.md) | `array(T)` | -| `map(V, K)` | [Map(V, K)](/docs/en/sql-reference/data-types/map.md) | `map(string, K)` | -| `union(null, T)`, `union(T, null)` | [Nullable(T)](/docs/en/sql-reference/data-types/date.md) | `union(null, T)` | -| `null` | [Nullable(Nothing)](/docs/en/sql-reference/data-types/special-data-types/nothing.md) | `null` | -| `int (date)` \** | [Date](/docs/en/sql-reference/data-types/date.md), [Date32](docs/en/sql-reference/data-types/date32.md) | `int (date)` \** | -| `long (timestamp-millis)` \** | [DateTime64(3)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \** | -| `long (timestamp-micros)` \** | [DateTime64(6)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \** | -| `bytes (decimal)` \** | [DateTime64(N)](/docs/en/sql-reference/data-types/datetime.md) | `bytes (decimal)` \** | -| `int` | [IPv4](/docs/en/sql-reference/data-types/ipv4.md) | `int` | -| `fixed(16)` | [IPv6](/docs/en/sql-reference/data-types/ipv6.md) | `fixed(16)` | -| `bytes (decimal)` \** | [Decimal(P, S)](/docs/en/sql-reference/data-types/decimal.md) | `bytes (decimal)` \** | -| `string (uuid)` \** | [UUID](/docs/en/sql-reference/data-types/uuid.md) | `string (uuid)` \** | -| `fixed(16)` | [Int128/UInt128](/docs/en/sql-reference/data-types/int-uint.md) | `fixed(16)` | -| `fixed(32)` | [Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `fixed(32)` | -| `record` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `record` | +| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | +|---------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|---------------------------------| +| `boolean`, `int`, `long`, `float`, `double` | [Int(8\16\32)](/docs/en/sql-reference/data-types/int-uint.md), [UInt(8\16\32)](/docs/en/sql-reference/data-types/int-uint.md) | `int` | +| `boolean`, `int`, `long`, `float`, `double` | [Int64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `long` | +| `boolean`, `int`, `long`, `float`, `double` | [Float32](/docs/en/sql-reference/data-types/float.md) | `float` | +| `boolean`, `int`, `long`, `float`, `double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `double` | +| `bytes`, `string`, `fixed`, `enum` | [String](/docs/en/sql-reference/data-types/string.md) | `bytes` or `string` \* | +| `bytes`, `string`, `fixed` | [FixedString(N)](/docs/en/sql-reference/data-types/fixedstring.md) | `fixed(N)` | +| `enum` | [Enum(8\16)](/docs/en/sql-reference/data-types/enum.md) | `enum` | +| `array(T)` | [Array(T)](/docs/en/sql-reference/data-types/array.md) | `array(T)` | +| `map(V, K)` | [Map(V, K)](/docs/en/sql-reference/data-types/map.md) | `map(string, K)` | +| `union(null, T)`, `union(T, null)` | [Nullable(T)](/docs/en/sql-reference/data-types/date.md) | `union(null, T)` | +| `union(T1, T2, …)` \** | [Variant(T1, T2, …)](/docs/en/sql-reference/data-types/variant.md) | `union(T1, T2, …)` \** | +| `null` | [Nullable(Nothing)](/docs/en/sql-reference/data-types/special-data-types/nothing.md) | `null` | +| `int (date)` \**\* | [Date](/docs/en/sql-reference/data-types/date.md), [Date32](docs/en/sql-reference/data-types/date32.md) | `int (date)` \**\* | +| `long (timestamp-millis)` \**\* | [DateTime64(3)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \**\* | +| `long (timestamp-micros)` \**\* | [DateTime64(6)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \**\* | +| `bytes (decimal)` \**\* | [DateTime64(N)](/docs/en/sql-reference/data-types/datetime.md) | `bytes (decimal)` \**\* | +| `int` | [IPv4](/docs/en/sql-reference/data-types/ipv4.md) | `int` | +| `fixed(16)` | [IPv6](/docs/en/sql-reference/data-types/ipv6.md) | `fixed(16)` | +| `bytes (decimal)` \**\* | [Decimal(P, S)](/docs/en/sql-reference/data-types/decimal.md) | `bytes (decimal)` \**\* | +| `string (uuid)` \**\* | [UUID](/docs/en/sql-reference/data-types/uuid.md) | `string (uuid)` \**\* | +| `fixed(16)` | [Int128/UInt128](/docs/en/sql-reference/data-types/int-uint.md) | `fixed(16)` | +| `fixed(32)` | [Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `fixed(32)` | +| `record` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `record` | \* `bytes` is default, controlled by [output_format_avro_string_column_pattern](/docs/en/operations/settings/settings-formats.md/#output_format_avro_string_column_pattern) -\** [Avro logical types](https://avro.apache.org/docs/current/spec.html#Logical+Types) + +\** [Variant type](/docs/en/sql-reference/data-types/variant) implicitly accepts `null` as a field value, so for example the Avro `union(T1, T2, null)` will be converted to `Variant(T1, T2)`. +As a result, when producing Avro from ClickHouse, we have to always include the `null` type to the Avro `union` type set as we don't know if any value is actually `null` during the schema inference. + +\**\* [Avro logical types](https://avro.apache.org/docs/current/spec.html#Logical+Types) Unsupported Avro logical data types: `time-millis`, `time-micros`, `duration` diff --git a/docs/en/interfaces/prometheus.md b/docs/en/interfaces/prometheus.md index 8e7023cc51f..11f503b54d7 100644 --- a/docs/en/interfaces/prometheus.md +++ b/docs/en/interfaces/prometheus.md @@ -9,7 +9,7 @@ sidebar_label: Prometheus protocols ## Exposing metrics {#expose} :::note -ClickHouse Cloud does not currently support connecting to Prometheus. To be notified when this feature is supported, please contact support@clickhouse.com. +If you are using ClickHouse Cloud, you can expose metrics to Prometheus using the [Prometheus Integration](/en/integrations/prometheus). ::: ClickHouse can expose its own metrics for scraping from Prometheus: diff --git a/docs/en/operations/_troubleshooting.md b/docs/en/operations/_troubleshooting.md index 77389782675..f0ee1ca1d29 100644 --- a/docs/en/operations/_troubleshooting.md +++ b/docs/en/operations/_troubleshooting.md @@ -65,6 +65,34 @@ sudo rm -f /etc/yum.repos.d/clickhouse.repo After that follow the [install guide](../getting-started/install.md#from-rpm-packages) +### You Can't Run Docker Container + +You are running a simple `docker run clickhouse/clickhouse-server` and it crashes with a stack trace similar to following: + +``` +$ docker run -it clickhouse/clickhouse-server +........ +2024.11.06 21:04:48.912036 [ 1 ] {} SentryWriter: Sending crash reports is disabled +Poco::Exception. Code: 1000, e.code() = 0, System exception: cannot start thread, Stack trace (when copying this message, always include the lines below): + +0. Poco::ThreadImpl::startImpl(Poco::SharedPtr>) @ 0x00000000157c7b34 +1. Poco::Thread::start(Poco::Runnable&) @ 0x00000000157c8a0e +2. BaseDaemon::initializeTerminationAndSignalProcessing() @ 0x000000000d267a14 +3. BaseDaemon::initialize(Poco::Util::Application&) @ 0x000000000d2652cb +4. DB::Server::initialize(Poco::Util::Application&) @ 0x000000000d128b38 +5. Poco::Util::Application::run() @ 0x000000001581cfda +6. DB::Server::run() @ 0x000000000d1288f0 +7. Poco::Util::ServerApplication::run(int, char**) @ 0x0000000015825e27 +8. mainEntryClickHouseServer(int, char**) @ 0x000000000d125b38 +9. main @ 0x0000000007ea4eee +10. ? @ 0x00007f67ff946d90 +11. ? @ 0x00007f67ff946e40 +12. _start @ 0x00000000062e802e + (version 24.10.1.2812 (official build)) +``` + +The reason is an old docker daemon with version lower than `20.10.10`. A way to fix it either upgrading it, or running `docker run [--privileged | --security-opt seccomp=unconfined]`. The latter has security implications. + ## Connecting to the Server {#troubleshooting-accepts-no-connections} Possible issues: diff --git a/docs/en/operations/opentelemetry.md b/docs/en/operations/opentelemetry.md index 48078197309..9f3a48dfa5a 100644 --- a/docs/en/operations/opentelemetry.md +++ b/docs/en/operations/opentelemetry.md @@ -33,7 +33,7 @@ The tags or attributes are saved as two parallel arrays, containing the keys and ## Log-query-settings -ClickHouse allows you to log changes to query settings during query execution. When enabled, any modifications made to query settings will be recorded in the OpenTelemetry span log. This feature is particularly useful in production environments for tracking configuration changes that may affect query performance. +Setting [log_query_settings](settings/settings.md) allows log changes to query settings during query execution. When enabled, any modifications made to query settings will be recorded in the OpenTelemetry span log. This feature is particularly useful in production environments for tracking configuration changes that may affect query performance. ## Integration with monitoring systems diff --git a/docs/en/operations/query-cache.md b/docs/en/operations/query-cache.md index 955cec0234e..f0941aa28aa 100644 --- a/docs/en/operations/query-cache.md +++ b/docs/en/operations/query-cache.md @@ -25,9 +25,10 @@ Query caches can generally be viewed as transactionally consistent or inconsiste slowly enough that the database only needs to compute the report once (represented by the first `SELECT` query). Further queries can be served directly from the query cache. In this example, a reasonable validity period could be 30 min. -Transactionally inconsistent caching is traditionally provided by client tools or proxy packages interacting with the database. As a result, -the same caching logic and configuration is often duplicated. With ClickHouse's query cache, the caching logic moves to the server side. -This reduces maintenance effort and avoids redundancy. +Transactionally inconsistent caching is traditionally provided by client tools or proxy packages (e.g. +[chproxy](https://www.chproxy.org/configuration/caching/)) interacting with the database. As a result, the same caching logic and +configuration is often duplicated. With ClickHouse's query cache, the caching logic moves to the server side. This reduces maintenance +effort and avoids redundancy. ## Configuration Settings and Usage @@ -138,7 +139,10 @@ is only cached if the query runs longer than 5 seconds. It is also possible to s cached - for that use setting [query_cache_min_query_runs](settings/settings.md#query-cache-min-query-runs). Entries in the query cache become stale after a certain time period (time-to-live). By default, this period is 60 seconds but a different -value can be specified at session, profile or query level using setting [query_cache_ttl](settings/settings.md#query-cache-ttl). +value can be specified at session, profile or query level using setting [query_cache_ttl](settings/settings.md#query-cache-ttl). The query +cache evicts entries "lazily", i.e. when an entry becomes stale, it is not immediately removed from the cache. Instead, when a new entry +is to be inserted into the query cache, the database checks whether the cache has enough free space for the new entry. If this is not the +case, the database tries to remove all stale entries. If the cache still has not enough free space, the new entry is not inserted. Entries in the query cache are compressed by default. This reduces the overall memory consumption at the cost of slower writes into / reads from the query cache. To disable compression, use setting [query_cache_compress_entries](settings/settings.md#query-cache-compress-entries). @@ -188,14 +192,9 @@ Also, results of queries with non-deterministic functions are not cached by defa To force caching of results of queries with non-deterministic functions regardless, use setting [query_cache_nondeterministic_function_handling](settings/settings.md#query-cache-nondeterministic-function-handling). -Results of queries that involve system tables, e.g. `system.processes` or `information_schema.tables`, are not cached by default. To force -caching of results of queries with system tables regardless, use setting -[query_cache_system_table_handling](settings/settings.md#query-cache-system-table-handling). - -:::note -Prior to ClickHouse v23.11, setting 'query_cache_store_results_of_queries_with_nondeterministic_functions = 0 / 1' controlled whether -results of queries with non-deterministic results were cached. In newer ClickHouse versions, this setting is obsolete and has no effect. -::: +Results of queries that involve system tables (e.g. [system.processes](system-tables/processes.md)` or +[information_schema.tables](system-tables/information_schema.md)) are not cached by default. To force caching of results of queries with +system tables regardless, use setting [query_cache_system_table_handling](settings/settings.md#query-cache-system-table-handling). Finally, entries in the query cache are not shared between users due to security reasons. For example, user A must not be able to bypass a row policy on a table by running the same query as another user B for whom no such policy exists. However, if necessary, cache entries can diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 79407d46ce0..c5f92ccdf68 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -131,16 +131,6 @@ Type: UInt64 Default: 8 -## background_pool_size - -Sets the number of threads performing background merges and mutations for tables with MergeTree engines. You can only increase the number of threads at runtime. To lower the number of threads you have to restart the server. By adjusting this setting, you manage CPU and disk load. Smaller pool size utilizes less CPU and disk resources, but background processes advance slower which might eventually impact query performance. - -Before changing it, please also take a look at related MergeTree settings, such as `number_of_free_entries_in_pool_to_lower_max_size_of_merge` and `number_of_free_entries_in_pool_to_execute_mutation`. - -Type: UInt64 - -Default: 16 - ## background_schedule_pool_size The maximum number of threads that will be used for constantly executing some lightweight periodic operations for replicated tables, Kafka streaming, and DNS cache updates. @@ -1488,6 +1478,8 @@ Keys: - `formatting` – Log format for console output. Currently, only `json` is supported). - `use_syslog` - Also forward log output to syslog. - `syslog_level` - Log level for logging to syslog. +- `message_regexp` - Only log messages that match this regular expression. Defaults to `""`, indicating no filtering. +- `message_regexp_negative` - Only log messages that don't match this regular expression. Defaults to `""`, indicating no filtering. **Log format specifiers** @@ -1576,6 +1568,28 @@ The log level of individual log names can be overridden. For example, to mute al ``` +**Regular Expression Filtering** + +The messages logged can be filtered using regular expressions using `message_regexp` and `message_regexp_negative`. This can be done on a per-level basis or globally. If both a global and logger-specific pattern is specified, the global pattern is overridden (ignored) and only the logger-specific pattern applies. The positive and negative patterns are considered independently for this situation. Note: Using this feature may cause a slight slowdown in performance. + + +```xml + + trace + + .*Trace.* + + + + + executeQuery + .*Read.* + .*from.* + + + +``` + ### syslog To write log messages additionally to syslog: @@ -1951,6 +1965,22 @@ The default is `false`. true ``` +## async_load_system_database {#async_load_system_database} + +Asynchronous loading of system tables. Helpful if there is a high amount of log tables and parts in the `system` database. Independent of the `async_load_databases` setting. + +If set to `true`, all system databases with `Ordinary`, `Atomic`, and `Replicated` engines will be loaded asynchronously after the ClickHouse server starts. See `system.asynchronous_loader` table, `tables_loader_background_pool_size` and `tables_loader_foreground_pool_size` server settings. Any query that tries to access a system table, that is not yet loaded, will wait for exactly this table to be started up. The table that is waited for by at least one query will be loaded with higher priority. Also consider setting the `max_waiting_queries` setting to limit the total number of waiting queries. + +If `false`, system database loads before server start. + +The default is `false`. + +**Example** + +``` xml +true +``` + ## tables_loader_foreground_pool_size {#tables_loader_foreground_pool_size} Sets the number of threads performing load jobs in foreground pool. The foreground pool is used for loading table synchronously before server start listening on a port and for loading tables that are waited for. Foreground pool has higher priority than background pool. It means that no job starts in background pool while there are jobs running in foreground pool. @@ -2193,6 +2223,39 @@ If the table does not exist, ClickHouse will create it. If the structure of the ``` +# query_metric_log {#query_metric_log} + +It is disabled by default. + +**Enabling** + +To manually turn on metrics history collection [`system.query_metric_log`](../../operations/system-tables/query_metric_log.md), create `/etc/clickhouse-server/config.d/query_metric_log.xml` with the following content: + +``` xml + + + system + query_metric_log
+ 7500 + 1000 + 1048576 + 8192 + 524288 + false +
+
+``` + +**Disabling** + +To disable `query_metric_log` setting, you should create the following file `/etc/clickhouse-server/config.d/disable_query_metric_log.xml` with the following content: + +``` xml + + + +``` + ## query_cache {#server_configuration_parameters_query-cache} [Query cache](../query-cache.md) configuration. @@ -3085,7 +3148,7 @@ By default, tunneling (i.e, `HTTP CONNECT`) is used to make `HTTPS` requests ove ### no_proxy By default, all requests will go through the proxy. In order to disable it for specific hosts, the `no_proxy` variable must be set. -It can be set inside the `` clause for list and remote resolvers and as an environment variable for environment resolver. +It can be set inside the `` clause for list and remote resolvers and as an environment variable for environment resolver. It supports IP addresses, domains, subdomains and `'*'` wildcard for full bypass. Leading dots are stripped just like curl does. Example: @@ -3151,6 +3214,34 @@ Default value: "default" **See Also** - [Workload Scheduling](/docs/en/operations/workload-scheduling.md) +## workload_path {#workload_path} + +The directory used as a storage for all `CREATE WORKLOAD` and `CREATE RESOURCE` queries. By default `/workload/` folder under server working directory is used. + +**Example** + +``` xml +/var/lib/clickhouse/workload/ +``` + +**See Also** +- [Workload Hierarchy](/docs/en/operations/workload-scheduling.md#workloads) +- [workload_zookeeper_path](#workload_zookeeper_path) + +## workload_zookeeper_path {#workload_zookeeper_path} + +The path to a ZooKeeper node, which is used as a storage for all `CREATE WORKLOAD` and `CREATE RESOURCE` queries. For consistency all SQL definitions are stored as a value of this single znode. By default ZooKeeper is not used and definitions are stored on [disk](#workload_path). + +**Example** + +``` xml +/clickhouse/workload/definitions.sql +``` + +**See Also** +- [Workload Hierarchy](/docs/en/operations/workload-scheduling.md#workloads) +- [workload_path](#workload_path) + ## max_authentication_methods_per_user {#max_authentication_methods_per_user} The maximum number of authentication methods a user can be created with or altered to. diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index 2fd34c4067c..45c4cdf9458 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -1079,6 +1079,8 @@ Possible values: Default value: 0 bytes. +Note that if both `min_free_disk_bytes_to_perform_insert` and `min_free_disk_ratio_to_perform_insert` are specified, ClickHouse will count on the value that will allow to perform inserts on a bigger amount of free memory. + ## min_free_disk_ratio_to_perform_insert The minimum free to total disk space ratio to perform an `INSERT`. Must be a floating point value between 0 and 1. Note that this setting: diff --git a/docs/en/operations/system-tables/azure_queue_settings.md b/docs/en/operations/system-tables/azure_queue_settings.md new file mode 100644 index 00000000000..89235691110 --- /dev/null +++ b/docs/en/operations/system-tables/azure_queue_settings.md @@ -0,0 +1,20 @@ +--- +slug: /en/operations/system-tables/azure_queue_settings +--- +# azure_queue_settings + +Contains information about settings of [AzureQueue](../../engines/table-engines/integrations/azure-queue.md) tables. +Available from `24.10` server version. + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `table` ([String](../../sql-reference/data-types/string.md)) — Database name. +- `name` ([String](../../sql-reference/data-types/string.md)) — Setting name. +- `value` ([String](../../sql-reference/data-types/string.md)) — Setting value. +- `changed` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Whether the setting was explicitly defined in the config or explicitly changed. +- `description` ([String](../../sql-reference/data-types/string.md)) — Setting description. +- `alterable` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether the setting can be changes via `ALTER TABLE ... MODIFY SETTING`. + - `0` — Current user can alter the setting. + - `1` — Current user can’t alter the setting. +- `type` ([String](../../sql-reference/data-types/string.md)) — Setting type (implementation specific string value). diff --git a/docs/en/operations/system-tables/grants.md b/docs/en/operations/system-tables/grants.md index 262a53a87a5..debc3146008 100644 --- a/docs/en/operations/system-tables/grants.md +++ b/docs/en/operations/system-tables/grants.md @@ -19,7 +19,7 @@ Columns: - `column` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Name of a column to which access is granted. - `is_partial_revoke` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Logical value. It shows whether some privileges have been revoked. Possible values: -- `0` — The row describes a partial revoke. -- `1` — The row describes a grant. +- `0` — The row describes a grant. +- `1` — The row describes a partial revoke. - `grant_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Permission is granted `WITH GRANT OPTION`, see [GRANT](../../sql-reference/statements/grant.md#granting-privilege-syntax). diff --git a/docs/en/operations/system-tables/merge_tree_settings.md b/docs/en/operations/system-tables/merge_tree_settings.md index 48217d63f9d..473315d3941 100644 --- a/docs/en/operations/system-tables/merge_tree_settings.md +++ b/docs/en/operations/system-tables/merge_tree_settings.md @@ -18,6 +18,11 @@ Columns: - `1` — Current user can’t change the setting. - `type` ([String](../../sql-reference/data-types/string.md)) — Setting type (implementation specific string value). - `is_obsolete` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) - Shows whether a setting is obsolete. +- `tier` ([Enum8](../../sql-reference/data-types/enum.md)) — Support level for this feature. ClickHouse features are organized in tiers, varying depending on the current status of their development and the expectations one might have when using them. Values: + - `'Production'` — The feature is stable, safe to use and does not have issues interacting with other **production** features. . + - `'Beta'` — The feature is stable and safe. The outcome of using it together with other features is unknown and correctness is not guaranteed. Testing and reports are welcome. + - `'Experimental'` — The feature is under development. Only intended for developers and ClickHouse enthusiasts. The feature might or might not work and could be removed at any time. + - `'Obsolete'` — No longer supported. Either it is already removed or it will be removed in future releases. **Example** ```sql diff --git a/docs/en/operations/system-tables/part_log.md b/docs/en/operations/system-tables/part_log.md index 2ad2ae68ab5..f3cf013b4a0 100644 --- a/docs/en/operations/system-tables/part_log.md +++ b/docs/en/operations/system-tables/part_log.md @@ -13,10 +13,12 @@ The `system.part_log` table contains the following columns: - `query_id` ([String](../../sql-reference/data-types/string.md)) — Identifier of the `INSERT` query that created this data part. - `event_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of the event that occurred with the data part. Can have one of the following values: - `NewPart` — Inserting of a new data part. - - `MergeParts` — Merging of data parts. + - `MergePartsStart` — Merging of data parts has started. + - `MergeParts` — Merging of data parts has finished. - `DownloadPart` — Downloading a data part. - `RemovePart` — Removing or detaching a data part using [DETACH PARTITION](../../sql-reference/statements/alter/partition.md#alter_detach-partition). - - `MutatePart` — Mutating of a data part. + - `MutatePartStart` — Mutating of a data part has started. + - `MutatePart` — Mutating of a data part has finished. - `MovePart` — Moving the data part from the one disk to another one. - `merge_reason` ([Enum8](../../sql-reference/data-types/enum.md)) — The reason for the event with type `MERGE_PARTS`. Can have one of the following values: - `NotAMerge` — The current event has the type other than `MERGE_PARTS`. diff --git a/docs/en/operations/system-tables/query_metric_log.md b/docs/en/operations/system-tables/query_metric_log.md new file mode 100644 index 00000000000..38d44c0e19a --- /dev/null +++ b/docs/en/operations/system-tables/query_metric_log.md @@ -0,0 +1,49 @@ +--- +slug: /en/operations/system-tables/query_metric_log +--- +# query_metric_log + +Contains history of memory and metric values from table `system.events` for individual queries, periodically flushed to disk. + +Once a query starts, data is collected at periodic intervals of `query_metric_log_interval` milliseconds (which is set to 1000 +by default). The data is also collected when the query finishes if the query takes longer than `query_metric_log_interval`. + +Columns: +- `query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the query. +- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query. +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds resolution. + +**Example** + +``` sql +SELECT * FROM system.query_metric_log LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +query_id: 97c8ba04-b6d4-4bd7-b13e-6201c5c6e49d +hostname: clickhouse.eu-central1.internal +event_date: 2020-09-05 +event_time: 2020-09-05 16:22:33 +event_time_microseconds: 2020-09-05 16:22:33.196807 +memory_usage: 313434219 +peak_memory_usage: 598951986 +ProfileEvent_Query: 0 +ProfileEvent_SelectQuery: 0 +ProfileEvent_InsertQuery: 0 +ProfileEvent_FailedQuery: 0 +ProfileEvent_FailedSelectQuery: 0 +... +``` + +**See also** + +- [query_metric_log setting](../../operations/server-configuration-parameters/settings.md#query_metric_log) — Enabling and disabling the setting. +- [query_metric_log_interval](../../operations/settings/settings.md#query_metric_log_interval) +- [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md) — Contains periodically calculated metrics. +- [system.events](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that occurred. +- [system.metrics](../../operations/system-tables/metrics.md) — Contains instantly calculated metrics. +- [Monitoring](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. diff --git a/docs/en/operations/system-tables/resources.md b/docs/en/operations/system-tables/resources.md new file mode 100644 index 00000000000..6329f05f610 --- /dev/null +++ b/docs/en/operations/system-tables/resources.md @@ -0,0 +1,37 @@ +--- +slug: /en/operations/system-tables/resources +--- +# resources + +Contains information for [resources](/docs/en/operations/workload-scheduling.md#workload_entity_storage) residing on the local server. The table contains a row for every resource. + +Example: + +``` sql +SELECT * +FROM system.resources +FORMAT Vertical +``` + +``` text +Row 1: +────── +name: io_read +read_disks: ['s3'] +write_disks: [] +create_query: CREATE RESOURCE io_read (READ DISK s3) + +Row 2: +────── +name: io_write +read_disks: [] +write_disks: ['s3'] +create_query: CREATE RESOURCE io_write (WRITE DISK s3) +``` + +Columns: + +- `name` (`String`) - Resource name. +- `read_disks` (`Array(String)`) - The array of disk names that uses this resource for read operations. +- `write_disks` (`Array(String)`) - The array of disk names that uses this resource for write operations. +- `create_query` (`String`) - The definition of the resource. diff --git a/docs/en/operations/system-tables/s3_queue_settings.md b/docs/en/operations/system-tables/s3_queue_settings.md new file mode 100644 index 00000000000..87e067b35fb --- /dev/null +++ b/docs/en/operations/system-tables/s3_queue_settings.md @@ -0,0 +1,20 @@ +--- +slug: /en/operations/system-tables/s3_queue_settings +--- +# s3_queue_settings + +Contains information about settings of [S3Queue](../../engines/table-engines/integrations/s3queue.md) tables. +Available from `24.10` server version. + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `table` ([String](../../sql-reference/data-types/string.md)) — Database name. +- `name` ([String](../../sql-reference/data-types/string.md)) — Setting name. +- `value` ([String](../../sql-reference/data-types/string.md)) — Setting value. +- `changed` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Whether the setting was explicitly defined in the config or explicitly changed. +- `description` ([String](../../sql-reference/data-types/string.md)) — Setting description. +- `alterable` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether the setting can be changes via `ALTER TABLE ... MODIFY SETTING`. + - `0` — Current user can alter the setting. + - `1` — Current user can’t alter the setting. +- `type` ([String](../../sql-reference/data-types/string.md)) — Setting type (implementation specific string value). diff --git a/docs/en/operations/system-tables/settings.md b/docs/en/operations/system-tables/settings.md index a04e095e990..1cfee0ba5f4 100644 --- a/docs/en/operations/system-tables/settings.md +++ b/docs/en/operations/system-tables/settings.md @@ -18,6 +18,11 @@ Columns: - `1` — Current user can’t change the setting. - `default` ([String](../../sql-reference/data-types/string.md)) — Setting default value. - `is_obsolete` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) - Shows whether a setting is obsolete. +- `tier` ([Enum8](../../sql-reference/data-types/enum.md)) — Support level for this feature. ClickHouse features are organized in tiers, varying depending on the current status of their development and the expectations one might have when using them. Values: + - `'Production'` — The feature is stable, safe to use and does not have issues interacting with other **production** features. . + - `'Beta'` — The feature is stable and safe. The outcome of using it together with other features is unknown and correctness is not guaranteed. Testing and reports are welcome. + - `'Experimental'` — The feature is under development. Only intended for developers and ClickHouse enthusiasts. The feature might or might not work and could be removed at any time. + - `'Obsolete'` — No longer supported. Either it is already removed or it will be removed in future releases. **Example** @@ -26,19 +31,99 @@ The following example shows how to get information about settings which name con ``` sql SELECT * FROM system.settings -WHERE name LIKE '%min_i%' +WHERE name LIKE '%min_insert_block_size_%' +FORMAT Vertical ``` ``` text -┌─name───────────────────────────────────────────────_─value─────_─changed─_─description───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────_─min──_─max──_─readonly─_─type─────────_─default───_─alias_for─_─is_obsolete─┐ -│ min_insert_block_size_rows │ 1048449 │ 0 │ Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough. │ ____ │ ____ │ 0 │ UInt64 │ 1048449 │ │ 0 │ -│ min_insert_block_size_bytes │ 268402944 │ 0 │ Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough. │ ____ │ ____ │ 0 │ UInt64 │ 268402944 │ │ 0 │ -│ min_insert_block_size_rows_for_materialized_views │ 0 │ 0 │ Like min_insert_block_size_rows, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_rows) │ ____ │ ____ │ 0 │ UInt64 │ 0 │ │ 0 │ -│ min_insert_block_size_bytes_for_materialized_views │ 0 │ 0 │ Like min_insert_block_size_bytes, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_bytes) │ ____ │ ____ │ 0 │ UInt64 │ 0 │ │ 0 │ -│ read_backoff_min_interval_between_events_ms │ 1000 │ 0 │ Settings to reduce the number of threads in case of slow reads. Do not pay attention to the event, if the previous one has passed less than a certain amount of time. │ ____ │ ____ │ 0 │ Milliseconds │ 1000 │ │ 0 │ -└────────────────────────────────────────────────────┴───────────┴─────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────── -──────────────────────────────────────────────────────┴──────┴──────┴──────────┴──────────────┴───────────┴───────────┴─────────────┘ -``` +Row 1: +────── +name: min_insert_block_size_rows +value: 1048449 +changed: 0 +description: Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. + +Possible values: + +- Positive integer. +- 0 — Squashing disabled. +min: ᴺᵁᴸᴸ +max: ᴺᵁᴸᴸ +readonly: 0 +type: UInt64 +default: 1048449 +alias_for: +is_obsolete: 0 +tier: Production + +Row 2: +────── +name: min_insert_block_size_bytes +value: 268402944 +changed: 0 +description: Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. + +Possible values: + +- Positive integer. +- 0 — Squashing disabled. +min: ᴺᵁᴸᴸ +max: ᴺᵁᴸᴸ +readonly: 0 +type: UInt64 +default: 268402944 +alias_for: +is_obsolete: 0 +tier: Production + +Row 3: +────── +name: min_insert_block_size_rows_for_materialized_views +value: 0 +changed: 0 +description: Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. + +Possible values: + +- Any positive integer. +- 0 — Squashing disabled. + +**See Also** + +- [min_insert_block_size_rows](#min-insert-block-size-rows) +min: ᴺᵁᴸᴸ +max: ᴺᵁᴸᴸ +readonly: 0 +type: UInt64 +default: 0 +alias_for: +is_obsolete: 0 +tier: Production + +Row 4: +────── +name: min_insert_block_size_bytes_for_materialized_views +value: 0 +changed: 0 +description: Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. + +Possible values: + +- Any positive integer. +- 0 — Squashing disabled. + +**See also** + +- [min_insert_block_size_bytes](#min-insert-block-size-bytes) +min: ᴺᵁᴸᴸ +max: ᴺᵁᴸᴸ +readonly: 0 +type: UInt64 +default: 0 +alias_for: +is_obsolete: 0 +tier: Production + ``` Using of `WHERE changed` can be useful, for example, when you want to check: diff --git a/docs/en/operations/system-tables/workloads.md b/docs/en/operations/system-tables/workloads.md new file mode 100644 index 00000000000..d9c62372044 --- /dev/null +++ b/docs/en/operations/system-tables/workloads.md @@ -0,0 +1,40 @@ +--- +slug: /en/operations/system-tables/workloads +--- +# workloads + +Contains information for [workloads](/docs/en/operations/workload-scheduling.md#workload_entity_storage) residing on the local server. The table contains a row for every workload. + +Example: + +``` sql +SELECT * +FROM system.workloads +FORMAT Vertical +``` + +``` text +Row 1: +────── +name: production +parent: all +create_query: CREATE WORKLOAD production IN `all` SETTINGS weight = 9 + +Row 2: +────── +name: development +parent: all +create_query: CREATE WORKLOAD development IN `all` + +Row 3: +────── +name: all +parent: +create_query: CREATE WORKLOAD `all` +``` + +Columns: + +- `name` (`String`) - Workload name. +- `parent` (`String`) - Parent workload name. +- `create_query` (`String`) - The definition of the workload. diff --git a/docs/en/operations/workload-scheduling.md b/docs/en/operations/workload-scheduling.md index 08629492ec6..a43bea7a5b1 100644 --- a/docs/en/operations/workload-scheduling.md +++ b/docs/en/operations/workload-scheduling.md @@ -43,6 +43,20 @@ Example: ``` +An alternative way to express which disks are used by a resource is SQL syntax: + +```sql +CREATE RESOURCE resource_name (WRITE DISK disk1, READ DISK disk2) +``` + +Resource could be used for any number of disk for READ or WRITE or both for READ and WRITE. There a syntax allowing to use a resource for all the disks: + +```sql +CREATE RESOURCE all_io (READ ANY DISK, WRITE ANY DISK); +``` + +Note that server configuration options have priority over SQL way to define resources. + ## Workload markup {#workload_markup} Queries can be marked with setting `workload` to distinguish different workloads. If `workload` is not set, than value "default" is used. Note that you are able to specify the other value using settings profiles. Setting constraints can be used to make `workload` constant if you want all queries from the user to be marked with fixed value of `workload` setting. @@ -153,9 +167,48 @@ Example: ``` +## Workload hierarchy (SQL only) {#workloads} + +Defining resources and classifiers in XML could be challenging. ClickHouse provides SQL syntax that is much more convenient. All resources that were created with `CREATE RESOURCE` share the same structure of the hierarchy, but could differ in some aspects. Every workload created with `CREATE WORKLOAD` maintains a few automatically created scheduling nodes for every resource. A child workload can be created inside another parent workload. Here is the example that defines exactly the same hierarchy as XML configuration above: + +```sql +CREATE RESOURCE network_write (WRITE DISK s3) +CREATE RESOURCE network_read (READ DISK s3) +CREATE WORKLOAD all SETTINGS max_requests = 100 +CREATE WORKLOAD development IN all +CREATE WORKLOAD production IN all SETTINGS weight = 3 +``` + +The name of a leaf workload without children could be used in query settings `SETTINGS workload = 'name'`. Note that workload classifiers are also created automatically when using SQL syntax. + +To customize workload the following settings could be used: +* `priority` - sibling workloads are served according to static priority values (lower value means higher priority). +* `weight` - sibling workloads having the same static priority share resources according to weights. +* `max_requests` - the limit on the number of concurrent resource requests in this workload. +* `max_cost` - the limit on the total inflight bytes count of concurrent resource requests in this workload. +* `max_speed` - the limit on byte processing rate of this workload (the limit is independent for every resource). +* `max_burst` - maximum number of bytes that could be processed by the workload without being throttled (for every resource independently). + +Note that workload settings are translated into a proper set of scheduling nodes. For more details, see the description of the scheduling node [types and options](#hierarchy). + +There is no way to specify different hierarchies of workloads for different resources. But there is a way to specify different workload setting value for a specific resource: + +```sql +CREATE OR REPLACE WORKLOAD all SETTINGS max_requests = 100, max_speed = 1000000 FOR network_read, max_speed = 2000000 FOR network_write +``` + +Also note that workload or resource could not be dropped if it is referenced from another workload. To update a definition of a workload use `CREATE OR REPLACE WORKLOAD` query. + +## Workloads and resources storage {#workload_entity_storage} +Definitions of all workloads and resources in the form of `CREATE WORKLOAD` and `CREATE RESOURCE` queries are stored persistently either on disk at `workload_path` or in ZooKeeper at `workload_zookeeper_path`. ZooKeeper storage is recommended to achieve consistency between nodes. Alternatively `ON CLUSTER` clause could be used along with disk storage. + ## See also - [system.scheduler](/docs/en/operations/system-tables/scheduler.md) + - [system.workloads](/docs/en/operations/system-tables/workloads.md) + - [system.resources](/docs/en/operations/system-tables/resources.md) - [merge_workload](/docs/en/operations/settings/merge-tree-settings.md#merge_workload) merge tree setting - [merge_workload](/docs/en/operations/server-configuration-parameters/settings.md#merge_workload) global server setting - [mutation_workload](/docs/en/operations/settings/merge-tree-settings.md#mutation_workload) merge tree setting - [mutation_workload](/docs/en/operations/server-configuration-parameters/settings.md#mutation_workload) global server setting + - [workload_path](/docs/en/operations/server-configuration-parameters/settings.md#workload_path) global server setting + - [workload_zookeeper_path](/docs/en/operations/server-configuration-parameters/settings.md#workload_zookeeper_path) global server setting diff --git a/docs/en/sql-reference/aggregate-functions/reference/anylast.md b/docs/en/sql-reference/aggregate-functions/reference/anylast.md index 202d2e9fb10..4fe21531c76 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/anylast.md +++ b/docs/en/sql-reference/aggregate-functions/reference/anylast.md @@ -17,7 +17,7 @@ anyLast(column) [RESPECT NULLS] - `column`: The column name. :::note -Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the first value passed, regardless of whether it is `NULL` or not. +Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the last value passed, regardless of whether it is `NULL` or not. ::: **Returned value** @@ -40,4 +40,4 @@ SELECT anyLast(city) FROM any_last_nulls; ┌─anyLast(city)─┐ │ Valencia │ └───────────────┘ -``` \ No newline at end of file +``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md index 4ce212888c4..6004e8392f1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md @@ -23,7 +23,7 @@ Alias: `medianExactWeighted`. - `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). - `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). -- `weight` — Column with weights of sequence members. Weight is a number of value occurrences. +- `weight` — Column with weights of sequence members. Weight is a number of value occurrences with [Unsigned integer types](../../../sql-reference/data-types/int-uint.md). **Returned value** diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantileexactweightedinterpolated.md b/docs/en/sql-reference/aggregate-functions/reference/quantileexactweightedinterpolated.md new file mode 100644 index 00000000000..6b38e130cb2 --- /dev/null +++ b/docs/en/sql-reference/aggregate-functions/reference/quantileexactweightedinterpolated.md @@ -0,0 +1,77 @@ +--- +slug: /en/sql-reference/aggregate-functions/reference/quantileExactWeightedInterpolated +sidebar_position: 176 +--- + +# quantileExactWeightedInterpolated + +Computes [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using linear interpolation, taking into account the weight of each element. + +To get the interpolated value, all the passed values are combined into an array, which are then sorted by their corresponding weights. Quantile interpolation is then performed using the [weighted percentile method](https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method) by building a cumulative distribution based on weights and then a linear interpolation is performed using the weights and the values to compute the quantiles. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +We strongly recommend using `quantileExactWeightedInterpolated` instead of `quantileInterpolatedWeighted` because `quantileExactWeightedInterpolated` is more accurate than `quantileInterpolatedWeighted`. Here is an example: + +``` sql +SELECT + quantileExactWeightedInterpolated(0.99)(number, 1), + quantile(0.99)(number), + quantileInterpolatedWeighted(0.99)(number, 1) +FROM numbers(9) + + +┌─quantileExactWeightedInterpolated(0.99)(number, 1)─┬─quantile(0.99)(number)─┬─quantileInterpolatedWeighted(0.99)(number, 1)─┐ +│ 7.92 │ 7.92 │ 8 │ +└────────────────────────────────────────────────────┴────────────────────────┴───────────────────────────────────────────────┘ +``` + +**Syntax** + +``` sql +quantileExactWeightedInterpolated(level)(expr, weight) +``` + +Alias: `medianExactWeightedInterpolated`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). +- `weight` — Column with weights of sequence members. Weight is a number of value occurrences with [Unsigned integer types](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Input table: + +``` text +┌─n─┬─val─┐ +│ 0 │ 3 │ +│ 1 │ 2 │ +│ 2 │ 1 │ +│ 5 │ 4 │ +└───┴─────┘ +``` + +Result: + +``` text +┌─quantileExactWeightedInterpolated(n, val)─┐ +│ 1.5 │ +└───────────────────────────────────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md index e2c3295221d..aed017d295f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md @@ -9,7 +9,7 @@ sidebar_position: 177 Syntax: `quantiles(level1, level2, ...)(x)` -All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`, `quantilesDD`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. +All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileExactWeightedInterpolated`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`, `quantilesDD`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. ## quantilesExactExclusive diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index 5fbf47f7ef2..aa7455c8f68 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -512,6 +512,8 @@ The result of operator `<` for values `d1` with underlying type `T1` and `d2` wi - If `T1 = T2 = T`, the result will be `d1.T < d2.T` (underlying values will be compared). - If `T1 != T2`, the result will be `T1 < T2` (type names will be compared). +By default `Dynamic` type is not allowed in `GROUP BY`/`ORDER BY` keys, if you want to use it consider its special comparison rule and enable `allow_suspicious_types_in_group_by`/`allow_suspicious_types_in_order_by` settings. + Examples: ```sql CREATE TABLE test (d Dynamic) ENGINE=Memory; @@ -535,7 +537,7 @@ SELECT d, dynamicType(d) FROM test; ``` ```sql -SELECT d, dynamicType(d) FROM test ORDER BY d; +SELECT d, dynamicType(d) FROM test ORDER BY d SETTINGS allow_suspicious_types_in_order_by=1; ``` ```sql @@ -557,7 +559,7 @@ Example: ```sql CREATE TABLE test (d Dynamic) ENGINE=Memory; INSERT INTO test VALUES (1::UInt32), (1::Int64), (100::UInt32), (100::Int64); -SELECT d, dynamicType(d) FROM test ORDER by d; +SELECT d, dynamicType(d) FROM test ORDER BY d SETTINGS allow_suspicious_types_in_order_by=1; ``` ```text @@ -570,7 +572,7 @@ SELECT d, dynamicType(d) FROM test ORDER by d; ``` ```sql -SELECT d, dynamicType(d) FROM test GROUP by d; +SELECT d, dynamicType(d) FROM test GROUP by d SETTINGS allow_suspicious_types_in_group_by=1; ``` ```text @@ -582,7 +584,7 @@ SELECT d, dynamicType(d) FROM test GROUP by d; └─────┴────────────────┘ ``` -**Note**: the described comparison rule is not applied during execution of comparison functions like `<`/`>`/`=` and others because of [special work](#using-dynamic-type-in-functions) of functions with `Dynamic` type +**Note:** the described comparison rule is not applied during execution of comparison functions like `<`/`>`/`=` and others because of [special work](#using-dynamic-type-in-functions) of functions with `Dynamic` type ## Reaching the limit in number of different data types stored inside Dynamic diff --git a/docs/en/sql-reference/data-types/newjson.md b/docs/en/sql-reference/data-types/newjson.md index 68952590eb9..4a21900545d 100644 --- a/docs/en/sql-reference/data-types/newjson.md +++ b/docs/en/sql-reference/data-types/newjson.md @@ -5,7 +5,7 @@ sidebar_label: JSON keywords: [json, data type] --- -# JSON +# JSON Data Type Stores JavaScript Object Notation (JSON) documents in a single column. @@ -58,10 +58,10 @@ SELECT json FROM test; └───────────────────────────────────┘ ``` -Using CAST from 'String': +Using CAST from `String`: ```sql -SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON as json; +SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON AS json; ``` ```text @@ -70,7 +70,47 @@ SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON as json └────────────────────────────────────────────────┘ ``` -CAST from `JSON`, named `Tuple`, `Map` and `Object('json')` to `JSON` type will be supported later. +Using CAST from `Tuple`: + +```sql +SELECT (tuple(42 AS b) AS a, [1, 2, 3] AS c, 'Hello, World!' AS d)::JSON AS json; +``` + +```text +┌─json───────────────────────────────────────────┐ +│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │ +└────────────────────────────────────────────────┘ +``` + +Using CAST from `Map`: + +```sql +SELECT map('a', map('b', 42), 'c', [1,2,3], 'd', 'Hello, World!')::JSON AS json; +``` + +```text +┌─json───────────────────────────────────────────┐ +│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │ +└────────────────────────────────────────────────┘ +``` + +Using CAST from deprecated `Object('json')`: + +```sql + SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::Object('json')::JSON AS json; + ``` + +```text +┌─json───────────────────────────────────────────┐ +│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │ +└────────────────────────────────────────────────┘ +``` + +:::note +CAST from `Tuple`/`Map`/`Object('json')` to `JSON` is implemented via serializing the column into `String` column containing JSON objects and deserializing it back to `JSON` type column. +::: + +CAST between `JSON` types with different arguments will be supported later. ## Reading JSON paths as subcolumns @@ -630,6 +670,28 @@ SELECT arrayJoin(distinctJSONPathsAndTypes(json)) FROM s3('s3://clickhouse-publi └─arrayJoin(distinctJSONPathsAndTypes(json))──────────────────┘ ``` +## ALTER MODIFY COLUMN to JSON type + +It's possible to alter an existing table and change the type of the column to the new `JSON` type. Right now only alter from `String` type is supported. + +**Example** + +```sql +CREATE TABLE test (json String) ENGINE=MergeTree ORDeR BY tuple(); +INSERT INTO test VALUES ('{"a" : 42}'), ('{"a" : 43, "b" : "Hello"}'), ('{"a" : 44, "b" : [1, 2, 3]}')), ('{"c" : "2020-01-01"}'); +ALTER TABLE test MODIFY COLUMN json JSON; +SELECT json, json.a, json.b, json.c FROM test; +``` + +```text +┌─json─────────────────────────┬─json.a─┬─json.b──┬─json.c─────┐ +│ {"a":"42"} │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +│ {"a":"43","b":"Hello"} │ 43 │ Hello │ ᴺᵁᴸᴸ │ +│ {"a":"44","b":["1","2","3"]} │ 44 │ [1,2,3] │ ᴺᵁᴸᴸ │ +│ {"c":"2020-01-01"} │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-01 │ +└──────────────────────────────┴────────┴─────────┴────────────┘ +``` + ## Tips for better usage of the JSON type Before creating `JSON` column and loading data into it, consider the following tips: diff --git a/docs/en/sql-reference/data-types/variant.md b/docs/en/sql-reference/data-types/variant.md index 3c2b6e0a362..7cb0f4ad4ea 100644 --- a/docs/en/sql-reference/data-types/variant.md +++ b/docs/en/sql-reference/data-types/variant.md @@ -441,6 +441,8 @@ SELECT v, variantType(v) FROM test ORDER by v; └─────┴────────────────┘ ``` +**Note** by default `Variant` type is not allowed in `GROUP BY`/`ORDER BY` keys, if you want to use it consider its special comparison rule and enable `allow_suspicious_types_in_group_by`/`allow_suspicious_types_in_order_by` settings. + ## JSONExtract functions with Variant All `JSONExtract*` functions support `Variant` type: diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 5c39f880a0e..91bae2fe9da 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -6867,6 +6867,18 @@ Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that it returns `NULL` when it encounters a date format that cannot be processed. +## parseDateTime64InJodaSyntax + +Similar to [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax). Differently, it returns a value of type [DateTime64](../data-types/datetime64.md). + +## parseDateTime64InJodaSyntaxOrZero + +Same as for [parseDateTime64InJodaSyntax](#parsedatetime64injodasyntax) except that it returns zero date when it encounters a date format that cannot be processed. + +## parseDateTime64InJodaSyntaxOrNull + +Same as for [parseDateTime64InJodaSyntax](#parsedatetime64injodasyntax) except that it returns `NULL` when it encounters a date format that cannot be processed. + ## parseDateTimeBestEffort ## parseDateTime32BestEffort diff --git a/docs/en/sql-reference/statements/alter/user.md b/docs/en/sql-reference/statements/alter/user.md index a56532e2ab0..1514b16a657 100644 --- a/docs/en/sql-reference/statements/alter/user.md +++ b/docs/en/sql-reference/statements/alter/user.md @@ -12,7 +12,7 @@ Syntax: ``` sql ALTER USER [IF EXISTS] name1 [RENAME TO new_name |, name2 [,...]] [ON CLUSTER cluster_name] - [NOT IDENTIFIED | RESET AUTHENTICATION METHODS TO NEW | {IDENTIFIED | ADD IDENTIFIED} {[WITH {plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | WITH NO_PASSWORD | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']} | {WITH ssl_certificate CN 'common_name' | SAN 'TYPE:subject_alt_name'} | {WITH ssh_key BY KEY 'public_key' TYPE 'ssh-rsa|...'} | {WITH http SERVER 'server_name' [SCHEME 'Basic']} + [NOT IDENTIFIED | RESET AUTHENTICATION METHODS TO NEW | {IDENTIFIED | ADD IDENTIFIED} {[WITH {plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | WITH NO_PASSWORD | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']} | {WITH ssl_certificate CN 'common_name' | SAN 'TYPE:subject_alt_name'} | {WITH ssh_key BY KEY 'public_key' TYPE 'ssh-rsa|...'} | {WITH http SERVER 'server_name' [SCHEME 'Basic']} [VALID UNTIL datetime] [, {[{plaintext_password | sha256_password | sha256_hash | ...}] BY {'password' | 'hash'}} | {ldap SERVER 'server_name'} | {...} | ... [,...]]] [[ADD | DROP] HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [VALID UNTIL datetime] @@ -91,3 +91,15 @@ Reset authentication methods and keep the most recent added one: ``` sql ALTER USER user1 RESET AUTHENTICATION METHODS TO NEW ``` + +## VALID UNTIL Clause + +Allows you to specify the expiration date and, optionally, the time for an authentication method. It accepts a string as a parameter. It is recommended to use the `YYYY-MM-DD [hh:mm:ss] [timezone]` format for datetime. By default, this parameter equals `'infinity'`. +The `VALID UNTIL` clause can only be specified along with an authentication method, except for the case where no authentication method has been specified in the query. In this scenario, the `VALID UNTIL` clause will be applied to all existing authentication methods. + +Examples: + +- `ALTER USER name1 VALID UNTIL '2025-01-01'` +- `ALTER USER name1 VALID UNTIL '2025-01-01 12:00:00 UTC'` +- `ALTER USER name1 VALID UNTIL 'infinity'` +- `ALTER USER name1 IDENTIFIED WITH plaintext_password BY 'no_expiration', bcrypt_password BY 'expiration_set' VALID UNTIL'2025-01-01''` diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index ab44f545430..a9fc5712b4d 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -427,19 +427,6 @@ High compression levels are useful for asymmetric scenarios, like compress once, ZSTD_QAT is not available in ClickHouse Cloud. ::: -#### DEFLATE_QPL - -`DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library. Some limitations apply: - -- DEFLATE_QPL is disabled by default and can only be used after enabling configuration setting [enable_deflate_qpl_codec](../../../operations/settings/settings.md#enable_deflate_qpl_codec). -- DEFLATE_QPL requires a ClickHouse build compiled with SSE 4.2 instructions (by default, this is the case). Refer to [Build Clickhouse with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Build-Clickhouse-with-DEFLATE_QPL) for more details. -- DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device. Refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) and [Benchmark with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Run-Benchmark-with-DEFLATE_QPL) for more details. -- DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with SSE 4.2 enabled. - -:::note -DEFLATE_QPL is not available in ClickHouse Cloud. -::: - ### Specialized Codecs These codecs are designed to make compression more effective by exploiting specific features of the data. Some of these codecs do not compress data themselves, they instead preprocess the data such that a second compression stage using a general-purpose codec can achieve a higher data compression rate. diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md index a018e28306c..03d93fc3365 100644 --- a/docs/en/sql-reference/statements/create/user.md +++ b/docs/en/sql-reference/statements/create/user.md @@ -11,7 +11,7 @@ Syntax: ``` sql CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [, name2 [,...]] [ON CLUSTER cluster_name] - [NOT IDENTIFIED | IDENTIFIED {[WITH {plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | WITH NO_PASSWORD | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']} | {WITH ssl_certificate CN 'common_name' | SAN 'TYPE:subject_alt_name'} | {WITH ssh_key BY KEY 'public_key' TYPE 'ssh-rsa|...'} | {WITH http SERVER 'server_name' [SCHEME 'Basic']} + [NOT IDENTIFIED | IDENTIFIED {[WITH {plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | WITH NO_PASSWORD | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']} | {WITH ssl_certificate CN 'common_name' | SAN 'TYPE:subject_alt_name'} | {WITH ssh_key BY KEY 'public_key' TYPE 'ssh-rsa|...'} | {WITH http SERVER 'server_name' [SCHEME 'Basic']} [VALID UNTIL datetime] [, {[{plaintext_password | sha256_password | sha256_hash | ...}] BY {'password' | 'hash'}} | {ldap SERVER 'server_name'} | {...} | ... [,...]]] [HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [VALID UNTIL datetime] @@ -178,13 +178,16 @@ ClickHouse treats `user_name@'address'` as a username as a whole. Thus, technica ## VALID UNTIL Clause -Allows you to specify the expiration date and, optionally, the time for user credentials. It accepts a string as a parameter. It is recommended to use the `YYYY-MM-DD [hh:mm:ss] [timezone]` format for datetime. By default, this parameter equals `'infinity'`. +Allows you to specify the expiration date and, optionally, the time for an authentication method. It accepts a string as a parameter. It is recommended to use the `YYYY-MM-DD [hh:mm:ss] [timezone]` format for datetime. By default, this parameter equals `'infinity'`. +The `VALID UNTIL` clause can only be specified along with an authentication method, except for the case where no authentication method has been specified in the query. In this scenario, the `VALID UNTIL` clause will be applied to all existing authentication methods. Examples: - `CREATE USER name1 VALID UNTIL '2025-01-01'` - `CREATE USER name1 VALID UNTIL '2025-01-01 12:00:00 UTC'` - `CREATE USER name1 VALID UNTIL 'infinity'` +- ```CREATE USER name1 VALID UNTIL '2025-01-01 12:00:00 `Asia/Tokyo`'``` +- `CREATE USER name1 IDENTIFIED WITH plaintext_password BY 'no_expiration', bcrypt_password BY 'expiration_set' VALID UNTIL '2025-01-01''` ## GRANTEES Clause diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 0e5d5250e0f..c770348bce0 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -55,7 +55,7 @@ SELECT * FROM view(column1=value1, column2=value2 ...) ## Materialized View ``` sql -CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] +CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] [DEFINER = { user | CURRENT_USER }] [SQL SECURITY { DEFINER | INVOKER | NONE }] AS SELECT ... [COMMENT 'comment'] diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index c11299baf38..6decaf19d5b 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -78,6 +78,10 @@ Specifying privileges you can use asterisk (`*`) instead of a table or a databas Also, you can omit database name. In this case privileges are granted for current database. For example, `GRANT SELECT ON * TO john` grants the privilege on all the tables in the current database, `GRANT SELECT ON mytable TO john` grants the privilege on the `mytable` table in the current database. +:::note +The feature described below is available starting with the 24.10 ClickHouse version. +::: + You can also put asterisks at the end of a table or a database name. This feature allows you to grant privileges on an abstract prefix of the table's path. Example: `GRANT SELECT ON db.my_tables* TO john`. This query allows `john` to execute the `SELECT` query over all the `db` database tables with the prefix `my_tables*`. @@ -113,6 +117,7 @@ GRANT SELECT ON db*.* TO john -- correct GRANT SELECT ON *.my_table TO john -- wrong GRANT SELECT ON foo*bar TO john -- wrong GRANT SELECT ON *suffix TO john -- wrong +GRANT SELECT(foo) ON db.table* TO john -- wrong ``` ## Privileges @@ -238,10 +243,13 @@ Hierarchy of privileges: - `HDFS` - `HIVE` - `JDBC` + - `KAFKA` - `MONGO` - `MYSQL` + - `NATS` - `ODBC` - `POSTGRES` + - `RABBITMQ` - `REDIS` - `REMOTE` - `S3` @@ -520,10 +528,13 @@ Allows using external data sources. Applies to [table engines](../../engines/tab - `HDFS`. Level: `GLOBAL` - `HIVE`. Level: `GLOBAL` - `JDBC`. Level: `GLOBAL` + - `KAFKA`. Level: `GLOBAL` - `MONGO`. Level: `GLOBAL` - `MYSQL`. Level: `GLOBAL` + - `NATS`. Level: `GLOBAL` - `ODBC`. Level: `GLOBAL` - `POSTGRES`. Level: `GLOBAL` + - `RABBITMQ`. Level: `GLOBAL` - `REDIS`. Level: `GLOBAL` - `REMOTE`. Level: `GLOBAL` - `S3`. Level: `GLOBAL` diff --git a/docs/en/sql-reference/statements/kill.md b/docs/en/sql-reference/statements/kill.md index 667a5b51f5c..ff6f64a97fe 100644 --- a/docs/en/sql-reference/statements/kill.md +++ b/docs/en/sql-reference/statements/kill.md @@ -83,7 +83,7 @@ The presence of long-running or incomplete mutations often indicates that a Clic - Or manually kill some of these mutations by sending a `KILL` command. ``` sql -KILL MUTATION [ON CLUSTER cluster] +KILL MUTATION WHERE [TEST] [FORMAT format] @@ -135,7 +135,6 @@ KILL MUTATION WHERE database = 'default' AND table = 'table' -- Cancel the specific mutation: KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = 'mutation_3.txt' ``` -:::tip If you are killing a mutation in ClickHouse Cloud or in a self-managed cluster, then be sure to use the ```ON CLUSTER [cluster-name]``` option, in order to ensure the mutation is killed on all replicas::: The query is useful when a mutation is stuck and cannot finish (e.g. if some function in the mutation query throws an exception when applied to the data contained in the table). diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md index 512a58d7cd9..25d2e7123fd 100644 --- a/docs/en/sql-reference/statements/select/order-by.md +++ b/docs/en/sql-reference/statements/select/order-by.md @@ -291,7 +291,7 @@ All missed values of `expr` column will be filled sequentially and other columns To fill multiple columns, add `WITH FILL` modifier with optional parameters after each field name in `ORDER BY` section. ``` sql -ORDER BY expr [WITH FILL] [FROM const_expr] [TO const_expr] [STEP const_numeric_expr], ... exprN [WITH FILL] [FROM expr] [TO expr] [STEP numeric_expr] +ORDER BY expr [WITH FILL] [FROM const_expr] [TO const_expr] [STEP const_numeric_expr] [STALENESS const_numeric_expr], ... exprN [WITH FILL] [FROM expr] [TO expr] [STEP numeric_expr] [STALENESS numeric_expr] [INTERPOLATE [(col [AS expr], ... colN [AS exprN])]] ``` @@ -300,6 +300,7 @@ When `FROM const_expr` not defined sequence of filling use minimal `expr` field When `TO const_expr` not defined sequence of filling use maximum `expr` field value from `ORDER BY`. When `STEP const_numeric_expr` defined then `const_numeric_expr` interprets `as is` for numeric types, as `days` for Date type, as `seconds` for DateTime type. It also supports [INTERVAL](https://clickhouse.com/docs/en/sql-reference/data-types/special-data-types/interval/) data type representing time and date intervals. When `STEP const_numeric_expr` omitted then sequence of filling use `1.0` for numeric type, `1 day` for Date type and `1 second` for DateTime type. +When `STALENESS const_numeric_expr` is defined, the query will generate rows until the difference from the previous row in the original data exceeds `const_numeric_expr`. `INTERPOLATE` can be applied to columns not participating in `ORDER BY WITH FILL`. Such columns are filled based on previous fields values by applying `expr`. If `expr` is not present will repeat previous value. Omitted list will result in including all allowed columns. Example of a query without `WITH FILL`: @@ -497,6 +498,64 @@ Result: └────────────┴────────────┴──────────┘ ``` +Example of a query without `STALENESS`: + +``` sql +SELECT number as key, 5 * number value, 'original' AS source +FROM numbers(16) WHERE key % 5 == 0 +ORDER BY key WITH FILL; +``` + +Result: + +``` text + ┌─key─┬─value─┬─source───┐ + 1. │ 0 │ 0 │ original │ + 2. │ 1 │ 0 │ │ + 3. │ 2 │ 0 │ │ + 4. │ 3 │ 0 │ │ + 5. │ 4 │ 0 │ │ + 6. │ 5 │ 25 │ original │ + 7. │ 6 │ 0 │ │ + 8. │ 7 │ 0 │ │ + 9. │ 8 │ 0 │ │ +10. │ 9 │ 0 │ │ +11. │ 10 │ 50 │ original │ +12. │ 11 │ 0 │ │ +13. │ 12 │ 0 │ │ +14. │ 13 │ 0 │ │ +15. │ 14 │ 0 │ │ +16. │ 15 │ 75 │ original │ + └─────┴───────┴──────────┘ +``` + +Same query after applying `STALENESS 3`: + +``` sql +SELECT number as key, 5 * number value, 'original' AS source +FROM numbers(16) WHERE key % 5 == 0 +ORDER BY key WITH FILL STALENESS 3; +``` + +Result: + +``` text + ┌─key─┬─value─┬─source───┐ + 1. │ 0 │ 0 │ original │ + 2. │ 1 │ 0 │ │ + 3. │ 2 │ 0 │ │ + 4. │ 5 │ 25 │ original │ + 5. │ 6 │ 0 │ │ + 6. │ 7 │ 0 │ │ + 7. │ 10 │ 50 │ original │ + 8. │ 11 │ 0 │ │ + 9. │ 12 │ 0 │ │ +10. │ 15 │ 75 │ original │ +11. │ 16 │ 0 │ │ +12. │ 17 │ 0 │ │ + └─────┴───────┴──────────┘ +``` + Example of a query without `INTERPOLATE`: ``` sql diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 181c92b92d4..b14eb84392f 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -93,7 +93,6 @@ LIMIT 5; ClickHouse also can determine the compression method of the file. For example, if the file was zipped up with a `.csv.gz` extension, ClickHouse would decompress the file automatically. ::: - ## Usage Suppose that we have several files with following URIs on S3: @@ -248,6 +247,25 @@ FROM s3( LIMIT 5; ``` +## Using S3 credentials (ClickHouse Cloud) + +For non-public buckets, users can pass an `aws_access_key_id` and `aws_secret_access_key` to the function. For example: + +```sql +SELECT count() FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/mta/*.tsv', '', '','TSVWithNames') +``` + +This is appropriate for one-off accesses or in cases where credentials can easily be rotated. However, this is not recommended as a long-term solution for repeated access or where credentials are sensitive. In this case, we recommend users rely on role-based access. + +Role-based access for S3 in ClickHouse Cloud is documented [here](/docs/en/cloud/security/secure-s3#access-your-s3-bucket-with-the-clickhouseaccess-role). + +Once configured, a `roleARN` can be passed to the s3 function via an `extra_credentials` parameter. For example: + +```sql +SELECT count() FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/mta/*.tsv','CSVWithNames',extra_credentials(role_arn = 'arn:aws:iam::111111111111:role/ClickHouseAccessRole-001')) +``` + +Further examples can be found [here](/docs/en/cloud/security/secure-s3#access-your-s3-bucket-with-the-clickhouseaccess-role) ## Working with archives @@ -266,6 +284,14 @@ FROM s3( ); ``` +:::note +ClickHouse supports three archive formats: +ZIP +TAR +7Z +While ZIP and TAR archives can be accessed from any supported storage location, 7Z archives can only be read from the local filesystem where ClickHouse is installed. +::: + ## Virtual Columns {#virtual-columns} diff --git a/docs/en/sql-reference/table-functions/s3Cluster.md b/docs/en/sql-reference/table-functions/s3Cluster.md index 9bf5a6b4da6..0aa4784d27a 100644 --- a/docs/en/sql-reference/table-functions/s3Cluster.md +++ b/docs/en/sql-reference/table-functions/s3Cluster.md @@ -70,6 +70,15 @@ SELECT count(*) FROM s3Cluster( ) ``` +## Accessing private and public buckets + +Users can use the same approaches as document for the s3 function [here](/docs/en/sql-reference/table-functions/s3#accessing-public-buckets). + +## Optimizing performance + +For details on optimizing the performance of the s3 function see [our detailed guide](/docs/en/integrations/s3/performance). + + **See Also** - [S3 engine](../../engines/table-engines/integrations/s3.md) diff --git a/docs/ru/development/contrib.md b/docs/ru/development/contrib.md index f3a88a2da0c..67da2b2a6bf 100644 --- a/docs/ru/development/contrib.md +++ b/docs/ru/development/contrib.md @@ -93,7 +93,7 @@ sidebar_label: "Используемые сторонние библиотеки SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en'; ``` -[Пример](https://play.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==) +[Пример](https://sql.clickhouse.com?query_id=478GCPU7LRTSZJBNY3EJT3) ## Рекомендации по добавлению сторонних библиотек и поддержанию в них пользовательских изменений {#adding-third-party-libraries} diff --git a/docs/ru/engines/table-engines/integrations/s3.md b/docs/ru/engines/table-engines/integrations/s3.md index a1c69df4d0a..2bab78c0612 100644 --- a/docs/ru/engines/table-engines/integrations/s3.md +++ b/docs/ru/engines/table-engines/integrations/s3.md @@ -138,6 +138,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) - `use_insecure_imds_request` — признак использования менее безопасного соединения при выполнении запроса к IMDS при получении учётных данных из метаданных Amazon EC2. Значение по умолчанию — `false`. - `region` — название региона S3. - `header` — добавляет указанный HTTP-заголовок к запросу на заданную точку приема запроса. Может быть определен несколько раз. +- `access_header` - добавляет указанный HTTP-заголовок к запросу на заданную точку приема запроса, в случая если не указаны другие способы авторизации. - `server_side_encryption_customer_key_base64` — устанавливает необходимые заголовки для доступа к объектам S3 с шифрованием SSE-C. - `single_read_retries` — Максимальное количество попыток запроса при единичном чтении. Значение по умолчанию — `4`. diff --git a/docs/ru/getting-started/example-datasets/brown-benchmark.md b/docs/ru/getting-started/example-datasets/brown-benchmark.md index c830d639095..d37be9f48d5 100644 --- a/docs/ru/getting-started/example-datasets/brown-benchmark.md +++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md @@ -412,4 +412,4 @@ ORDER BY yr, mo; ``` -Данные также доступны для работы с интерактивными запросами через [Playground](https://play.clickhouse.com/play?user=play), [пример](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). +Данные также доступны для работы с интерактивными запросами через [Playground](https://sql.clickhouse.com), [пример](https://sql.clickhouse.com?query_id=1MXMHASDLEQIP4P1D1STND). diff --git a/docs/ru/getting-started/example-datasets/cell-towers.md b/docs/ru/getting-started/example-datasets/cell-towers.md index cf1a02ae8f0..2f91bed1c04 100644 --- a/docs/ru/getting-started/example-datasets/cell-towers.md +++ b/docs/ru/getting-started/example-datasets/cell-towers.md @@ -126,4 +126,4 @@ SELECT count() FROM cell_towers WHERE pointInPolygon((lon, lat), (SELECT * FROM 1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.) ``` -Вы можете протестировать другие запросы с помощью интерактивного ресурса [Playground](https://play.clickhouse.com/play?user=play). Например, [вот так](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). Однако, обратите внимание, что здесь нельзя создавать временные таблицы. +Вы можете протестировать другие запросы с помощью интерактивного ресурса [Playground](https://sql.clickhouse.com). Например, [вот так](https://sql.clickhouse.com?query_id=UV8M4MAGS2PWAUOAYAAARM). Однако, обратите внимание, что здесь нельзя создавать временные таблицы. diff --git a/docs/ru/getting-started/example-datasets/recipes.md b/docs/ru/getting-started/example-datasets/recipes.md index b91fe3314ff..860d1ff450c 100644 --- a/docs/ru/getting-started/example-datasets/recipes.md +++ b/docs/ru/getting-started/example-datasets/recipes.md @@ -338,4 +338,4 @@ WHERE title = 'Chocolate-Strawberry-Orange Wedding Cake'; ### Online Playground -Этот набор данных доступен в [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==). +Этот набор данных доступен в [Online Playground](https://sql.clickhouse.com?query_id=HQXNQZE26Z1QWYP9KC76ML). diff --git a/docs/ru/getting-started/install.md b/docs/ru/getting-started/install.md index f8a660fbec9..083ddc8c39c 100644 --- a/docs/ru/getting-started/install.md +++ b/docs/ru/getting-started/install.md @@ -95,7 +95,7 @@ sudo yum install -y clickhouse-server clickhouse-client sudo systemctl enable clickhouse-server sudo systemctl start clickhouse-server sudo systemctl status clickhouse-server -clickhouse-client # илм "clickhouse-client --password" если установлен пароль +clickhouse-client # или "clickhouse-client --password" если установлен пароль ``` Для использования наиболее свежих версий нужно заменить `stable` на `testing` (рекомендуется для тестовых окружений). Также иногда доступен `prestable`. diff --git a/docs/ru/getting-started/playground.md b/docs/ru/getting-started/playground.md index a2d5498fb9a..b4ec89784ac 100644 --- a/docs/ru/getting-started/playground.md +++ b/docs/ru/getting-started/playground.md @@ -6,7 +6,7 @@ sidebar_label: Playground # ClickHouse Playground {#clickhouse-playground} -[ClickHouse Playground](https://play.clickhouse.com/play?user=play) позволяет пользователям экспериментировать с ClickHouse, выполняя запросы мгновенно, без необходимости настройки сервера или кластера. +[ClickHouse Playground](https://sql.clickhouse.com) позволяет пользователям экспериментировать с ClickHouse, выполняя запросы мгновенно, без необходимости настройки сервера или кластера. В Playground доступны несколько примеров наборов данных. Вы можете выполнять запросы к Playground, используя любой HTTP-клиент, например [curl](https://curl.haxx.se) или [wget](https://www.gnu.org/software/wget/), или настроить соединение, используя драйверы [JDBC](../interfaces/jdbc.md) или [ODBC](../interfaces/odbc.md). Дополнительную информацию о программных продуктах, поддерживающих ClickHouse, можно найти [здесь](../interfaces/index.md). diff --git a/docs/ru/sql-reference/statements/create/view.md b/docs/ru/sql-reference/statements/create/view.md index 8fa30446bb3..5dbffd90205 100644 --- a/docs/ru/sql-reference/statements/create/view.md +++ b/docs/ru/sql-reference/statements/create/view.md @@ -39,7 +39,7 @@ SELECT a, b, c FROM (SELECT ...) ## Материализованные представления {#materialized} ``` sql -CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] +CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] [DEFINER = { user | CURRENT_USER }] [SQL SECURITY { DEFINER | INVOKER | NONE }] AS SELECT ... ``` diff --git a/docs/ru/sql-reference/statements/grant.md b/docs/ru/sql-reference/statements/grant.md index 2ccc2d05452..79682dc42cd 100644 --- a/docs/ru/sql-reference/statements/grant.md +++ b/docs/ru/sql-reference/statements/grant.md @@ -192,14 +192,23 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION - `addressToSymbol` - `demangle` - [SOURCES](#grant-sources) + - `AZURE` - `FILE` - - `URL` - - `REMOTE` - - `MYSQL` - - `ODBC` - - `JDBC` - `HDFS` + - `HIVE` + - `JDBC` + - `KAFKA` + - `MONGO` + - `MYSQL` + - `NATS` + - `ODBC` + - `POSTGRES` + - `RABBITMQ` + - `REDIS` + - `REMOTE` - `S3` + - `SQLITE` + - `URL` - [dictGet](#grant-dictget) Примеры того, как трактуется данная иерархия: @@ -461,14 +470,23 @@ GRANT INSERT(x,y) ON db.table TO john Разрешает использовать внешние источники данных. Применяется к [движкам таблиц](../../engines/table-engines/index.md) и [табличным функциям](../table-functions/index.md#table-functions). - `SOURCES`. Уровень: `GROUP` + - `AZURE`. Уровень: `GLOBAL` - `FILE`. Уровень: `GLOBAL` - - `URL`. Уровень: `GLOBAL` - - `REMOTE`. Уровень: `GLOBAL` - - `MYSQL`. Уровень: `GLOBAL` - - `ODBC`. Уровень: `GLOBAL` - - `JDBC`. Уровень: `GLOBAL` - `HDFS`. Уровень: `GLOBAL` + - `HIVE`. Уровень: `GLOBAL` + - `JDBC`. Уровень: `GLOBAL` + - `KAFKA`. Уровень: `GLOBAL` + - `MONGO`. Уровень: `GLOBAL` + - `MYSQL`. Уровень: `GLOBAL` + - `NATS`. Уровень: `GLOBAL` + - `ODBC`. Уровень: `GLOBAL` + - `POSTGRES`. Уровень: `GLOBAL` + - `RABBITMQ`. Уровень: `GLOBAL` + - `REDIS`. Уровень: `GLOBAL` + - `REMOTE`. Уровень: `GLOBAL` - `S3`. Уровень: `GLOBAL` + - `SQLITE`. Уровень: `GLOBAL` + - `URL`. Уровень: `GLOBAL` Привилегия `SOURCES` разрешает использование всех источников. Также вы можете присвоить привилегию для каждого источника отдельно. Для использования источников необходимы дополнительные привилегии. diff --git a/docs/zh/getting-started/example-datasets/brown-benchmark.mdx b/docs/zh/getting-started/example-datasets/brown-benchmark.mdx index 6db4982f50f..74bfeb58d6d 100644 --- a/docs/zh/getting-started/example-datasets/brown-benchmark.mdx +++ b/docs/zh/getting-started/example-datasets/brown-benchmark.mdx @@ -457,4 +457,4 @@ ORDER BY yr, mo; ``` -此数据集可在 [Playground](https://play.clickhouse.com/play?user=play) 中进行交互式的请求, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). +此数据集可在 [Playground](https://sql.clickhouse.com) 中进行交互式的请求, [example](https://sql.clickhouse.com?query_id=1MXMHASDLEQIP4P1D1STND). diff --git a/docs/zh/getting-started/example-datasets/cell-towers.mdx b/docs/zh/getting-started/example-datasets/cell-towers.mdx index 9738680519a..b98e92c378a 100644 --- a/docs/zh/getting-started/example-datasets/cell-towers.mdx +++ b/docs/zh/getting-started/example-datasets/cell-towers.mdx @@ -228,5 +228,5 @@ WHERE pointInPolygon((lon, lat), (SELECT * FROM moscow)) 1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.) ``` -虽然不能创建临时表,但此数据集仍可在 [Playground](https://play.clickhouse.com/play?user=play) 中进行交互式的请求, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). +虽然不能创建临时表,但此数据集仍可在 [Playground](https://sql.clickhouse.com) 中进行交互式的请求, [example](https://sql.clickhouse.com?query_id=UV8M4MAGS2PWAUOAYAAARM). diff --git a/docs/zh/getting-started/example-datasets/menus.mdx b/docs/zh/getting-started/example-datasets/menus.mdx index 10e9f2bd318..33ec031c1ad 100644 --- a/docs/zh/getting-started/example-datasets/menus.mdx +++ b/docs/zh/getting-started/example-datasets/menus.mdx @@ -349,4 +349,4 @@ ORDER BY d ASC; ## 在线 Playground{#playground} -此数据集已经上传到了 ClickHouse Playground 中,[example](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICByb3VuZCh0b1VJbnQzMk9yWmVybyhleHRyYWN0KG1lbnVfZGF0ZSwgJ15cXGR7NH0nKSksIC0xKSBBUyBkLAogICAgY291bnQoKSwKICAgIHJvdW5kKGF2ZyhwcmljZSksIDIpLAogICAgYmFyKGF2ZyhwcmljZSksIDAsIDUwLCAxMDApLAogICAgYW55KGRpc2hfbmFtZSkKRlJPTSBtZW51X2l0ZW1fZGVub3JtCldIRVJFIChtZW51X2N1cnJlbmN5IElOICgnRG9sbGFycycsICcnKSkgQU5EIChkID4gMCkgQU5EIChkIDwgMjAyMikgQU5EIChkaXNoX25hbWUgSUxJS0UgJyVjYXZpYXIlJykKR1JPVVAgQlkgZApPUkRFUiBCWSBkIEFTQw==)。 +此数据集已经上传到了 ClickHouse Playground 中,[example](https://sql.clickhouse.com?query_id=KB5KQJJFNBKHE5GBUJCP1B)。 diff --git a/docs/zh/getting-started/example-datasets/opensky.mdx b/docs/zh/getting-started/example-datasets/opensky.mdx index b79c02ab780..0116515b28f 100644 --- a/docs/zh/getting-started/example-datasets/opensky.mdx +++ b/docs/zh/getting-started/example-datasets/opensky.mdx @@ -413,4 +413,4 @@ ORDER BY k ASC; ### 在线 Playground {#playground} -你可以使用交互式资源 [Online Playground](https://play.clickhouse.com/play?user=play) 来尝试对此数据集的其他查询。 例如, [执行这个查询](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBvcmlnaW4sCiAgICBjb3VudCgpLAogICAgcm91bmQoYXZnKGdlb0Rpc3RhbmNlKGxvbmdpdHVkZV8xLCBsYXRpdHVkZV8xLCBsb25naXR1ZGVfMiwgbGF0aXR1ZGVfMikpKSBBUyBkaXN0YW5jZSwKICAgIGJhcihkaXN0YW5jZSwgMCwgMTAwMDAwMDAsIDEwMCkgQVMgYmFyCkZST00gb3BlbnNreQpXSEVSRSBvcmlnaW4gIT0gJycKR1JPVVAgQlkgb3JpZ2luCk9SREVSIEJZIGNvdW50KCkgREVTQwpMSU1JVCAxMDA=). 但是,请注意无法在 Playground 中创建临时表。 +你可以使用交互式资源 [Online Playground](https://sql.clickhouse.com) 来尝试对此数据集的其他查询。 例如, [执行这个查询](https://sql.clickhouse.com?query_id=BIPDVQNIGVEZFQYFEFQB7O). 但是,请注意无法在 Playground 中创建临时表。 diff --git a/docs/zh/getting-started/example-datasets/recipes.mdx b/docs/zh/getting-started/example-datasets/recipes.mdx index b7f8fe8eafd..a7b3ddbe0da 100644 --- a/docs/zh/getting-started/example-datasets/recipes.mdx +++ b/docs/zh/getting-started/example-datasets/recipes.mdx @@ -334,6 +334,6 @@ WHERE title = 'Chocolate-Strawberry-Orange Wedding Cake' ### 在线 Playground -此数据集也可在 [在线 Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==) 中体验。 +此数据集也可在 [在线 Playground](https://sql.clickhouse.com?query_id=HQXNQZE26Z1QWYP9KC76ML) 中体验。 [原文链接](https://clickhouse.com/docs/en/getting-started/example-datasets/recipes/) diff --git a/docs/zh/getting-started/example-datasets/uk-price-paid.mdx b/docs/zh/getting-started/example-datasets/uk-price-paid.mdx index 7d4c299b919..158ce08216c 100644 --- a/docs/zh/getting-started/example-datasets/uk-price-paid.mdx +++ b/docs/zh/getting-started/example-datasets/uk-price-paid.mdx @@ -447,4 +447,4 @@ With projection: 100 rows in set. Elapsed: 0.336 sec. Processed 17.32 thousand r ### 在 Playground 上测试{#playground} -也可以在 [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==) 上找到此数据集。 +也可以在 [Online Playground](https://sql.clickhouse.com?query_id=TRCWH5ZETY4SEEK8ISCCAX) 上找到此数据集。 diff --git a/docs/zh/getting-started/playground.md b/docs/zh/getting-started/playground.md index 2874b307cee..5d8927d8a6c 100644 --- a/docs/zh/getting-started/playground.md +++ b/docs/zh/getting-started/playground.md @@ -6,7 +6,7 @@ sidebar_label: 体验平台 # ClickHouse Playground {#clickhouse-playground} -无需搭建服务或集群,[ClickHouse Playground](https://play.clickhouse.com/play?user=play)允许人们通过执行查询语句立即体验ClickHouse,在Playground中我们提供了一些示例数据集。 +无需搭建服务或集群,[ClickHouse Playground](https://sql.clickhouse.com)允许人们通过执行查询语句立即体验ClickHouse,在Playground中我们提供了一些示例数据集。 你可以使用任意HTTP客户端向Playground提交查询语句,比如[curl](https://curl.haxx.se)或者[wget](https://www.gnu.org/software/wget/),也可以通过[JDBC](../interfaces/jdbc.md)或者[ODBC](../interfaces/odbc.md)驱动建立连接,更多信息详见[客户端](../interfaces/index.md)。 diff --git a/docs/zh/sql-reference/statements/create/view.md b/docs/zh/sql-reference/statements/create/view.md index 49a1d66bdf1..6c93240644d 100644 --- a/docs/zh/sql-reference/statements/create/view.md +++ b/docs/zh/sql-reference/statements/create/view.md @@ -39,7 +39,7 @@ SELECT a, b, c FROM (SELECT ...) ## Materialized {#materialized} ``` sql -CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ... +CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ... ``` 物化视图存储由相应的[SELECT](../../../sql-reference/statements/select/index.md)管理. diff --git a/docs/zh/sql-reference/statements/grant.md b/docs/zh/sql-reference/statements/grant.md index fea51d590d5..3fd314c791f 100644 --- a/docs/zh/sql-reference/statements/grant.md +++ b/docs/zh/sql-reference/statements/grant.md @@ -170,14 +170,23 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION - `addressToSymbol` - `demangle` - [SOURCES](#grant-sources) + - `AZURE` - `FILE` - - `URL` - - `REMOTE` - - `YSQL` - - `ODBC` - - `JDBC` - `HDFS` + - `HIVE` + - `JDBC` + - `KAFKA` + - `MONGO` + - `MYSQL` + - `NATS` + - `ODBC` + - `POSTGRES` + - `RABBITMQ` + - `REDIS` + - `REMOTE` - `S3` + - `SQLITE` + - `URL` - [dictGet](#grant-dictget) 如何对待该层级的示例: @@ -428,14 +437,23 @@ GRANT INSERT(x,y) ON db.table TO john 允许在 [table engines](../../engines/table-engines/index.md) 和 [table functions](../../sql-reference/table-functions/index.md#table-functions)中使用外部数据源。 - `SOURCES`. 级别: `GROUP` + - `AZURE`. 级别: `GLOBAL` - `FILE`. 级别: `GLOBAL` - - `URL`. 级别: `GLOBAL` - - `REMOTE`. 级别: `GLOBAL` - - `YSQL`. 级别: `GLOBAL` - - `ODBC`. 级别: `GLOBAL` - - `JDBC`. 级别: `GLOBAL` - `HDFS`. 级别: `GLOBAL` + - `HIVE`. 级别: `GLOBAL` + - `JDBC`. 级别: `GLOBAL` + - `KAFKA`. 级别: `GLOBAL` + - `MONGO`. 级别: `GLOBAL` + - `MYSQL`. 级别: `GLOBAL` + - `NATS`. 级别: `GLOBAL` + - `ODBC`. 级别: `GLOBAL` + - `POSTGRES`. 级别: `GLOBAL` + - `RABBITMQ`. 级别: `GLOBAL` + - `REDIS`. 级别: `GLOBAL` + - `REMOTE`. 级别: `GLOBAL` - `S3`. 级别: `GLOBAL` + - `SQLITE`. 级别: `GLOBAL` + - `URL`. 级别: `GLOBAL` `SOURCES` 权限允许使用所有数据源。当然也可以单独对每个数据源进行授权。要使用数据源时,还需要额外的权限。 diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index ffb029404d3..d7190444f0b 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -192,6 +192,10 @@ void Client::parseConnectionsCredentials(Poco::Util::AbstractConfiguration & con history_file = home_path + "/" + history_file.substr(1); config.setString("history_file", history_file); } + if (config.has(prefix + ".history_max_entries")) + { + config.setUInt("history_max_entries", history_max_entries); + } if (config.has(prefix + ".accept-invalid-certificate")) config.setBool("accept-invalid-certificate", config.getBool(prefix + ".accept-invalid-certificate")); } @@ -1160,6 +1164,9 @@ void Client::processOptions(const OptionsDescription & options_description, /// (There is no need to copy the context because clickhouse-client has no background tasks so it won't use that context in parallel.) client_context = global_context; initClientContext(); + + /// Allow to pass-through unknown settings to the server. + client_context->getAccessControl().allowAllSettings(); } diff --git a/programs/compressor/Compressor.cpp b/programs/compressor/Compressor.cpp index 050bb495024..819f16cfd64 100644 --- a/programs/compressor/Compressor.cpp +++ b/programs/compressor/Compressor.cpp @@ -80,7 +80,6 @@ int mainEntryClickHouseCompressor(int argc, char ** argv) ("block-size,b", po::value()->default_value(DBMS_DEFAULT_BUFFER_SIZE), "compress in blocks of specified size") ("hc", "use LZ4HC instead of LZ4") ("zstd", "use ZSTD instead of LZ4") - ("deflate_qpl", "use deflate_qpl instead of LZ4") ("codec", po::value>()->multitoken(), "use codecs combination instead of LZ4") ("level", po::value(), "compression level for codecs specified via flags") ("none", "use no compression instead of LZ4") @@ -107,7 +106,6 @@ int mainEntryClickHouseCompressor(int argc, char ** argv) bool decompress = options.count("decompress"); bool use_lz4hc = options.count("hc"); bool use_zstd = options.count("zstd"); - bool use_deflate_qpl = options.count("deflate_qpl"); bool stat_mode = options.count("stat"); bool use_none = options.count("none"); print_stacktrace = options.count("stacktrace"); @@ -116,7 +114,7 @@ int mainEntryClickHouseCompressor(int argc, char ** argv) if (options.count("codec")) codecs = options["codec"].as>(); - if ((use_lz4hc || use_zstd || use_deflate_qpl || use_none) && !codecs.empty()) + if ((use_lz4hc || use_zstd || use_none) && !codecs.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong options, codec flags like --zstd and --codec options are mutually exclusive"); if (!codecs.empty() && options.count("level")) @@ -128,8 +126,6 @@ int mainEntryClickHouseCompressor(int argc, char ** argv) method_family = "LZ4HC"; else if (use_zstd) method_family = "ZSTD"; - else if (use_deflate_qpl) - method_family = "DEFLATE_QPL"; else if (use_none) method_family = "NONE"; diff --git a/programs/disks/DisksApp.cpp b/programs/disks/DisksApp.cpp index 5fddfce0678..610d8eaa638 100644 --- a/programs/disks/DisksApp.cpp +++ b/programs/disks/DisksApp.cpp @@ -236,6 +236,7 @@ void DisksApp::runInteractiveReplxx() ReplxxLineReader lr( suggest, history_file, + history_max_entries, /* multiline= */ false, /* ignore_shell_suspend= */ false, query_extenders, @@ -398,6 +399,8 @@ void DisksApp::initializeHistoryFile() throw; } } + + history_max_entries = config().getUInt("history-max-entries", 1000000); } void DisksApp::init(const std::vector & common_arguments) diff --git a/programs/disks/DisksApp.h b/programs/disks/DisksApp.h index 5b240648508..4f2bd7fcad6 100644 --- a/programs/disks/DisksApp.h +++ b/programs/disks/DisksApp.h @@ -62,6 +62,8 @@ private: // Fields responsible for the REPL work String history_file; + UInt32 history_max_entries = 0; /// Maximum number of entries in the history file. Needs to be initialized to 0 since we don't have a proper constructor. Worry not, actual value is set within the initializeHistoryFile method. + LineReader::Suggest suggest; static LineReader::Patterns query_extenders; static LineReader::Patterns query_delimiters; diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp index 97caa142124..2a426fad7ac 100644 --- a/programs/keeper-client/KeeperClient.cpp +++ b/programs/keeper-client/KeeperClient.cpp @@ -163,6 +163,10 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options) .argument("") .binding("operation-timeout")); + options.addOption( + Poco::Util::Option("use-xid-64", "", "use 64-bit XID. default false.") + .binding("use-xid-64")); + options.addOption( Poco::Util::Option("config-file", "c", "if set, will try to get a connection string from clickhouse config. default `config.xml`") .argument("") @@ -239,6 +243,8 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */) } } + history_max_entries = config().getUInt("history-max-entries", 1000000); + String default_log_level; if (config().has("query")) /// We don't want to see any information log in query mode, unless it was set explicitly @@ -315,6 +321,7 @@ void KeeperClient::runInteractiveReplxx() ReplxxLineReader lr( suggest, history_file, + history_max_entries, /* multiline= */ false, /* ignore_shell_suspend= */ false, query_extenders, @@ -411,6 +418,7 @@ int KeeperClient::main(const std::vector & /* args */) zk_args.connection_timeout_ms = config().getInt("connection-timeout", 10) * 1000; zk_args.session_timeout_ms = config().getInt("session-timeout", 10) * 1000; zk_args.operation_timeout_ms = config().getInt("operation-timeout", 10) * 1000; + zk_args.use_xid_64 = config().hasOption("use-xid-64"); zookeeper = zkutil::ZooKeeper::createWithoutKillingPreviousSessions(zk_args); if (config().has("no-confirmation") || config().has("query")) diff --git a/programs/keeper-client/KeeperClient.h b/programs/keeper-client/KeeperClient.h index 0d3db3c2f02..359663c6a13 100644 --- a/programs/keeper-client/KeeperClient.h +++ b/programs/keeper-client/KeeperClient.h @@ -59,6 +59,8 @@ protected: std::vector getCompletions(const String & prefix) const; String history_file; + UInt32 history_max_entries; /// Maximum number of entries in the history file. + LineReader::Suggest suggest; zkutil::ZooKeeperArgs zk_args; diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 3007df60765..74af9950e13 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -590,6 +590,7 @@ try #if USE_SSL CertificateReloader::instance().tryLoad(*config); + CertificateReloader::instance().tryLoadClient(*config); #endif }); diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index b6b67724b0a..1dcef5eb25e 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -821,11 +821,11 @@ void LocalServer::processConfig() status.emplace(fs::path(path) / "status", StatusFile::write_full_info); LOG_DEBUG(log, "Loading metadata from {}", path); - auto startup_system_tasks = loadMetadataSystem(global_context); + auto load_system_metadata_tasks = loadMetadataSystem(global_context); attachSystemTablesServer(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE), false); attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA)); attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE)); - waitLoad(TablesLoaderForegroundPoolId, startup_system_tasks); + waitLoad(TablesLoaderForegroundPoolId, load_system_metadata_tasks); if (!getClientConfiguration().has("only-system-tables")) { diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 15585ac8d57..5159f95419e 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -86,7 +86,7 @@ #include #include #include -#include +#include #include #include #include "MetricsTransmitter.h" @@ -168,9 +168,11 @@ namespace ServerSetting { extern const ServerSettingsUInt32 asynchronous_heavy_metrics_update_period_s; extern const ServerSettingsUInt32 asynchronous_metrics_update_period_s; + extern const ServerSettingsBool asynchronous_metrics_enable_heavy_metrics; extern const ServerSettingsBool async_insert_queue_flush_on_shutdown; extern const ServerSettingsUInt64 async_insert_threads; extern const ServerSettingsBool async_load_databases; + extern const ServerSettingsBool async_load_system_database; extern const ServerSettingsUInt64 background_buffer_flush_schedule_pool_size; extern const ServerSettingsUInt64 background_common_pool_size; extern const ServerSettingsUInt64 background_distributed_schedule_pool_size; @@ -205,7 +207,6 @@ namespace ServerSetting extern const ServerSettingsBool format_alter_operations_with_parentheses; extern const ServerSettingsUInt64 global_profiler_cpu_time_period_ns; extern const ServerSettingsUInt64 global_profiler_real_time_period_ns; - extern const ServerSettingsDouble gwp_asan_force_sample_probability; extern const ServerSettingsUInt64 http_connections_soft_limit; extern const ServerSettingsUInt64 http_connections_store_limit; extern const ServerSettingsUInt64 http_connections_warn_limit; @@ -620,7 +621,7 @@ void sanityChecks(Server & server) #if defined(OS_LINUX) try { - const std::unordered_set fastClockSources = { + const std::unordered_set fast_clock_sources = { // ARM clock "arch_sys_counter", // KVM guest clock @@ -629,7 +630,7 @@ void sanityChecks(Server & server) "tsc", }; const char * filename = "/sys/devices/system/clocksource/clocksource0/current_clocksource"; - if (!fastClockSources.contains(readLine(filename))) + if (!fast_clock_sources.contains(readLine(filename))) server.context()->addWarningMessage("Linux is not using a fast clock source. Performance can be degraded. Check " + String(filename)); } catch (...) // NOLINT(bugprone-empty-catch) @@ -919,7 +920,6 @@ try registerFormats(); registerRemoteFileMetadatas(); registerSchedulerNodes(); - registerResourceManagers(); CurrentMetrics::set(CurrentMetrics::Revision, ClickHouseRevision::getVersionRevision()); CurrentMetrics::set(CurrentMetrics::VersionInteger, ClickHouseRevision::getVersionInteger()); @@ -1060,6 +1060,7 @@ try ServerAsynchronousMetrics async_metrics( global_context, server_settings[ServerSetting::asynchronous_metrics_update_period_s], + server_settings[ServerSetting::asynchronous_metrics_enable_heavy_metrics], server_settings[ServerSetting::asynchronous_heavy_metrics_update_period_s], [&]() -> std::vector { @@ -1352,9 +1353,11 @@ try } FailPointInjection::enableFromGlobalConfig(config()); +#endif memory_worker.start(); +#if defined(OS_LINUX) int default_oom_score = 0; #if !defined(NDEBUG) @@ -1927,10 +1930,6 @@ try if (global_context->isServerCompletelyStarted()) CannotAllocateThreadFaultInjector::setFaultProbability(new_server_settings[ServerSetting::cannot_allocate_thread_fault_injection_probability]); -#if USE_GWP_ASAN - GWPAsan::setForceSampleProbability(new_server_settings[ServerSetting::gwp_asan_force_sample_probability]); -#endif - ProfileEvents::increment(ProfileEvents::MainConfigLoads); /// Must be the last. @@ -2082,7 +2081,7 @@ try auto & access_control = global_context->getAccessControl(); try { - access_control.setUpFromMainConfig(config(), config_path, [&] { return global_context->getZooKeeper(); }); + access_control.setupFromMainConfig(config(), config_path, [&] { return global_context->getZooKeeper(); }); } catch (...) { @@ -2199,6 +2198,7 @@ try LOG_INFO(log, "Loading metadata from {}", path_str); + LoadTaskPtrs load_system_metadata_tasks; LoadTaskPtrs load_metadata_tasks; // Make sure that if exception is thrown during startup async, new async loading jobs are not going to be called. @@ -2222,12 +2222,8 @@ try auto & database_catalog = DatabaseCatalog::instance(); /// We load temporary database first, because projections need it. database_catalog.initializeAndLoadTemporaryDatabase(); - auto system_startup_tasks = loadMetadataSystem(global_context); - maybeConvertSystemDatabase(global_context, system_startup_tasks); - /// This has to be done before the initialization of system logs, - /// otherwise there is a race condition between the system database initialization - /// and creation of new tables in the database. - waitLoad(TablesLoaderForegroundPoolId, system_startup_tasks); + load_system_metadata_tasks = loadMetadataSystem(global_context, server_settings[ServerSetting::async_load_system_database]); + maybeConvertSystemDatabase(global_context, load_system_metadata_tasks); /// Startup scripts can depend on the system log tables. if (config().has("startup_scripts") && !server_settings[ServerSetting::prepare_system_log_tables_on_startup].changed) @@ -2258,6 +2254,8 @@ try database_catalog.assertDatabaseExists(default_database); /// Load user-defined SQL functions. global_context->getUserDefinedSQLObjectsStorage().loadObjects(); + /// Load WORKLOADs and RESOURCEs. + global_context->getWorkloadEntityStorage().loadEntities(); global_context->getRefreshSet().setRefreshesStopped(false); } @@ -2267,6 +2265,30 @@ try throw; } + bool found_stop_flag = false; + + if (has_zookeeper && global_context->getMacros()->getMacroMap().contains("replica")) + { + try + { + auto zookeeper = global_context->getZooKeeper(); + String stop_flag_path = "/clickhouse/stop_replicated_ddl_queries/{replica}"; + stop_flag_path = global_context->getMacros()->expand(stop_flag_path); + found_stop_flag = zookeeper->exists(stop_flag_path); + } + catch (const Coordination::Exception & e) + { + if (e.code != Coordination::Error::ZCONNECTIONLOSS) + throw; + tryLogCurrentException(log); + } + } + + if (found_stop_flag) + LOG_INFO(log, "Found a stop flag for replicated DDL queries. They will be disabled"); + else + DatabaseCatalog::instance().startReplicatedDDLQueries(); + LOG_DEBUG(log, "Loaded metadata."); if (has_trace_collector) @@ -2321,6 +2343,7 @@ try #if USE_SSL CertificateReloader::instance().tryLoad(config()); + CertificateReloader::instance().tryLoadClient(config()); #endif /// Must be done after initialization of `servers`, because async_metrics will access `servers` variable from its thread. @@ -2369,17 +2392,28 @@ try if (has_zookeeper && config().has("distributed_ddl")) { /// DDL worker should be started after all tables were loaded - String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/"); + String ddl_queue_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/"); + String ddl_replicas_path = config().getString("distributed_ddl.replicas_path", "/clickhouse/task_queue/replicas/"); int pool_size = config().getInt("distributed_ddl.pool_size", 1); if (pool_size < 1) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "distributed_ddl.pool_size should be greater then 0"); - global_context->setDDLWorker(std::make_unique(pool_size, ddl_zookeeper_path, global_context, &config(), - "distributed_ddl", "DDLWorker", - &CurrentMetrics::MaxDDLEntryID, &CurrentMetrics::MaxPushedDDLEntryID), - load_metadata_tasks); + global_context->setDDLWorker( + std::make_unique( + pool_size, + ddl_queue_path, + ddl_replicas_path, + global_context, + &config(), + "distributed_ddl", + "DDLWorker", + &CurrentMetrics::MaxDDLEntryID, + &CurrentMetrics::MaxPushedDDLEntryID), + joinTasks(load_system_metadata_tasks, load_metadata_tasks)); } /// Do not keep tasks in server, they should be kept inside databases. Used here to make dependent tasks only. + load_system_metadata_tasks.clear(); + load_system_metadata_tasks.shrink_to_fit(); load_metadata_tasks.clear(); load_metadata_tasks.shrink_to_fit(); @@ -2405,7 +2439,6 @@ try #if USE_GWP_ASAN GWPAsan::initFinished(); - GWPAsan::setForceSampleProbability(server_settings[ServerSetting::gwp_asan_force_sample_probability]); #endif try @@ -2999,7 +3032,7 @@ void Server::updateServers( for (auto * server : all_servers) { - if (!server->isStopping()) + if (server->supportsRuntimeReconfiguration() && !server->isStopping()) { std::string port_name = server->getPortName(); bool has_host = false; diff --git a/programs/server/config.xml b/programs/server/config.xml index 10ad831465a..9807f8c0d5a 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -1195,6 +1195,19 @@ false + + + system + query_metric_log
+ 7500 + 1048576 + 8192 + 524288 + 1000 + false +
+ + + + @@ -1437,6 +1454,8 @@ /clickhouse/task_queue/ddl + + /clickhouse/task_queue/replicas diff --git a/programs/server/config.yaml.example b/programs/server/config.yaml.example index 5d5499f876c..5b0330df572 100644 --- a/programs/server/config.yaml.example +++ b/programs/server/config.yaml.example @@ -743,6 +743,13 @@ error_log: flush_interval_milliseconds: 7500 collect_interval_milliseconds: 1000 +# Query metric log contains history of memory and metric values from table system.events for individual queries, periodically flushed to disk. +query_metric_log: + database: system + table: query_metric_log + flush_interval_milliseconds: 7500 + collect_interval_milliseconds: 1000 + # Asynchronous metric log contains values of metrics from # system.asynchronous_metrics. asynchronous_metric_log: diff --git a/programs/server/merges.html b/programs/server/merges.html new file mode 100644 index 00000000000..119fb058b0b --- /dev/null +++ b/programs/server/merges.html @@ -0,0 +1,441 @@ + + + + + ClickHouse Merges Visualizer + + + + +
+
+
+ + +
+ + + 10x + 0000-00-00 00:00:00 + +
+
+
+
+ + + diff --git a/programs/static-files-disk-uploader/static-files-disk-uploader.cpp b/programs/static-files-disk-uploader/static-files-disk-uploader.cpp index f7696dd37f1..590e0364040 100644 --- a/programs/static-files-disk-uploader/static-files-disk-uploader.cpp +++ b/programs/static-files-disk-uploader/static-files-disk-uploader.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp index 1b397304a06..9b3b8d2a977 100644 --- a/src/Access/AccessControl.cpp +++ b/src/Access/AccessControl.cpp @@ -282,7 +282,7 @@ void AccessControl::shutdown() } -void AccessControl::setUpFromMainConfig(const Poco::Util::AbstractConfiguration & config_, const String & config_path_, +void AccessControl::setupFromMainConfig(const Poco::Util::AbstractConfiguration & config_, const String & config_path_, const zkutil::GetZooKeeper & get_zookeeper_function_) { if (config_.has("custom_settings_prefixes")) @@ -608,7 +608,7 @@ AuthResult AccessControl::authenticate(const Credentials & credentials, const Po } catch (...) { - tryLogCurrentException(getLogger(), "from: " + address.toString() + ", user: " + credentials.getUserName() + ": Authentication failed"); + tryLogCurrentException(getLogger(), "from: " + address.toString() + ", user: " + credentials.getUserName() + ": Authentication failed", LogsLevel::information); WriteBufferFromOwnString message; message << credentials.getUserName() << ": Authentication failed: password is incorrect, or there is no user with such name."; @@ -622,8 +622,9 @@ AuthResult AccessControl::authenticate(const Credentials & credentials, const Po << "and deleting this file will reset the password.\n" << "See also /etc/clickhouse-server/users.xml on the server where ClickHouse is installed.\n\n"; - /// We use the same message for all authentication failures because we don't want to give away any unnecessary information for security reasons, - /// only the log will show the exact reason. + /// We use the same message for all authentication failures because we don't want to give away any unnecessary information for security reasons. + /// Only the log ((*), above) will show the exact reason. Note that (*) logs at information level instead of the default error level as + /// authentication failures are not an unusual event. throw Exception(PreformattedMessage{message.str(), "{}: Authentication failed: password is incorrect, or there is no user with such name", std::vector{credentials.getUserName()}}, @@ -869,4 +870,10 @@ const ExternalAuthenticators & AccessControl::getExternalAuthenticators() const return *external_authenticators; } + +void AccessControl::allowAllSettings() +{ + custom_settings_prefixes->registerPrefixes({""}); +} + } diff --git a/src/Access/AccessControl.h b/src/Access/AccessControl.h index cc1b7b2ca0d..a342c5300bf 100644 --- a/src/Access/AccessControl.h +++ b/src/Access/AccessControl.h @@ -9,6 +9,8 @@ #include +#include "config.h" + namespace Poco { @@ -57,7 +59,7 @@ public: void shutdown() override; /// Initializes access storage (user directories). - void setUpFromMainConfig(const Poco::Util::AbstractConfiguration & config_, const String & config_path_, + void setupFromMainConfig(const Poco::Util::AbstractConfiguration & config_, const String & config_path_, const zkutil::GetZooKeeper & get_zookeeper_function_); /// Parses access entities from a configuration loaded from users.xml. @@ -238,6 +240,9 @@ public: /// Gets manager of notifications. AccessChangesNotifier & getChangesNotifier(); + /// Allow all setting names - this can be used in clients to pass-through unknown settings to the server. + void allowAllSettings(); + private: class ContextAccessCache; class CustomSettingsPrefixes; diff --git a/src/Access/Authentication.cpp b/src/Access/Authentication.cpp index 8d5d04a4ed2..1d69a659cd6 100644 --- a/src/Access/Authentication.cpp +++ b/src/Access/Authentication.cpp @@ -12,6 +12,7 @@ #include "config.h" + namespace DB { diff --git a/src/Access/AuthenticationData.cpp b/src/Access/AuthenticationData.cpp index 57a1cd756ff..37a4e356af8 100644 --- a/src/Access/AuthenticationData.cpp +++ b/src/Access/AuthenticationData.cpp @@ -1,12 +1,16 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include +#include #include #include @@ -113,7 +117,8 @@ bool operator ==(const AuthenticationData & lhs, const AuthenticationData & rhs) && (lhs.ssh_keys == rhs.ssh_keys) #endif && (lhs.http_auth_scheme == rhs.http_auth_scheme) - && (lhs.http_auth_server_name == rhs.http_auth_server_name); + && (lhs.http_auth_server_name == rhs.http_auth_server_name) + && (lhs.valid_until == rhs.valid_until); } @@ -384,14 +389,34 @@ std::shared_ptr AuthenticationData::toAST() const throw Exception(ErrorCodes::LOGICAL_ERROR, "AST: Unexpected authentication type {}", toString(auth_type)); } + + if (valid_until) + { + WriteBufferFromOwnString out; + writeDateTimeText(valid_until, out); + + node->valid_until = std::make_shared(out.str()); + } + return node; } AuthenticationData AuthenticationData::fromAST(const ASTAuthenticationData & query, ContextPtr context, bool validate) { + time_t valid_until = 0; + + if (query.valid_until) + { + valid_until = getValidUntilFromAST(query.valid_until, context); + } + if (query.type && query.type == AuthenticationType::NO_PASSWORD) - return AuthenticationData(); + { + AuthenticationData auth_data; + auth_data.setValidUntil(valid_until); + return auth_data; + } /// For this type of authentication we have ASTPublicSSHKey as children for ASTAuthenticationData if (query.type && query.type == AuthenticationType::SSH_KEY) @@ -418,6 +443,7 @@ AuthenticationData AuthenticationData::fromAST(const ASTAuthenticationData & que } auth_data.setSSHKeys(std::move(keys)); + auth_data.setValidUntil(valid_until); return auth_data; #else throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSH is disabled, because ClickHouse is built without libssh"); @@ -451,6 +477,8 @@ AuthenticationData AuthenticationData::fromAST(const ASTAuthenticationData & que AuthenticationData auth_data(current_type); + auth_data.setValidUntil(valid_until); + if (validate) context->getAccessControl().checkPasswordComplexityRules(value); @@ -494,6 +522,7 @@ AuthenticationData AuthenticationData::fromAST(const ASTAuthenticationData & que } AuthenticationData auth_data(*query.type); + auth_data.setValidUntil(valid_until); if (query.contains_hash) { diff --git a/src/Access/AuthenticationData.h b/src/Access/AuthenticationData.h index a0c100264f8..2d8d008c925 100644 --- a/src/Access/AuthenticationData.h +++ b/src/Access/AuthenticationData.h @@ -74,6 +74,9 @@ public: const String & getHTTPAuthenticationServerName() const { return http_auth_server_name; } void setHTTPAuthenticationServerName(const String & name) { http_auth_server_name = name; } + time_t getValidUntil() const { return valid_until; } + void setValidUntil(time_t valid_until_) { valid_until = valid_until_; } + friend bool operator ==(const AuthenticationData & lhs, const AuthenticationData & rhs); friend bool operator !=(const AuthenticationData & lhs, const AuthenticationData & rhs) { return !(lhs == rhs); } @@ -106,6 +109,7 @@ private: /// HTTP authentication properties String http_auth_server_name; HTTPAuthenticationScheme http_auth_scheme = HTTPAuthenticationScheme::BASIC; + time_t valid_until = 0; }; } diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index e9f24a8c685..ec543104167 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -99,6 +99,8 @@ enum class AccessType : uint8_t M(CREATE_ARBITRARY_TEMPORARY_TABLE, "", GLOBAL, CREATE) /* allows to create and manipulate temporary tables with arbitrary table engine */\ M(CREATE_FUNCTION, "", GLOBAL, CREATE) /* allows to execute CREATE FUNCTION */ \ + M(CREATE_WORKLOAD, "", GLOBAL, CREATE) /* allows to execute CREATE WORKLOAD */ \ + M(CREATE_RESOURCE, "", GLOBAL, CREATE) /* allows to execute CREATE RESOURCE */ \ M(CREATE_NAMED_COLLECTION, "", NAMED_COLLECTION, NAMED_COLLECTION_ADMIN) /* allows to execute CREATE NAMED COLLECTION */ \ M(CREATE, "", GROUP, ALL) /* allows to execute {CREATE|ATTACH} */ \ \ @@ -108,6 +110,8 @@ enum class AccessType : uint8_t implicitly enabled by the grant DROP_TABLE */\ M(DROP_DICTIONARY, "", DICTIONARY, DROP) /* allows to execute {DROP|DETACH} DICTIONARY */\ M(DROP_FUNCTION, "", GLOBAL, DROP) /* allows to execute DROP FUNCTION */\ + M(DROP_WORKLOAD, "", GLOBAL, DROP) /* allows to execute DROP WORKLOAD */\ + M(DROP_RESOURCE, "", GLOBAL, DROP) /* allows to execute DROP RESOURCE */\ M(DROP_NAMED_COLLECTION, "", NAMED_COLLECTION, NAMED_COLLECTION_ADMIN) /* allows to execute DROP NAMED COLLECTION */\ M(DROP, "", GROUP, ALL) /* allows to execute {DROP|DETACH} */\ \ @@ -159,6 +163,7 @@ enum class AccessType : uint8_t M(SYSTEM_SHUTDOWN, "SYSTEM KILL, SHUTDOWN", GLOBAL, SYSTEM) \ M(SYSTEM_DROP_DNS_CACHE, "SYSTEM DROP DNS, DROP DNS CACHE, DROP DNS", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_CONNECTIONS_CACHE, "SYSTEM DROP CONNECTIONS CACHE, DROP CONNECTIONS CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ + M(SYSTEM_PREWARM_MARK_CACHE, "SYSTEM PREWARM MARK, PREWARM MARK CACHE, PREWARM MARKS", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_MARK_CACHE, "SYSTEM DROP MARK, DROP MARK CACHE, DROP MARKS", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_UNCOMPRESSED_CACHE, "SYSTEM DROP UNCOMPRESSED, DROP UNCOMPRESSED CACHE, DROP UNCOMPRESSED", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_MMAP_CACHE, "SYSTEM DROP MMAP, DROP MMAP CACHE, DROP MMAP", GLOBAL, SYSTEM_DROP_CACHE) \ @@ -193,6 +198,7 @@ enum class AccessType : uint8_t M(SYSTEM_SENDS, "SYSTEM STOP SENDS, SYSTEM START SENDS, STOP SENDS, START SENDS", GROUP, SYSTEM) \ M(SYSTEM_REPLICATION_QUEUES, "SYSTEM STOP REPLICATION QUEUES, SYSTEM START REPLICATION QUEUES, STOP REPLICATION QUEUES, START REPLICATION QUEUES", TABLE, SYSTEM) \ M(SYSTEM_VIRTUAL_PARTS_UPDATE, "SYSTEM STOP VIRTUAL PARTS UPDATE, SYSTEM START VIRTUAL PARTS UPDATE, STOP VIRTUAL PARTS UPDATE, START VIRTUAL PARTS UPDATE", TABLE, SYSTEM) \ + M(SYSTEM_REDUCE_BLOCKING_PARTS, "SYSTEM STOP REDUCE BLOCKING PARTS, SYSTEM START REDUCE BLOCKING PARTS, STOP REDUCE BLOCKING PARTS, START REDUCE BLOCKING PARTS", TABLE, SYSTEM) \ M(SYSTEM_DROP_REPLICA, "DROP REPLICA", TABLE, SYSTEM) \ M(SYSTEM_SYNC_REPLICA, "SYNC REPLICA", TABLE, SYSTEM) \ M(SYSTEM_REPLICA_READINESS, "SYSTEM REPLICA READY, SYSTEM REPLICA UNREADY", GLOBAL, SYSTEM) \ @@ -237,6 +243,9 @@ enum class AccessType : uint8_t M(S3, "", GLOBAL, SOURCES) \ M(HIVE, "", GLOBAL, SOURCES) \ M(AZURE, "", GLOBAL, SOURCES) \ + M(KAFKA, "", GLOBAL, SOURCES) \ + M(NATS, "", GLOBAL, SOURCES) \ + M(RABBITMQ, "", GLOBAL, SOURCES) \ M(SOURCES, "", GROUP, ALL) \ \ M(CLUSTER, "", GLOBAL, ALL) /* ON CLUSTER queries */ \ diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 949fd37e403..06e89d78339 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -52,7 +52,10 @@ namespace {AccessType::HDFS, "HDFS"}, {AccessType::S3, "S3"}, {AccessType::HIVE, "Hive"}, - {AccessType::AZURE, "AzureBlobStorage"} + {AccessType::AZURE, "AzureBlobStorage"}, + {AccessType::KAFKA, "Kafka"}, + {AccessType::NATS, "NATS"}, + {AccessType::RABBITMQ, "RabbitMQ"} }; @@ -701,15 +704,17 @@ bool ContextAccess::checkAccessImplHelper(const ContextPtr & context, AccessFlag const AccessFlags dictionary_ddl = AccessType::CREATE_DICTIONARY | AccessType::DROP_DICTIONARY; const AccessFlags function_ddl = AccessType::CREATE_FUNCTION | AccessType::DROP_FUNCTION; + const AccessFlags workload_ddl = AccessType::CREATE_WORKLOAD | AccessType::DROP_WORKLOAD; + const AccessFlags resource_ddl = AccessType::CREATE_RESOURCE | AccessType::DROP_RESOURCE; const AccessFlags table_and_dictionary_ddl = table_ddl | dictionary_ddl; const AccessFlags table_and_dictionary_and_function_ddl = table_ddl | dictionary_ddl | function_ddl; const AccessFlags write_table_access = AccessType::INSERT | AccessType::OPTIMIZE; const AccessFlags write_dcl_access = AccessType::ACCESS_MANAGEMENT - AccessType::SHOW_ACCESS; - const AccessFlags not_readonly_flags = write_table_access | table_and_dictionary_and_function_ddl | write_dcl_access | AccessType::SYSTEM | AccessType::KILL_QUERY; + const AccessFlags not_readonly_flags = write_table_access | table_and_dictionary_and_function_ddl | workload_ddl | resource_ddl | write_dcl_access | AccessType::SYSTEM | AccessType::KILL_QUERY; const AccessFlags not_readonly_1_flags = AccessType::CREATE_TEMPORARY_TABLE; - const AccessFlags ddl_flags = table_ddl | dictionary_ddl | function_ddl; + const AccessFlags ddl_flags = table_ddl | dictionary_ddl | function_ddl | workload_ddl | resource_ddl; const AccessFlags introspection_flags = AccessType::INTROSPECTION; }; static const PrecalculatedFlags precalc; diff --git a/src/Access/Credentials.h b/src/Access/Credentials.h index f220b8d2c48..b21b7e6921f 100644 --- a/src/Access/Credentials.h +++ b/src/Access/Credentials.h @@ -15,6 +15,9 @@ public: explicit Credentials() = default; explicit Credentials(const String & user_name_); + Credentials(const Credentials &) = default; + Credentials(Credentials &&) = default; + virtual ~Credentials() = default; const String & getUserName() const; diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index 3249d89ba87..72e0933e214 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -554,7 +554,7 @@ std::optional IAccessStorage::authenticateImpl( continue; } - if (areCredentialsValid(user->getName(), user->valid_until, auth_method, credentials, external_authenticators, auth_result.settings)) + if (areCredentialsValid(user->getName(), auth_method, credentials, external_authenticators, auth_result.settings)) { auth_result.authentication_data = auth_method; return auth_result; @@ -579,7 +579,6 @@ std::optional IAccessStorage::authenticateImpl( bool IAccessStorage::areCredentialsValid( const std::string & user_name, - time_t valid_until, const AuthenticationData & authentication_method, const Credentials & credentials, const ExternalAuthenticators & external_authenticators, @@ -591,6 +590,7 @@ bool IAccessStorage::areCredentialsValid( if (credentials.getUserName() != user_name) return false; + auto valid_until = authentication_method.getValidUntil(); if (valid_until) { const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); diff --git a/src/Access/IAccessStorage.h b/src/Access/IAccessStorage.h index 84cbdd0a751..4e2b27a1864 100644 --- a/src/Access/IAccessStorage.h +++ b/src/Access/IAccessStorage.h @@ -236,7 +236,6 @@ protected: bool allow_plaintext_password) const; virtual bool areCredentialsValid( const std::string & user_name, - time_t valid_until, const AuthenticationData & authentication_method, const Credentials & credentials, const ExternalAuthenticators & external_authenticators, diff --git a/src/Access/RoleCache.h b/src/Access/RoleCache.h index 75d1fd32685..b707a05346f 100644 --- a/src/Access/RoleCache.h +++ b/src/Access/RoleCache.h @@ -22,6 +22,10 @@ public: const std::vector & current_roles, const std::vector & current_roles_with_admin_option); + std::shared_ptr getEnabledRoles( + boost::container::flat_set current_roles, + boost::container::flat_set current_roles_with_admin_option); + private: using SubscriptionsOnRoles = std::vector>; diff --git a/src/Access/SettingsConstraints.cpp b/src/Access/SettingsConstraints.cpp index 4305568dd8b..cdf3dac192e 100644 --- a/src/Access/SettingsConstraints.cpp +++ b/src/Access/SettingsConstraints.cpp @@ -223,8 +223,8 @@ void SettingsConstraints::clamp(const Settings & current_settings, SettingsChang }); } -template -bool getNewValueToCheck(const T & current_settings, SettingChange & change, Field & new_value, bool throw_on_failure) +template +bool getNewValueToCheck(const SettingsT & current_settings, SettingChange & change, Field & new_value, bool throw_on_failure) { Field current_value; bool has_current_value = current_settings.tryGet(change.name, current_value); @@ -234,12 +234,12 @@ bool getNewValueToCheck(const T & current_settings, SettingChange & change, Fiel return false; if (throw_on_failure) - new_value = T::castValueUtil(change.name, change.value); + new_value = SettingsT::castValueUtil(change.name, change.value); else { try { - new_value = T::castValueUtil(change.name, change.value); + new_value = SettingsT::castValueUtil(change.name, change.value); } catch (...) { diff --git a/src/Access/User.cpp b/src/Access/User.cpp index 887abc213f9..1c92f467003 100644 --- a/src/Access/User.cpp +++ b/src/Access/User.cpp @@ -19,8 +19,7 @@ bool User::equal(const IAccessEntity & other) const return (authentication_methods == other_user.authentication_methods) && (allowed_client_hosts == other_user.allowed_client_hosts) && (access == other_user.access) && (granted_roles == other_user.granted_roles) && (default_roles == other_user.default_roles) - && (settings == other_user.settings) && (grantees == other_user.grantees) && (default_database == other_user.default_database) - && (valid_until == other_user.valid_until); + && (settings == other_user.settings) && (grantees == other_user.grantees) && (default_database == other_user.default_database); } void User::setName(const String & name_) @@ -88,7 +87,6 @@ void User::clearAllExceptDependencies() access = {}; settings.removeSettingsKeepProfiles(); default_database = {}; - valid_until = 0; } } diff --git a/src/Access/User.h b/src/Access/User.h index 03d62bf2277..f54e74a305d 100644 --- a/src/Access/User.h +++ b/src/Access/User.h @@ -23,7 +23,6 @@ struct User : public IAccessEntity SettingsProfileElements settings; RolesOrUsersSet grantees = RolesOrUsersSet::AllTag{}; String default_database; - time_t valid_until = 0; bool equal(const IAccessEntity & other) const override; std::shared_ptr clone() const override { return cloneImpl(); } diff --git a/src/Access/tests/gtest_access_rights_ops.cpp b/src/Access/tests/gtest_access_rights_ops.cpp index 902fc949840..41567905a10 100644 --- a/src/Access/tests/gtest_access_rights_ops.cpp +++ b/src/Access/tests/gtest_access_rights_ops.cpp @@ -284,7 +284,8 @@ TEST(AccessRights, Union) "CREATE DICTIONARY, DROP DATABASE, DROP TABLE, DROP VIEW, DROP DICTIONARY, UNDROP TABLE, " "TRUNCATE, OPTIMIZE, BACKUP, CREATE ROW POLICY, ALTER ROW POLICY, DROP ROW POLICY, " "SHOW ROW POLICIES, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, " - "SYSTEM MOVES, SYSTEM PULLING REPLICATION LOG, SYSTEM CLEANUP, SYSTEM VIEWS, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, SYSTEM VIRTUAL PARTS UPDATE, " + "SYSTEM MOVES, SYSTEM PULLING REPLICATION LOG, SYSTEM CLEANUP, SYSTEM VIEWS, SYSTEM SENDS, " + "SYSTEM REPLICATION QUEUES, SYSTEM VIRTUAL PARTS UPDATE, SYSTEM REDUCE BLOCKING PARTS, " "SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, " "SYSTEM RESTORE REPLICA, SYSTEM WAIT LOADING PARTS, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, " "SYSTEM UNLOAD PRIMARY KEY, dictGet ON db1.*, GRANT TABLE ENGINE ON db1, " diff --git a/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp b/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp index 86f7661e53f..061a1e519e1 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp @@ -59,13 +59,13 @@ constexpr size_t group_array_sorted_sort_strategy_max_elements_threshold = 10000 template struct GroupArraySortedData { + static constexpr bool is_value_generic_field = std::is_same_v; + using Allocator = MixedAlignedArenaAllocator; - using Array = PODArray; + using Array = typename std::conditional_t, PODArray>; static constexpr size_t partial_sort_max_elements_factor = 2; - static constexpr bool is_value_generic_field = std::is_same_v; - Array values; static bool compare(const T & lhs, const T & rhs) @@ -144,7 +144,7 @@ struct GroupArraySortedData } if (values.size() > max_elements) - values.resize(max_elements, arena); + resize(max_elements, arena); } ALWAYS_INLINE void partialSortAndLimitIfNeeded(size_t max_elements, Arena * arena) @@ -153,7 +153,23 @@ struct GroupArraySortedData return; ::nth_element(values.begin(), values.begin() + max_elements, values.end(), Comparator()); - values.resize(max_elements, arena); + resize(max_elements, arena); + } + + ALWAYS_INLINE void resize(size_t n, Arena * arena) + { + if constexpr (is_value_generic_field) + values.resize(n); + else + values.resize(n, arena); + } + + ALWAYS_INLINE void push_back(T && element, Arena * arena) + { + if constexpr (is_value_generic_field) + values.push_back(element); + else + values.push_back(element, arena); } ALWAYS_INLINE void addElement(T && element, size_t max_elements, Arena * arena) @@ -171,12 +187,12 @@ struct GroupArraySortedData return; } - values.push_back(std::move(element), arena); + push_back(std::move(element), arena); std::push_heap(values.begin(), values.end(), Comparator()); } else { - values.push_back(std::move(element), arena); + push_back(std::move(element), arena); partialSortAndLimitIfNeeded(max_elements, arena); } } @@ -210,14 +226,6 @@ struct GroupArraySortedData result_array_data[result_array_data_insert_begin + i] = values[i]; } } - - ~GroupArraySortedData() - { - for (auto & value : values) - { - value.~T(); - } - } }; template @@ -313,14 +321,12 @@ public: throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size, it should not exceed {}", max_elements); auto & values = this->data(place).values; - values.resize_exact(size, arena); - if constexpr (std::is_same_v) + if constexpr (Data::is_value_generic_field) { + values.resize(size); for (Field & element : values) { - /// We must initialize the Field type since some internal functions (like operator=) use them - new (&element) Field; bool has_value = false; readBinary(has_value, buf); if (has_value) @@ -329,6 +335,7 @@ public: } else { + values.resize_exact(size, arena); if constexpr (std::endian::native == std::endian::little) { buf.readStrict(reinterpret_cast(values.data()), size * sizeof(values[0])); diff --git a/src/AggregateFunctions/AggregateFunctionQuantile.h b/src/AggregateFunctions/AggregateFunctionQuantile.h index 423fd4bc569..aa6755f237d 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantile.h +++ b/src/AggregateFunctions/AggregateFunctionQuantile.h @@ -312,6 +312,9 @@ struct NameQuantilesExactInclusive { static constexpr auto name = "quantilesExac struct NameQuantileExactWeighted { static constexpr auto name = "quantileExactWeighted"; }; struct NameQuantilesExactWeighted { static constexpr auto name = "quantilesExactWeighted"; }; +struct NameQuantileExactWeightedInterpolated { static constexpr auto name = "quantileExactWeightedInterpolated"; }; +struct NameQuantilesExactWeightedInterpolated { static constexpr auto name = "quantilesExactWeightedInterpolated"; }; + struct NameQuantileInterpolatedWeighted { static constexpr auto name = "quantileInterpolatedWeighted"; }; struct NameQuantilesInterpolatedWeighted { static constexpr auto name = "quantilesInterpolatedWeighted"; }; diff --git a/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp b/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp index 469abdf45a2..116b04bf4ba 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp @@ -1,13 +1,14 @@ -#include #include +#include #include +#include #include #include -#include - #include #include +#include + namespace DB { @@ -29,7 +30,7 @@ namespace * It uses O(distinct(N)) memory. Can be naturally applied for values with weight. * In case of many identical values, it can be more efficient than QuantileExact even when weight is not used. */ -template +template struct QuantileExactWeighted { struct Int128Hash @@ -46,6 +47,7 @@ struct QuantileExactWeighted /// When creating, the hash table must be small. using Map = HashMapWithStackMemory; + using Pair = typename Map::value_type; Map map; @@ -58,8 +60,18 @@ struct QuantileExactWeighted void add(const Value & x, Weight weight) { - if (!isNaN(x)) - map[x] += weight; + if constexpr (!interpolated) + { + /// Keep compatibility for function quantilesExactWeighted. + if (!isNaN(x)) + map[x] += weight; + } + else + { + /// Ignore values with zero weight in function quantilesExactWeightedInterpolated. + if (!isNaN(x) && weight) + map[x] += weight; + } } void merge(const QuantileExactWeighted & rhs) @@ -85,6 +97,43 @@ struct QuantileExactWeighted /// Get the value of the `level` quantile. The level must be between 0 and 1. Value get(Float64 level) const + { + if constexpr (interpolated) + return getInterpolatedImpl(level); + else + return getImpl(level); + } + + /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address. + /// indices - an array of index levels such that the corresponding elements will go in ascending order. + void getMany(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const + { + if constexpr (interpolated) + getManyInterpolatedImpl(levels, indices, num_levels, result); + else + getManyImpl(levels, indices, num_levels, result); + } + + Float64 getFloat(Float64 level) const + { + if constexpr (interpolated) + return getFloatInterpolatedImpl(level); + else + return getFloatImpl(level); + } + + void getManyFloat(const Float64 * levels, const size_t * indices, size_t num_levels, Float64 * result) const + { + if constexpr (interpolated) + getManyFloatInterpolatedImpl(levels, indices, num_levels, result); + else + getManyFloatImpl(levels, indices, num_levels, result); + } + +private: + /// get implementation without interpolation + Value getImpl(Float64 level) const + requires(!interpolated) { size_t size = map.size(); @@ -92,7 +141,6 @@ struct QuantileExactWeighted return std::numeric_limits::quiet_NaN(); /// Copy the data to a temporary array to get the element you need in order. - using Pair = typename Map::value_type; std::unique_ptr array_holder(new Pair[size]); Pair * array = array_holder.get(); @@ -135,9 +183,9 @@ struct QuantileExactWeighted return it->first; } - /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address. - /// indices - an array of index levels such that the corresponding elements will go in ascending order. - void getMany(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const + /// getMany implementation without interpolation + void getManyImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const + requires(!interpolated) { size_t size = map.size(); @@ -149,7 +197,6 @@ struct QuantileExactWeighted } /// Copy the data to a temporary array to get the element you need in order. - using Pair = typename Map::value_type; std::unique_ptr array_holder(new Pair[size]); Pair * array = array_holder.get(); @@ -197,23 +244,165 @@ struct QuantileExactWeighted } } - /// The same, but in the case of an empty state, NaN is returned. - Float64 getFloat(Float64) const + /// getFloat implementation without interpolation + Float64 getFloatImpl(Float64) const + requires(!interpolated) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat is not implemented for QuantileExact"); } - void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const + /// getManyFloat implementation without interpolation + void getManyFloatImpl(const Float64 *, const size_t *, size_t, Float64 *) const + requires(!interpolated) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for QuantileExact"); } + + /// get implementation with interpolation + Value getInterpolatedImpl(Float64 level) const + requires(interpolated) + { + size_t size = map.size(); + if (0 == size) + return Value(); + + Float64 res = getFloatInterpolatedImpl(level); + if constexpr (is_decimal) + return Value(static_cast(res)); + else + return static_cast(res); + } + + /// getMany implementation with interpolation + void getManyInterpolatedImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const + requires(interpolated) + { + size_t size = map.size(); + if (0 == size) + { + for (size_t i = 0; i < num_levels; ++i) + result[i] = Value(); + return; + } + + std::unique_ptr res_holder(new Float64[num_levels]); + Float64 * res = res_holder.get(); + getManyFloatInterpolatedImpl(levels, indices, num_levels, res); + for (size_t i = 0; i < num_levels; ++i) + { + if constexpr (is_decimal) + result[i] = Value(static_cast(res[i])); + else + result[i] = Value(res[i]); + } + } + + /// getFloat implementation with interpolation + Float64 getFloatInterpolatedImpl(Float64 level) const + requires(interpolated) + { + size_t size = map.size(); + + if (0 == size) + return std::numeric_limits::quiet_NaN(); + + /// Copy the data to a temporary array to get the element you need in order. + std::unique_ptr array_holder(new Pair[size]); + Pair * array = array_holder.get(); + + size_t i = 0; + for (const auto & pair : map) + { + array[i] = pair.getValue(); + ++i; + } + + ::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; }); + std::partial_sum(array, array + size, array, [](const Pair & acc, const Pair & p) { return Pair(p.first, acc.second + p.second); }); + Weight max_position = array[size - 1].second - 1; + Float64 position = max_position * level; + return quantileInterpolated(array, size, position); + } + + /// getManyFloat implementation with interpolation + void getManyFloatInterpolatedImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Float64 * result) const + requires(interpolated) + { + size_t size = map.size(); + if (0 == size) + { + for (size_t i = 0; i < num_levels; ++i) + result[i] = std::numeric_limits::quiet_NaN(); + return; + } + + /// Copy the data to a temporary array to get the element you need in order. + std::unique_ptr array_holder(new Pair[size]); + Pair * array = array_holder.get(); + + size_t i = 0; + for (const auto & pair : map) + { + array[i] = pair.getValue(); + ++i; + } + + ::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; }); + std::partial_sum(array, array + size, array, [](Pair acc, Pair & p) { return Pair(p.first, acc.second + p.second); }); + Weight max_position = array[size - 1].second - 1; + + for (size_t j = 0; j < num_levels; ++j) + { + Float64 position = max_position * levels[indices[j]]; + result[indices[j]] = quantileInterpolated(array, size, position); + } + } + + /// Calculate quantile, using linear interpolation between two closest values + Float64 NO_SANITIZE_UNDEFINED quantileInterpolated(const Pair * array, size_t size, Float64 position) const + requires(interpolated) + { + size_t lower = static_cast(std::floor(position)); + size_t higher = static_cast(std::ceil(position)); + + const auto * lower_it = std::lower_bound(array, array + size, lower + 1, [](const Pair & a, size_t b) { return a.second < b; }); + const auto * higher_it = std::lower_bound(array, array + size, higher + 1, [](const Pair & a, size_t b) { return a.second < b; }); + if (lower_it == array + size) + lower_it = array + size - 1; + if (higher_it == array + size) + higher_it = array + size - 1; + + UnderlyingType lower_key = lower_it->first; + UnderlyingType higher_key = higher_it->first; + + if (lower == higher || lower_key == higher_key) + return static_cast(lower_key); + + return (static_cast(higher) - position) * lower_key + (position - static_cast(lower)) * higher_key; + } }; -template using FuncQuantileExactWeighted = AggregateFunctionQuantile, NameQuantileExactWeighted, true, void, false, false>; -template using FuncQuantilesExactWeighted = AggregateFunctionQuantile, NameQuantilesExactWeighted, true, void, true, false>; +template +using FuncQuantileExactWeighted = AggregateFunctionQuantile< + Value, + QuantileExactWeighted, + std::conditional_t, + true, + std::conditional_t, + false, + false>; +template +using FuncQuantilesExactWeighted = AggregateFunctionQuantile< + Value, + QuantileExactWeighted, + std::conditional_t, + true, + std::conditional_t, + true, + false>; -template